diff --git a/CMakeLists.txt b/CMakeLists.txt index 4ff0e6a90e571e23ab899b057a1e7141c24547d9..15f95d2e580c94ec4592b6c9c6bfa5cf16de136c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -187,6 +187,11 @@ option(LLVM_INSTALL_UTILS "Include utility binaries in the 'install' target." OF option(LLVM_INSTALL_TOOLCHAIN_ONLY "Only include toolchain files in the 'install' target." OFF) +# Unfortunatly Clang is too eager to search directories for module maps, which can cause the +# installed version of the maps to be found when building LLVM from source. Therefore we turn off +# the installation by default. See llvm.org/PR31905. +option(LLVM_INSTALL_MODULEMAPS "Install the modulemap files in the 'install' target." OFF) + option(LLVM_USE_FOLDERS "Enable solution folders in Visual Studio. Disable for Express versions." ON) if ( LLVM_USE_FOLDERS ) set_property(GLOBAL PROPERTY USE_FOLDERS ON) @@ -365,6 +370,10 @@ option(LLVM_ENABLE_LLD "Use lld as C and C++ linker." OFF) option(LLVM_ENABLE_PEDANTIC "Compile with pedantic enabled." ON) option(LLVM_ENABLE_WERROR "Fail and stop if a warning is triggered." OFF) +if (MSVC) + option(LLVM_ENABLE_INCREMENTAL_LINK "Link incrementally. Enabling it might produce slower executables." OFF) +endif() + option(LLVM_ENABLE_DUMP "Enable dump functions even when assertions are disabled" OFF) if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" ) @@ -399,8 +408,8 @@ option(LLVM_USE_OPROFILE option(LLVM_EXTERNALIZE_DEBUGINFO "Generate dSYM files and strip executables and libraries (Darwin Only)" OFF) -option(LLVM_CODESIGNING_IDENTITY - "Sign executables and dylibs with the given identity (Darwin Only)" OFF) +set(LLVM_CODESIGNING_IDENTITY "" CACHE STRING + "Sign executables and dylibs with the given identity or skip if empty (Darwin Only)") # If enabled, verify we are on a platform that supports oprofile. if( LLVM_USE_OPROFILE ) @@ -973,6 +982,20 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY) PATTERN ".svn" EXCLUDE ) + if (LLVM_INSTALL_MODULEMAPS) + install(DIRECTORY include/llvm include/llvm-c + DESTINATION include + COMPONENT llvm-headers + FILES_MATCHING + PATTERN "module.modulemap" + ) + install(FILES include/llvm/module.install.modulemap + DESTINATION include/llvm + COMPONENT llvm-headers + RENAME "module.extern.modulemap" + ) + endif(LLVM_INSTALL_MODULEMAPS) + # Installing the headers needs to depend on generating any public # tablegen'd headers. add_custom_target(llvm-headers DEPENDS intrinsics_gen) diff --git a/CODE_OWNERS.TXT b/CODE_OWNERS.TXT index 6a0dd0cbadc330bc7ed2cd36a045ab065f616cc9..a6e10fc0e27039fe3147abd5b3655b751e2df5be 100644 --- a/CODE_OWNERS.TXT +++ b/CODE_OWNERS.TXT @@ -71,7 +71,7 @@ D: Loop Strength Reduction, Register allocators N: Andrea Di Biagio E: andrea.dibiagio@sony.com E: andrea.dibiagio@gmail.com -D: llvm-mca +D: MCA, llvm-mca N: Duncan P. N. Exon Smith E: dexonsmith@apple.com @@ -222,3 +222,7 @@ D: C API, OCaml bindings N: Jake Ehrlich E: jakehehrlich@google.com D: llvm-objcopy (tools/llvm-objcopy) + +N: Martin Storsjö +E: martin@martin.st +D: MinGW diff --git a/CREDITS.TXT b/CREDITS.TXT index e279701f57d90bb182b24dfb48d88c4b64443843..0a8154c8842f0830901f1a54b78fb4604bc0f56a 100644 --- a/CREDITS.TXT +++ b/CREDITS.TXT @@ -510,3 +510,7 @@ D: PowerPC Backend Developer N: Zixuan Wu E: wuzish@cn.ibm.com D: PowerPC Backend Developer + +N: Kang Zhang +E: shkzhang@cn.ibm.com +D: PowerPC Backend Developer diff --git a/RELEASE_TESTERS.TXT b/RELEASE_TESTERS.TXT index 462941b35b0e48b088daffe04986a7a5a0c40340..2e5ac1b9078db259ee530a3442aa20a23617f6f1 100644 --- a/RELEASE_TESTERS.TXT +++ b/RELEASE_TESTERS.TXT @@ -13,8 +13,8 @@ O: Ubuntu N: Sylvestre Ledru E: sylvestre@debian.org -T: x86 -O: Debian +T: All supported archs Debian/Ubuntu +O: Debian/Ubuntu packages N: Nikola Smiljanic E: popizdeh@gmail.com @@ -41,7 +41,7 @@ E: hans@chromium.org T: x86 O: Windows -N: Diana Picus -E: diana.picus@linaro.org +N: Diana Picus, Yvan Roux +E: diana.picus@linaro.org, yvan.roux@linaro.org T: ARM, AArch64 O: Linux diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake index 18977d9950ff84ed01288fe92633e55efb71cad4..900c35ee4f0c9bb01c39097e8ba4b013b0c73505 100644 --- a/cmake/config-ix.cmake +++ b/cmake/config-ix.cmake @@ -7,6 +7,7 @@ include(CheckIncludeFile) include(CheckLibraryExists) include(CheckSymbolExists) include(CheckFunctionExists) +include(CheckStructHasMember) include(CheckCCompilerFlag) include(CheckCompilerVersion) @@ -128,7 +129,7 @@ if(NOT LLVM_USE_SANITIZER MATCHES "Memory.*") endif() if(LLVM_ENABLE_TERMINFO) set(HAVE_TERMINFO 0) - foreach(library tinfo terminfo curses ncurses ncursesw) + foreach(library terminfo tinfo curses ncurses ncursesw) string(TOUPPER ${library} library_suffix) check_library_exists(${library} setupterm "" HAVE_TERMINFO_${library_suffix}) if(HAVE_TERMINFO_${library_suffix}) @@ -248,6 +249,11 @@ if( HAVE_DLFCN_H ) endif() endif() +CHECK_STRUCT_HAS_MEMBER("struct stat" st_mtimespec.tv_nsec + "sys/types.h;sys/stat.h" HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC) +CHECK_STRUCT_HAS_MEMBER("struct stat" st_mtim.tv_nsec + "sys/types.h;sys/stat.h" HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC) + check_symbol_exists(__GLIBC__ stdio.h LLVM_USING_GLIBC) if( LLVM_USING_GLIBC ) add_definitions( -D_GNU_SOURCE ) diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake index 189971655583a40532272d608d807212198b14be..c5aa961a292b4a2b8ba83c986aa51c78451ec5b3 100644 --- a/cmake/modules/AddLLVM.cmake +++ b/cmake/modules/AddLLVM.cmake @@ -73,7 +73,7 @@ function(add_llvm_symbol_exports target_name export_file) VERBATIM COMMENT "Creating export file for ${target_name}") set_property(TARGET ${target_name} APPEND_STRING PROPERTY - LINK_FLAGS " -Wl,-exported_symbols_list,${CMAKE_CURRENT_BINARY_DIR}/${native_export_file}") + LINK_FLAGS " -Wl,-exported_symbols_list,\"${CMAKE_CURRENT_BINARY_DIR}/${native_export_file}\"") elseif(${CMAKE_SYSTEM_NAME} MATCHES "AIX") set_property(TARGET ${target_name} APPEND_STRING PROPERTY LINK_FLAGS " -Wl,-bE:${export_file}") @@ -93,10 +93,10 @@ function(add_llvm_symbol_exports target_name export_file) COMMENT "Creating export file for ${target_name}") if (${LLVM_LINKER_IS_SOLARISLD}) set_property(TARGET ${target_name} APPEND_STRING PROPERTY - LINK_FLAGS " -Wl,-M,${CMAKE_CURRENT_BINARY_DIR}/${native_export_file}") + LINK_FLAGS " -Wl,-M,\"${CMAKE_CURRENT_BINARY_DIR}/${native_export_file}\"") else() set_property(TARGET ${target_name} APPEND_STRING PROPERTY - LINK_FLAGS " -Wl,--version-script,${CMAKE_CURRENT_BINARY_DIR}/${native_export_file}") + LINK_FLAGS " -Wl,--version-script,\"${CMAKE_CURRENT_BINARY_DIR}/${native_export_file}\"") endif() else() set(native_export_file "${target_name}.def") @@ -372,12 +372,14 @@ endfunction(set_windows_version_resource_properties) # May specify header files for IDE generators. # SONAME # Should set SONAME link flags and create symlinks +# NO_INSTALL_RPATH +# Suppress default RPATH settings in shared libraries. # PLUGIN_TOOL # The tool (i.e. cmake target) that this plugin will link against # ) function(llvm_add_library name) cmake_parse_arguments(ARG - "MODULE;SHARED;STATIC;OBJECT;DISABLE_LLVM_LINK_LLVM_DYLIB;SONAME" + "MODULE;SHARED;STATIC;OBJECT;DISABLE_LLVM_LINK_LLVM_DYLIB;SONAME;NO_INSTALL_RPATH" "OUTPUT_NAME;PLUGIN_TOOL" "ADDITIONAL_HEADERS;DEPENDS;LINK_COMPONENTS;LINK_LIBS;OBJLIBS" ${ARGN}) @@ -448,17 +450,19 @@ function(llvm_add_library name) if(ARG_MODULE) add_library(${name} MODULE ${ALL_FILES}) - llvm_setup_rpath(${name}) elseif(ARG_SHARED) add_windows_version_resource_file(ALL_FILES ${ALL_FILES}) add_library(${name} SHARED ${ALL_FILES}) - - llvm_setup_rpath(${name}) - else() add_library(${name} STATIC ${ALL_FILES}) endif() + if(NOT ARG_NO_INSTALL_RPATH) + if(ARG_MODULE OR ARG_SHARED) + llvm_setup_rpath(${name}) + endif() + endif() + setup_dependency_debugging(${name} ${LLVM_COMMON_DEPENDS}) if(DEFINED windows_resource_file) @@ -708,7 +712,12 @@ endmacro(add_llvm_loadable_module name) macro(add_llvm_executable name) - cmake_parse_arguments(ARG "DISABLE_LLVM_LINK_LLVM_DYLIB;IGNORE_EXTERNALIZE_DEBUGINFO;NO_INSTALL_RPATH" "" "DEPENDS" ${ARGN}) + cmake_parse_arguments(ARG + "DISABLE_LLVM_LINK_LLVM_DYLIB;IGNORE_EXTERNALIZE_DEBUGINFO;NO_INSTALL_RPATH" + "ENTITLEMENTS" + "DEPENDS" + ${ARGN}) + llvm_process_sources( ALL_FILES ${ARG_UNPARSED_ARGUMENTS} ) list(APPEND LLVM_COMMON_DEPENDS ${ARG_DEPENDS}) @@ -787,7 +796,7 @@ macro(add_llvm_executable name) target_link_libraries(${name} PRIVATE ${LLVM_PTHREAD_LIB}) endif() - llvm_codesign(${name}) + llvm_codesign(${name} ENTITLEMENTS ${ARG_ENTITLEMENTS}) endmacro(add_llvm_executable name) function(export_executable_symbols target) @@ -973,47 +982,50 @@ endfunction(canonicalize_tool_name) # Custom add_subdirectory wrapper # Takes in a project name (i.e. LLVM), the subdirectory name, and an optional # path if it differs from the name. -macro(add_llvm_subdirectory project type name) +function(add_llvm_subdirectory project type name) set(add_llvm_external_dir "${ARGN}") if("${add_llvm_external_dir}" STREQUAL "") set(add_llvm_external_dir ${name}) endif() canonicalize_tool_name(${name} nameUPPER) + set(canonical_full_name ${project}_${type}_${nameUPPER}) + get_property(already_processed GLOBAL PROPERTY ${canonical_full_name}_PROCESSED) + if(already_processed) + return() + endif() + set_property(GLOBAL PROPERTY ${canonical_full_name}_PROCESSED YES) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${add_llvm_external_dir}/CMakeLists.txt) # Treat it as in-tree subproject. - option(${project}_${type}_${nameUPPER}_BUILD + option(${canonical_full_name}_BUILD "Whether to build ${name} as part of ${project}" On) mark_as_advanced(${project}_${type}_${name}_BUILD) - if(${project}_${type}_${nameUPPER}_BUILD) + if(${canonical_full_name}_BUILD) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${add_llvm_external_dir} ${add_llvm_external_dir}) - # Don't process it in add_llvm_implicit_projects(). - set(${project}_${type}_${nameUPPER}_BUILD OFF) endif() else() set(LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR "${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR}" CACHE PATH "Path to ${name} source directory") - set(${project}_${type}_${nameUPPER}_BUILD_DEFAULT ON) + set(${canonical_full_name}_BUILD_DEFAULT ON) if(NOT LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR OR NOT EXISTS ${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR}) - set(${project}_${type}_${nameUPPER}_BUILD_DEFAULT OFF) + set(${canonical_full_name}_BUILD_DEFAULT OFF) endif() if("${LLVM_EXTERNAL_${nameUPPER}_BUILD}" STREQUAL "OFF") - set(${project}_${type}_${nameUPPER}_BUILD_DEFAULT OFF) + set(${canonical_full_name}_BUILD_DEFAULT OFF) endif() - option(${project}_${type}_${nameUPPER}_BUILD + option(${canonical_full_name}_BUILD "Whether to build ${name} as part of LLVM" - ${${project}_${type}_${nameUPPER}_BUILD_DEFAULT}) - if (${project}_${type}_${nameUPPER}_BUILD) + ${${canonical_full_name}_BUILD_DEFAULT}) + if (${canonical_full_name}_BUILD) if(EXISTS ${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR}) add_subdirectory(${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR} ${add_llvm_external_dir}) elseif(NOT "${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR}" STREQUAL "") message(WARNING "Nonexistent directory for ${name}: ${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR}") endif() - # FIXME: It'd be redundant. - set(${project}_${type}_${nameUPPER}_BUILD Off) endif() endif() -endmacro() +endfunction() # Add external project that may want to be built as part of llvm such as Clang, # lld, and Polly. This adds two options. One for the source directory of the @@ -1369,16 +1381,6 @@ function(add_lit_target target comment) message(STATUS "${target} does nothing.") endif() - # Add lit test dependencies. - set(llvm_utils_deps - FileCheck count not - ) - foreach(dep ${llvm_utils_deps}) - if (TARGET ${dep}) - add_dependencies(${target} ${dep}) - endif() - endforeach() - if (ARG_DEPENDS) add_dependencies(${target} ${ARG_DEPENDS}) endif() @@ -1602,6 +1604,13 @@ function(llvm_externalize_debuginfo name) endif() endif() + if(LLVM_EXTERNALIZE_DEBUGINFO_OUTPUT_DIR) + if(APPLE) + set(output_name "$.dSYM") + set(output_path "-o=${LLVM_EXTERNALIZE_DEBUGINFO_OUTPUT_DIR}/${output_name}") + endif() + endif() + if(APPLE) if(CMAKE_CXX_FLAGS MATCHES "-flto" OR CMAKE_CXX_FLAGS_${uppercase_CMAKE_BUILD_TYPE} MATCHES "-flto") @@ -1614,7 +1623,7 @@ function(llvm_externalize_debuginfo name) set(CMAKE_DSYMUTIL xcrun dsymutil) endif() add_custom_command(TARGET ${name} POST_BUILD - COMMAND ${CMAKE_DSYMUTIL} $ + COMMAND ${CMAKE_DSYMUTIL} ${output_path} $ ${strip_command} ) else() @@ -1626,7 +1635,10 @@ function(llvm_externalize_debuginfo name) endif() endfunction() +# Usage: llvm_codesign(name [ENTITLEMENTS file]) function(llvm_codesign name) + cmake_parse_arguments(ARG "" "ENTITLEMENTS" "" ${ARGN}) + if(NOT LLVM_CODESIGNING_IDENTITY) return() endif() @@ -1642,12 +1654,21 @@ function(llvm_codesign name) OUTPUT_VARIABLE CMAKE_CODESIGN_ALLOCATE ) endif() + if(DEFINED ARG_ENTITLEMENTS) + set(pass_entitlements --entitlements ${ARG_ENTITLEMENTS}) + endif() + if(CMAKE_GENERATOR STREQUAL "Xcode") + # Avoid double-signing error: Since output overwrites input, Xcode runs + # the post-build rule even if the actual build-step was skipped. + set(pass_force --force) + endif() + add_custom_command( TARGET ${name} POST_BUILD COMMAND ${CMAKE_COMMAND} -E env CODESIGN_ALLOCATE=${CMAKE_CODESIGN_ALLOCATE} ${CMAKE_CODESIGN} -s ${LLVM_CODESIGNING_IDENTITY} - $ + ${pass_entitlements} ${pass_force} $ ) endif() endfunction() diff --git a/cmake/modules/ChooseMSVCCRT.cmake b/cmake/modules/ChooseMSVCCRT.cmake index 0e6e1aa55254e51480ba489e54c545c7a6accc25..cdd7deec4c84ce1790efb5c80432b73ff327e91b 100644 --- a/cmake/modules/ChooseMSVCCRT.cmake +++ b/cmake/modules/ChooseMSVCCRT.cmake @@ -66,6 +66,15 @@ variables (LLVM_USE_CRT_DEBUG, etc) instead.") get_current_crt(LLVM_USE_CRT_${build} MSVC_CRT_REGEX CMAKE_CXX_FLAGS_${build}) + + # Make /MT the default in Release builds to make them faster + # and avoid the DLL function thunking. + if ((${build} STREQUAL "MINSIZEREL") OR + (${build} STREQUAL "RELEASE") OR + (${build} STREQUAL "RELWITHDEBINFO")) + set(LLVM_USE_CRT_${build} "MT") + endif() + set(LLVM_USE_CRT_${build} "${LLVM_USE_CRT_${build}}" CACHE STRING "Specify VC++ CRT to use for ${build_type} configurations." diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake index b590f768244540ffc696a12f73944dadc275fec8..e8b0e27e000d2414a19ca2f481dfb23cae0e02b4 100644 --- a/cmake/modules/HandleLLVMOptions.cmake +++ b/cmake/modules/HandleLLVMOptions.cmake @@ -138,7 +138,8 @@ endif() # build might work on ELF but fail on MachO/COFF. if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin" OR WIN32 OR CYGWIN OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR - ${CMAKE_SYSTEM_NAME} MATCHES "OpenBSD") AND + ${CMAKE_SYSTEM_NAME} MATCHES "OpenBSD" OR + ${CMAKE_SYSTEM_NAME} MATCHES "DragonFly") AND NOT LLVM_USE_SANITIZER) set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,defs") endif() @@ -380,6 +381,14 @@ if( MSVC ) # "Enforce type conversion rules". append("/Zc:rvalueCast" CMAKE_CXX_FLAGS) + if (CMAKE_CXX_COMPILER_ID MATCHES "MSVC" AND NOT LLVM_ENABLE_INCREMENTAL_LINK) + foreach(CONFIG RELEASE RELWITHDEBINFO MINSIZEREL) + foreach(FLAG EXE MODULE SHARED STATIC) + string(REGEX REPLACE "[-/](INCREMENTAL:YES|INCREMENTAL:NO|INCREMENTAL)" "/INCREMENTAL:NO" CMAKE_${FLAG}_LINKER_FLAGS_${CONFIG} "${CMAKE_${FLAG}_LINKER_FLAGS_${CONFIG}}") + endforeach() + endforeach() + endif() + if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NOT LLVM_ENABLE_LTO) # clang-cl and cl by default produce non-deterministic binaries because # link.exe /incremental requires a timestamp in the .obj file. clang-cl @@ -512,6 +521,10 @@ if (MSVC) # Update 1. Re-evaluate the usefulness of this diagnostic with Update 2. -wd4592 # Suppress ''var': symbol will be dynamically initialized (implementation limitation) -wd4319 # Suppress ''operator' : zero extending 'type' to 'type' of greater size' + # C4709 is disabled because of a bug with Visual Studio 2017 as of + # v15.8.8. Re-evaluate the usefulness of this diagnostic when the bug + # is fixed. + -wd4709 # Suppress comma operator within array index expression # Ideally, we'd like this warning to be enabled, but MSVC 2013 doesn't # support the 'aligned' attribute in the way that clang sources requires (for diff --git a/cmake/modules/LLVMExternalProjectUtils.cmake b/cmake/modules/LLVMExternalProjectUtils.cmake index f736c3947843fd54479c1cc501064e05b294df04..4d26a30f97becc8c58ce80d5b2374f10b4bbcc40 100644 --- a/cmake/modules/LLVMExternalProjectUtils.cmake +++ b/cmake/modules/LLVMExternalProjectUtils.cmake @@ -162,8 +162,35 @@ function(llvm_ExternalProject_Add name source_dir) -DCMAKE_OBJDUMP=${CMAKE_OBJDUMP} -DCMAKE_STRIP=${CMAKE_STRIP}) set(llvm_config_path ${LLVM_CONFIG_PATH}) + + if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") + string(REGEX MATCH "[0-9]+\\.[0-9]+(\\.[0-9]+)?" CLANG_VERSION + ${PACKAGE_VERSION}) + set(resource_dir "${LLVM_LIBRARY_DIR}/clang/${CLANG_VERSION}") + set(flag_types ASM C CXX MODULE_LINKER SHARED_LINKER EXE_LINKER) + foreach(type ${flag_types}) + set(${type}_flag -DCMAKE_${type}_FLAGS=-resource-dir=${resource_dir}) + endforeach() + string(REPLACE ";" "|" flag_string "${flag_types}") + foreach(arg ${ARG_CMAKE_ARGS}) + if(arg MATCHES "^-DCMAKE_(${flag_string})_FLAGS") + foreach(type ${flag_types}) + if(arg MATCHES "^-DCMAKE_${type}_FLAGS") + string(REGEX REPLACE "^-DCMAKE_${type}_FLAGS=(.*)$" "\\1" flag_value "${arg}") + set(${type}_flag "${${type}_flag} ${flag_value}") + endif() + endforeach() + else() + list(APPEND cmake_args ${arg}) + endif() + endforeach() + foreach(type ${flag_types}) + list(APPEND cmake_args ${${type}_flag}) + endforeach() + endif() else() set(llvm_config_path "$") + set(cmake_args ${ARG_CMAKE_ARGS}) endif() ExternalProject_Add(${name} @@ -187,7 +214,7 @@ function(llvm_ExternalProject_Add name source_dir) -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM} -DCMAKE_EXPORT_COMPILE_COMMANDS=1 - ${ARG_CMAKE_ARGS} + ${cmake_args} ${PASSTHROUGH_VARIABLES} INSTALL_COMMAND "" STEP_TARGETS configure build diff --git a/cmake/modules/TableGen.cmake b/cmake/modules/TableGen.cmake index d1afcb42f9de7de87e421fb8b5265e2e25e39372..2d942435222ee472d75b4e535c147ae4510e255b 100644 --- a/cmake/modules/TableGen.cmake +++ b/cmake/modules/TableGen.cmake @@ -158,6 +158,12 @@ macro(add_tablegen target project) llvm_ExternalProject_BuildCmd(tblgen_build_cmd ${target} ${LLVM_NATIVE_BUILD} CONFIGURATION Release) + # Create an artificial dependency between tablegen projects, because they + # compile the same dependencies, thus using the same build folders. + # FIXME: A proper fix requires sequentially chaining tablegens. + if (NOT ${project} STREQUAL LLVM AND TARGET ${project}-tablegen-host) + add_dependencies(${project}-tablegen-host LLVM-tablegen-host) + endif() add_custom_command(OUTPUT ${${project}_TABLEGEN_EXE} COMMAND ${tblgen_build_cmd} DEPENDS CONFIGURE_LLVM_NATIVE ${target} diff --git a/docs/AMDGPU/AMDGPUAsmGFX7.rst b/docs/AMDGPU/AMDGPUAsmGFX7.rst new file mode 100644 index 0000000000000000000000000000000000000000..58c13b5556b436293f03db6b4c76a20e65ea5e4b --- /dev/null +++ b/docs/AMDGPU/AMDGPUAsmGFX7.rst @@ -0,0 +1,1411 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +============================ +Syntax of GFX7 Instructions +============================ + +.. contents:: + :local: + +Notation +======== + +Notation used in this document is explained :ref:`here`. + +Introduction +============ + +An overview of generic syntax and other features of AMDGPU instructions may be found :ref:`in this document`. + +Instructions +============ + + +DS +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + ds_add_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_add_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_add_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_add_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_add_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_add_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_and_b32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_and_b64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_and_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_and_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_and_src2_b32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_and_src2_b64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_append :ref:`vdst` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_b32 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_b64 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_f32 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_f64 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_rtn_f32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_rtn_f64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_condxchg32_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_consume :ref:`vdst` :ref:`ds_offset16` :ref:`gds` + ds_dec_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_dec_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_dec_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_dec_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_dec_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_dec_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_gws_barrier :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_gws_init :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_gws_sema_br :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_gws_sema_p :ref:`ds_offset16` :ref:`gds` + ds_gws_sema_release_all :ref:`ds_offset16` :ref:`gds` + ds_gws_sema_v :ref:`ds_offset16` :ref:`gds` + ds_inc_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_inc_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_inc_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_inc_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_inc_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_inc_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_f32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_f64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_i32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_i64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_rtn_f32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_rtn_f64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_rtn_i32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_rtn_i64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_src2_f32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_max_src2_f64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_max_src2_i32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_max_src2_i64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_max_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_max_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_max_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_f32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_f64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_i32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_i64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_rtn_f32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_rtn_f64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_rtn_i32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_rtn_i64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_src2_f32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_min_src2_f64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_min_src2_i32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_min_src2_i64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_min_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_min_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_min_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_mskor_b32 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_mskor_b64 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_mskor_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_mskor_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_nop + ds_or_b32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_or_b64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_or_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_or_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_or_src2_b32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_or_src2_b64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_ordered_count :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read2_b32 :ref:`vdst`::ref:`b32x2`, :ref:`vaddr` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_read2_b64 :ref:`vdst`::ref:`b64x2`, :ref:`vaddr` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_read2st64_b32 :ref:`vdst`::ref:`b32x2`, :ref:`vaddr` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_read2st64_b64 :ref:`vdst`::ref:`b64x2`, :ref:`vaddr` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_read_b128 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_b32 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_b64 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_b96 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_i16 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_i8 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_u16 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_u8 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_rsub_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_rsub_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_rsub_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_rsub_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_rsub_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_rsub_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_sub_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_sub_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_sub_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_sub_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_sub_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_sub_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_swizzle_b32 :ref:`vdst`, :ref:`vaddr` :ref:`sw_offset16` :ref:`gds` + ds_wrap_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_write2_b32 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_write2_b64 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_write2st64_b32 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_write2st64_b64 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_write_b128 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_b16 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_b32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_b64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_b8 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_b96 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_src2_b32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_write_src2_b64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_wrxchg2_rtn_b32 :ref:`vdst`::ref:`b32x2`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_wrxchg2_rtn_b64 :ref:`vdst`::ref:`b64x2`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_wrxchg2st64_rtn_b32 :ref:`vdst`::ref:`b32x2`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_wrxchg2st64_rtn_b64 :ref:`vdst`::ref:`b64x2`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_wrxchg_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_wrxchg_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_xor_b32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_xor_b64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_xor_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_xor_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_xor_src2_b32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_xor_src2_b64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + +EXP +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **SRC3** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + exp :ref:`tgt`, :ref:`vsrc0`, :ref:`vsrc1`, :ref:`vsrc2`, :ref:`vsrc3` :ref:`done` :ref:`compr` :ref:`vm` + +FLAT +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + flat_atomic_add :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_add_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_and :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_and_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_cmpswap :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`::ref:`b32x2` :ref:`glc` :ref:`slc` + flat_atomic_cmpswap_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`::ref:`b64x2` :ref:`glc` :ref:`slc` + flat_atomic_dec :ref:`vdst`::ref:`opt`::ref:`u32`, :ref:`vaddr`, :ref:`vdata`::ref:`u32` :ref:`glc` :ref:`slc` + flat_atomic_dec_x2 :ref:`vdst`::ref:`opt`::ref:`u64`, :ref:`vaddr`, :ref:`vdata`::ref:`u64` :ref:`glc` :ref:`slc` + flat_atomic_fcmpswap :ref:`vdst`::ref:`opt`::ref:`f32`, :ref:`vaddr`, :ref:`vdata`::ref:`f32x2` :ref:`glc` :ref:`slc` + flat_atomic_fcmpswap_x2 :ref:`vdst`::ref:`opt`::ref:`f64`, :ref:`vaddr`, :ref:`vdata`::ref:`f64x2` :ref:`glc` :ref:`slc` + flat_atomic_fmax :ref:`vdst`::ref:`opt`::ref:`f32`, :ref:`vaddr`, :ref:`vdata`::ref:`f32` :ref:`glc` :ref:`slc` + flat_atomic_fmax_x2 :ref:`vdst`::ref:`opt`::ref:`f64`, :ref:`vaddr`, :ref:`vdata`::ref:`f64` :ref:`glc` :ref:`slc` + flat_atomic_fmin :ref:`vdst`::ref:`opt`::ref:`f32`, :ref:`vaddr`, :ref:`vdata`::ref:`f32` :ref:`glc` :ref:`slc` + flat_atomic_fmin_x2 :ref:`vdst`::ref:`opt`::ref:`f64`, :ref:`vaddr`, :ref:`vdata`::ref:`f64` :ref:`glc` :ref:`slc` + flat_atomic_inc :ref:`vdst`::ref:`opt`::ref:`u32`, :ref:`vaddr`, :ref:`vdata`::ref:`u32` :ref:`glc` :ref:`slc` + flat_atomic_inc_x2 :ref:`vdst`::ref:`opt`::ref:`u64`, :ref:`vaddr`, :ref:`vdata`::ref:`u64` :ref:`glc` :ref:`slc` + flat_atomic_or :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_or_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_smax :ref:`vdst`::ref:`opt`::ref:`s32`, :ref:`vaddr`, :ref:`vdata`::ref:`s32` :ref:`glc` :ref:`slc` + flat_atomic_smax_x2 :ref:`vdst`::ref:`opt`::ref:`s64`, :ref:`vaddr`, :ref:`vdata`::ref:`s64` :ref:`glc` :ref:`slc` + flat_atomic_smin :ref:`vdst`::ref:`opt`::ref:`s32`, :ref:`vaddr`, :ref:`vdata`::ref:`s32` :ref:`glc` :ref:`slc` + flat_atomic_smin_x2 :ref:`vdst`::ref:`opt`::ref:`s64`, :ref:`vaddr`, :ref:`vdata`::ref:`s64` :ref:`glc` :ref:`slc` + flat_atomic_sub :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_sub_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_swap :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_swap_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_umax :ref:`vdst`::ref:`opt`::ref:`u32`, :ref:`vaddr`, :ref:`vdata`::ref:`u32` :ref:`glc` :ref:`slc` + flat_atomic_umax_x2 :ref:`vdst`::ref:`opt`::ref:`u64`, :ref:`vaddr`, :ref:`vdata`::ref:`u64` :ref:`glc` :ref:`slc` + flat_atomic_umin :ref:`vdst`::ref:`opt`::ref:`u32`, :ref:`vaddr`, :ref:`vdata`::ref:`u32` :ref:`glc` :ref:`slc` + flat_atomic_umin_x2 :ref:`vdst`::ref:`opt`::ref:`u64`, :ref:`vaddr`, :ref:`vdata`::ref:`u64` :ref:`glc` :ref:`slc` + flat_atomic_xor :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_xor_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_load_dword :ref:`vdst`, :ref:`vaddr` :ref:`glc` :ref:`slc` + flat_load_dwordx2 :ref:`vdst`, :ref:`vaddr` :ref:`glc` :ref:`slc` + flat_load_dwordx3 :ref:`vdst`, :ref:`vaddr` :ref:`glc` :ref:`slc` + flat_load_dwordx4 :ref:`vdst`, :ref:`vaddr` :ref:`glc` :ref:`slc` + flat_load_sbyte :ref:`vdst`, :ref:`vaddr` :ref:`glc` :ref:`slc` + flat_load_sshort :ref:`vdst`, :ref:`vaddr` :ref:`glc` :ref:`slc` + flat_load_ubyte :ref:`vdst`, :ref:`vaddr` :ref:`glc` :ref:`slc` + flat_load_ushort :ref:`vdst`, :ref:`vaddr` :ref:`glc` :ref:`slc` + flat_store_byte :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_store_dword :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_store_dwordx2 :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_store_dwordx3 :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_store_dwordx4 :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_store_short :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + +MIMG +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + image_atomic_add :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_and :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_cmpswap :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_dec :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_inc :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_or :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_smax :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_smin :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_sub :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_swap :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_umax :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_umin :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_xor :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4 :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_b :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_b_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_b_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_b_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_c :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_c_b :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_c_b_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_c_b_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_c_b_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_c_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_c_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_c_l :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_c_l_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_c_lz :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_c_lz_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_c_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_l :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_l_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_lz :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_lz_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_get_lod :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_get_resinfo :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_load :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_load_mip :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_load_mip_pck :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_load_mip_pck_sgn :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_load_pck :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_load_pck_sgn :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_b :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_b_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_b_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_b_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c_b :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c_b_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c_b_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c_b_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c_cd :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c_cd_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c_cd_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c_cd_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c_d :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c_d_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c_d_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c_d_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c_l :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c_l_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c_lz :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c_lz_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_c_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_cd :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_cd_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_cd_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_cd_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_d :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_d_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_d_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_d_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_l :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_l_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_lz :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_lz_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_store :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_store_mip :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_store_mip_pck :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_store_pck :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + +MUBUF +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **SRC3** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + buffer_atomic_add :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_add_x2 :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_and :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_and_x2 :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_cmpswap :ref:`vdata`::ref:`dst`::ref:`b32x2`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_cmpswap_x2 :ref:`vdata`::ref:`dst`::ref:`b64x2`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_dec :ref:`vdata`::ref:`dst`::ref:`u32`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_dec_x2 :ref:`vdata`::ref:`dst`::ref:`u64`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_inc :ref:`vdata`::ref:`dst`::ref:`u32`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_inc_x2 :ref:`vdata`::ref:`dst`::ref:`u64`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_or :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_or_x2 :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_smax :ref:`vdata`::ref:`dst`::ref:`s32`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_smax_x2 :ref:`vdata`::ref:`dst`::ref:`s64`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_smin :ref:`vdata`::ref:`dst`::ref:`s32`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_smin_x2 :ref:`vdata`::ref:`dst`::ref:`s64`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_sub :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_sub_x2 :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_swap :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_swap_x2 :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_umax :ref:`vdata`::ref:`dst`::ref:`u32`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_umax_x2 :ref:`vdata`::ref:`dst`::ref:`u64`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_umin :ref:`vdata`::ref:`dst`::ref:`u32`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_umin_x2 :ref:`vdata`::ref:`dst`::ref:`u64`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_xor :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_xor_x2 :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_dword :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` + buffer_load_dwordx2 :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_dwordx3 :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_dwordx4 :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_format_x :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` + buffer_load_format_xy :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_format_xyz :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_format_xyzw :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_sbyte :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` + buffer_load_sshort :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` + buffer_load_ubyte :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` + buffer_load_ushort :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` + buffer_store_byte :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_dword :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_dwordx2 :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_dwordx3 :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_dwordx4 :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_x :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_xy :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_xyz :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_xyzw :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_short :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_wbinvl1 + buffer_wbinvl1_vol + +SMRD +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + s_buffer_load_dword :ref:`sdst`, :ref:`sbase`, :ref:`soffset` + s_buffer_load_dwordx16 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` + s_buffer_load_dwordx2 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` + s_buffer_load_dwordx4 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` + s_buffer_load_dwordx8 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` + s_dcache_inv + s_dcache_inv_vol + s_load_dword :ref:`sdst`, :ref:`sbase`, :ref:`soffset` + s_load_dwordx16 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` + s_load_dwordx2 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` + s_load_dwordx4 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` + s_load_dwordx8 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` + s_memtime :ref:`sdst` + +SOP1 +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + s_abs_i32 :ref:`sdst`, :ref:`ssrc` + s_and_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_andn2_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_bcnt0_i32_b32 :ref:`sdst`, :ref:`ssrc` + s_bcnt0_i32_b64 :ref:`sdst`, :ref:`ssrc` + s_bcnt1_i32_b32 :ref:`sdst`, :ref:`ssrc` + s_bcnt1_i32_b64 :ref:`sdst`, :ref:`ssrc` + s_bitset0_b32 :ref:`sdst`, :ref:`ssrc` + s_bitset0_b64 :ref:`sdst`, :ref:`ssrc`::ref:`b32` + s_bitset1_b32 :ref:`sdst`, :ref:`ssrc` + s_bitset1_b64 :ref:`sdst`, :ref:`ssrc`::ref:`b32` + s_brev_b32 :ref:`sdst`, :ref:`ssrc` + s_brev_b64 :ref:`sdst`, :ref:`ssrc` + s_cbranch_join :ref:`ssrc` + s_cmov_b32 :ref:`sdst`, :ref:`ssrc` + s_cmov_b64 :ref:`sdst`, :ref:`ssrc` + s_ff0_i32_b32 :ref:`sdst`, :ref:`ssrc` + s_ff0_i32_b64 :ref:`sdst`, :ref:`ssrc` + s_ff1_i32_b32 :ref:`sdst`, :ref:`ssrc` + s_ff1_i32_b64 :ref:`sdst`, :ref:`ssrc` + s_flbit_i32 :ref:`sdst`, :ref:`ssrc` + s_flbit_i32_b32 :ref:`sdst`, :ref:`ssrc` + s_flbit_i32_b64 :ref:`sdst`, :ref:`ssrc` + s_flbit_i32_i64 :ref:`sdst`, :ref:`ssrc` + s_getpc_b64 :ref:`sdst` + s_mov_b32 :ref:`sdst`, :ref:`ssrc` + s_mov_b64 :ref:`sdst`, :ref:`ssrc` + s_mov_fed_b32 :ref:`sdst`, :ref:`ssrc` + s_movreld_b32 :ref:`sdst`, :ref:`ssrc` + s_movreld_b64 :ref:`sdst`, :ref:`ssrc` + s_movrels_b32 :ref:`sdst`, :ref:`ssrc` + s_movrels_b64 :ref:`sdst`, :ref:`ssrc` + s_nand_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_nor_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_not_b32 :ref:`sdst`, :ref:`ssrc` + s_not_b64 :ref:`sdst`, :ref:`ssrc` + s_or_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_orn2_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_quadmask_b32 :ref:`sdst`, :ref:`ssrc` + s_quadmask_b64 :ref:`sdst`, :ref:`ssrc` + s_rfe_b64 :ref:`ssrc` + s_setpc_b64 :ref:`ssrc` + s_sext_i32_i16 :ref:`sdst`, :ref:`ssrc` + s_sext_i32_i8 :ref:`sdst`, :ref:`ssrc` + s_swappc_b64 :ref:`sdst`, :ref:`ssrc` + s_wqm_b32 :ref:`sdst`, :ref:`ssrc` + s_wqm_b64 :ref:`sdst`, :ref:`ssrc` + s_xnor_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_xor_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + +SOP2 +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + s_absdiff_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_add_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_add_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_addc_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_and_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_and_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_andn2_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_andn2_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_ashr_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_ashr_i64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_bfe_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_bfe_i64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_bfe_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_bfe_u64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_bfm_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_bfm_b64 :ref:`sdst`, :ref:`ssrc0`::ref:`b32`, :ref:`ssrc1`::ref:`b32` + s_cbranch_g_fork :ref:`ssrc0`, :ref:`ssrc1` + s_cselect_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_cselect_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_lshl_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_lshl_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_lshr_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_lshr_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_max_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_max_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_min_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_min_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_mul_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_nand_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_nand_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_nor_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_nor_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_or_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_or_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_orn2_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_orn2_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_sub_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_sub_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_subb_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_xnor_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_xnor_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_xor_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_xor_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + +SOPC +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **SRC0** **SRC1** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + s_bitcmp0_b32 :ref:`ssrc0`, :ref:`ssrc1` + s_bitcmp0_b64 :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_bitcmp1_b32 :ref:`ssrc0`, :ref:`ssrc1` + s_bitcmp1_b64 :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_cmp_eq_i32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_eq_u32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_ge_i32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_ge_u32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_gt_i32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_gt_u32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_le_i32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_le_u32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_lg_i32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_lg_u32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_lt_i32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_lt_u32 :ref:`ssrc0`, :ref:`ssrc1` + s_setvskip :ref:`ssrc0`, :ref:`ssrc1` + +SOPK +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + s_addk_i32 :ref:`sdst`, :ref:`imm16` + s_cbranch_i_fork :ref:`ssrc`, :ref:`label` + s_cmovk_i32 :ref:`sdst`, :ref:`imm16` + s_cmpk_eq_i32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_eq_u32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_ge_i32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_ge_u32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_gt_i32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_gt_u32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_le_i32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_le_u32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_lg_i32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_lg_u32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_lt_i32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_lt_u32 :ref:`ssrc`, :ref:`imm16` + s_getreg_b32 :ref:`sdst`, :ref:`hwreg` + s_movk_i32 :ref:`sdst`, :ref:`imm16` + s_mulk_i32 :ref:`sdst`, :ref:`imm16` + s_setreg_b32 :ref:`hwreg`, :ref:`ssrc` + s_setreg_imm32_b32 :ref:`hwreg`, :ref:`imm32` + +SOPP +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **SRC** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + s_barrier + s_branch :ref:`label` + s_cbranch_cdbgsys :ref:`label` + s_cbranch_cdbgsys_and_user :ref:`label` + s_cbranch_cdbgsys_or_user :ref:`label` + s_cbranch_cdbguser :ref:`label` + s_cbranch_execnz :ref:`label` + s_cbranch_execz :ref:`label` + s_cbranch_scc0 :ref:`label` + s_cbranch_scc1 :ref:`label` + s_cbranch_vccnz :ref:`label` + s_cbranch_vccz :ref:`label` + s_decperflevel :ref:`imm16` + s_endpgm + s_icache_inv + s_incperflevel :ref:`imm16` + s_nop :ref:`imm16` + s_sendmsg :ref:`msg` + s_sendmsghalt :ref:`msg` + s_sethalt :ref:`imm16` + s_setkill :ref:`imm16` + s_setprio :ref:`imm16` + s_sleep :ref:`imm16` + s_trap :ref:`imm16` + s_ttracedata + s_waitcnt :ref:`waitcnt` + +VINTRP +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + v_interp_mov_f32 :ref:`vdst`, :ref:`param`::ref:`b32`, :ref:`attr`::ref:`b32` + v_interp_p1_f32 :ref:`vdst`, :ref:`vsrc`, :ref:`attr`::ref:`b32` + v_interp_p2_f32 :ref:`vdst`, :ref:`vsrc`, :ref:`attr`::ref:`b32` + +VOP1 +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + v_bfrev_b32 :ref:`vdst`, :ref:`src` + v_ceil_f32 :ref:`vdst`, :ref:`src` + v_ceil_f64 :ref:`vdst`, :ref:`src` + v_clrexcp + v_cos_f32 :ref:`vdst`, :ref:`src` + v_cvt_f16_f32 :ref:`vdst`, :ref:`src` + v_cvt_f32_f16 :ref:`vdst`, :ref:`src` + v_cvt_f32_f64 :ref:`vdst`, :ref:`src` + v_cvt_f32_i32 :ref:`vdst`, :ref:`src` + v_cvt_f32_u32 :ref:`vdst`, :ref:`src` + v_cvt_f32_ubyte0 :ref:`vdst`, :ref:`src` + v_cvt_f32_ubyte1 :ref:`vdst`, :ref:`src` + v_cvt_f32_ubyte2 :ref:`vdst`, :ref:`src` + v_cvt_f32_ubyte3 :ref:`vdst`, :ref:`src` + v_cvt_f64_f32 :ref:`vdst`, :ref:`src` + v_cvt_f64_i32 :ref:`vdst`, :ref:`src` + v_cvt_f64_u32 :ref:`vdst`, :ref:`src` + v_cvt_flr_i32_f32 :ref:`vdst`, :ref:`src` + v_cvt_i32_f32 :ref:`vdst`, :ref:`src` + v_cvt_i32_f64 :ref:`vdst`, :ref:`src` + v_cvt_off_f32_i4 :ref:`vdst`, :ref:`src` + v_cvt_rpi_i32_f32 :ref:`vdst`, :ref:`src` + v_cvt_u32_f32 :ref:`vdst`, :ref:`src` + v_cvt_u32_f64 :ref:`vdst`, :ref:`src` + v_exp_f32 :ref:`vdst`, :ref:`src` + v_exp_legacy_f32 :ref:`vdst`, :ref:`src` + v_ffbh_i32 :ref:`vdst`, :ref:`src` + v_ffbh_u32 :ref:`vdst`, :ref:`src` + v_ffbl_b32 :ref:`vdst`, :ref:`src` + v_floor_f32 :ref:`vdst`, :ref:`src` + v_floor_f64 :ref:`vdst`, :ref:`src` + v_fract_f32 :ref:`vdst`, :ref:`src` + v_fract_f64 :ref:`vdst`, :ref:`src` + v_frexp_exp_i32_f32 :ref:`vdst`, :ref:`src` + v_frexp_exp_i32_f64 :ref:`vdst`, :ref:`src` + v_frexp_mant_f32 :ref:`vdst`, :ref:`src` + v_frexp_mant_f64 :ref:`vdst`, :ref:`src` + v_log_clamp_f32 :ref:`vdst`, :ref:`src` + v_log_f32 :ref:`vdst`, :ref:`src` + v_log_legacy_f32 :ref:`vdst`, :ref:`src` + v_mov_b32 :ref:`vdst`, :ref:`src` + v_mov_fed_b32 :ref:`vdst`, :ref:`src` + v_movreld_b32 :ref:`vdst`, :ref:`src` + v_movrels_b32 :ref:`vdst`, :ref:`vsrc` + v_movrelsd_b32 :ref:`vdst`, :ref:`vsrc` + v_nop + v_not_b32 :ref:`vdst`, :ref:`src` + v_rcp_clamp_f32 :ref:`vdst`, :ref:`src` + v_rcp_clamp_f64 :ref:`vdst`, :ref:`src` + v_rcp_f32 :ref:`vdst`, :ref:`src` + v_rcp_f64 :ref:`vdst`, :ref:`src` + v_rcp_iflag_f32 :ref:`vdst`, :ref:`src` + v_rcp_legacy_f32 :ref:`vdst`, :ref:`src` + v_readfirstlane_b32 :ref:`sdst`, :ref:`vsrc` + v_rndne_f32 :ref:`vdst`, :ref:`src` + v_rndne_f64 :ref:`vdst`, :ref:`src` + v_rsq_clamp_f32 :ref:`vdst`, :ref:`src` + v_rsq_clamp_f64 :ref:`vdst`, :ref:`src` + v_rsq_f32 :ref:`vdst`, :ref:`src` + v_rsq_f64 :ref:`vdst`, :ref:`src` + v_rsq_legacy_f32 :ref:`vdst`, :ref:`src` + v_sin_f32 :ref:`vdst`, :ref:`src` + v_sqrt_f32 :ref:`vdst`, :ref:`src` + v_sqrt_f64 :ref:`vdst`, :ref:`src` + v_trunc_f32 :ref:`vdst`, :ref:`src` + v_trunc_f64 :ref:`vdst`, :ref:`src` + +VOP2 +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST0** **DST1** **SRC0** **SRC1** **SRC2** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + v_add_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_add_i32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_addc_u32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`, :ref:`vcc` + v_and_b32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_ashr_i32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1`::ref:`u32` + v_ashrrev_i32 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`vsrc1` + v_bcnt_u32_b32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_bfm_b32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_cndmask_b32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1`, :ref:`vcc` + v_cvt_pk_i16_i32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_cvt_pk_u16_u32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_cvt_pkaccum_u8_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1`::ref:`u32` + v_cvt_pknorm_i16_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_cvt_pknorm_u16_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_cvt_pkrtz_f16_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_ldexp_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1`::ref:`i32` + v_lshl_b32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1`::ref:`u32` + v_lshlrev_b32 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`vsrc1` + v_lshr_b32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1`::ref:`u32` + v_lshrrev_b32 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`vsrc1` + v_mac_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mac_legacy_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_madak_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1`, :ref:`imm32` + v_madmk_f32 :ref:`vdst`, :ref:`src0`, :ref:`imm32`, :ref:`vsrc2` + v_max_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_max_i32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_max_legacy_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_max_u32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mbcnt_hi_u32_b32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mbcnt_lo_u32_b32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_min_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_min_i32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_min_legacy_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_min_u32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_hi_i32_i24 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_hi_u32_u24 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_i32_i24 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_legacy_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_u32_u24 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_or_b32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_readlane_b32 :ref:`sdst`, :ref:`vsrc0`, :ref:`ssrc1` + v_sub_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_sub_i32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_subb_u32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`, :ref:`vcc` + v_subbrev_u32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`, :ref:`vcc` + v_subrev_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_subrev_i32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_writelane_b32 :ref:`vdst`, :ref:`ssrc0`, :ref:`ssrc1` + v_xor_b32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + +VOP3 +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST0** **DST1** **SRC0** **SRC1** **SRC2** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + v_add_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_add_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_add_i32_e64 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_addc_u32_e64 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1`, :ref:`ssrc2` + v_alignbit_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_alignbyte_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_and_b32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_ashr_i32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1`::ref:`u32` + v_ashr_i64 :ref:`vdst`, :ref:`src0`, :ref:`src1`::ref:`u32` + v_ashrrev_i32_e64 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`src1` + v_bcnt_u32_b32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_bfe_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1`::ref:`u32`, :ref:`src2`::ref:`u32` + v_bfe_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_bfi_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_bfm_b32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_bfrev_b32_e64 :ref:`vdst`, :ref:`src` + v_ceil_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_ceil_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_clrexcp_e64 + v_cmp_class_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`b32` + v_cmp_class_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`b32` + v_cmp_eq_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_eq_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_eq_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_eq_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_eq_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_eq_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_f_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_f_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_f_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_f_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_f_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_f_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ge_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_ge_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_ge_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ge_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ge_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ge_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_gt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_gt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_gt_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_gt_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_gt_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_gt_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_le_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_le_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_le_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_le_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_le_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_le_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_lg_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_lg_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_lt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_lt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_lt_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_lt_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_lt_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_lt_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ne_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ne_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ne_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ne_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_neq_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_neq_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_nge_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_nge_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_ngt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_ngt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_nle_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_nle_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_nlg_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_nlg_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_nlt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_nlt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_o_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_o_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_t_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_t_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_t_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_t_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_tru_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_tru_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_u_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmp_u_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_eq_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_eq_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_f_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_f_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_ge_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_ge_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_gt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_gt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_le_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_le_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_lg_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_lg_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_lt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_lt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_neq_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_neq_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_nge_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_nge_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_ngt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_ngt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_nle_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_nle_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_nlg_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_nlg_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_nlt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_nlt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_o_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_o_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_tru_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_tru_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_u_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmps_u_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_eq_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_eq_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_f_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_f_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_ge_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_ge_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_gt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_gt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_le_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_le_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_lg_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_lg_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_lt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_lt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_neq_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_neq_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_nge_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_nge_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_ngt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_ngt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_nle_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_nle_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_nlg_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_nlg_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_nlt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_nlt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_o_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_o_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_tru_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_tru_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_u_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpsx_u_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_class_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`b32` + v_cmpx_class_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`b32` + v_cmpx_eq_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_eq_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_eq_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_eq_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_eq_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_eq_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_f_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_f_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_f_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_f_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_f_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_f_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ge_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_ge_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_ge_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ge_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ge_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ge_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_gt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_gt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_gt_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_gt_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_gt_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_gt_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_le_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_le_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_le_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_le_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_le_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_le_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_lg_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_lg_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_lt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_lt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_lt_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_lt_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_lt_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_lt_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ne_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ne_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ne_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ne_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_neq_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_neq_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_nge_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_nge_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_ngt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_ngt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_nle_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_nle_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_nlg_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_nlg_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_nlt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_nlt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_o_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_o_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_t_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_t_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_t_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_t_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_tru_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_tru_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_u_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cmpx_u_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cndmask_b32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`ssrc2` + v_cos_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_cubeid_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_cubema_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_cubesc_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_cubetc_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_cvt_f16_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_cvt_f32_f16_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_f32_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_cvt_f32_i32_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_f32_u32_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_f32_ubyte0_e64 :ref:`vdst`, :ref:`src` + v_cvt_f32_ubyte1_e64 :ref:`vdst`, :ref:`src` + v_cvt_f32_ubyte2_e64 :ref:`vdst`, :ref:`src` + v_cvt_f32_ubyte3_e64 :ref:`vdst`, :ref:`src` + v_cvt_f64_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_cvt_f64_i32_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_f64_u32_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_flr_i32_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_cvt_i32_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_cvt_i32_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_cvt_off_f32_i4_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_pk_i16_i32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_cvt_pk_u16_u32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_cvt_pk_u8_f32 :ref:`vdst`, :ref:`src0`, :ref:`src1`::ref:`u32`, :ref:`src2`::ref:`u32` + v_cvt_pkaccum_u8_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`u32` + v_cvt_pknorm_i16_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cvt_pknorm_u16_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cvt_pkrtz_f16_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cvt_rpi_i32_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_cvt_u32_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_cvt_u32_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_div_fixup_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_div_fixup_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_div_fmas_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_div_fmas_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_div_scale_f32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_div_scale_f64 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_exp_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_exp_legacy_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_ffbh_i32_e64 :ref:`vdst`, :ref:`src` + v_ffbh_u32_e64 :ref:`vdst`, :ref:`src` + v_ffbl_b32_e64 :ref:`vdst`, :ref:`src` + v_floor_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_floor_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_fma_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_fma_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_fract_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_fract_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_frexp_exp_i32_f32_e64 :ref:`vdst`, :ref:`src` + v_frexp_exp_i32_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_frexp_mant_f32_e64 :ref:`vdst`, :ref:`src` + v_frexp_mant_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_ldexp_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`i32` :ref:`clamp` :ref:`omod` + v_ldexp_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`i32` :ref:`clamp` :ref:`omod` + v_lerp_u8 :ref:`vdst`::ref:`u32`, :ref:`src0`::ref:`b32`, :ref:`src1`::ref:`b32`, :ref:`src2`::ref:`b32` + v_log_clamp_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_log_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_log_legacy_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_lshl_b32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1`::ref:`u32` + v_lshl_b64 :ref:`vdst`, :ref:`src0`, :ref:`src1`::ref:`u32` + v_lshlrev_b32_e64 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`src1` + v_lshr_b32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1`::ref:`u32` + v_lshr_b64 :ref:`vdst`, :ref:`src0`, :ref:`src1`::ref:`u32` + v_lshrrev_b32_e64 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`src1` + v_mac_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_mac_legacy_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_mad_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_mad_i32_i24 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2`::ref:`i32` + v_mad_i64_i32 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1`, :ref:`src2`::ref:`i64` + v_mad_legacy_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_mad_u32_u24 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2`::ref:`u32` + v_mad_u64_u32 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1`, :ref:`src2`::ref:`u64` + v_max3_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_max3_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_max3_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_max_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_max_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_max_i32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_max_legacy_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_max_u32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mbcnt_hi_u32_b32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mbcnt_lo_u32_b32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_med3_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_med3_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_med3_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_min3_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_min3_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_min3_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_min_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_min_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_min_i32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_min_legacy_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_min_u32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mov_b32_e64 :ref:`vdst`, :ref:`src` + v_mov_fed_b32_e64 :ref:`vdst`, :ref:`src` + v_movreld_b32_e64 :ref:`vdst`, :ref:`src` + v_movrels_b32_e64 :ref:`vdst`, :ref:`vsrc` + v_movrelsd_b32_e64 :ref:`vdst`, :ref:`vsrc` + v_mqsad_pk_u16_u8 :ref:`vdst`::ref:`b64`, :ref:`src0`::ref:`b64`, :ref:`src1`::ref:`b32`, :ref:`src2`::ref:`b64` + v_mqsad_u32_u8 :ref:`vdst`::ref:`b128`, :ref:`src0`::ref:`b64`, :ref:`src1`::ref:`b32`, :ref:`vsrc2`::ref:`b128` + v_msad_u8 :ref:`vdst`::ref:`u32`, :ref:`src0`::ref:`b32`, :ref:`src1`::ref:`b32`, :ref:`src2`::ref:`b32` + v_mul_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_mul_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_mul_hi_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_hi_i32_i24_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_hi_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_hi_u32_u24_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_i32_i24_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_legacy_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_mul_lo_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_lo_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_u32_u24_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mullit_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_nop_e64 + v_not_b32_e64 :ref:`vdst`, :ref:`src` + v_or_b32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_qsad_pk_u16_u8 :ref:`vdst`::ref:`b64`, :ref:`src0`::ref:`b64`, :ref:`src1`::ref:`b32`, :ref:`src2`::ref:`b64` + v_rcp_clamp_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rcp_clamp_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rcp_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rcp_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rcp_iflag_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rcp_legacy_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rndne_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rndne_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rsq_clamp_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rsq_clamp_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rsq_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rsq_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rsq_legacy_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_sad_hi_u8 :ref:`vdst`::ref:`u32`, :ref:`src0`::ref:`u8x4`, :ref:`src1`::ref:`u8x4`, :ref:`src2`::ref:`u32` + v_sad_u16 :ref:`vdst`::ref:`u32`, :ref:`src0`::ref:`u16x2`, :ref:`src1`::ref:`u16x2`, :ref:`src2`::ref:`u32` + v_sad_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_sad_u8 :ref:`vdst`::ref:`u32`, :ref:`src0`::ref:`u8x4`, :ref:`src1`::ref:`u8x4`, :ref:`src2`::ref:`u32` + v_sin_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_sqrt_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_sqrt_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_sub_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_sub_i32_e64 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_subb_u32_e64 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1`, :ref:`ssrc2` + v_subbrev_u32_e64 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1`, :ref:`ssrc2` + v_subrev_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_subrev_i32_e64 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_trig_preop_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`u32` :ref:`clamp` :ref:`omod` + v_trunc_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_trunc_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_xor_b32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + +VOPC +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + v_cmp_class_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`::ref:`b32` + v_cmp_class_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`::ref:`b32` + v_cmp_eq_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lg_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lg_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ne_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ne_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ne_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ne_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_neq_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_neq_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nge_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nge_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ngt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ngt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nle_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nle_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nlg_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nlg_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nlt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nlt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_o_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_o_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_t_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_t_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_t_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_t_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_tru_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_tru_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_u_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_u_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_eq_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_eq_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_f_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_f_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_ge_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_ge_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_gt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_gt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_le_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_le_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_lg_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_lg_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_lt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_lt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_neq_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_neq_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_nge_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_nge_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_ngt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_ngt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_nle_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_nle_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_nlg_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_nlg_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_nlt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_nlt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_o_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_o_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_tru_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_tru_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_u_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmps_u_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_eq_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_eq_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_f_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_f_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_ge_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_ge_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_gt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_gt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_le_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_le_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_lg_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_lg_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_lt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_lt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_neq_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_neq_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_nge_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_nge_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_ngt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_ngt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_nle_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_nle_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_nlg_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_nlg_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_nlt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_nlt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_o_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_o_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_tru_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_tru_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_u_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpsx_u_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_class_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`::ref:`b32` + v_cmpx_class_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`::ref:`b32` + v_cmpx_eq_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lg_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lg_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ne_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ne_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ne_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ne_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_neq_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_neq_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nge_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nge_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ngt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ngt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nle_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nle_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nlg_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nlg_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nlt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nlt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_o_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_o_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_t_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_t_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_t_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_t_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_tru_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_tru_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_u_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_u_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + +.. |---| unicode:: U+02014 .. em dash + + +.. toctree:: + :hidden: + + gfx7_attr + gfx7_bimm16 + gfx7_bimm32 + gfx7_fimm32 + gfx7_hwreg + gfx7_label + gfx7_msg + gfx7_param + gfx7_simm16 + gfx7_tgt + gfx7_uimm16 + gfx7_waitcnt + gfx7_addr_buf + gfx7_addr_ds + gfx7_addr_flat + gfx7_addr_mimg + gfx7_base_smem_addr + gfx7_base_smem_buf + gfx7_data_buf_atomic128 + gfx7_data_buf_atomic32 + gfx7_data_buf_atomic64 + gfx7_data_mimg_atomic_cmp + gfx7_data_mimg_atomic_reg + gfx7_data_mimg_store + gfx7_dst_buf_128 + gfx7_dst_buf_64 + gfx7_dst_buf_96 + gfx7_dst_buf_lds + gfx7_dst_flat_atomic32 + gfx7_dst_flat_atomic64 + gfx7_dst_mimg_gather4 + gfx7_dst_mimg_regular + gfx7_offset_buf + gfx7_offset_smem + gfx7_rsrc_buf + gfx7_rsrc_mimg + gfx7_samp_mimg + gfx7_sdst128_0 + gfx7_sdst256_0 + gfx7_sdst32_0 + gfx7_sdst32_1 + gfx7_sdst32_2 + gfx7_sdst512_0 + gfx7_sdst64_0 + gfx7_sdst64_1 + gfx7_src32_0 + gfx7_src32_1 + gfx7_src32_2 + gfx7_src32_3 + gfx7_src64_0 + gfx7_src64_1 + gfx7_src64_2 + gfx7_src_exp + gfx7_ssrc32_0 + gfx7_ssrc32_1 + gfx7_ssrc32_2 + gfx7_ssrc32_3 + gfx7_ssrc32_4 + gfx7_ssrc64_0 + gfx7_ssrc64_1 + gfx7_ssrc64_2 + gfx7_ssrc64_3 + gfx7_vcc_64 + gfx7_vdata128_0 + gfx7_vdata32_0 + gfx7_vdata64_0 + gfx7_vdata96_0 + gfx7_vdst128_0 + gfx7_vdst32_0 + gfx7_vdst64_0 + gfx7_vdst96_0 + gfx7_vsrc128_0 + gfx7_vsrc32_0 + gfx7_vsrc64_0 + gfx7_mod + gfx7_opt + gfx7_ret + gfx7_type_dev diff --git a/docs/AMDGPU/AMDGPUAsmGFX8.rst b/docs/AMDGPU/AMDGPUAsmGFX8.rst new file mode 100644 index 0000000000000000000000000000000000000000..9dc78e10089125b7263a21ac0fee4660fac3bdf4 --- /dev/null +++ b/docs/AMDGPU/AMDGPUAsmGFX8.rst @@ -0,0 +1,1846 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +============================ +Syntax of GFX8 Instructions +============================ + +.. contents:: + :local: + +Notation +======== + +Notation used in this document is explained :ref:`here`. + +Introduction +============ + +An overview of generic syntax and other features of AMDGPU instructions may be found :ref:`in this document`. + +Instructions +============ + + +DS +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + ds_add_f32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_add_rtn_f32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_add_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_add_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_add_src2_f32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_add_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_add_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_add_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_add_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_and_b32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_and_b64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_and_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_and_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_and_src2_b32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_and_src2_b64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_append :ref:`vdst` :ref:`ds_offset16` :ref:`gds` + ds_bpermute_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` + ds_cmpst_b32 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_b64 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_f32 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_f64 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_rtn_f32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_rtn_f64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_condxchg32_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_consume :ref:`vdst` :ref:`ds_offset16` :ref:`gds` + ds_dec_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_dec_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_dec_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_dec_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_dec_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_dec_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_gws_barrier :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_gws_init :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_gws_sema_br :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_gws_sema_p :ref:`ds_offset16` :ref:`gds` + ds_gws_sema_release_all :ref:`ds_offset16` :ref:`gds` + ds_gws_sema_v :ref:`ds_offset16` :ref:`gds` + ds_inc_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_inc_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_inc_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_inc_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_inc_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_inc_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_f32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_f64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_i32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_i64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_rtn_f32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_rtn_f64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_rtn_i32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_rtn_i64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_src2_f32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_max_src2_f64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_max_src2_i32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_max_src2_i64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_max_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_max_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_max_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_f32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_f64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_i32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_i64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_rtn_f32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_rtn_f64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_rtn_i32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_rtn_i64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_src2_f32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_min_src2_f64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_min_src2_i32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_min_src2_i64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_min_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_min_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_min_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_mskor_b32 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_mskor_b64 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_mskor_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_mskor_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_nop + ds_or_b32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_or_b64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_or_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_or_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_or_src2_b32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_or_src2_b64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_ordered_count :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_permute_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` + ds_read2_b32 :ref:`vdst`::ref:`b32x2`, :ref:`vaddr` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_read2_b64 :ref:`vdst`::ref:`b64x2`, :ref:`vaddr` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_read2st64_b32 :ref:`vdst`::ref:`b32x2`, :ref:`vaddr` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_read2st64_b64 :ref:`vdst`::ref:`b64x2`, :ref:`vaddr` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_read_b128 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_b32 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_b64 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_b96 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_i16 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_i8 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_u16 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_u8 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_rsub_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_rsub_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_rsub_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_rsub_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_rsub_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_rsub_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_sub_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_sub_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_sub_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_sub_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_sub_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_sub_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_swizzle_b32 :ref:`vdst`, :ref:`vaddr` :ref:`sw_offset16` :ref:`gds` + ds_wrap_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_write2_b32 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_write2_b64 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_write2st64_b32 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_write2st64_b64 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_write_b128 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_b16 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_b32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_b64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_b8 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_b96 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_src2_b32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_write_src2_b64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_wrxchg2_rtn_b32 :ref:`vdst`::ref:`b32x2`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_wrxchg2_rtn_b64 :ref:`vdst`::ref:`b64x2`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_wrxchg2st64_rtn_b32 :ref:`vdst`::ref:`b32x2`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_wrxchg2st64_rtn_b64 :ref:`vdst`::ref:`b64x2`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_wrxchg_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_wrxchg_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_xor_b32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_xor_b64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_xor_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_xor_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_xor_src2_b32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_xor_src2_b64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + +EXP +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **SRC3** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + exp :ref:`tgt`, :ref:`vsrc0`, :ref:`vsrc1`, :ref:`vsrc2`, :ref:`vsrc3` :ref:`done` :ref:`compr` :ref:`vm` + +FLAT +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + flat_atomic_add :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_add_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_and :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_and_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_cmpswap :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`::ref:`b32x2` :ref:`glc` :ref:`slc` + flat_atomic_cmpswap_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`::ref:`b64x2` :ref:`glc` :ref:`slc` + flat_atomic_dec :ref:`vdst`::ref:`opt`::ref:`u32`, :ref:`vaddr`, :ref:`vdata`::ref:`u32` :ref:`glc` :ref:`slc` + flat_atomic_dec_x2 :ref:`vdst`::ref:`opt`::ref:`u64`, :ref:`vaddr`, :ref:`vdata`::ref:`u64` :ref:`glc` :ref:`slc` + flat_atomic_inc :ref:`vdst`::ref:`opt`::ref:`u32`, :ref:`vaddr`, :ref:`vdata`::ref:`u32` :ref:`glc` :ref:`slc` + flat_atomic_inc_x2 :ref:`vdst`::ref:`opt`::ref:`u64`, :ref:`vaddr`, :ref:`vdata`::ref:`u64` :ref:`glc` :ref:`slc` + flat_atomic_or :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_or_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_smax :ref:`vdst`::ref:`opt`::ref:`s32`, :ref:`vaddr`, :ref:`vdata`::ref:`s32` :ref:`glc` :ref:`slc` + flat_atomic_smax_x2 :ref:`vdst`::ref:`opt`::ref:`s64`, :ref:`vaddr`, :ref:`vdata`::ref:`s64` :ref:`glc` :ref:`slc` + flat_atomic_smin :ref:`vdst`::ref:`opt`::ref:`s32`, :ref:`vaddr`, :ref:`vdata`::ref:`s32` :ref:`glc` :ref:`slc` + flat_atomic_smin_x2 :ref:`vdst`::ref:`opt`::ref:`s64`, :ref:`vaddr`, :ref:`vdata`::ref:`s64` :ref:`glc` :ref:`slc` + flat_atomic_sub :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_sub_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_swap :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_swap_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_umax :ref:`vdst`::ref:`opt`::ref:`u32`, :ref:`vaddr`, :ref:`vdata`::ref:`u32` :ref:`glc` :ref:`slc` + flat_atomic_umax_x2 :ref:`vdst`::ref:`opt`::ref:`u64`, :ref:`vaddr`, :ref:`vdata`::ref:`u64` :ref:`glc` :ref:`slc` + flat_atomic_umin :ref:`vdst`::ref:`opt`::ref:`u32`, :ref:`vaddr`, :ref:`vdata`::ref:`u32` :ref:`glc` :ref:`slc` + flat_atomic_umin_x2 :ref:`vdst`::ref:`opt`::ref:`u64`, :ref:`vaddr`, :ref:`vdata`::ref:`u64` :ref:`glc` :ref:`slc` + flat_atomic_xor :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_atomic_xor_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_load_dword :ref:`vdst`, :ref:`vaddr` :ref:`glc` :ref:`slc` + flat_load_dwordx2 :ref:`vdst`, :ref:`vaddr` :ref:`glc` :ref:`slc` + flat_load_dwordx3 :ref:`vdst`, :ref:`vaddr` :ref:`glc` :ref:`slc` + flat_load_dwordx4 :ref:`vdst`, :ref:`vaddr` :ref:`glc` :ref:`slc` + flat_load_sbyte :ref:`vdst`, :ref:`vaddr` :ref:`glc` :ref:`slc` + flat_load_sshort :ref:`vdst`, :ref:`vaddr` :ref:`glc` :ref:`slc` + flat_load_ubyte :ref:`vdst`, :ref:`vaddr` :ref:`glc` :ref:`slc` + flat_load_ushort :ref:`vdst`, :ref:`vaddr` :ref:`glc` :ref:`slc` + flat_store_byte :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_store_dword :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_store_dwordx2 :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_store_dwordx3 :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_store_dwordx4 :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + flat_store_short :ref:`vaddr`, :ref:`vdata` :ref:`glc` :ref:`slc` + +MIMG +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + image_atomic_add :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_and :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_cmpswap :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_dec :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_inc :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_or :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_smax :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_smin :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_sub :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_swap :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_umax :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_umin :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_atomic_xor :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_gather4 :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_b :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_b_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_b_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_b_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_b :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_b_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_b_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_b_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_l :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_l_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_lz :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_lz_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_l :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_l_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_lz :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_lz_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_get_lod :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_get_resinfo :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_load :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_load_mip :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_load_mip_pck :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_load_mip_pck_sgn :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_load_pck :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_load_pck_sgn :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_sample :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_b :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_b_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_b_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_b_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_b :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_b_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_b_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_b_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_cd :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_cd_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_cd_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_cd_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_d :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_d_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_d_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_d_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_l :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_l_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_lz :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_lz_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_cd :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_cd_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_cd_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_cd_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_d :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_d_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_d_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_d_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_l :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_l_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_lz :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_lz_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_store :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_store_mip :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_store_mip_pck :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + image_store_pck :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` + +MUBUF +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **SRC3** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + buffer_atomic_add :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_add_x2 :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_and :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_and_x2 :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_cmpswap :ref:`vdata`::ref:`dst`::ref:`b32x2`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_cmpswap_x2 :ref:`vdata`::ref:`dst`::ref:`b64x2`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_dec :ref:`vdata`::ref:`dst`::ref:`u32`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_dec_x2 :ref:`vdata`::ref:`dst`::ref:`u64`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_inc :ref:`vdata`::ref:`dst`::ref:`u32`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_inc_x2 :ref:`vdata`::ref:`dst`::ref:`u64`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_or :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_or_x2 :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_smax :ref:`vdata`::ref:`dst`::ref:`s32`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_smax_x2 :ref:`vdata`::ref:`dst`::ref:`s64`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_smin :ref:`vdata`::ref:`dst`::ref:`s32`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_smin_x2 :ref:`vdata`::ref:`dst`::ref:`s64`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_sub :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_sub_x2 :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_swap :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_swap_x2 :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_umax :ref:`vdata`::ref:`dst`::ref:`u32`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_umax_x2 :ref:`vdata`::ref:`dst`::ref:`u64`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_umin :ref:`vdata`::ref:`dst`::ref:`u32`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_umin_x2 :ref:`vdata`::ref:`dst`::ref:`u64`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_xor :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_xor_x2 :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_dword :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` + buffer_load_dwordx2 :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_dwordx3 :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_dwordx4 :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_format_d16_x :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_format_d16_xy :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_format_d16_xyz :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_format_d16_xyzw :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_format_x :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` + buffer_load_format_xy :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_format_xyz :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_format_xyzw :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_sbyte :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` + buffer_load_sshort :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` + buffer_load_ubyte :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` + buffer_load_ushort :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` + buffer_store_byte :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_dword :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_dwordx2 :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_dwordx3 :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_dwordx4 :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_d16_x :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_d16_xy :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_d16_xyz :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_d16_xyzw :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_x :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_xy :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_xyz :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_xyzw :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_lds_dword :ref:`srsrc`, :ref:`soffset` :ref:`buf_offset12` :ref:`lds` :ref:`glc` :ref:`slc` + buffer_store_short :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_wbinvl1 + buffer_wbinvl1_vol + +SMEM +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + s_atc_probe :ref:`imm3`, :ref:`sbase`, :ref:`soffset` + s_atc_probe_buffer :ref:`imm3`, :ref:`sbase`, :ref:`soffset` + s_buffer_load_dword :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_load_dwordx16 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_load_dwordx2 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_load_dwordx4 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_load_dwordx8 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_store_dword :ref:`sdata`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_store_dwordx2 :ref:`sdata`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_store_dwordx4 :ref:`sdata`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_dcache_inv + s_dcache_inv_vol + s_dcache_wb + s_dcache_wb_vol + s_load_dword :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_load_dwordx16 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_load_dwordx2 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_load_dwordx4 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_load_dwordx8 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_memrealtime :ref:`sdst` + s_memtime :ref:`sdst` + s_store_dword :ref:`sdata`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_store_dwordx2 :ref:`sdata`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_store_dwordx4 :ref:`sdata`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + +SOP1 +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + s_abs_i32 :ref:`sdst`, :ref:`ssrc` + s_and_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_andn2_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_bcnt0_i32_b32 :ref:`sdst`, :ref:`ssrc` + s_bcnt0_i32_b64 :ref:`sdst`, :ref:`ssrc` + s_bcnt1_i32_b32 :ref:`sdst`, :ref:`ssrc` + s_bcnt1_i32_b64 :ref:`sdst`, :ref:`ssrc` + s_bitset0_b32 :ref:`sdst`, :ref:`ssrc` + s_bitset0_b64 :ref:`sdst`, :ref:`ssrc`::ref:`b32` + s_bitset1_b32 :ref:`sdst`, :ref:`ssrc` + s_bitset1_b64 :ref:`sdst`, :ref:`ssrc`::ref:`b32` + s_brev_b32 :ref:`sdst`, :ref:`ssrc` + s_brev_b64 :ref:`sdst`, :ref:`ssrc` + s_cbranch_join :ref:`ssrc` + s_cmov_b32 :ref:`sdst`, :ref:`ssrc` + s_cmov_b64 :ref:`sdst`, :ref:`ssrc` + s_ff0_i32_b32 :ref:`sdst`, :ref:`ssrc` + s_ff0_i32_b64 :ref:`sdst`, :ref:`ssrc` + s_ff1_i32_b32 :ref:`sdst`, :ref:`ssrc` + s_ff1_i32_b64 :ref:`sdst`, :ref:`ssrc` + s_flbit_i32 :ref:`sdst`, :ref:`ssrc` + s_flbit_i32_b32 :ref:`sdst`, :ref:`ssrc` + s_flbit_i32_b64 :ref:`sdst`, :ref:`ssrc` + s_flbit_i32_i64 :ref:`sdst`, :ref:`ssrc` + s_getpc_b64 :ref:`sdst` + s_mov_b32 :ref:`sdst`, :ref:`ssrc` + s_mov_b64 :ref:`sdst`, :ref:`ssrc` + s_mov_fed_b32 :ref:`sdst`, :ref:`ssrc` + s_movreld_b32 :ref:`sdst`, :ref:`ssrc` + s_movreld_b64 :ref:`sdst`, :ref:`ssrc` + s_movrels_b32 :ref:`sdst`, :ref:`ssrc` + s_movrels_b64 :ref:`sdst`, :ref:`ssrc` + s_nand_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_nor_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_not_b32 :ref:`sdst`, :ref:`ssrc` + s_not_b64 :ref:`sdst`, :ref:`ssrc` + s_or_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_orn2_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_quadmask_b32 :ref:`sdst`, :ref:`ssrc` + s_quadmask_b64 :ref:`sdst`, :ref:`ssrc` + s_rfe_b64 :ref:`ssrc` + s_set_gpr_idx_idx :ref:`ssrc` + s_setpc_b64 :ref:`ssrc` + s_sext_i32_i16 :ref:`sdst`, :ref:`ssrc` + s_sext_i32_i8 :ref:`sdst`, :ref:`ssrc` + s_swappc_b64 :ref:`sdst`, :ref:`ssrc` + s_wqm_b32 :ref:`sdst`, :ref:`ssrc` + s_wqm_b64 :ref:`sdst`, :ref:`ssrc` + s_xnor_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_xor_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + +SOP2 +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + s_absdiff_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_add_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_add_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_addc_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_and_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_and_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_andn2_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_andn2_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_ashr_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_ashr_i64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_bfe_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_bfe_i64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_bfe_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_bfe_u64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_bfm_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_bfm_b64 :ref:`sdst`, :ref:`ssrc0`::ref:`b32`, :ref:`ssrc1`::ref:`b32` + s_cbranch_g_fork :ref:`ssrc0`, :ref:`ssrc1` + s_cselect_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_cselect_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_lshl_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_lshl_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_lshr_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_lshr_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_max_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_max_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_min_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_min_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_mul_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_nand_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_nand_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_nor_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_nor_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_or_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_or_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_orn2_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_orn2_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_rfe_restore_b64 :ref:`ssrc0`, :ref:`ssrc1`::ref:`b32` + s_sub_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_sub_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_subb_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_xnor_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_xnor_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_xor_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_xor_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + +SOPC +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **SRC0** **SRC1** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + s_bitcmp0_b32 :ref:`ssrc0`, :ref:`ssrc1` + s_bitcmp0_b64 :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_bitcmp1_b32 :ref:`ssrc0`, :ref:`ssrc1` + s_bitcmp1_b64 :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_cmp_eq_i32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_eq_u32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_eq_u64 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_ge_i32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_ge_u32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_gt_i32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_gt_u32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_le_i32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_le_u32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_lg_i32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_lg_u32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_lg_u64 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_lt_i32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_lt_u32 :ref:`ssrc0`, :ref:`ssrc1` + s_set_gpr_idx_on :ref:`ssrc`, :ref:`imm4` + s_setvskip :ref:`ssrc0`, :ref:`ssrc1` + +SOPK +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + s_addk_i32 :ref:`sdst`, :ref:`imm16` + s_cbranch_i_fork :ref:`ssrc`, :ref:`label` + s_cmovk_i32 :ref:`sdst`, :ref:`imm16` + s_cmpk_eq_i32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_eq_u32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_ge_i32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_ge_u32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_gt_i32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_gt_u32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_le_i32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_le_u32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_lg_i32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_lg_u32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_lt_i32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_lt_u32 :ref:`ssrc`, :ref:`imm16` + s_getreg_b32 :ref:`sdst`, :ref:`hwreg` + s_movk_i32 :ref:`sdst`, :ref:`imm16` + s_mulk_i32 :ref:`sdst`, :ref:`imm16` + s_setreg_b32 :ref:`hwreg`, :ref:`ssrc` + s_setreg_imm32_b32 :ref:`hwreg`, :ref:`imm32` + +SOPP +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **SRC** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + s_barrier + s_branch :ref:`label` + s_cbranch_cdbgsys :ref:`label` + s_cbranch_cdbgsys_and_user :ref:`label` + s_cbranch_cdbgsys_or_user :ref:`label` + s_cbranch_cdbguser :ref:`label` + s_cbranch_execnz :ref:`label` + s_cbranch_execz :ref:`label` + s_cbranch_scc0 :ref:`label` + s_cbranch_scc1 :ref:`label` + s_cbranch_vccnz :ref:`label` + s_cbranch_vccz :ref:`label` + s_decperflevel :ref:`imm16` + s_endpgm + s_endpgm_saved + s_icache_inv + s_incperflevel :ref:`imm16` + s_nop :ref:`imm16` + s_sendmsg :ref:`msg` + s_sendmsghalt :ref:`msg` + s_set_gpr_idx_mode :ref:`imm4` + s_set_gpr_idx_off + s_sethalt :ref:`imm16` + s_setkill :ref:`imm16` + s_setprio :ref:`imm16` + s_sleep :ref:`imm16` + s_trap :ref:`imm16` + s_ttracedata + s_waitcnt :ref:`waitcnt` + s_wakeup + +VINTRP +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + v_interp_mov_f32 :ref:`vdst`, :ref:`param`::ref:`b32`, :ref:`attr`::ref:`b32` + v_interp_p1_f32 :ref:`vdst`, :ref:`vsrc`, :ref:`attr`::ref:`b32` + v_interp_p2_f32 :ref:`vdst`, :ref:`vsrc`, :ref:`attr`::ref:`b32` + +VOP1 +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + v_bfrev_b32 :ref:`vdst`, :ref:`src` + v_bfrev_b32_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_bfrev_b32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_ceil_f16 :ref:`vdst`, :ref:`src` + v_ceil_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_ceil_f16_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_ceil_f32 :ref:`vdst`, :ref:`src` + v_ceil_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_ceil_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_ceil_f64 :ref:`vdst`, :ref:`src` + v_clrexcp + v_cos_f16 :ref:`vdst`, :ref:`src` + v_cos_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cos_f16_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cos_f32 :ref:`vdst`, :ref:`src` + v_cos_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cos_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f16_f32 :ref:`vdst`, :ref:`src` + v_cvt_f16_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f16_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f16_i16 :ref:`vdst`, :ref:`src` + v_cvt_f16_i16_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f16_i16_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f16_u16 :ref:`vdst`, :ref:`src` + v_cvt_f16_u16_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f16_u16_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f32_f16 :ref:`vdst`, :ref:`src` + v_cvt_f32_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f32_f16_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f32_f64 :ref:`vdst`, :ref:`src` + v_cvt_f32_i32 :ref:`vdst`, :ref:`src` + v_cvt_f32_i32_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f32_i32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f32_u32 :ref:`vdst`, :ref:`src` + v_cvt_f32_u32_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f32_u32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f32_ubyte0 :ref:`vdst`, :ref:`src` + v_cvt_f32_ubyte0_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f32_ubyte0_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f32_ubyte1 :ref:`vdst`, :ref:`src` + v_cvt_f32_ubyte1_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f32_ubyte1_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f32_ubyte2 :ref:`vdst`, :ref:`src` + v_cvt_f32_ubyte2_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f32_ubyte2_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f32_ubyte3 :ref:`vdst`, :ref:`src` + v_cvt_f32_ubyte3_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f32_ubyte3_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f64_f32 :ref:`vdst`, :ref:`src` + v_cvt_f64_i32 :ref:`vdst`, :ref:`src` + v_cvt_f64_u32 :ref:`vdst`, :ref:`src` + v_cvt_flr_i32_f32 :ref:`vdst`, :ref:`src` + v_cvt_flr_i32_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_flr_i32_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_i16_f16 :ref:`vdst`, :ref:`src` + v_cvt_i16_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_i16_f16_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_i32_f32 :ref:`vdst`, :ref:`src` + v_cvt_i32_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_i32_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_i32_f64 :ref:`vdst`, :ref:`src` + v_cvt_off_f32_i4 :ref:`vdst`, :ref:`src` + v_cvt_off_f32_i4_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_off_f32_i4_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_rpi_i32_f32 :ref:`vdst`, :ref:`src` + v_cvt_rpi_i32_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_rpi_i32_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_u16_f16 :ref:`vdst`, :ref:`src` + v_cvt_u16_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_u16_f16_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_u32_f32 :ref:`vdst`, :ref:`src` + v_cvt_u32_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_u32_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_u32_f64 :ref:`vdst`, :ref:`src` + v_exp_f16 :ref:`vdst`, :ref:`src` + v_exp_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_exp_f16_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_exp_f32 :ref:`vdst`, :ref:`src` + v_exp_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_exp_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_exp_legacy_f32 :ref:`vdst`, :ref:`src` + v_exp_legacy_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_exp_legacy_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_ffbh_i32 :ref:`vdst`, :ref:`src` + v_ffbh_i32_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_ffbh_i32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_ffbh_u32 :ref:`vdst`, :ref:`src` + v_ffbh_u32_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_ffbh_u32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_ffbl_b32 :ref:`vdst`, :ref:`src` + v_ffbl_b32_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_ffbl_b32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_floor_f16 :ref:`vdst`, :ref:`src` + v_floor_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_floor_f16_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_floor_f32 :ref:`vdst`, :ref:`src` + v_floor_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_floor_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_floor_f64 :ref:`vdst`, :ref:`src` + v_fract_f16 :ref:`vdst`, :ref:`src` + v_fract_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_fract_f16_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_fract_f32 :ref:`vdst`, :ref:`src` + v_fract_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_fract_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_fract_f64 :ref:`vdst`, :ref:`src` + v_frexp_exp_i16_f16 :ref:`vdst`, :ref:`src` + v_frexp_exp_i16_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_frexp_exp_i16_f16_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_frexp_exp_i32_f32 :ref:`vdst`, :ref:`src` + v_frexp_exp_i32_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_frexp_exp_i32_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_frexp_exp_i32_f64 :ref:`vdst`, :ref:`src` + v_frexp_mant_f16 :ref:`vdst`, :ref:`src` + v_frexp_mant_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_frexp_mant_f16_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_frexp_mant_f32 :ref:`vdst`, :ref:`src` + v_frexp_mant_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_frexp_mant_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_frexp_mant_f64 :ref:`vdst`, :ref:`src` + v_log_f16 :ref:`vdst`, :ref:`src` + v_log_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_log_f16_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_log_f32 :ref:`vdst`, :ref:`src` + v_log_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_log_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_log_legacy_f32 :ref:`vdst`, :ref:`src` + v_log_legacy_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_log_legacy_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_mov_b32 :ref:`vdst`, :ref:`src` + v_mov_b32_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mov_b32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_mov_fed_b32 :ref:`vdst`, :ref:`src` + v_mov_fed_b32_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mov_fed_b32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_movreld_b32 :ref:`vdst`, :ref:`src` + v_movrels_b32 :ref:`vdst`, :ref:`vsrc` + v_movrelsd_b32 :ref:`vdst`, :ref:`vsrc` + v_nop + v_not_b32 :ref:`vdst`, :ref:`src` + v_not_b32_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_not_b32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_rcp_f16 :ref:`vdst`, :ref:`src` + v_rcp_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_rcp_f16_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_rcp_f32 :ref:`vdst`, :ref:`src` + v_rcp_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_rcp_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_rcp_f64 :ref:`vdst`, :ref:`src` + v_rcp_iflag_f32 :ref:`vdst`, :ref:`src` + v_rcp_iflag_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_rcp_iflag_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_readfirstlane_b32 :ref:`sdst`, :ref:`vsrc` + v_rndne_f16 :ref:`vdst`, :ref:`src` + v_rndne_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_rndne_f16_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_rndne_f32 :ref:`vdst`, :ref:`src` + v_rndne_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_rndne_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_rndne_f64 :ref:`vdst`, :ref:`src` + v_rsq_f16 :ref:`vdst`, :ref:`src` + v_rsq_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_rsq_f16_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_rsq_f32 :ref:`vdst`, :ref:`src` + v_rsq_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_rsq_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_rsq_f64 :ref:`vdst`, :ref:`src` + v_sin_f16 :ref:`vdst`, :ref:`src` + v_sin_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_sin_f16_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_sin_f32 :ref:`vdst`, :ref:`src` + v_sin_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_sin_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_sqrt_f16 :ref:`vdst`, :ref:`src` + v_sqrt_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_sqrt_f16_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_sqrt_f32 :ref:`vdst`, :ref:`src` + v_sqrt_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_sqrt_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_sqrt_f64 :ref:`vdst`, :ref:`src` + v_trunc_f16 :ref:`vdst`, :ref:`src` + v_trunc_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_trunc_f16_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_trunc_f32 :ref:`vdst`, :ref:`src` + v_trunc_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_trunc_f32_sdwa :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_trunc_f64 :ref:`vdst`, :ref:`src` + +VOP2 +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST0** **DST1** **SRC0** **SRC1** **SRC2** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + v_add_f16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_add_f16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_add_f16_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_add_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_add_f32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_add_f32_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_add_u16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_add_u16_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_add_u16_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_add_u32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_add_u32_dpp :ref:`vdst`, :ref:`vcc`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_add_u32_sdwa :ref:`vdst`, :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_addc_u32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`, :ref:`vcc` + v_addc_u32_dpp :ref:`vdst`, :ref:`vcc`, :ref:`vsrc0`, :ref:`vsrc1`, :ref:`vcc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_addc_u32_sdwa :ref:`vdst`, :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m`, :ref:`vcc` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_and_b32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_and_b32_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_and_b32_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_ashrrev_i16 :ref:`vdst`, :ref:`src0`::ref:`u16`, :ref:`vsrc1` + v_ashrrev_i16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`u16`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_ashrrev_i16_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`::ref:`u16`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_ashrrev_i32 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`vsrc1` + v_ashrrev_i32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`u32`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_ashrrev_i32_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`::ref:`u32`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_cndmask_b32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1`, :ref:`vcc` + v_cndmask_b32_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1`, :ref:`vcc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cndmask_b32_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m`, :ref:`vcc` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_ldexp_f16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1`::ref:`i16` + v_ldexp_f16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`i16` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_ldexp_f16_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m`::ref:`i16` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_lshlrev_b16 :ref:`vdst`, :ref:`src0`::ref:`u16`, :ref:`vsrc1` + v_lshlrev_b16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`u16`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_lshlrev_b16_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`::ref:`u16`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_lshlrev_b32 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`vsrc1` + v_lshlrev_b32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`u32`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_lshlrev_b32_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`::ref:`u32`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_lshrrev_b16 :ref:`vdst`, :ref:`src0`::ref:`u16`, :ref:`vsrc1` + v_lshrrev_b16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`u16`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_lshrrev_b16_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`::ref:`u16`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_lshrrev_b32 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`vsrc1` + v_lshrrev_b32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`u32`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_lshrrev_b32_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`::ref:`u32`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_mac_f16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mac_f16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mac_f16_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_mac_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mac_f32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mac_f32_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_madak_f16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1`, :ref:`imm32` + v_madak_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1`, :ref:`imm32` + v_madmk_f16 :ref:`vdst`, :ref:`src0`, :ref:`imm32`, :ref:`vsrc2` + v_madmk_f32 :ref:`vdst`, :ref:`src0`, :ref:`imm32`, :ref:`vsrc2` + v_max_f16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_max_f16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_max_f16_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_max_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_max_f32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_max_f32_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_max_i16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_max_i16_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_max_i16_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_max_i32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_max_i32_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_max_i32_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_max_u16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_max_u16_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_max_u16_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_max_u32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_max_u32_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_max_u32_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_min_f16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_min_f16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_min_f16_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_min_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_min_f32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_min_f32_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_min_i16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_min_i16_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_min_i16_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_min_i32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_min_i32_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_min_i32_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_min_u16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_min_u16_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_min_u16_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_min_u32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_min_u32_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_min_u32_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_mul_f16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_f16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mul_f16_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_mul_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_f32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mul_f32_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_mul_hi_i32_i24 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_hi_i32_i24_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mul_hi_i32_i24_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_mul_hi_u32_u24 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_hi_u32_u24_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mul_hi_u32_u24_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_mul_i32_i24 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_i32_i24_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mul_i32_i24_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_mul_legacy_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_legacy_f32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mul_legacy_f32_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_mul_lo_u16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_lo_u16_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mul_lo_u16_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_mul_u32_u24 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_u32_u24_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mul_u32_u24_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_or_b32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_or_b32_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_or_b32_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_sub_f16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_sub_f16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_sub_f16_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_sub_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_sub_f32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_sub_f32_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_sub_u16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_sub_u16_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_sub_u16_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_sub_u32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_sub_u32_dpp :ref:`vdst`, :ref:`vcc`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_sub_u32_sdwa :ref:`vdst`, :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_subb_u32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`, :ref:`vcc` + v_subb_u32_dpp :ref:`vdst`, :ref:`vcc`, :ref:`vsrc0`, :ref:`vsrc1`, :ref:`vcc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_subb_u32_sdwa :ref:`vdst`, :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m`, :ref:`vcc` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_subbrev_u32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`, :ref:`vcc` + v_subbrev_u32_dpp :ref:`vdst`, :ref:`vcc`, :ref:`vsrc0`, :ref:`vsrc1`, :ref:`vcc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_subbrev_u32_sdwa :ref:`vdst`, :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m`, :ref:`vcc` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_subrev_f16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_subrev_f16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_subrev_f16_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_subrev_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_subrev_f32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_subrev_f32_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_subrev_u16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_subrev_u16_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_subrev_u16_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_subrev_u32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_subrev_u32_dpp :ref:`vdst`, :ref:`vcc`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_subrev_u32_sdwa :ref:`vdst`, :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_xor_b32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_xor_b32_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_xor_b32_sdwa :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + +VOP3 +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST0** **DST1** **SRC0** **SRC1** **SRC2** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + v_add_f16_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_add_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_add_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_add_u16_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_add_u32_e64 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_addc_u32_e64 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1`, :ref:`ssrc2` + v_alignbit_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_alignbyte_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_and_b32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_ashrrev_i16_e64 :ref:`vdst`, :ref:`src0`::ref:`u16`, :ref:`src1` + v_ashrrev_i32_e64 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`src1` + v_ashrrev_i64 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`src1` + v_bcnt_u32_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_bfe_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1`::ref:`u32`, :ref:`src2`::ref:`u32` + v_bfe_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_bfi_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_bfm_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_bfrev_b32_e64 :ref:`vdst`, :ref:`src` + v_ceil_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_ceil_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_ceil_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_clrexcp_e64 + v_cmp_class_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`b32` + v_cmp_class_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`b32` + v_cmp_class_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`b32` + v_cmp_eq_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_eq_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_eq_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_eq_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_eq_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_eq_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_eq_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_eq_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_eq_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_f_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_f_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_f_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_f_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_f_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_f_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_f_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_f_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_f_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ge_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_ge_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_ge_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_ge_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ge_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ge_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ge_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ge_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ge_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_gt_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_gt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_gt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_gt_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_gt_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_gt_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_gt_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_gt_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_gt_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_le_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_le_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_le_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_le_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_le_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_le_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_le_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_le_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_le_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_lg_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_lg_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_lg_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_lt_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_lt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_lt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_lt_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_lt_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_lt_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_lt_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_lt_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_lt_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ne_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ne_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ne_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ne_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ne_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ne_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_neq_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_neq_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_neq_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nge_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nge_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nge_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_ngt_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_ngt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_ngt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nle_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nle_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nle_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nlg_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nlg_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nlg_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nlt_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nlt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nlt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_o_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_o_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_o_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_t_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_t_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_t_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_t_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_t_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_t_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_tru_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_tru_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_tru_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_u_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_u_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_u_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_class_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`b32` + v_cmpx_class_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`b32` + v_cmpx_class_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`b32` + v_cmpx_eq_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_eq_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_eq_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_eq_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_eq_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_eq_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_eq_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_eq_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_eq_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_f_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_f_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_f_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_f_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_f_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_f_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_f_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_f_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_f_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ge_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_ge_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_ge_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_ge_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ge_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ge_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ge_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ge_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ge_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_gt_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_gt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_gt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_gt_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_gt_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_gt_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_gt_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_gt_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_gt_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_le_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_le_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_le_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_le_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_le_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_le_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_le_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_le_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_le_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_lg_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_lg_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_lg_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_lt_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_lt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_lt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_lt_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_lt_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_lt_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_lt_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_lt_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_lt_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ne_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ne_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ne_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ne_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ne_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ne_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_neq_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_neq_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_neq_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nge_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nge_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nge_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_ngt_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_ngt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_ngt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nle_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nle_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nle_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nlg_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nlg_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nlg_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nlt_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nlt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nlt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_o_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_o_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_o_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_t_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_t_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_t_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_t_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_t_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_t_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_tru_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_tru_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_tru_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_u_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_u_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_u_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cndmask_b32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`ssrc2` + v_cos_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_cos_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_cubeid_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_cubema_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_cubesc_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_cubetc_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_cvt_f16_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_cvt_f16_i16_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` + v_cvt_f16_u16_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` + v_cvt_f32_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_cvt_f32_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_cvt_f32_i32_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_f32_u32_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_f32_ubyte0_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_f32_ubyte1_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_f32_ubyte2_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_f32_ubyte3_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_f64_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_cvt_f64_i32_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_f64_u32_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_flr_i32_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_cvt_i16_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_cvt_i32_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_cvt_i32_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_cvt_off_f32_i4_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_pk_i16_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_cvt_pk_u16_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_cvt_pk_u8_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`u32`, :ref:`src2`::ref:`u32` + v_cvt_pkaccum_u8_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`u32` + v_cvt_pknorm_i16_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cvt_pknorm_u16_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cvt_pkrtz_f16_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cvt_rpi_i32_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_cvt_u16_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_cvt_u32_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_cvt_u32_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_div_fixup_f16 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` + v_div_fixup_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_div_fixup_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_div_fmas_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_div_fmas_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_div_scale_f32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_div_scale_f64 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_exp_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_exp_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_exp_legacy_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_ffbh_i32_e64 :ref:`vdst`, :ref:`src` + v_ffbh_u32_e64 :ref:`vdst`, :ref:`src` + v_ffbl_b32_e64 :ref:`vdst`, :ref:`src` + v_floor_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_floor_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_floor_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_fma_f16 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` + v_fma_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_fma_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_fract_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_fract_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_fract_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_frexp_exp_i16_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_frexp_exp_i32_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_frexp_exp_i32_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_frexp_mant_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_frexp_mant_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_frexp_mant_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_interp_mov_f32_e64 :ref:`vdst`, :ref:`param`::ref:`b32`, :ref:`attr`::ref:`b32` :ref:`clamp` :ref:`omod` + v_interp_p1_f32_e64 :ref:`vdst`, :ref:`vsrc`::ref:`m`, :ref:`attr`::ref:`b32` :ref:`clamp` :ref:`omod` + v_interp_p1ll_f16 :ref:`vdst`::ref:`f32`, :ref:`vsrc`::ref:`m`::ref:`f32`, :ref:`attr`::ref:`b32` :ref:`high` :ref:`clamp` :ref:`omod` + v_interp_p1lv_f16 :ref:`vdst`::ref:`f32`, :ref:`vsrc0`::ref:`m`::ref:`f32`, :ref:`attr`::ref:`b32`, :ref:`vsrc2`::ref:`m`::ref:`f16x2` :ref:`high` :ref:`clamp` :ref:`omod` + v_interp_p2_f16 :ref:`vdst`, :ref:`vsrc0`::ref:`m`::ref:`f32`, :ref:`attr`::ref:`b32`, :ref:`vsrc2`::ref:`m`::ref:`f32` :ref:`high` :ref:`clamp` + v_interp_p2_f32_e64 :ref:`vdst`, :ref:`vsrc`::ref:`m`, :ref:`attr`::ref:`b32` :ref:`clamp` :ref:`omod` + v_ldexp_f16_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`i16` :ref:`clamp` + v_ldexp_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`i32` :ref:`clamp` :ref:`omod` + v_ldexp_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`i32` :ref:`clamp` :ref:`omod` + v_lerp_u8 :ref:`vdst`::ref:`u32`, :ref:`src0`::ref:`b32`, :ref:`src1`::ref:`b32`, :ref:`src2`::ref:`b32` + v_log_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_log_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_log_legacy_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_lshlrev_b16_e64 :ref:`vdst`, :ref:`src0`::ref:`u16`, :ref:`src1` + v_lshlrev_b32_e64 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`src1` + v_lshlrev_b64 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`src1` + v_lshrrev_b16_e64 :ref:`vdst`, :ref:`src0`::ref:`u16`, :ref:`src1` + v_lshrrev_b32_e64 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`src1` + v_lshrrev_b64 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`src1` + v_mac_f16_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_mac_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_mad_f16 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` + v_mad_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_mad_i16 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` :ref:`clamp` + v_mad_i32_i24 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2`::ref:`i32` :ref:`clamp` + v_mad_i64_i32 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1`, :ref:`src2`::ref:`i64` :ref:`clamp` + v_mad_legacy_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_mad_u16 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` :ref:`clamp` + v_mad_u32_u24 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2`::ref:`u32` :ref:`clamp` + v_mad_u64_u32 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1`, :ref:`src2`::ref:`u64` :ref:`clamp` + v_max3_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_max3_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_max3_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_max_f16_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_max_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_max_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_max_i16_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_max_i32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_max_u16_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_max_u32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mbcnt_hi_u32_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mbcnt_lo_u32_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_med3_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_med3_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_med3_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_min3_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_min3_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_min3_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_min_f16_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_min_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_min_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_min_i16_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_min_i32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_min_u16_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_min_u32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mov_b32_e64 :ref:`vdst`, :ref:`src` + v_mov_fed_b32_e64 :ref:`vdst`, :ref:`src` + v_movreld_b32_e64 :ref:`vdst`, :ref:`src` + v_movrels_b32_e64 :ref:`vdst`, :ref:`vsrc` + v_movrelsd_b32_e64 :ref:`vdst`, :ref:`vsrc` + v_mqsad_pk_u16_u8 :ref:`vdst`::ref:`b64`, :ref:`src0`::ref:`b64`, :ref:`src1`::ref:`b32`, :ref:`src2`::ref:`b64` :ref:`clamp` + v_mqsad_u32_u8 :ref:`vdst`::ref:`b128`, :ref:`src0`::ref:`b64`, :ref:`src1`::ref:`b32`, :ref:`vsrc2`::ref:`b128` :ref:`clamp` + v_msad_u8 :ref:`vdst`::ref:`u32`, :ref:`src0`::ref:`b32`, :ref:`src1`::ref:`b32`, :ref:`src2`::ref:`b32` :ref:`clamp` + v_mul_f16_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_mul_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_mul_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_mul_hi_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_hi_i32_i24_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_hi_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_hi_u32_u24_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_i32_i24_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_legacy_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_mul_lo_u16_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_lo_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_u32_u24_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_nop_e64 + v_not_b32_e64 :ref:`vdst`, :ref:`src` + v_or_b32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_perm_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_qsad_pk_u16_u8 :ref:`vdst`::ref:`b64`, :ref:`src0`::ref:`b64`, :ref:`src1`::ref:`b32`, :ref:`src2`::ref:`b64` :ref:`clamp` + v_rcp_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_rcp_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rcp_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rcp_iflag_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_readlane_b32 :ref:`sdst`, :ref:`vsrc0`, :ref:`ssrc1` + v_rndne_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_rndne_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rndne_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rsq_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_rsq_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rsq_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_sad_hi_u8 :ref:`vdst`::ref:`u32`, :ref:`src0`::ref:`u8x4`, :ref:`src1`::ref:`u8x4`, :ref:`src2`::ref:`u32` :ref:`clamp` + v_sad_u16 :ref:`vdst`::ref:`u32`, :ref:`src0`::ref:`u16x2`, :ref:`src1`::ref:`u16x2`, :ref:`src2`::ref:`u32` :ref:`clamp` + v_sad_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` :ref:`clamp` + v_sad_u8 :ref:`vdst`::ref:`u32`, :ref:`src0`::ref:`u8x4`, :ref:`src1`::ref:`u8x4`, :ref:`src2`::ref:`u32` :ref:`clamp` + v_sin_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_sin_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_sqrt_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_sqrt_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_sqrt_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_sub_f16_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_sub_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_sub_u16_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_sub_u32_e64 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_subb_u32_e64 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1`, :ref:`ssrc2` + v_subbrev_u32_e64 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1`, :ref:`ssrc2` + v_subrev_f16_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_subrev_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_subrev_u16_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_subrev_u32_e64 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_trig_preop_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`u32` :ref:`clamp` :ref:`omod` + v_trunc_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_trunc_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_trunc_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_writelane_b32 :ref:`vdst`, :ref:`ssrc0`, :ref:`ssrc1` + v_xor_b32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + +VOPC +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + v_cmp_class_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`::ref:`b32` + v_cmp_class_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m`::ref:`b32` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_class_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`::ref:`b32` + v_cmp_class_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m`::ref:`b32` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_class_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`::ref:`b32` + v_cmp_eq_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_eq_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_eq_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_i16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_eq_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_i32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_eq_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_u16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_eq_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_u32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_eq_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_f_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_f_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_i16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_f_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_i32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_f_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_u16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_f_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_u32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_f_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ge_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ge_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_i16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ge_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_i32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ge_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_u16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ge_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_u32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ge_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_gt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_gt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_i16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_gt_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_i32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_gt_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_u16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_gt_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_u32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_gt_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_le_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_le_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_i16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_le_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_i32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_le_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_u16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_le_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_u32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_le_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lg_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lg_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_lg_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lg_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_lg_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_lt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_lt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_i16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_lt_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_i32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_lt_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_u16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_lt_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_u32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_lt_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ne_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ne_i16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ne_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ne_i32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ne_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ne_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ne_u16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ne_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ne_u32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ne_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_neq_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_neq_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_neq_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_neq_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_neq_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nge_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nge_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_nge_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nge_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_nge_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ngt_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ngt_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ngt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ngt_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ngt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nle_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nle_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_nle_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nle_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_nle_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nlg_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nlg_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_nlg_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nlg_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_nlg_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nlt_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nlt_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_nlt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nlt_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_nlt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_o_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_o_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_o_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_o_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_o_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_t_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_t_i16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_t_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_t_i32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_t_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_t_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_t_u16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_t_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_t_u32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_t_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_tru_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_tru_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_tru_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_tru_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_tru_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_u_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_u_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_u_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_u_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_u_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_class_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`::ref:`b32` + v_cmpx_class_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m`::ref:`b32` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_class_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`::ref:`b32` + v_cmpx_class_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m`::ref:`b32` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_class_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`::ref:`b32` + v_cmpx_eq_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_eq_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_eq_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_i16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_eq_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_i32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_eq_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_u16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_eq_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_u32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_eq_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_f_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_f_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_i16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_f_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_i32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_f_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_u16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_f_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_u32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_f_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ge_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ge_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_i16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ge_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_i32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ge_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_u16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ge_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_u32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ge_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_gt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_gt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_i16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_gt_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_i32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_gt_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_u16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_gt_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_u32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_gt_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_le_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_le_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_i16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_le_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_i32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_le_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_u16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_le_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_u32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_le_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lg_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lg_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_lg_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lg_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_lg_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_lt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_lt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_i16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_lt_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_i32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_lt_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_u16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_lt_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_u32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_lt_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ne_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ne_i16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ne_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ne_i32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ne_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ne_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ne_u16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ne_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ne_u32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ne_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_neq_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_neq_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_neq_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_neq_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_neq_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nge_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nge_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_nge_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nge_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_nge_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ngt_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ngt_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ngt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ngt_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ngt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nle_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nle_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_nle_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nle_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_nle_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nlg_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nlg_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_nlg_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nlg_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_nlg_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nlt_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nlt_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_nlt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nlt_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_nlt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_o_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_o_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_o_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_o_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_o_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_t_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_t_i16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_t_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_t_i32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_t_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_t_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_t_u16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_t_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_t_u32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_t_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_tru_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_tru_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_tru_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_tru_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_tru_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_u_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_u_f16_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_u_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_u_f32_sdwa :ref:`vcc`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_u_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + +.. |---| unicode:: U+02014 .. em dash + + +.. toctree:: + :hidden: + + gfx8_attr + gfx8_bimm16 + gfx8_bimm32 + gfx8_fimm16 + gfx8_fimm32 + gfx8_hwreg + gfx8_imm4 + gfx8_label + gfx8_msg + gfx8_param + gfx8_perm_smem + gfx8_simm16 + gfx8_tgt + gfx8_uimm16 + gfx8_waitcnt + gfx8_addr_buf + gfx8_addr_ds + gfx8_addr_flat + gfx8_addr_mimg + gfx8_base_smem_addr + gfx8_base_smem_buf + gfx8_data_buf_atomic128 + gfx8_data_buf_atomic32 + gfx8_data_buf_atomic64 + gfx8_data_buf_d16_128 + gfx8_data_buf_d16_32 + gfx8_data_buf_d16_64 + gfx8_data_buf_d16_96 + gfx8_data_mimg_atomic_cmp + gfx8_data_mimg_atomic_reg + gfx8_data_mimg_store + gfx8_data_mimg_store_d16 + gfx8_dst_buf_128 + gfx8_dst_buf_64 + gfx8_dst_buf_96 + gfx8_dst_buf_d16_128 + gfx8_dst_buf_d16_32 + gfx8_dst_buf_d16_64 + gfx8_dst_buf_d16_96 + gfx8_dst_buf_lds + gfx8_dst_flat_atomic32 + gfx8_dst_flat_atomic64 + gfx8_dst_mimg_gather4 + gfx8_dst_mimg_regular + gfx8_dst_mimg_regular_d16 + gfx8_offset_buf + gfx8_offset_smem_load + gfx8_offset_smem_store + gfx8_rsrc_buf + gfx8_rsrc_mimg + gfx8_samp_mimg + gfx8_sdata128_0 + gfx8_sdata32_0 + gfx8_sdata64_0 + gfx8_sdst128_0 + gfx8_sdst256_0 + gfx8_sdst32_0 + gfx8_sdst32_1 + gfx8_sdst32_2 + gfx8_sdst512_0 + gfx8_sdst64_0 + gfx8_sdst64_1 + gfx8_src32_0 + gfx8_src32_1 + gfx8_src64_0 + gfx8_src64_1 + gfx8_src_exp + gfx8_ssrc32_0 + gfx8_ssrc32_1 + gfx8_ssrc32_2 + gfx8_ssrc32_3 + gfx8_ssrc32_4 + gfx8_ssrc64_0 + gfx8_ssrc64_1 + gfx8_ssrc64_2 + gfx8_ssrc64_3 + gfx8_vcc_64 + gfx8_vdata128_0 + gfx8_vdata32_0 + gfx8_vdata64_0 + gfx8_vdata96_0 + gfx8_vdst128_0 + gfx8_vdst32_0 + gfx8_vdst64_0 + gfx8_vdst96_0 + gfx8_vsrc128_0 + gfx8_vsrc32_0 + gfx8_vsrc64_0 + gfx8_mod_dpp_sdwa_abs_neg + gfx8_mod_sdwa_sext + gfx8_mod_vop3_abs_neg + gfx8_opt + gfx8_ret + gfx8_type_dev diff --git a/docs/AMDGPU/AMDGPUAsmGFX9.rst b/docs/AMDGPU/AMDGPUAsmGFX9.rst new file mode 100644 index 0000000000000000000000000000000000000000..71052d0cf6e4ca1d96874f84ecc652165945e743 --- /dev/null +++ b/docs/AMDGPU/AMDGPUAsmGFX9.rst @@ -0,0 +1,2102 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +============================ +Syntax of GFX9 Instructions +============================ + +.. contents:: + :local: + +Notation +======== + +Notation used in this document is explained :ref:`here`. + +Introduction +============ + +An overview of generic syntax and other features of AMDGPU instructions may be found :ref:`in this document`. + +Instructions +============ + + +DS +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + ds_add_f32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_add_rtn_f32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_add_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_add_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_add_src2_f32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_add_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_add_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_add_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_add_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_and_b32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_and_b64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_and_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_and_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_and_src2_b32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_and_src2_b64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_append :ref:`vdst` :ref:`ds_offset16` :ref:`gds` + ds_bpermute_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` + ds_cmpst_b32 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_b64 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_f32 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_f64 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_rtn_f32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_cmpst_rtn_f64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_condxchg32_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_consume :ref:`vdst` :ref:`ds_offset16` :ref:`gds` + ds_dec_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_dec_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_dec_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_dec_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_dec_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_dec_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_gws_barrier :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_gws_init :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_gws_sema_br :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_gws_sema_p :ref:`ds_offset16` :ref:`gds` + ds_gws_sema_release_all :ref:`ds_offset16` :ref:`gds` + ds_gws_sema_v :ref:`ds_offset16` :ref:`gds` + ds_inc_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_inc_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_inc_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_inc_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_inc_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_inc_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_f32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_f64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_i32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_i64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_rtn_f32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_rtn_f64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_rtn_i32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_rtn_i64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_src2_f32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_max_src2_f64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_max_src2_i32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_max_src2_i64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_max_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_max_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_max_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_max_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_f32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_f64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_i32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_i64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_rtn_f32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_rtn_f64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_rtn_i32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_rtn_i64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_src2_f32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_min_src2_f64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_min_src2_i32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_min_src2_i64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_min_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_min_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_min_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_min_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_mskor_b32 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_mskor_b64 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_mskor_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_mskor_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_nop + ds_or_b32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_or_b64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_or_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_or_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_or_src2_b32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_or_src2_b64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_ordered_count :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_permute_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` + ds_read2_b32 :ref:`vdst`::ref:`b32x2`, :ref:`vaddr` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_read2_b64 :ref:`vdst`::ref:`b64x2`, :ref:`vaddr` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_read2st64_b32 :ref:`vdst`::ref:`b32x2`, :ref:`vaddr` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_read2st64_b64 :ref:`vdst`::ref:`b64x2`, :ref:`vaddr` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_read_b128 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_b32 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_b64 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_b96 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_i16 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_i8 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_i8_d16 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_i8_d16_hi :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_u16 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_u16_d16 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_u16_d16_hi :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_u8 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_u8_d16 :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_read_u8_d16_hi :ref:`vdst`, :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_rsub_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_rsub_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_rsub_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_rsub_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_rsub_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_rsub_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_sub_rtn_u32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_sub_rtn_u64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_sub_src2_u32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_sub_src2_u64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_sub_u32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_sub_u64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_swizzle_b32 :ref:`vdst`, :ref:`vaddr` :ref:`sw_offset16` :ref:`gds` + ds_wrap_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset16` :ref:`gds` + ds_write2_b32 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_write2_b64 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_write2st64_b32 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_write2st64_b64 :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_write_b128 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_b16 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_b16_d16_hi :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_b32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_b64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_b8 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_b8_d16_hi :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_b96 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_write_src2_b32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_write_src2_b64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_wrxchg2_rtn_b32 :ref:`vdst`::ref:`b32x2`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_wrxchg2_rtn_b64 :ref:`vdst`::ref:`b64x2`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_wrxchg2st64_rtn_b32 :ref:`vdst`::ref:`b32x2`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_wrxchg2st64_rtn_b64 :ref:`vdst`::ref:`b64x2`, :ref:`vaddr`, :ref:`vdata0`, :ref:`vdata1` :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` + ds_wrxchg_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_wrxchg_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_xor_b32 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_xor_b64 :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_xor_rtn_b32 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_xor_rtn_b64 :ref:`vdst`, :ref:`vaddr`, :ref:`vdata` :ref:`ds_offset16` :ref:`gds` + ds_xor_src2_b32 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + ds_xor_src2_b64 :ref:`vaddr` :ref:`ds_offset16` :ref:`gds` + +EXP +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **SRC3** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + exp :ref:`tgt`, :ref:`vsrc0`, :ref:`vsrc1`, :ref:`vsrc2`, :ref:`vsrc3` :ref:`done` :ref:`compr` :ref:`vm` + +FLAT +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + flat_atomic_add :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_add_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_and :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_and_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_cmpswap :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`::ref:`b32x2` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_cmpswap_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`::ref:`b64x2` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_dec :ref:`vdst`::ref:`opt`::ref:`u32`, :ref:`vaddr`, :ref:`vdata`::ref:`u32` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_dec_x2 :ref:`vdst`::ref:`opt`::ref:`u64`, :ref:`vaddr`, :ref:`vdata`::ref:`u64` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_inc :ref:`vdst`::ref:`opt`::ref:`u32`, :ref:`vaddr`, :ref:`vdata`::ref:`u32` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_inc_x2 :ref:`vdst`::ref:`opt`::ref:`u64`, :ref:`vaddr`, :ref:`vdata`::ref:`u64` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_or :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_or_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_smax :ref:`vdst`::ref:`opt`::ref:`s32`, :ref:`vaddr`, :ref:`vdata`::ref:`s32` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_smax_x2 :ref:`vdst`::ref:`opt`::ref:`s64`, :ref:`vaddr`, :ref:`vdata`::ref:`s64` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_smin :ref:`vdst`::ref:`opt`::ref:`s32`, :ref:`vaddr`, :ref:`vdata`::ref:`s32` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_smin_x2 :ref:`vdst`::ref:`opt`::ref:`s64`, :ref:`vaddr`, :ref:`vdata`::ref:`s64` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_sub :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_sub_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_swap :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_swap_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_umax :ref:`vdst`::ref:`opt`::ref:`u32`, :ref:`vaddr`, :ref:`vdata`::ref:`u32` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_umax_x2 :ref:`vdst`::ref:`opt`::ref:`u64`, :ref:`vaddr`, :ref:`vdata`::ref:`u64` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_umin :ref:`vdst`::ref:`opt`::ref:`u32`, :ref:`vaddr`, :ref:`vdata`::ref:`u32` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_umin_x2 :ref:`vdst`::ref:`opt`::ref:`u64`, :ref:`vaddr`, :ref:`vdata`::ref:`u64` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_xor :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_atomic_xor_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_load_dword :ref:`vdst`, :ref:`vaddr` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_load_dwordx2 :ref:`vdst`, :ref:`vaddr` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_load_dwordx3 :ref:`vdst`, :ref:`vaddr` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_load_dwordx4 :ref:`vdst`, :ref:`vaddr` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_load_sbyte :ref:`vdst`, :ref:`vaddr` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_load_sbyte_d16 :ref:`vdst`, :ref:`vaddr` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_load_sbyte_d16_hi :ref:`vdst`, :ref:`vaddr` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_load_short_d16 :ref:`vdst`, :ref:`vaddr` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_load_short_d16_hi :ref:`vdst`, :ref:`vaddr` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_load_sshort :ref:`vdst`, :ref:`vaddr` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_load_ubyte :ref:`vdst`, :ref:`vaddr` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_load_ubyte_d16 :ref:`vdst`, :ref:`vaddr` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_load_ubyte_d16_hi :ref:`vdst`, :ref:`vaddr` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_load_ushort :ref:`vdst`, :ref:`vaddr` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_store_byte :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_store_byte_d16_hi :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_store_dword :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_store_dwordx2 :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_store_dwordx3 :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_store_dwordx4 :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_store_short :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + flat_store_short_d16_hi :ref:`vaddr`, :ref:`vdata` :ref:`flat_offset12` :ref:`glc` :ref:`slc` + global_atomic_add :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_add_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_and :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_and_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_cmpswap :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`::ref:`b32x2`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_cmpswap_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`::ref:`b64x2`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_dec :ref:`vdst`::ref:`opt`::ref:`u32`, :ref:`vaddr`, :ref:`vdata`::ref:`u32`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_dec_x2 :ref:`vdst`::ref:`opt`::ref:`u64`, :ref:`vaddr`, :ref:`vdata`::ref:`u64`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_inc :ref:`vdst`::ref:`opt`::ref:`u32`, :ref:`vaddr`, :ref:`vdata`::ref:`u32`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_inc_x2 :ref:`vdst`::ref:`opt`::ref:`u64`, :ref:`vaddr`, :ref:`vdata`::ref:`u64`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_or :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_or_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_smax :ref:`vdst`::ref:`opt`::ref:`s32`, :ref:`vaddr`, :ref:`vdata`::ref:`s32`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_smax_x2 :ref:`vdst`::ref:`opt`::ref:`s64`, :ref:`vaddr`, :ref:`vdata`::ref:`s64`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_smin :ref:`vdst`::ref:`opt`::ref:`s32`, :ref:`vaddr`, :ref:`vdata`::ref:`s32`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_smin_x2 :ref:`vdst`::ref:`opt`::ref:`s64`, :ref:`vaddr`, :ref:`vdata`::ref:`s64`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_sub :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_sub_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_swap :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_swap_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_umax :ref:`vdst`::ref:`opt`::ref:`u32`, :ref:`vaddr`, :ref:`vdata`::ref:`u32`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_umax_x2 :ref:`vdst`::ref:`opt`::ref:`u64`, :ref:`vaddr`, :ref:`vdata`::ref:`u64`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_umin :ref:`vdst`::ref:`opt`::ref:`u32`, :ref:`vaddr`, :ref:`vdata`::ref:`u32`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_umin_x2 :ref:`vdst`::ref:`opt`::ref:`u64`, :ref:`vaddr`, :ref:`vdata`::ref:`u64`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_xor :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_atomic_xor_x2 :ref:`vdst`::ref:`opt`, :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_load_dword :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_load_dwordx2 :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_load_dwordx3 :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_load_dwordx4 :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_load_sbyte :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_load_sbyte_d16 :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_load_sbyte_d16_hi :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_load_short_d16 :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_load_short_d16_hi :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_load_sshort :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_load_ubyte :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_load_ubyte_d16 :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_load_ubyte_d16_hi :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_load_ushort :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_store_byte :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_store_byte_d16_hi :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_store_dword :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_store_dwordx2 :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_store_dwordx3 :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_store_dwordx4 :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_store_short :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + global_store_short_d16_hi :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_load_dword :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_load_dwordx2 :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_load_dwordx3 :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_load_dwordx4 :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_load_sbyte :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_load_sbyte_d16 :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_load_sbyte_d16_hi :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_load_short_d16 :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_load_short_d16_hi :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_load_sshort :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_load_ubyte :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_load_ubyte_d16 :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_load_ubyte_d16_hi :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_load_ushort :ref:`vdst`, :ref:`vaddr`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_store_byte :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_store_byte_d16_hi :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_store_dword :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_store_dwordx2 :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_store_dwordx3 :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_store_dwordx4 :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_store_short :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + scratch_store_short_d16_hi :ref:`vaddr`, :ref:`vdata`, :ref:`saddr` :ref:`flat_offset13` :ref:`glc` :ref:`slc` + +MIMG +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + image_atomic_add :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_atomic_and :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_atomic_cmpswap :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_atomic_dec :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_atomic_inc :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_atomic_or :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_atomic_smax :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_atomic_smin :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_atomic_sub :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_atomic_swap :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_atomic_umax :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_atomic_umin :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_atomic_xor :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_gather4 :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_b :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_b_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_b_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_b_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_b :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_b_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_b_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_b_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_l :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_l_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_lz :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_lz_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_c_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_l :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_l_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_lz :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_lz_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_gather4_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_get_lod :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_get_resinfo :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_load :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_load_mip :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_load_mip_pck :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_load_mip_pck_sgn :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_load_pck :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_load_pck_sgn :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_sample :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_b :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_b_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_b_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_b_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_b :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_b_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_b_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_b_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_cd :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_cd_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_cd_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_cd_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_d :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_d_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_d_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_d_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_l :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_l_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_lz :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_lz_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_c_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_cd :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_cd_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_cd_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_cd_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_d :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_d_cl :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_d_cl_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_d_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_l :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_l_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_lz :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_lz_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_sample_o :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`ssamp` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_store :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_store_mip :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` :ref:`d16` + image_store_mip_pck :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + image_store_pck :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc` :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`a16` :ref:`lwe` :ref:`da` + +MUBUF +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **SRC3** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + buffer_atomic_add :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_add_x2 :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_and :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_and_x2 :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_cmpswap :ref:`vdata`::ref:`dst`::ref:`b32x2`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_cmpswap_x2 :ref:`vdata`::ref:`dst`::ref:`b64x2`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_dec :ref:`vdata`::ref:`dst`::ref:`u32`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_dec_x2 :ref:`vdata`::ref:`dst`::ref:`u64`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_inc :ref:`vdata`::ref:`dst`::ref:`u32`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_inc_x2 :ref:`vdata`::ref:`dst`::ref:`u64`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_or :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_or_x2 :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_smax :ref:`vdata`::ref:`dst`::ref:`s32`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_smax_x2 :ref:`vdata`::ref:`dst`::ref:`s64`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_smin :ref:`vdata`::ref:`dst`::ref:`s32`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_smin_x2 :ref:`vdata`::ref:`dst`::ref:`s64`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_sub :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_sub_x2 :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_swap :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_swap_x2 :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_umax :ref:`vdata`::ref:`dst`::ref:`u32`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_umax_x2 :ref:`vdata`::ref:`dst`::ref:`u64`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_umin :ref:`vdata`::ref:`dst`::ref:`u32`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_umin_x2 :ref:`vdata`::ref:`dst`::ref:`u64`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_xor :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_atomic_xor_x2 :ref:`vdata`::ref:`dst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_dword :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` + buffer_load_dwordx2 :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_dwordx3 :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_dwordx4 :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_format_d16_hi_x :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_format_d16_x :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_format_d16_xy :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_format_d16_xyz :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_format_d16_xyzw :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_format_x :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` + buffer_load_format_xy :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_format_xyz :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_format_xyzw :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_sbyte :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` + buffer_load_sbyte_d16 :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_sbyte_d16_hi :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_short_d16 :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_short_d16_hi :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_sshort :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` + buffer_load_ubyte :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` + buffer_load_ubyte_d16 :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_ubyte_d16_hi :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_load_ushort :ref:`vdst`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` + buffer_store_byte :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_byte_d16_hi :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_dword :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_dwordx2 :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_dwordx3 :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_dwordx4 :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_d16_hi_x :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_d16_x :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_d16_xy :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_d16_xyz :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_d16_xyzw :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_x :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_xy :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_xyz :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_format_xyzw :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_lds_dword :ref:`srsrc`, :ref:`soffset` :ref:`buf_offset12` :ref:`lds` + buffer_store_short :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_store_short_d16_hi :ref:`vdata`, :ref:`vaddr`, :ref:`srsrc`, :ref:`soffset` :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` + buffer_wbinvl1 + buffer_wbinvl1_vol + +SMEM +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + s_atc_probe :ref:`imm3`, :ref:`sbase`, :ref:`soffset` + s_atc_probe_buffer :ref:`imm3`, :ref:`sbase`, :ref:`soffset` + s_atomic_add :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_add_x2 :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_and :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_and_x2 :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_cmpswap :ref:`sdata`::ref:`dst`::ref:`b32x2`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_cmpswap_x2 :ref:`sdata`::ref:`dst`::ref:`b64x2`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_dec :ref:`sdata`::ref:`dst`::ref:`u32`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_dec_x2 :ref:`sdata`::ref:`dst`::ref:`u64`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_inc :ref:`sdata`::ref:`dst`::ref:`u32`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_inc_x2 :ref:`sdata`::ref:`dst`::ref:`u64`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_or :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_or_x2 :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_smax :ref:`sdata`::ref:`dst`::ref:`s32`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_smax_x2 :ref:`sdata`::ref:`dst`::ref:`s64`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_smin :ref:`sdata`::ref:`dst`::ref:`s32`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_smin_x2 :ref:`sdata`::ref:`dst`::ref:`s64`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_sub :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_sub_x2 :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_swap :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_swap_x2 :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_umax :ref:`sdata`::ref:`dst`::ref:`u32`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_umax_x2 :ref:`sdata`::ref:`dst`::ref:`u64`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_umin :ref:`sdata`::ref:`dst`::ref:`u32`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_umin_x2 :ref:`sdata`::ref:`dst`::ref:`u64`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_xor :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_atomic_xor_x2 :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_add :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_add_x2 :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_and :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_and_x2 :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_cmpswap :ref:`sdata`::ref:`dst`::ref:`b32x2`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_cmpswap_x2 :ref:`sdata`::ref:`dst`::ref:`b64x2`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_dec :ref:`sdata`::ref:`dst`::ref:`u32`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_dec_x2 :ref:`sdata`::ref:`dst`::ref:`u64`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_inc :ref:`sdata`::ref:`dst`::ref:`u32`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_inc_x2 :ref:`sdata`::ref:`dst`::ref:`u64`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_or :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_or_x2 :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_smax :ref:`sdata`::ref:`dst`::ref:`s32`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_smax_x2 :ref:`sdata`::ref:`dst`::ref:`s64`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_smin :ref:`sdata`::ref:`dst`::ref:`s32`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_smin_x2 :ref:`sdata`::ref:`dst`::ref:`s64`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_sub :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_sub_x2 :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_swap :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_swap_x2 :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_umax :ref:`sdata`::ref:`dst`::ref:`u32`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_umax_x2 :ref:`sdata`::ref:`dst`::ref:`u64`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_umin :ref:`sdata`::ref:`dst`::ref:`u32`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_umin_x2 :ref:`sdata`::ref:`dst`::ref:`u64`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_xor :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_atomic_xor_x2 :ref:`sdata`::ref:`dst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_load_dword :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_load_dwordx16 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_load_dwordx2 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_load_dwordx4 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_load_dwordx8 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_store_dword :ref:`sdata`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_store_dwordx2 :ref:`sdata`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_buffer_store_dwordx4 :ref:`sdata`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_dcache_discard :ref:`sbase`, :ref:`soffset` + s_dcache_discard_x2 :ref:`sbase`, :ref:`soffset` + s_dcache_inv + s_dcache_inv_vol + s_dcache_wb + s_dcache_wb_vol + s_load_dword :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_load_dwordx16 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_load_dwordx2 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_load_dwordx4 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_load_dwordx8 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_memrealtime :ref:`sdst` + s_memtime :ref:`sdst` + s_scratch_load_dword :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_scratch_load_dwordx2 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_scratch_load_dwordx4 :ref:`sdst`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_scratch_store_dword :ref:`sdata`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_scratch_store_dwordx2 :ref:`sdata`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_scratch_store_dwordx4 :ref:`sdata`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_store_dword :ref:`sdata`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_store_dwordx2 :ref:`sdata`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + s_store_dwordx4 :ref:`sdata`, :ref:`sbase`, :ref:`soffset` :ref:`glc` + +SOP1 +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + s_abs_i32 :ref:`sdst`, :ref:`ssrc` + s_and_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_andn1_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_andn1_wrexec_b64 :ref:`sdst`, :ref:`ssrc` + s_andn2_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_andn2_wrexec_b64 :ref:`sdst`, :ref:`ssrc` + s_bcnt0_i32_b32 :ref:`sdst`, :ref:`ssrc` + s_bcnt0_i32_b64 :ref:`sdst`, :ref:`ssrc` + s_bcnt1_i32_b32 :ref:`sdst`, :ref:`ssrc` + s_bcnt1_i32_b64 :ref:`sdst`, :ref:`ssrc` + s_bitreplicate_b64_b32 :ref:`sdst`, :ref:`ssrc` + s_bitset0_b32 :ref:`sdst`, :ref:`ssrc` + s_bitset0_b64 :ref:`sdst`, :ref:`ssrc`::ref:`b32` + s_bitset1_b32 :ref:`sdst`, :ref:`ssrc` + s_bitset1_b64 :ref:`sdst`, :ref:`ssrc`::ref:`b32` + s_brev_b32 :ref:`sdst`, :ref:`ssrc` + s_brev_b64 :ref:`sdst`, :ref:`ssrc` + s_cbranch_join :ref:`ssrc` + s_cmov_b32 :ref:`sdst`, :ref:`ssrc` + s_cmov_b64 :ref:`sdst`, :ref:`ssrc` + s_ff0_i32_b32 :ref:`sdst`, :ref:`ssrc` + s_ff0_i32_b64 :ref:`sdst`, :ref:`ssrc` + s_ff1_i32_b32 :ref:`sdst`, :ref:`ssrc` + s_ff1_i32_b64 :ref:`sdst`, :ref:`ssrc` + s_flbit_i32 :ref:`sdst`, :ref:`ssrc` + s_flbit_i32_b32 :ref:`sdst`, :ref:`ssrc` + s_flbit_i32_b64 :ref:`sdst`, :ref:`ssrc` + s_flbit_i32_i64 :ref:`sdst`, :ref:`ssrc` + s_getpc_b64 :ref:`sdst` + s_mov_b32 :ref:`sdst`, :ref:`ssrc` + s_mov_b64 :ref:`sdst`, :ref:`ssrc` + s_mov_fed_b32 :ref:`sdst`, :ref:`ssrc` + s_movreld_b32 :ref:`sdst`, :ref:`ssrc` + s_movreld_b64 :ref:`sdst`, :ref:`ssrc` + s_movrels_b32 :ref:`sdst`, :ref:`ssrc` + s_movrels_b64 :ref:`sdst`, :ref:`ssrc` + s_nand_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_nor_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_not_b32 :ref:`sdst`, :ref:`ssrc` + s_not_b64 :ref:`sdst`, :ref:`ssrc` + s_or_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_orn1_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_orn2_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_quadmask_b32 :ref:`sdst`, :ref:`ssrc` + s_quadmask_b64 :ref:`sdst`, :ref:`ssrc` + s_rfe_b64 :ref:`ssrc` + s_set_gpr_idx_idx :ref:`ssrc` + s_setpc_b64 :ref:`ssrc` + s_sext_i32_i16 :ref:`sdst`, :ref:`ssrc` + s_sext_i32_i8 :ref:`sdst`, :ref:`ssrc` + s_swappc_b64 :ref:`sdst`, :ref:`ssrc` + s_wqm_b32 :ref:`sdst`, :ref:`ssrc` + s_wqm_b64 :ref:`sdst`, :ref:`ssrc` + s_xnor_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + s_xor_saveexec_b64 :ref:`sdst`, :ref:`ssrc` + +SOP2 +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + s_absdiff_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_add_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_add_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_addc_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_and_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_and_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_andn2_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_andn2_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_ashr_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_ashr_i64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_bfe_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_bfe_i64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_bfe_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_bfe_u64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_bfm_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_bfm_b64 :ref:`sdst`, :ref:`ssrc0`::ref:`b32`, :ref:`ssrc1`::ref:`b32` + s_cbranch_g_fork :ref:`ssrc0`, :ref:`ssrc1` + s_cselect_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_cselect_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_lshl1_add_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_lshl2_add_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_lshl3_add_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_lshl4_add_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_lshl_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_lshl_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_lshr_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_lshr_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_max_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_max_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_min_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_min_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_mul_hi_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_mul_hi_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_mul_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_nand_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_nand_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_nor_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_nor_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_or_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_or_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_orn2_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_orn2_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_pack_hh_b32_b16 :ref:`sdst`, :ref:`ssrc0`::ref:`b16x2`, :ref:`ssrc1`::ref:`b16x2` + s_pack_lh_b32_b16 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1`::ref:`b16x2` + s_pack_ll_b32_b16 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_rfe_restore_b64 :ref:`ssrc0`, :ref:`ssrc1`::ref:`b32` + s_sub_i32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_sub_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_subb_u32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_xnor_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_xnor_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_xor_b32 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + s_xor_b64 :ref:`sdst`, :ref:`ssrc0`, :ref:`ssrc1` + +SOPC +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **SRC0** **SRC1** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + s_bitcmp0_b32 :ref:`ssrc0`, :ref:`ssrc1` + s_bitcmp0_b64 :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_bitcmp1_b32 :ref:`ssrc0`, :ref:`ssrc1` + s_bitcmp1_b64 :ref:`ssrc0`, :ref:`ssrc1`::ref:`u32` + s_cmp_eq_i32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_eq_u32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_eq_u64 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_ge_i32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_ge_u32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_gt_i32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_gt_u32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_le_i32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_le_u32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_lg_i32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_lg_u32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_lg_u64 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_lt_i32 :ref:`ssrc0`, :ref:`ssrc1` + s_cmp_lt_u32 :ref:`ssrc0`, :ref:`ssrc1` + s_set_gpr_idx_on :ref:`ssrc`, :ref:`imm4` + s_setvskip :ref:`ssrc0`, :ref:`ssrc1` + +SOPK +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + s_addk_i32 :ref:`sdst`, :ref:`imm16` + s_call_b64 :ref:`sdst`, :ref:`label` + s_cbranch_i_fork :ref:`ssrc`, :ref:`label` + s_cmovk_i32 :ref:`sdst`, :ref:`imm16` + s_cmpk_eq_i32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_eq_u32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_ge_i32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_ge_u32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_gt_i32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_gt_u32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_le_i32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_le_u32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_lg_i32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_lg_u32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_lt_i32 :ref:`ssrc`, :ref:`imm16` + s_cmpk_lt_u32 :ref:`ssrc`, :ref:`imm16` + s_getreg_b32 :ref:`sdst`, :ref:`hwreg` + s_movk_i32 :ref:`sdst`, :ref:`imm16` + s_mulk_i32 :ref:`sdst`, :ref:`imm16` + s_setreg_b32 :ref:`hwreg`, :ref:`ssrc` + s_setreg_imm32_b32 :ref:`hwreg`, :ref:`imm32` + +SOPP +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **SRC** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + s_barrier + s_branch :ref:`label` + s_cbranch_cdbgsys :ref:`label` + s_cbranch_cdbgsys_and_user :ref:`label` + s_cbranch_cdbgsys_or_user :ref:`label` + s_cbranch_cdbguser :ref:`label` + s_cbranch_execnz :ref:`label` + s_cbranch_execz :ref:`label` + s_cbranch_scc0 :ref:`label` + s_cbranch_scc1 :ref:`label` + s_cbranch_vccnz :ref:`label` + s_cbranch_vccz :ref:`label` + s_decperflevel :ref:`imm16` + s_endpgm + s_endpgm_ordered_ps_done + s_endpgm_saved + s_icache_inv + s_incperflevel :ref:`imm16` + s_nop :ref:`imm16` + s_sendmsg :ref:`msg` + s_sendmsghalt :ref:`msg` + s_set_gpr_idx_mode :ref:`imm4` + s_set_gpr_idx_off + s_sethalt :ref:`imm16` + s_setkill :ref:`imm16` + s_setprio :ref:`imm16` + s_sleep :ref:`imm16` + s_trap :ref:`imm16` + s_ttracedata + s_waitcnt :ref:`waitcnt` + s_wakeup + +VINTRP +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + v_interp_mov_f32 :ref:`vdst`, :ref:`param`::ref:`b32`, :ref:`attr`::ref:`b32` + v_interp_p1_f32 :ref:`vdst`, :ref:`vsrc`, :ref:`attr`::ref:`b32` + v_interp_p2_f32 :ref:`vdst`, :ref:`vsrc`, :ref:`attr`::ref:`b32` + +VOP1 +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + v_bfrev_b32 :ref:`vdst`, :ref:`src` + v_bfrev_b32_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_bfrev_b32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_ceil_f16 :ref:`vdst`, :ref:`src` + v_ceil_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_ceil_f16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_ceil_f32 :ref:`vdst`, :ref:`src` + v_ceil_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_ceil_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_ceil_f64 :ref:`vdst`, :ref:`src` + v_clrexcp + v_cos_f16 :ref:`vdst`, :ref:`src` + v_cos_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cos_f16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cos_f32 :ref:`vdst`, :ref:`src` + v_cos_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cos_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f16_f32 :ref:`vdst`, :ref:`src` + v_cvt_f16_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f16_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f16_i16 :ref:`vdst`, :ref:`src` + v_cvt_f16_i16_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f16_i16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f16_u16 :ref:`vdst`, :ref:`src` + v_cvt_f16_u16_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f16_u16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f32_f16 :ref:`vdst`, :ref:`src` + v_cvt_f32_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f32_f16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f32_f64 :ref:`vdst`, :ref:`src` + v_cvt_f32_i32 :ref:`vdst`, :ref:`src` + v_cvt_f32_i32_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f32_i32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f32_u32 :ref:`vdst`, :ref:`src` + v_cvt_f32_u32_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f32_u32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f32_ubyte0 :ref:`vdst`, :ref:`src` + v_cvt_f32_ubyte0_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f32_ubyte0_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f32_ubyte1 :ref:`vdst`, :ref:`src` + v_cvt_f32_ubyte1_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f32_ubyte1_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f32_ubyte2 :ref:`vdst`, :ref:`src` + v_cvt_f32_ubyte2_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f32_ubyte2_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f32_ubyte3 :ref:`vdst`, :ref:`src` + v_cvt_f32_ubyte3_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_f32_ubyte3_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_f64_f32 :ref:`vdst`, :ref:`src` + v_cvt_f64_i32 :ref:`vdst`, :ref:`src` + v_cvt_f64_u32 :ref:`vdst`, :ref:`src` + v_cvt_flr_i32_f32 :ref:`vdst`, :ref:`src` + v_cvt_flr_i32_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_flr_i32_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_i16_f16 :ref:`vdst`, :ref:`src` + v_cvt_i16_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_i16_f16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_i32_f32 :ref:`vdst`, :ref:`src` + v_cvt_i32_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_i32_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_i32_f64 :ref:`vdst`, :ref:`src` + v_cvt_norm_i16_f16 :ref:`vdst`, :ref:`src` + v_cvt_norm_i16_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_norm_i16_f16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_norm_u16_f16 :ref:`vdst`, :ref:`src` + v_cvt_norm_u16_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_norm_u16_f16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_off_f32_i4 :ref:`vdst`, :ref:`src` + v_cvt_off_f32_i4_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_off_f32_i4_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_rpi_i32_f32 :ref:`vdst`, :ref:`src` + v_cvt_rpi_i32_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_rpi_i32_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_u16_f16 :ref:`vdst`, :ref:`src` + v_cvt_u16_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_u16_f16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_u32_f32 :ref:`vdst`, :ref:`src` + v_cvt_u32_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cvt_u32_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_cvt_u32_f64 :ref:`vdst`, :ref:`src` + v_exp_f16 :ref:`vdst`, :ref:`src` + v_exp_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_exp_f16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_exp_f32 :ref:`vdst`, :ref:`src` + v_exp_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_exp_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_exp_legacy_f32 :ref:`vdst`, :ref:`src` + v_exp_legacy_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_exp_legacy_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_ffbh_i32 :ref:`vdst`, :ref:`src` + v_ffbh_i32_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_ffbh_i32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_ffbh_u32 :ref:`vdst`, :ref:`src` + v_ffbh_u32_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_ffbh_u32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_ffbl_b32 :ref:`vdst`, :ref:`src` + v_ffbl_b32_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_ffbl_b32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_floor_f16 :ref:`vdst`, :ref:`src` + v_floor_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_floor_f16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_floor_f32 :ref:`vdst`, :ref:`src` + v_floor_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_floor_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_floor_f64 :ref:`vdst`, :ref:`src` + v_fract_f16 :ref:`vdst`, :ref:`src` + v_fract_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_fract_f16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_fract_f32 :ref:`vdst`, :ref:`src` + v_fract_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_fract_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_fract_f64 :ref:`vdst`, :ref:`src` + v_frexp_exp_i16_f16 :ref:`vdst`, :ref:`src` + v_frexp_exp_i16_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_frexp_exp_i16_f16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_frexp_exp_i32_f32 :ref:`vdst`, :ref:`src` + v_frexp_exp_i32_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_frexp_exp_i32_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_frexp_exp_i32_f64 :ref:`vdst`, :ref:`src` + v_frexp_mant_f16 :ref:`vdst`, :ref:`src` + v_frexp_mant_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_frexp_mant_f16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_frexp_mant_f32 :ref:`vdst`, :ref:`src` + v_frexp_mant_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_frexp_mant_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_frexp_mant_f64 :ref:`vdst`, :ref:`src` + v_log_f16 :ref:`vdst`, :ref:`src` + v_log_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_log_f16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_log_f32 :ref:`vdst`, :ref:`src` + v_log_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_log_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_log_legacy_f32 :ref:`vdst`, :ref:`src` + v_log_legacy_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_log_legacy_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_mov_b32 :ref:`vdst`, :ref:`src` + v_mov_b32_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mov_b32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_mov_fed_b32 :ref:`vdst`, :ref:`src` + v_mov_fed_b32_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mov_fed_b32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_nop + v_not_b32 :ref:`vdst`, :ref:`src` + v_not_b32_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_not_b32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_rcp_f16 :ref:`vdst`, :ref:`src` + v_rcp_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_rcp_f16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_rcp_f32 :ref:`vdst`, :ref:`src` + v_rcp_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_rcp_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_rcp_f64 :ref:`vdst`, :ref:`src` + v_rcp_iflag_f32 :ref:`vdst`, :ref:`src` + v_rcp_iflag_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_rcp_iflag_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_readfirstlane_b32 :ref:`sdst`, :ref:`vsrc` + v_rndne_f16 :ref:`vdst`, :ref:`src` + v_rndne_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_rndne_f16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_rndne_f32 :ref:`vdst`, :ref:`src` + v_rndne_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_rndne_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_rndne_f64 :ref:`vdst`, :ref:`src` + v_rsq_f16 :ref:`vdst`, :ref:`src` + v_rsq_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_rsq_f16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_rsq_f32 :ref:`vdst`, :ref:`src` + v_rsq_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_rsq_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_rsq_f64 :ref:`vdst`, :ref:`src` + v_sat_pk_u8_i16 :ref:`vdst`, :ref:`src` + v_sat_pk_u8_i16_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_sat_pk_u8_i16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_screen_partition_4se_b32 :ref:`vdst`, :ref:`src` + v_screen_partition_4se_b32_dpp :ref:`vdst`, :ref:`vsrc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_screen_partition_4se_b32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_sin_f16 :ref:`vdst`, :ref:`src` + v_sin_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_sin_f16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_sin_f32 :ref:`vdst`, :ref:`src` + v_sin_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_sin_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_sqrt_f16 :ref:`vdst`, :ref:`src` + v_sqrt_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_sqrt_f16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_sqrt_f32 :ref:`vdst`, :ref:`src` + v_sqrt_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_sqrt_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_sqrt_f64 :ref:`vdst`, :ref:`src` + v_swap_b32 :ref:`vdst`, :ref:`vsrc` + v_trunc_f16 :ref:`vdst`, :ref:`src` + v_trunc_f16_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_trunc_f16_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_trunc_f32 :ref:`vdst`, :ref:`src` + v_trunc_f32_dpp :ref:`vdst`, :ref:`vsrc`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_trunc_f32_sdwa :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` + v_trunc_f64 :ref:`vdst`, :ref:`src` + +VOP2 +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST0** **DST1** **SRC0** **SRC1** **SRC2** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + v_add_co_u32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_add_co_u32_dpp :ref:`vdst`, :ref:`vcc`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_add_co_u32_sdwa :ref:`vdst`, :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_add_f16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_add_f16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_add_f16_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_add_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_add_f32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_add_f32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_add_u16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_add_u16_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_add_u16_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_add_u32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_add_u32_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_add_u32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_addc_co_u32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`, :ref:`vcc` + v_addc_co_u32_dpp :ref:`vdst`, :ref:`vcc`, :ref:`vsrc0`, :ref:`vsrc1`, :ref:`vcc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_addc_co_u32_sdwa :ref:`vdst`, :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m`, :ref:`vcc` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_and_b32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_and_b32_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_and_b32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_ashrrev_i16 :ref:`vdst`, :ref:`src0`::ref:`u16`, :ref:`vsrc1` + v_ashrrev_i16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`u16`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_ashrrev_i16_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`::ref:`u16`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_ashrrev_i32 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`vsrc1` + v_ashrrev_i32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`u32`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_ashrrev_i32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`::ref:`u32`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_cndmask_b32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1`, :ref:`vcc` + v_cndmask_b32_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1`, :ref:`vcc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_cndmask_b32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m`, :ref:`vcc` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_ldexp_f16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1`::ref:`i16` + v_ldexp_f16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`i16` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_ldexp_f16_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m`::ref:`i16` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_lshlrev_b16 :ref:`vdst`, :ref:`src0`::ref:`u16`, :ref:`vsrc1` + v_lshlrev_b16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`u16`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_lshlrev_b16_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`::ref:`u16`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_lshlrev_b32 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`vsrc1` + v_lshlrev_b32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`u32`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_lshlrev_b32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`::ref:`u32`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_lshrrev_b16 :ref:`vdst`, :ref:`src0`::ref:`u16`, :ref:`vsrc1` + v_lshrrev_b16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`u16`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_lshrrev_b16_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`::ref:`u16`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_lshrrev_b32 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`vsrc1` + v_lshrrev_b32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`u32`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_lshrrev_b32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`::ref:`u32`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_mac_f16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mac_f16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mac_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mac_f32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_madak_f16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1`, :ref:`imm32` + v_madak_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1`, :ref:`imm32` + v_madmk_f16 :ref:`vdst`, :ref:`src0`, :ref:`imm32`, :ref:`vsrc2` + v_madmk_f32 :ref:`vdst`, :ref:`src0`, :ref:`imm32`, :ref:`vsrc2` + v_max_f16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_max_f16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_max_f16_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_max_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_max_f32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_max_f32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_max_i16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_max_i16_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_max_i16_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_max_i32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_max_i32_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_max_i32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_max_u16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_max_u16_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_max_u16_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_max_u32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_max_u32_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_max_u32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_min_f16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_min_f16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_min_f16_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_min_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_min_f32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_min_f32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_min_i16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_min_i16_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_min_i16_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_min_i32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_min_i32_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_min_i32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_min_u16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_min_u16_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_min_u16_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_min_u32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_min_u32_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_min_u32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_mul_f16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_f16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mul_f16_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_mul_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_f32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mul_f32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_mul_hi_i32_i24 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_hi_i32_i24_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mul_hi_i32_i24_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_mul_hi_u32_u24 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_hi_u32_u24_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mul_hi_u32_u24_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_mul_i32_i24 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_i32_i24_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mul_i32_i24_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_mul_legacy_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_legacy_f32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mul_legacy_f32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_mul_lo_u16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_lo_u16_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mul_lo_u16_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_mul_u32_u24 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_mul_u32_u24_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_mul_u32_u24_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_or_b32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_or_b32_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_or_b32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_sub_co_u32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_sub_co_u32_dpp :ref:`vdst`, :ref:`vcc`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_sub_co_u32_sdwa :ref:`vdst`, :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_sub_f16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_sub_f16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_sub_f16_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_sub_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_sub_f32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_sub_f32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_sub_u16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_sub_u16_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_sub_u16_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_sub_u32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_sub_u32_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_sub_u32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_subb_co_u32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`, :ref:`vcc` + v_subb_co_u32_dpp :ref:`vdst`, :ref:`vcc`, :ref:`vsrc0`, :ref:`vsrc1`, :ref:`vcc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_subb_co_u32_sdwa :ref:`vdst`, :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m`, :ref:`vcc` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_subbrev_co_u32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`, :ref:`vcc` + v_subbrev_co_u32_dpp :ref:`vdst`, :ref:`vcc`, :ref:`vsrc0`, :ref:`vsrc1`, :ref:`vcc` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_subbrev_co_u32_sdwa :ref:`vdst`, :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m`, :ref:`vcc` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_subrev_co_u32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_subrev_co_u32_dpp :ref:`vdst`, :ref:`vcc`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_subrev_co_u32_sdwa :ref:`vdst`, :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_subrev_f16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_subrev_f16_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_subrev_f16_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_subrev_f32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_subrev_f32_dpp :ref:`vdst`, :ref:`vsrc0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_subrev_f32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_subrev_u16 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_subrev_u16_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_subrev_u16_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_subrev_u32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_subrev_u32_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_subrev_u32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + v_xor_b32 :ref:`vdst`, :ref:`src0`, :ref:`vsrc1` + v_xor_b32_dpp :ref:`vdst`, :ref:`vsrc0`, :ref:`vsrc1` :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` + v_xor_b32_sdwa :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` + +VOP3 +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST0** **DST1** **SRC0** **SRC1** **SRC2** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + v_add3_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_add_co_u32_e64 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_add_f16_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_add_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_add_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_add_i16 :ref:`vdst`, :ref:`src0`, :ref:`src1` :ref:`vop3_op_sel` :ref:`clamp` + v_add_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_add_lshl_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_add_u16_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_add_u32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_addc_co_u32_e64 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1`, :ref:`ssrc2` + v_alignbit_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_alignbyte_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_and_b32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_and_or_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_ashrrev_i16_e64 :ref:`vdst`, :ref:`src0`::ref:`u16`, :ref:`src1` + v_ashrrev_i32_e64 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`src1` + v_ashrrev_i64 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`src1` + v_bcnt_u32_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_bfe_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1`::ref:`u32`, :ref:`src2`::ref:`u32` + v_bfe_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_bfi_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_bfm_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_bfrev_b32_e64 :ref:`vdst`, :ref:`src` + v_ceil_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_ceil_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_ceil_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_clrexcp_e64 + v_cmp_class_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`b32` + v_cmp_class_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`b32` + v_cmp_class_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`b32` + v_cmp_eq_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_eq_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_eq_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_eq_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_eq_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_eq_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_eq_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_eq_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_eq_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_f_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_f_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_f_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_f_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_f_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_f_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_f_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_f_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_f_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ge_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_ge_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_ge_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_ge_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ge_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ge_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ge_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ge_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ge_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_gt_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_gt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_gt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_gt_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_gt_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_gt_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_gt_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_gt_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_gt_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_le_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_le_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_le_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_le_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_le_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_le_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_le_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_le_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_le_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_lg_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_lg_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_lg_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_lt_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_lt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_lt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_lt_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_lt_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_lt_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_lt_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_lt_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_lt_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ne_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ne_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ne_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ne_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ne_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_ne_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_neq_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_neq_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_neq_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nge_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nge_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nge_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_ngt_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_ngt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_ngt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nle_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nle_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nle_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nlg_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nlg_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nlg_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nlt_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nlt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_nlt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_o_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_o_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_o_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_t_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_t_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_t_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_t_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_t_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_t_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmp_tru_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_tru_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_tru_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_u_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_u_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmp_u_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_class_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`b32` + v_cmpx_class_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`b32` + v_cmpx_class_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`b32` + v_cmpx_eq_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_eq_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_eq_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_eq_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_eq_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_eq_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_eq_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_eq_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_eq_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_f_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_f_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_f_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_f_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_f_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_f_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_f_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_f_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_f_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ge_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_ge_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_ge_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_ge_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ge_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ge_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ge_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ge_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ge_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_gt_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_gt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_gt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_gt_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_gt_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_gt_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_gt_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_gt_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_gt_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_le_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_le_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_le_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_le_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_le_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_le_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_le_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_le_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_le_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_lg_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_lg_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_lg_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_lt_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_lt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_lt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_lt_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_lt_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_lt_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_lt_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_lt_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_lt_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ne_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ne_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ne_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ne_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ne_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_ne_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_neq_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_neq_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_neq_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nge_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nge_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nge_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_ngt_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_ngt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_ngt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nle_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nle_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nle_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nlg_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nlg_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nlg_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nlt_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nlt_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_nlt_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_o_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_o_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_o_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_t_i16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_t_i32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_t_i64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_t_u16_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_t_u32_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_t_u64_e64 :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_cmpx_tru_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_tru_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_tru_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_u_f16_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_u_f32_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cmpx_u_f64_e64 :ref:`sdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_cndmask_b32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`ssrc2` + v_cos_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_cos_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_cubeid_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_cubema_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_cubesc_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_cubetc_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_cvt_f16_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_cvt_f16_i16_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` + v_cvt_f16_u16_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` + v_cvt_f32_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_cvt_f32_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_cvt_f32_i32_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_f32_u32_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_f32_ubyte0_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_f32_ubyte1_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_f32_ubyte2_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_f32_ubyte3_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_f64_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_cvt_f64_i32_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_f64_u32_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_flr_i32_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_cvt_i16_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_cvt_i32_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_cvt_i32_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_cvt_norm_i16_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_cvt_norm_u16_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_cvt_off_f32_i4_e64 :ref:`vdst`, :ref:`src` :ref:`clamp` :ref:`omod` + v_cvt_pk_i16_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_cvt_pk_u16_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_cvt_pk_u8_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`u32`, :ref:`src2`::ref:`u32` + v_cvt_pkaccum_u8_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`u32` + v_cvt_pknorm_i16_f16 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`vop3_op_sel` + v_cvt_pknorm_i16_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cvt_pknorm_u16_f16 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`vop3_op_sel` + v_cvt_pknorm_u16_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cvt_pkrtz_f16_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` + v_cvt_rpi_i32_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_cvt_u16_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_cvt_u32_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_cvt_u32_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_div_fixup_f16 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`vop3_op_sel` :ref:`clamp` + v_div_fixup_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_div_fixup_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_div_fixup_legacy_f16 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` + v_div_fmas_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_div_fmas_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_div_scale_f32 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_div_scale_f64 :ref:`vdst`, :ref:`vcc`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_exp_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_exp_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_exp_legacy_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_ffbh_i32_e64 :ref:`vdst`, :ref:`src` + v_ffbh_u32_e64 :ref:`vdst`, :ref:`src` + v_ffbl_b32_e64 :ref:`vdst`, :ref:`src` + v_floor_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_floor_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_floor_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_fma_f16 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`vop3_op_sel` :ref:`clamp` + v_fma_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_fma_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_fma_legacy_f16 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` + v_fract_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_fract_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_fract_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_frexp_exp_i16_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_frexp_exp_i32_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_frexp_exp_i32_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` + v_frexp_mant_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_frexp_mant_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_frexp_mant_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_interp_mov_f32_e64 :ref:`vdst`, :ref:`param`::ref:`b32`, :ref:`attr`::ref:`b32` :ref:`clamp` :ref:`omod` + v_interp_p1_f32_e64 :ref:`vdst`, :ref:`vsrc`::ref:`m`, :ref:`attr`::ref:`b32` :ref:`clamp` :ref:`omod` + v_interp_p1ll_f16 :ref:`vdst`::ref:`f32`, :ref:`vsrc`::ref:`m`::ref:`f32`, :ref:`attr`::ref:`b32` :ref:`high` :ref:`clamp` :ref:`omod` + v_interp_p1lv_f16 :ref:`vdst`::ref:`f32`, :ref:`vsrc0`::ref:`m`::ref:`f32`, :ref:`attr`::ref:`b32`, :ref:`vsrc2`::ref:`m`::ref:`f16x2` :ref:`high` :ref:`clamp` :ref:`omod` + v_interp_p2_f16 :ref:`vdst`, :ref:`vsrc0`::ref:`m`::ref:`f32`, :ref:`attr`::ref:`b32`, :ref:`vsrc2`::ref:`m`::ref:`f32` :ref:`high` :ref:`clamp` + v_interp_p2_f32_e64 :ref:`vdst`, :ref:`vsrc`::ref:`m`, :ref:`attr`::ref:`b32` :ref:`clamp` :ref:`omod` + v_interp_p2_legacy_f16 :ref:`vdst`, :ref:`vsrc0`::ref:`m`::ref:`f32`, :ref:`attr`::ref:`b32`, :ref:`vsrc2`::ref:`m`::ref:`f32` :ref:`high` :ref:`clamp` + v_ldexp_f16_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`i16` :ref:`clamp` + v_ldexp_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`i32` :ref:`clamp` :ref:`omod` + v_ldexp_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`i32` :ref:`clamp` :ref:`omod` + v_lerp_u8 :ref:`vdst`::ref:`u32`, :ref:`src0`::ref:`b32`, :ref:`src1`::ref:`b32`, :ref:`src2`::ref:`b32` + v_log_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_log_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_log_legacy_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_lshl_add_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_lshl_or_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1`::ref:`u32`, :ref:`src2` + v_lshlrev_b16_e64 :ref:`vdst`, :ref:`src0`::ref:`u16`, :ref:`src1` + v_lshlrev_b32_e64 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`src1` + v_lshlrev_b64 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`src1` + v_lshrrev_b16_e64 :ref:`vdst`, :ref:`src0`::ref:`u16`, :ref:`src1` + v_lshrrev_b32_e64 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`src1` + v_lshrrev_b64 :ref:`vdst`, :ref:`src0`::ref:`u32`, :ref:`src1` + v_mac_f16_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_mac_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_mad_f16 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`vop3_op_sel` :ref:`clamp` + v_mad_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_mad_i16 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` :ref:`vop3_op_sel` :ref:`clamp` + v_mad_i32_i16 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2`::ref:`i32` :ref:`vop3_op_sel` :ref:`clamp` + v_mad_i32_i24 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2`::ref:`i32` :ref:`clamp` + v_mad_i64_i32 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1`, :ref:`src2`::ref:`i64` :ref:`clamp` + v_mad_legacy_f16 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` + v_mad_legacy_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_mad_legacy_i16 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` :ref:`clamp` + v_mad_legacy_u16 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` :ref:`clamp` + v_mad_u16 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` :ref:`vop3_op_sel` :ref:`clamp` + v_mad_u32_u16 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2`::ref:`u32` :ref:`vop3_op_sel` :ref:`clamp` + v_mad_u32_u24 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2`::ref:`u32` :ref:`clamp` + v_mad_u64_u32 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1`, :ref:`src2`::ref:`u64` :ref:`clamp` + v_max3_f16 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`vop3_op_sel` :ref:`clamp` + v_max3_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_max3_i16 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` :ref:`vop3_op_sel` + v_max3_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_max3_u16 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` :ref:`vop3_op_sel` + v_max3_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_max_f16_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_max_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_max_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_max_i16_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_max_i32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_max_u16_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_max_u32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mbcnt_hi_u32_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mbcnt_lo_u32_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_med3_f16 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`vop3_op_sel` :ref:`clamp` + v_med3_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_med3_i16 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` :ref:`vop3_op_sel` + v_med3_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_med3_u16 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` :ref:`vop3_op_sel` + v_med3_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_min3_f16 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`vop3_op_sel` :ref:`clamp` + v_min3_f32 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m`, :ref:`src2`::ref:`m` :ref:`clamp` :ref:`omod` + v_min3_i16 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` :ref:`vop3_op_sel` + v_min3_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_min3_u16 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` :ref:`vop3_op_sel` + v_min3_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_min_f16_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_min_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_min_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_min_i16_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_min_i32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_min_u16_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_min_u32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mov_b32_e64 :ref:`vdst`, :ref:`src` + v_mov_fed_b32_e64 :ref:`vdst`, :ref:`src` + v_mqsad_pk_u16_u8 :ref:`vdst`::ref:`b64`, :ref:`src0`::ref:`b64`, :ref:`src1`::ref:`b32`, :ref:`src2`::ref:`b64` :ref:`clamp` + v_mqsad_u32_u8 :ref:`vdst`::ref:`b128`, :ref:`src0`::ref:`b64`, :ref:`src1`::ref:`b32`, :ref:`vsrc2`::ref:`b128` :ref:`clamp` + v_msad_u8 :ref:`vdst`::ref:`u32`, :ref:`src0`::ref:`b32`, :ref:`src1`::ref:`b32`, :ref:`src2`::ref:`b32` :ref:`clamp` + v_mul_f16_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_mul_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_mul_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_mul_hi_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_hi_i32_i24_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_hi_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_hi_u32_u24_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_i32_i24_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_legacy_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_mul_lo_u16_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_lo_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_mul_u32_u24_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_nop_e64 + v_not_b32_e64 :ref:`vdst`, :ref:`src` + v_or3_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_or_b32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_pack_b32_f16 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`vop3_op_sel` + v_perm_b32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_qsad_pk_u16_u8 :ref:`vdst`::ref:`b64`, :ref:`src0`::ref:`b64`, :ref:`src1`::ref:`b32`, :ref:`src2`::ref:`b64` :ref:`clamp` + v_rcp_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_rcp_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rcp_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rcp_iflag_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_readlane_b32 :ref:`sdst`, :ref:`vsrc0`, :ref:`ssrc1` + v_rndne_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_rndne_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rndne_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rsq_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_rsq_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_rsq_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_sad_hi_u8 :ref:`vdst`::ref:`u32`, :ref:`src0`::ref:`u8x4`, :ref:`src1`::ref:`u8x4`, :ref:`src2`::ref:`u32` :ref:`clamp` + v_sad_u16 :ref:`vdst`::ref:`u32`, :ref:`src0`::ref:`u16x2`, :ref:`src1`::ref:`u16x2`, :ref:`src2`::ref:`u32` :ref:`clamp` + v_sad_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` :ref:`clamp` + v_sad_u8 :ref:`vdst`::ref:`u32`, :ref:`src0`::ref:`u8x4`, :ref:`src1`::ref:`u8x4`, :ref:`src2`::ref:`u32` :ref:`clamp` + v_sat_pk_u8_i16_e64 :ref:`vdst`, :ref:`src` + v_screen_partition_4se_b32_e64 :ref:`vdst`, :ref:`src` + v_sin_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_sin_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_sqrt_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_sqrt_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_sqrt_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_sub_co_u32_e64 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_sub_f16_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_sub_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_sub_i16 :ref:`vdst`, :ref:`src0`, :ref:`src1` :ref:`vop3_op_sel` :ref:`clamp` + v_sub_i32 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_sub_u16_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_sub_u32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_subb_co_u32_e64 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1`, :ref:`ssrc2` + v_subbrev_co_u32_e64 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1`, :ref:`ssrc2` + v_subrev_co_u32_e64 :ref:`vdst`, :ref:`sdst`, :ref:`src0`, :ref:`src1` + v_subrev_f16_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` + v_subrev_f32_e64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`m` :ref:`clamp` :ref:`omod` + v_subrev_u16_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_subrev_u32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + v_trig_preop_f64 :ref:`vdst`, :ref:`src0`::ref:`m`, :ref:`src1`::ref:`u32` :ref:`clamp` :ref:`omod` + v_trunc_f16_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` + v_trunc_f32_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_trunc_f64_e64 :ref:`vdst`, :ref:`src`::ref:`m` :ref:`clamp` :ref:`omod` + v_writelane_b32 :ref:`vdst`, :ref:`ssrc0`, :ref:`ssrc1` + v_xad_u32 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` + v_xor_b32_e64 :ref:`vdst`, :ref:`src0`, :ref:`src1` + +VOP3P +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **SRC2** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + v_mad_mix_f32 :ref:`vdst`, :ref:`src0`::ref:`fx`, :ref:`src1`::ref:`fx`, :ref:`src2`::ref:`fx` :ref:`mad_mix_op_sel` :ref:`mad_mix_op_sel_hi` :ref:`clamp` + v_mad_mixhi_f16 :ref:`vdst`, :ref:`src0`::ref:`fx`, :ref:`src1`::ref:`fx`, :ref:`src2`::ref:`fx` :ref:`mad_mix_op_sel` :ref:`mad_mix_op_sel_hi` :ref:`clamp` + v_mad_mixlo_f16 :ref:`vdst`, :ref:`src0`::ref:`fx`, :ref:`src1`::ref:`fx`, :ref:`src2`::ref:`fx` :ref:`mad_mix_op_sel` :ref:`mad_mix_op_sel_hi` :ref:`clamp` + v_pk_add_f16 :ref:`vdst`, :ref:`src0`, :ref:`src1` :ref:`op_sel` :ref:`op_sel_hi` :ref:`neg_lo` :ref:`neg_hi` :ref:`clamp` + v_pk_add_i16 :ref:`vdst`, :ref:`src0`, :ref:`src1` :ref:`op_sel` :ref:`op_sel_hi` :ref:`clamp` + v_pk_add_u16 :ref:`vdst`, :ref:`src0`, :ref:`src1` :ref:`op_sel` :ref:`op_sel_hi` :ref:`clamp` + v_pk_ashrrev_i16 :ref:`vdst`, :ref:`src0`::ref:`u16x2`, :ref:`src1` :ref:`op_sel` :ref:`op_sel_hi` + v_pk_fma_f16 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` :ref:`op_sel` :ref:`op_sel_hi` :ref:`neg_lo` :ref:`neg_hi` :ref:`clamp` + v_pk_lshlrev_b16 :ref:`vdst`, :ref:`src0`::ref:`u16x2`, :ref:`src1` :ref:`op_sel` :ref:`op_sel_hi` + v_pk_lshrrev_b16 :ref:`vdst`, :ref:`src0`::ref:`u16x2`, :ref:`src1` :ref:`op_sel` :ref:`op_sel_hi` + v_pk_mad_i16 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` :ref:`op_sel` :ref:`op_sel_hi` :ref:`clamp` + v_pk_mad_u16 :ref:`vdst`, :ref:`src0`, :ref:`src1`, :ref:`src2` :ref:`op_sel` :ref:`op_sel_hi` :ref:`clamp` + v_pk_max_f16 :ref:`vdst`, :ref:`src0`, :ref:`src1` :ref:`op_sel` :ref:`op_sel_hi` :ref:`neg_lo` :ref:`neg_hi` :ref:`clamp` + v_pk_max_i16 :ref:`vdst`, :ref:`src0`, :ref:`src1` :ref:`op_sel` :ref:`op_sel_hi` + v_pk_max_u16 :ref:`vdst`, :ref:`src0`, :ref:`src1` :ref:`op_sel` :ref:`op_sel_hi` + v_pk_min_f16 :ref:`vdst`, :ref:`src0`, :ref:`src1` :ref:`op_sel` :ref:`op_sel_hi` :ref:`neg_lo` :ref:`neg_hi` :ref:`clamp` + v_pk_min_i16 :ref:`vdst`, :ref:`src0`, :ref:`src1` :ref:`op_sel` :ref:`op_sel_hi` + v_pk_min_u16 :ref:`vdst`, :ref:`src0`, :ref:`src1` :ref:`op_sel` :ref:`op_sel_hi` + v_pk_mul_f16 :ref:`vdst`, :ref:`src0`, :ref:`src1` :ref:`op_sel` :ref:`op_sel_hi` :ref:`neg_lo` :ref:`neg_hi` :ref:`clamp` + v_pk_mul_lo_u16 :ref:`vdst`, :ref:`src0`, :ref:`src1` :ref:`op_sel` :ref:`op_sel_hi` + v_pk_sub_i16 :ref:`vdst`, :ref:`src0`, :ref:`src1` :ref:`op_sel` :ref:`op_sel_hi` :ref:`clamp` + v_pk_sub_u16 :ref:`vdst`, :ref:`src0`, :ref:`src1` :ref:`op_sel` :ref:`op_sel_hi` :ref:`clamp` + +VOPC +----------------------- + +.. parsed-literal:: + + **INSTRUCTION** **DST** **SRC0** **SRC1** **MODIFIERS** + \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---| + v_cmp_class_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`::ref:`b32` + v_cmp_class_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m`::ref:`b32` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_class_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`::ref:`b32` + v_cmp_class_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m`::ref:`b32` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_class_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`::ref:`b32` + v_cmp_eq_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_eq_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_eq_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_i16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_eq_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_i32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_eq_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_u16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_eq_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_eq_u32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_eq_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_f_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_f_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_i16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_f_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_i32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_f_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_u16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_f_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_f_u32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_f_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ge_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ge_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_i16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ge_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_i32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ge_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_u16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ge_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ge_u32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ge_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_gt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_gt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_i16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_gt_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_i32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_gt_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_u16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_gt_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_gt_u32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_gt_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_le_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_le_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_i16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_le_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_i32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_le_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_u16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_le_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_le_u32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_le_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lg_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lg_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_lg_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lg_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_lg_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_lt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_lt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_i16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_lt_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_i32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_lt_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_u16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_lt_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_lt_u32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_lt_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ne_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ne_i16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ne_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ne_i32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ne_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ne_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ne_u16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ne_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ne_u32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ne_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_neq_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_neq_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_neq_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_neq_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_neq_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nge_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nge_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_nge_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nge_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_nge_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ngt_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ngt_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ngt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_ngt_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_ngt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nle_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nle_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_nle_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nle_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_nle_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nlg_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nlg_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_nlg_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nlg_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_nlg_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nlt_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nlt_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_nlt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_nlt_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_nlt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_o_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_o_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_o_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_o_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_o_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_t_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_t_i16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_t_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_t_i32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_t_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_t_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_t_u16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_t_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_t_u32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_t_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_tru_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_tru_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_tru_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_tru_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_tru_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_u_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_u_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_u_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmp_u_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmp_u_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_class_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`::ref:`b32` + v_cmpx_class_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m`::ref:`b32` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_class_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`::ref:`b32` + v_cmpx_class_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m`::ref:`b32` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_class_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1`::ref:`b32` + v_cmpx_eq_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_eq_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_eq_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_i16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_eq_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_i32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_eq_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_u16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_eq_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_eq_u32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_eq_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_f_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_f_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_i16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_f_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_i32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_f_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_u16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_f_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_f_u32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_f_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ge_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ge_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_i16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ge_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_i32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ge_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_u16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ge_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ge_u32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ge_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_gt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_gt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_i16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_gt_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_i32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_gt_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_u16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_gt_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_gt_u32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_gt_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_le_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_le_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_i16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_le_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_i32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_le_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_u16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_le_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_le_u32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_le_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lg_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lg_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_lg_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lg_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_lg_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_lt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_lt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_i16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_lt_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_i32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_lt_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_u16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_lt_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_lt_u32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_lt_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ne_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ne_i16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ne_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ne_i32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ne_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ne_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ne_u16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ne_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ne_u32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ne_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_neq_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_neq_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_neq_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_neq_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_neq_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nge_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nge_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_nge_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nge_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_nge_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ngt_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ngt_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ngt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_ngt_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_ngt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nle_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nle_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_nle_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nle_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_nle_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nlg_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nlg_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_nlg_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nlg_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_nlg_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nlt_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nlt_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_nlt_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_nlt_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_nlt_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_o_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_o_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_o_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_o_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_o_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_t_i16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_t_i16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_t_i32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_t_i32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_t_i64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_t_u16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_t_u16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_t_u32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_t_u32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_t_u64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_tru_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_tru_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_tru_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_tru_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_tru_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_u_f16 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_u_f16_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_u_f32 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + v_cmpx_u_f32_sdwa :ref:`vcc`, :ref:`src0`::ref:`m`, :ref:`vsrc1`::ref:`m` :ref:`src0_sel` :ref:`src1_sel` + v_cmpx_u_f64 :ref:`vcc`, :ref:`src0`, :ref:`vsrc1` + +.. |---| unicode:: U+02014 .. em dash + + +.. toctree:: + :hidden: + + gfx9_attr + gfx9_bimm16 + gfx9_bimm32 + gfx9_fimm16 + gfx9_fimm32 + gfx9_hwreg + gfx9_imm4 + gfx9_label + gfx9_msg + gfx9_param + gfx9_perm_smem + gfx9_simm16 + gfx9_tgt + gfx9_uimm16 + gfx9_waitcnt + gfx9_addr_buf + gfx9_addr_ds + gfx9_addr_flat + gfx9_addr_mimg + gfx9_base_smem_addr + gfx9_base_smem_buf + gfx9_base_smem_scratch + gfx9_data_buf_atomic128 + gfx9_data_buf_atomic32 + gfx9_data_buf_atomic64 + gfx9_data_mimg_atomic_cmp + gfx9_data_mimg_atomic_reg + gfx9_data_mimg_store + gfx9_data_mimg_store_d16 + gfx9_data_smem_atomic128 + gfx9_data_smem_atomic32 + gfx9_data_smem_atomic64 + gfx9_dst_buf_128 + gfx9_dst_buf_32 + gfx9_dst_buf_64 + gfx9_dst_buf_96 + gfx9_dst_buf_lds + gfx9_dst_flat_atomic32 + gfx9_dst_flat_atomic64 + gfx9_dst_mimg_gather4 + gfx9_dst_mimg_regular + gfx9_dst_mimg_regular_d16 + gfx9_offset_buf + gfx9_offset_smem_buf + gfx9_offset_smem_plain + gfx9_rsrc_buf + gfx9_rsrc_mimg + gfx9_saddr_flat_global + gfx9_saddr_flat_scratch + gfx9_samp_mimg + gfx9_sdata128_0 + gfx9_sdata32_0 + gfx9_sdata64_0 + gfx9_sdst128_0 + gfx9_sdst256_0 + gfx9_sdst32_0 + gfx9_sdst32_1 + gfx9_sdst32_2 + gfx9_sdst512_0 + gfx9_sdst64_0 + gfx9_sdst64_1 + gfx9_src32_0 + gfx9_src32_1 + gfx9_src64_0 + gfx9_src64_1 + gfx9_src_exp + gfx9_ssrc32_0 + gfx9_ssrc32_1 + gfx9_ssrc32_2 + gfx9_ssrc32_3 + gfx9_ssrc32_4 + gfx9_ssrc64_0 + gfx9_ssrc64_1 + gfx9_ssrc64_2 + gfx9_ssrc64_3 + gfx9_vaddr_flat_global + gfx9_vaddr_flat_scratch + gfx9_vcc_64 + gfx9_vdata128_0 + gfx9_vdata32_0 + gfx9_vdata64_0 + gfx9_vdata96_0 + gfx9_vdst128_0 + gfx9_vdst32_0 + gfx9_vdst64_0 + gfx9_vdst96_0 + gfx9_vsrc128_0 + gfx9_vsrc32_0 + gfx9_vsrc64_0 + gfx9_mad_type_dev + gfx9_mod_dpp_sdwa_abs_neg + gfx9_mod_sdwa_sext + gfx9_mod_vop3_abs_neg + gfx9_opt + gfx9_ret + gfx9_type_dev diff --git a/docs/AMDGPU/gfx7_addr_buf.rst b/docs/AMDGPU/gfx7_addr_buf.rst new file mode 100644 index 0000000000000000000000000000000000000000..22dc7d35338586c44dd3c7a6b632c459e835ea9a --- /dev/null +++ b/docs/AMDGPU/gfx7_addr_buf.rst @@ -0,0 +1,24 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_addr_buf: + +vaddr +=========================== + +This is an optional operand which may specify a 64-bit address, offset and/or index. + +*Size:* 0, 1 or 2 dwords. Size is controlled by modifiers :ref:`addr64`, :ref:`offen` and :ref:`idxen`: + +* If only :ref:`addr64` is specified, this operand supplies a 64-bit address. Size is 2 dwords. +* If only :ref:`idxen` is specified, this operand supplies an index. Size is 1 dword. +* If only :ref:`offen` is specified, this operand supplies an offset. Size is 1 dword. +* If both :ref:`idxen` and :ref:`offen` are specified, index is in the first register and offset is in the second. Size is 2 dwords. +* If none of these modifiers are specified, this operand must be set to :ref:`off`. +* All other combinations of these modifiers are illegal. + +*Operands:* :ref:`v`, :ref:`off` diff --git a/docs/AMDGPU/gfx7_addr_ds.rst b/docs/AMDGPU/gfx7_addr_ds.rst new file mode 100644 index 0000000000000000000000000000000000000000..c9cab7d45ec8ab27a6f27e89373ccb7795c9d0c1 --- /dev/null +++ b/docs/AMDGPU/gfx7_addr_ds.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_addr_ds: + +vaddr +=========================== + +An offset from the start of GDS/LDS memory. + +*Size:* 1 dword. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_addr_flat.rst b/docs/AMDGPU/gfx7_addr_flat.rst new file mode 100644 index 0000000000000000000000000000000000000000..93deea6d7a2fee503aa131ff53eaea07794f3a51 --- /dev/null +++ b/docs/AMDGPU/gfx7_addr_flat.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_addr_flat: + +vaddr +=========================== + +A 64-bit flat address. + +*Size:* 2 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_addr_mimg.rst b/docs/AMDGPU/gfx7_addr_mimg.rst new file mode 100644 index 0000000000000000000000000000000000000000..76eb4846f1ae137d170ed385e298f590f3d58e55 --- /dev/null +++ b/docs/AMDGPU/gfx7_addr_mimg.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_addr_mimg: + +vaddr +=========================== + +Image address which includes from one to four dimensional coordinates and other data used to locate a position in the image. + +*Size:* 1, 2, 3, 4, 8 or 16 dwords. Actual size depends on opcode and specific image being handled. + + Note 1. Image format and dimensions are encoded in the image resource constant but not in the instruction. + + Note 2. Actually image address size may vary from 1 to 13 dwords, but assembler currently supports a limited range of register sequences. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_attr.rst b/docs/AMDGPU/gfx7_attr.rst new file mode 100644 index 0000000000000000000000000000000000000000..219b77414ab1a9f189a047c45a59e7184246a112 --- /dev/null +++ b/docs/AMDGPU/gfx7_attr.rst @@ -0,0 +1,30 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_attr: + +attr +=========================== + +Interpolation attribute and channel: + + ============== =================================== + Syntax Description + ============== =================================== + attr{0..32}.x Attribute 0..32 with *x* channel. + attr{0..32}.y Attribute 0..32 with *y* channel. + attr{0..32}.z Attribute 0..32 with *z* channel. + attr{0..32}.w Attribute 0..32 with *w* channel. + ============== =================================== + +Examples: + +.. parsed-literal:: + + v_interp_p1_f32 v1, v0, attr0.x + v_interp_p1_f32 v1, v0, attr32.w + diff --git a/docs/AMDGPU/gfx7_base_smem_addr.rst b/docs/AMDGPU/gfx7_base_smem_addr.rst new file mode 100644 index 0000000000000000000000000000000000000000..9cc3cc71fdf7eb7ca939caeefdae1faf8c9ccf3b --- /dev/null +++ b/docs/AMDGPU/gfx7_base_smem_addr.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_base_smem_addr: + +sbase +=========================== + +A 64-bit base address for scalar memory operations. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap` diff --git a/docs/AMDGPU/gfx7_base_smem_buf.rst b/docs/AMDGPU/gfx7_base_smem_buf.rst new file mode 100644 index 0000000000000000000000000000000000000000..416cac715c71b36f86eb6a705f2ddf83824e6d5d --- /dev/null +++ b/docs/AMDGPU/gfx7_base_smem_buf.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_base_smem_buf: + +sbase +=========================== + +A 128-bit buffer resource constant for scalar memory operations which provides a base address, a size and a stride. + +*Size:* 4 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx7_bimm16.rst b/docs/AMDGPU/gfx7_bimm16.rst new file mode 100644 index 0000000000000000000000000000000000000000..eb43f9b36a9d533137b3e828a50a122b8eef159d --- /dev/null +++ b/docs/AMDGPU/gfx7_bimm16.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_bimm16: + +imm16 +=========================== + +An :ref:`integer_number`. The value is truncated to 16 bits. + diff --git a/docs/AMDGPU/gfx7_bimm32.rst b/docs/AMDGPU/gfx7_bimm32.rst new file mode 100644 index 0000000000000000000000000000000000000000..4d8f89d7ae39aa2ae59f3cc3abe81bca485ec5e9 --- /dev/null +++ b/docs/AMDGPU/gfx7_bimm32.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_bimm32: + +imm32 +=========================== + +An :ref:`integer_number`. The value is truncated to 32 bits. + diff --git a/docs/AMDGPU/gfx7_data_buf_atomic128.rst b/docs/AMDGPU/gfx7_data_buf_atomic128.rst new file mode 100644 index 0000000000000000000000000000000000000000..33ff26c6c594de28b49b609c205a24f44d38bb35 --- /dev/null +++ b/docs/AMDGPU/gfx7_data_buf_atomic128.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_data_buf_atomic128: + +vdata +=========================== + +Input data for an atomic instruction. + +Optionally may serve as an output data: + +* If :ref:`glc` is specified, gets the memory value before the operation. + +*Size:* 4 dwords by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_data_buf_atomic32.rst b/docs/AMDGPU/gfx7_data_buf_atomic32.rst new file mode 100644 index 0000000000000000000000000000000000000000..df4a6e42ad457f94d7c5c0bbc92649f59474b4bb --- /dev/null +++ b/docs/AMDGPU/gfx7_data_buf_atomic32.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_data_buf_atomic32: + +vdata +=========================== + +Input data for an atomic instruction. + +Optionally may serve as an output data: + +* If :ref:`glc` is specified, gets the memory value before the operation. + +*Size:* 1 dword by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_data_buf_atomic64.rst b/docs/AMDGPU/gfx7_data_buf_atomic64.rst new file mode 100644 index 0000000000000000000000000000000000000000..4892e41631f4bfe07db0e4326e93ca86d6f24aea --- /dev/null +++ b/docs/AMDGPU/gfx7_data_buf_atomic64.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_data_buf_atomic64: + +vdata +=========================== + +Input data for an atomic instruction. + +Optionally may serve as an output data: + +* If :ref:`glc` is specified, gets the memory value before the operation. + +*Size:* 2 dwords by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_data_mimg_atomic_cmp.rst b/docs/AMDGPU/gfx7_data_mimg_atomic_cmp.rst new file mode 100644 index 0000000000000000000000000000000000000000..82c3337aeb2bde11d64c304dd4db24b4c5de6bb2 --- /dev/null +++ b/docs/AMDGPU/gfx7_data_mimg_atomic_cmp.rst @@ -0,0 +1,27 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_data_mimg_atomic_cmp: + +vdata +=========================== + +Input data for an atomic instruction. + +Optionally may serve as an output data: + +* If :ref:`glc` is specified, gets the memory value before the operation. + +*Size:* depends on :ref:`dmask` and :ref:`tfe`: + +* :ref:`dmask` may specify 2 data elements for 32-bit-per-pixel surfaces or 4 data elements for 64-bit-per-pixel surfaces. Each data element occupies 1 dword. +* :ref:`tfe` adds 1 dword if specified. + + Note. The surface data format is indicated in the image resource constant but not in the instruction. + + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_data_mimg_atomic_reg.rst b/docs/AMDGPU/gfx7_data_mimg_atomic_reg.rst new file mode 100644 index 0000000000000000000000000000000000000000..729548dcb87b5718980c61d407f1da3601315828 --- /dev/null +++ b/docs/AMDGPU/gfx7_data_mimg_atomic_reg.rst @@ -0,0 +1,26 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_data_mimg_atomic_reg: + +vdata +=========================== + +Input data for an atomic instruction. + +Optionally may serve as an output data: + +* If :ref:`glc` is specified, gets the memory value before the operation. + +*Size:* depends on :ref:`dmask` and :ref:`tfe`: + +* :ref:`dmask` may specify 1 data element for 32-bit-per-pixel surfaces or 2 data elements for 64-bit-per-pixel surfaces. Each data element occupies 1 dword. +* :ref:`tfe` adds 1 dword if specified. + + Note. The surface data format is indicated in the image resource constant but not in the instruction. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_data_mimg_store.rst b/docs/AMDGPU/gfx7_data_mimg_store.rst new file mode 100644 index 0000000000000000000000000000000000000000..1858547a7c0c81a0ff24acb5973c8347e3e1c8fe --- /dev/null +++ b/docs/AMDGPU/gfx7_data_mimg_store.rst @@ -0,0 +1,18 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_data_mimg_store: + +vdata +=========================== + +Image data to store by an *image_store* instruction. + +*Size:* depends on :ref:`dmask` which may specify from 1 to 4 data elements. Each data element occupies 1 dword. + + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_dst_buf_128.rst b/docs/AMDGPU/gfx7_dst_buf_128.rst new file mode 100644 index 0000000000000000000000000000000000000000..701616ed3201edd9d5cd94724ce42acc7016ce30 --- /dev/null +++ b/docs/AMDGPU/gfx7_dst_buf_128.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_dst_buf_128: + +vdst +=========================== + +Instruction output: data read from a memory buffer. + +*Size:* 4 dwords by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_dst_buf_64.rst b/docs/AMDGPU/gfx7_dst_buf_64.rst new file mode 100644 index 0000000000000000000000000000000000000000..de62a568bf5ff58c03727073e2f6358304840d91 --- /dev/null +++ b/docs/AMDGPU/gfx7_dst_buf_64.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_dst_buf_64: + +vdst +=========================== + +Instruction output: data read from a memory buffer. + +*Size:* 2 dwords by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_dst_buf_96.rst b/docs/AMDGPU/gfx7_dst_buf_96.rst new file mode 100644 index 0000000000000000000000000000000000000000..1abfcc07aecf289ec3ff81db3e4a5a76eb194956 --- /dev/null +++ b/docs/AMDGPU/gfx7_dst_buf_96.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_dst_buf_96: + +vdst +=========================== + +Instruction output: data read from a memory buffer. + +*Size:* 3 dwords by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_dst_buf_lds.rst b/docs/AMDGPU/gfx7_dst_buf_lds.rst new file mode 100644 index 0000000000000000000000000000000000000000..435f6bb874b4cb85c205fe872f0f96d983436215 --- /dev/null +++ b/docs/AMDGPU/gfx7_dst_buf_lds.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_dst_buf_lds: + +vdst +=========================== + +Instruction output: data read from a memory buffer. + +If :ref:`lds` is specified, this operand is ignored by H/W and data are stored directly into LDS. + +*Size:* 1 dword by default. :ref:`tfe` adds 1 dword if specified. + + Note that :ref:`tfe` and :ref:`lds` cannot be used together. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_dst_flat_atomic32.rst b/docs/AMDGPU/gfx7_dst_flat_atomic32.rst new file mode 100644 index 0000000000000000000000000000000000000000..4a85656ca86deb2e08e2cdbf6b17a23dc22b063d --- /dev/null +++ b/docs/AMDGPU/gfx7_dst_flat_atomic32.rst @@ -0,0 +1,19 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_dst_flat_atomic32: + +vdst +=========================== + +Data returned by a 32-bit atomic flat instruction. + +This is an optional operand. It must be used if and only if :ref:`glc` is specified. + +*Size:* 1 dword. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_dst_flat_atomic64.rst b/docs/AMDGPU/gfx7_dst_flat_atomic64.rst new file mode 100644 index 0000000000000000000000000000000000000000..cb1fddd931871cf7cacbcec3476e7ee1c72e5bd4 --- /dev/null +++ b/docs/AMDGPU/gfx7_dst_flat_atomic64.rst @@ -0,0 +1,19 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_dst_flat_atomic64: + +vdst +=========================== + +Data returned by a 64-bit atomic flat instruction. + +This is an optional operand. It must be used if and only if :ref:`glc` is specified. + +*Size:* 2 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_dst_mimg_gather4.rst b/docs/AMDGPU/gfx7_dst_mimg_gather4.rst new file mode 100644 index 0000000000000000000000000000000000000000..17fcff5246a363a55a383ddd0774af4a38f7167f --- /dev/null +++ b/docs/AMDGPU/gfx7_dst_mimg_gather4.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_dst_mimg_gather4: + +vdst +=========================== + +Image data to load by an *image_gather4* instruction. + +*Size:* 4 data elements by default. Each data element occupies 1 dword. :ref:`tfe` adds one more dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_dst_mimg_regular.rst b/docs/AMDGPU/gfx7_dst_mimg_regular.rst new file mode 100644 index 0000000000000000000000000000000000000000..aa165b81e62136fb002cf2005fd8eece5b4352f2 --- /dev/null +++ b/docs/AMDGPU/gfx7_dst_mimg_regular.rst @@ -0,0 +1,20 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_dst_mimg_regular: + +vdst +=========================== + +Image data to load by an image instruction. + +*Size:* depends on :ref:`dmask` and :ref:`tfe`: + +* :ref:`dmask` may specify from 1 to 4 data elements. Each data element occupies 1 dword. +* :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_fimm32.rst b/docs/AMDGPU/gfx7_fimm32.rst new file mode 100644 index 0000000000000000000000000000000000000000..70c81891e9784d81415f5712c8f23e55336027bc --- /dev/null +++ b/docs/AMDGPU/gfx7_fimm32.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_fimm32: + +imm32 +=========================== + +An :ref:`integer_number` or a :ref:`floating-point_number`. The value is converted to *f32* as described :ref:`here`. + diff --git a/docs/AMDGPU/gfx7_hwreg.rst b/docs/AMDGPU/gfx7_hwreg.rst new file mode 100644 index 0000000000000000000000000000000000000000..1e2d96417c33985188f2bfd25e6af485390364cc --- /dev/null +++ b/docs/AMDGPU/gfx7_hwreg.rst @@ -0,0 +1,60 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_hwreg: + +hwreg +=========================== + +Bits of a hardware register being accessed. + +The bits of this operand have the following meaning: + + ============ =================================== + Bits Description + ============ =================================== + 5:0 Register *id*. + 10:6 First bit *offset* (0..31). + 15:11 *Size* in bits (1..32). + ============ =================================== + +This operand may be specified as a positive 16-bit :ref:`integer_number` or using the syntax described below. + + ==================================== ============================================================================ + Syntax Description + ==================================== ============================================================================ + hwreg({0..63}) All bits of a register indicated by its *id*. + hwreg(<*name*>) All bits of a register indicated by its *name*. + hwreg({0..63}, {0..31}, {1..32}) Register bits indicated by register *id*, first bit *offset* and *size*. + hwreg(<*name*>, {0..31}, {1..32}) Register bits indicated by register *name*, first bit *offset* and *size*. + ==================================== ============================================================================ + +Register *id*, *offset* and *size* must be specified as positive :ref:`integer numbers`. + +Defined register *names* include: + + =================== ========================================== + Name Description + =================== ========================================== + HW_REG_MODE Shader writeable mode bits. + HW_REG_STATUS Shader read-only status. + HW_REG_TRAPSTS Trap status. + HW_REG_HW_ID Id of wave, simd, compute unit, etc. + HW_REG_GPR_ALLOC Per-wave SGPR and VGPR allocation. + HW_REG_LDS_ALLOC Per-wave LDS allocation. + HW_REG_IB_STS Counters of outstanding instructions. + =================== ========================================== + +Examples: + +.. parsed-literal:: + + s_getreg_b32 s2, 0x6 + s_getreg_b32 s2, hwreg(15) + s_getreg_b32 s2, hwreg(51, 1, 31) + s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1) + diff --git a/docs/AMDGPU/gfx7_label.rst b/docs/AMDGPU/gfx7_label.rst new file mode 100644 index 0000000000000000000000000000000000000000..ed2f3a41666771a1e80b7d57e4e7875ab5b9095a --- /dev/null +++ b/docs/AMDGPU/gfx7_label.rst @@ -0,0 +1,30 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_label: + +label +=========================== + +A branch target which is a 16-bit signed integer treated as a PC-relative dword offset. + +This operand may be specified as: + +* An :ref:`integer_number`. The number is truncated to 16 bits. +* An :ref:`absolute_expression` which must start with an :ref:`integer_number`. The value of the expression is truncated to 16 bits. +* A :ref:`symbol` (for example, a label). The value is handled as a 16-bit PC-relative dword offset to be resolved by a linker. + +Examples: + +.. parsed-literal:: + + offset = 30 + s_branch loop_end + s_branch 2 + offset + s_branch 32 + loop_end: + diff --git a/docs/AMDGPU/gfx7_mod.rst b/docs/AMDGPU/gfx7_mod.rst new file mode 100644 index 0000000000000000000000000000000000000000..fcaa6caf9247cf8624688df4b32242f8100ddddd --- /dev/null +++ b/docs/AMDGPU/gfx7_mod.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_mod: + +m +=========================== + +This operand may be used with floating point operand modifiers :ref:`abs` and :ref:`neg`. + diff --git a/docs/AMDGPU/gfx7_msg.rst b/docs/AMDGPU/gfx7_msg.rst new file mode 100644 index 0000000000000000000000000000000000000000..5476053ccc1cdab76b2d1f7b5bad1e755eb7b6c7 --- /dev/null +++ b/docs/AMDGPU/gfx7_msg.rst @@ -0,0 +1,72 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_msg: + +msg +=========================== + +A 16-bit message code. The bits of this operand have the following meaning: + + ============ ====================================================== + Bits Description + ============ ====================================================== + 3:0 Message *type*. + 6:4 Optional *operation*. + 9:7 Optional *parameters*. + 15:10 Unused. + ============ ====================================================== + +This operand may be specified as a positive 16-bit :ref:`integer_number` or using the syntax described below: + + ======================================== ======================================================================== + Syntax Description + ======================================== ======================================================================== + sendmsg(<*type*>) A message identified by its *type*. + sendmsg(<*type*>, <*op*>) A message identified by its *type* and *operation*. + sendmsg(<*type*>, <*op*>, <*stream*>) A message identified by its *type* and *operation* with a stream *id*. + ======================================== ======================================================================== + +*Type* may be specified using message *name* or message *id*. + +*Op* may be specified using operation *name* or operation *id*. + +Stream *id* is an integer in the range 0..3. + +Message *id*, operation *id* and stream *id* must be specified as positive :ref:`integer numbers`. + +Each message type supports specific operations: + + ================= ========== ============================== ============ ========== + Message name Message Id Supported Operations Operation Id Stream Id + ================= ========== ============================== ============ ========== + MSG_INTERRUPT 1 \- \- \- + MSG_GS 2 GS_OP_CUT 1 Optional + \ GS_OP_EMIT 2 Optional + \ GS_OP_EMIT_CUT 3 Optional + MSG_GS_DONE 3 GS_OP_NOP 0 \- + \ GS_OP_CUT 1 Optional + \ GS_OP_EMIT 2 Optional + \ GS_OP_EMIT_CUT 3 Optional + MSG_SYSMSG 15 SYSMSG_OP_ECC_ERR_INTERRUPT 1 \- + \ SYSMSG_OP_REG_RD 2 \- + \ SYSMSG_OP_HOST_TRAP_ACK 3 \- + \ SYSMSG_OP_TTRACE_PC 4 \- + ================= ========== ============================== ============ ========== + +Examples: + +.. parsed-literal:: + + s_sendmsg 0x12 + s_sendmsg sendmsg(MSG_INTERRUPT) + s_sendmsg sendmsg(2, GS_OP_CUT) + s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT) + s_sendmsg sendmsg(MSG_GS, 2) + s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_EMIT_CUT, 1) + s_sendmsg sendmsg(MSG_SYSMSG, SYSMSG_OP_TTRACE_PC) + diff --git a/docs/AMDGPU/gfx7_offset_buf.rst b/docs/AMDGPU/gfx7_offset_buf.rst new file mode 100644 index 0000000000000000000000000000000000000000..c36df06a7bbfbe2a8f57092925e4315eca401220 --- /dev/null +++ b/docs/AMDGPU/gfx7_offset_buf.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_offset_buf: + +soffset +=========================== + +An unsigned byte offset. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap`, :ref:`m0`, :ref:`exec`, :ref:`constant` diff --git a/docs/AMDGPU/gfx7_offset_smem.rst b/docs/AMDGPU/gfx7_offset_smem.rst new file mode 100644 index 0000000000000000000000000000000000000000..85ed5f186737624ec5262e547f55b182acf05d9a --- /dev/null +++ b/docs/AMDGPU/gfx7_offset_smem.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_offset_smem: + +soffset +=========================== + +An unsigned offset added to the base address to get memory address. + +* If offset is specified as a register, it supplies an unsigned byte offset but 2 lsb's are ignored. +* If offset is specified as an :ref:`uimm32`, it supplies a 32-bit unsigned byte offset but 2 lsb's are ignored. +* If offset is specified as an :ref:`uimm8`, it supplies an 8-bit unsigned dword offset. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap`, :ref:`uimm8`, :ref:`uimm32` diff --git a/docs/AMDGPU/gfx7_opt.rst b/docs/AMDGPU/gfx7_opt.rst new file mode 100644 index 0000000000000000000000000000000000000000..1a48733dda976a0f0ae7bc7e942f04c658766949 --- /dev/null +++ b/docs/AMDGPU/gfx7_opt.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_opt: + +opt +=========================== + +This is an optional operand. It must be used if and only if :ref:`glc` is specified. + diff --git a/docs/AMDGPU/gfx7_param.rst b/docs/AMDGPU/gfx7_param.rst new file mode 100644 index 0000000000000000000000000000000000000000..13e533b3b288a4c18bcd67268f9c0d539c739eac --- /dev/null +++ b/docs/AMDGPU/gfx7_param.rst @@ -0,0 +1,22 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_param: + +param +=========================== + +Interpolation parameter to read: + + ============ =================================== + Syntax Description + ============ =================================== + p0 Parameter *P0*. + p10 Parameter *P10*. + p20 Parameter *P20*. + ============ =================================== + diff --git a/docs/AMDGPU/gfx7_ret.rst b/docs/AMDGPU/gfx7_ret.rst new file mode 100644 index 0000000000000000000000000000000000000000..25301c2cde8ffdeebbb98374b5a199d17194e2e9 --- /dev/null +++ b/docs/AMDGPU/gfx7_ret.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_ret: + +dst +=========================== + +This is an input operand. It may optionally serve as a destination if :ref:`glc` is specified. + diff --git a/docs/AMDGPU/gfx7_rsrc_buf.rst b/docs/AMDGPU/gfx7_rsrc_buf.rst new file mode 100644 index 0000000000000000000000000000000000000000..7ebcebc68efeb1602b0f3619d668392bd4a1df76 --- /dev/null +++ b/docs/AMDGPU/gfx7_rsrc_buf.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_rsrc_buf: + +srsrc +=========================== + +Buffer resource constant which defines the address and characteristics of the buffer in memory. + +*Size:* 4 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx7_rsrc_mimg.rst b/docs/AMDGPU/gfx7_rsrc_mimg.rst new file mode 100644 index 0000000000000000000000000000000000000000..b0e40fe09cc61e033cda66e61fecfbdbe45c23c9 --- /dev/null +++ b/docs/AMDGPU/gfx7_rsrc_mimg.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_rsrc_mimg: + +srsrc +=========================== + +Image resource constant which defines the location of the image buffer in memory, its dimensions, tiling, and data format. + +*Size:* 8 dwords by default, 4 dwords if :ref:`r128` is specified. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx7_samp_mimg.rst b/docs/AMDGPU/gfx7_samp_mimg.rst new file mode 100644 index 0000000000000000000000000000000000000000..738cad415fef6723f936de1ea361a23a8110b587 --- /dev/null +++ b/docs/AMDGPU/gfx7_samp_mimg.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_samp_mimg: + +ssamp +=========================== + +Sampler constant used to specify filtering options applied to the image data after it is read. + +*Size:* 4 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx7_sdst128_0.rst b/docs/AMDGPU/gfx7_sdst128_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..735a30cefd62173a69062256666e80bc4a3f3fb8 --- /dev/null +++ b/docs/AMDGPU/gfx7_sdst128_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_sdst128_0: + +sdst +=========================== + +Instruction output. + +*Size:* 4 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx7_sdst256_0.rst b/docs/AMDGPU/gfx7_sdst256_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..c4e6f9fcf6627f34487021b4ddd6eb8f12eab2d9 --- /dev/null +++ b/docs/AMDGPU/gfx7_sdst256_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_sdst256_0: + +sdst +=========================== + +Instruction output. + +*Size:* 8 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx7_sdst32_0.rst b/docs/AMDGPU/gfx7_sdst32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..183d89f9e906e307aa219ec72aabadc807e3243f --- /dev/null +++ b/docs/AMDGPU/gfx7_sdst32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_sdst32_0: + +sdst +=========================== + +Instruction output. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap` diff --git a/docs/AMDGPU/gfx7_sdst32_1.rst b/docs/AMDGPU/gfx7_sdst32_1.rst new file mode 100644 index 0000000000000000000000000000000000000000..2e49e20f9d5c9cbded58a479aaddcfafc623058a --- /dev/null +++ b/docs/AMDGPU/gfx7_sdst32_1.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_sdst32_1: + +sdst +=========================== + +Instruction output. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap`, :ref:`m0`, :ref:`exec` diff --git a/docs/AMDGPU/gfx7_sdst32_2.rst b/docs/AMDGPU/gfx7_sdst32_2.rst new file mode 100644 index 0000000000000000000000000000000000000000..8212e5d7a6120360569d3a4c1ed26f3a146a6a31 --- /dev/null +++ b/docs/AMDGPU/gfx7_sdst32_2.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_sdst32_2: + +sdst +=========================== + +Instruction output. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`trap` diff --git a/docs/AMDGPU/gfx7_sdst512_0.rst b/docs/AMDGPU/gfx7_sdst512_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..d8c64b1e395c9d307c2971c71f2ed6be7de0911b --- /dev/null +++ b/docs/AMDGPU/gfx7_sdst512_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_sdst512_0: + +sdst +=========================== + +Instruction output. + +*Size:* 16 dwords. + +*Operands:* :ref:`s` diff --git a/docs/AMDGPU/gfx7_sdst64_0.rst b/docs/AMDGPU/gfx7_sdst64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..af608801b390130c27305246f0cf97fef913b035 --- /dev/null +++ b/docs/AMDGPU/gfx7_sdst64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_sdst64_0: + +sdst +=========================== + +Instruction output. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap` diff --git a/docs/AMDGPU/gfx7_sdst64_1.rst b/docs/AMDGPU/gfx7_sdst64_1.rst new file mode 100644 index 0000000000000000000000000000000000000000..207df73932cae9d9157fef5c03a444486fec7ec9 --- /dev/null +++ b/docs/AMDGPU/gfx7_sdst64_1.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_sdst64_1: + +sdst +=========================== + +Instruction output. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap`, :ref:`exec` diff --git a/docs/AMDGPU/gfx7_simm16.rst b/docs/AMDGPU/gfx7_simm16.rst new file mode 100644 index 0000000000000000000000000000000000000000..66e560ecec8c368706814af1afc3c5e543148099 --- /dev/null +++ b/docs/AMDGPU/gfx7_simm16.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_simm16: + +imm16 +=========================== + +An :ref:`integer_number`. The value is truncated to 16 bits and then sign-extended to 32 bits. + diff --git a/docs/AMDGPU/gfx7_src32_0.rst b/docs/AMDGPU/gfx7_src32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..22ff73ddf47d9991073d2a3dccfbb74ac28ddaf5 --- /dev/null +++ b/docs/AMDGPU/gfx7_src32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_src32_0: + +src +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`v`, :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap`, :ref:`m0`, :ref:`exec`, :ref:`constant`, :ref:`literal` diff --git a/docs/AMDGPU/gfx7_src32_1.rst b/docs/AMDGPU/gfx7_src32_1.rst new file mode 100644 index 0000000000000000000000000000000000000000..00594597ff17fcbe5f5b800f5cfe875544875008 --- /dev/null +++ b/docs/AMDGPU/gfx7_src32_1.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_src32_1: + +src +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`v`, :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap`, :ref:`m0`, :ref:`exec`, :ref:`iconst` diff --git a/docs/AMDGPU/gfx7_src32_2.rst b/docs/AMDGPU/gfx7_src32_2.rst new file mode 100644 index 0000000000000000000000000000000000000000..b939c45603489b58cc41f0a0d68324e82776f8fc --- /dev/null +++ b/docs/AMDGPU/gfx7_src32_2.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_src32_2: + +src +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`v`, :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap`, :ref:`m0`, :ref:`exec`, :ref:`constant` diff --git a/docs/AMDGPU/gfx7_src32_3.rst b/docs/AMDGPU/gfx7_src32_3.rst new file mode 100644 index 0000000000000000000000000000000000000000..83aa9ca7199261e8b68f3d786d6ee9b2a709b872 --- /dev/null +++ b/docs/AMDGPU/gfx7_src32_3.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_src32_3: + +src +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`v`, :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap`, :ref:`m0`, :ref:`exec` diff --git a/docs/AMDGPU/gfx7_src64_0.rst b/docs/AMDGPU/gfx7_src64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..a19b6ee8bd5aa94e23265431669b3b76600f39b7 --- /dev/null +++ b/docs/AMDGPU/gfx7_src64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_src64_0: + +src +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`v`, :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap`, :ref:`exec`, :ref:`constant`, :ref:`literal` diff --git a/docs/AMDGPU/gfx7_src64_1.rst b/docs/AMDGPU/gfx7_src64_1.rst new file mode 100644 index 0000000000000000000000000000000000000000..c81864c9573fb4cef9e0a8f9c77761e8be5dd5dd --- /dev/null +++ b/docs/AMDGPU/gfx7_src64_1.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_src64_1: + +src +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`v`, :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap`, :ref:`exec`, :ref:`constant` diff --git a/docs/AMDGPU/gfx7_src64_2.rst b/docs/AMDGPU/gfx7_src64_2.rst new file mode 100644 index 0000000000000000000000000000000000000000..189245e548df362bfedf88eec48ababd25e797fc --- /dev/null +++ b/docs/AMDGPU/gfx7_src64_2.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_src64_2: + +src +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`v`, :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap`, :ref:`exec`, :ref:`iconst` diff --git a/docs/AMDGPU/gfx7_src_exp.rst b/docs/AMDGPU/gfx7_src_exp.rst new file mode 100644 index 0000000000000000000000000000000000000000..32f71a88b645329dbd2e1a2a3c1a2389773a4a83 --- /dev/null +++ b/docs/AMDGPU/gfx7_src_exp.rst @@ -0,0 +1,28 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_src_exp: + +vsrc +=========================== + +Data to copy to export buffers. This is an optional operand. Must be specified as :ref:`off` if not used. + +:ref:`compr` modifier indicates use of compressed (16-bit) data. This limits number of source operands from 4 to 2: + +* src0 and src1 must specify the first register (or :ref:`off`). +* src2 and src3 must specify the second register (or :ref:`off`). + +An example: + +.. parsed-literal:: + + exp mrtz v3, v3, off, off compr + +*Size:* 1 dword. + +*Operands:* :ref:`v`, :ref:`off` diff --git a/docs/AMDGPU/gfx7_ssrc32_0.rst b/docs/AMDGPU/gfx7_ssrc32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..843db249713caf0177c44d131fb031cf2635b5ad --- /dev/null +++ b/docs/AMDGPU/gfx7_ssrc32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_ssrc32_0: + +ssrc +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap`, :ref:`m0`, :ref:`exec`, :ref:`constant`, :ref:`literal` diff --git a/docs/AMDGPU/gfx7_ssrc32_1.rst b/docs/AMDGPU/gfx7_ssrc32_1.rst new file mode 100644 index 0000000000000000000000000000000000000000..6e626d8367329ad3121a7087101bdb1c0b0aed78 --- /dev/null +++ b/docs/AMDGPU/gfx7_ssrc32_1.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_ssrc32_1: + +ssrc +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap`, :ref:`m0`, :ref:`exec` diff --git a/docs/AMDGPU/gfx7_ssrc32_2.rst b/docs/AMDGPU/gfx7_ssrc32_2.rst new file mode 100644 index 0000000000000000000000000000000000000000..c7ff032f880039dce64731464db3cdcd9bb46993 --- /dev/null +++ b/docs/AMDGPU/gfx7_ssrc32_2.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_ssrc32_2: + +ssrc +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap` diff --git a/docs/AMDGPU/gfx7_ssrc32_3.rst b/docs/AMDGPU/gfx7_ssrc32_3.rst new file mode 100644 index 0000000000000000000000000000000000000000..68a24153b6887fb11244b8ca9414821822fa83cc --- /dev/null +++ b/docs/AMDGPU/gfx7_ssrc32_3.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_ssrc32_3: + +ssrc +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap`, :ref:`m0`, :ref:`exec`, :ref:`iconst`, :ref:`literal` diff --git a/docs/AMDGPU/gfx7_ssrc32_4.rst b/docs/AMDGPU/gfx7_ssrc32_4.rst new file mode 100644 index 0000000000000000000000000000000000000000..669ae4e7e2d03a0724b42ee71c0e178794464e51 --- /dev/null +++ b/docs/AMDGPU/gfx7_ssrc32_4.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_ssrc32_4: + +ssrc +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap`, :ref:`m0`, :ref:`iconst` diff --git a/docs/AMDGPU/gfx7_ssrc64_0.rst b/docs/AMDGPU/gfx7_ssrc64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..283e7ea63dac08f2a29708efe9272d33f79f1af4 --- /dev/null +++ b/docs/AMDGPU/gfx7_ssrc64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_ssrc64_0: + +ssrc +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap`, :ref:`exec`, :ref:`constant`, :ref:`literal` diff --git a/docs/AMDGPU/gfx7_ssrc64_1.rst b/docs/AMDGPU/gfx7_ssrc64_1.rst new file mode 100644 index 0000000000000000000000000000000000000000..42dc8c5690611b60f8f020132f85c9a28b4ef2aa --- /dev/null +++ b/docs/AMDGPU/gfx7_ssrc64_1.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_ssrc64_1: + +ssrc +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap` diff --git a/docs/AMDGPU/gfx7_ssrc64_2.rst b/docs/AMDGPU/gfx7_ssrc64_2.rst new file mode 100644 index 0000000000000000000000000000000000000000..344147fdd279d47730f9668ee5421121954ba75e --- /dev/null +++ b/docs/AMDGPU/gfx7_ssrc64_2.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_ssrc64_2: + +ssrc +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap`, :ref:`exec`, :ref:`constant` diff --git a/docs/AMDGPU/gfx7_ssrc64_3.rst b/docs/AMDGPU/gfx7_ssrc64_3.rst new file mode 100644 index 0000000000000000000000000000000000000000..173f550d4ec9c0aea67bc33a6fc00ce78ed26928 --- /dev/null +++ b/docs/AMDGPU/gfx7_ssrc64_3.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_ssrc64_3: + +ssrc +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`vcc`, :ref:`trap`, :ref:`exec` diff --git a/docs/AMDGPU/gfx7_tgt.rst b/docs/AMDGPU/gfx7_tgt.rst new file mode 100644 index 0000000000000000000000000000000000000000..c407c0c56bbe6618c291b7c165f729469e2cd8b1 --- /dev/null +++ b/docs/AMDGPU/gfx7_tgt.rst @@ -0,0 +1,24 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_tgt: + +tgt +=========================== + +An export target: + + ============== =================================== + Syntax Description + ============== =================================== + pos{0..3} Copy vertex position 0..3. + param{0..31} Copy vertex parameter 0..31. + mrt{0..7} Copy pixel color to the MRTs 0..7. + mrtz Copy pixel depth (Z) data. + null Copy nothing. + ============== =================================== + diff --git a/docs/AMDGPU/gfx7_type_dev.rst b/docs/AMDGPU/gfx7_type_dev.rst new file mode 100644 index 0000000000000000000000000000000000000000..6eab0e1b3964bdc30aca6ab2b9c2073bce739b83 --- /dev/null +++ b/docs/AMDGPU/gfx7_type_dev.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_type_dev: + +Type deviation +=========================== + +*Type* of this operand differs from *type* :ref:`implied by the opcode`. This tag specifies actual operand *type*. + diff --git a/docs/AMDGPU/gfx7_uimm16.rst b/docs/AMDGPU/gfx7_uimm16.rst new file mode 100644 index 0000000000000000000000000000000000000000..bd0d4c2fb14421dc679b1e28d2542c440e5f3c91 --- /dev/null +++ b/docs/AMDGPU/gfx7_uimm16.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_uimm16: + +imm16 +=========================== + +An :ref:`integer_number`. The value is truncated to 16 bits and then zero-extended to 32 bits. + diff --git a/docs/AMDGPU/gfx7_vcc_64.rst b/docs/AMDGPU/gfx7_vcc_64.rst new file mode 100644 index 0000000000000000000000000000000000000000..b1285e0e8b8d5e08cb7af465f4a0a37e0de3ccb0 --- /dev/null +++ b/docs/AMDGPU/gfx7_vcc_64.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_vcc_64: + +vcc +=========================== + +Vector condition code. + +*Size:* 2 dwords. + +*Operands:* :ref:`vcc` diff --git a/docs/AMDGPU/gfx7_vdata128_0.rst b/docs/AMDGPU/gfx7_vdata128_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..5ed2b823bf7479c3500f69b62df58c825b11877f --- /dev/null +++ b/docs/AMDGPU/gfx7_vdata128_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_vdata128_0: + +vdata +=========================== + +Instruction input. + +*Size:* 4 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_vdata32_0.rst b/docs/AMDGPU/gfx7_vdata32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..1615abb4f6c3e50a02b0f8b1dffed8e86d7beb80 --- /dev/null +++ b/docs/AMDGPU/gfx7_vdata32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_vdata32_0: + +vdata +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_vdata64_0.rst b/docs/AMDGPU/gfx7_vdata64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..fceea9fd4b185b3fee3ed84bdbbcfa9001f07f2a --- /dev/null +++ b/docs/AMDGPU/gfx7_vdata64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_vdata64_0: + +vdata +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_vdata96_0.rst b/docs/AMDGPU/gfx7_vdata96_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..b9fe5996e53565de5ad8fb85a8b54454c441d1c6 --- /dev/null +++ b/docs/AMDGPU/gfx7_vdata96_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_vdata96_0: + +vdata +=========================== + +Instruction input. + +*Size:* 3 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_vdst128_0.rst b/docs/AMDGPU/gfx7_vdst128_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..c18652e88419be12651718543f973ddc3a284af9 --- /dev/null +++ b/docs/AMDGPU/gfx7_vdst128_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_vdst128_0: + +vdst +=========================== + +Instruction output. + +*Size:* 4 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_vdst32_0.rst b/docs/AMDGPU/gfx7_vdst32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..e93620355e6d77c865921d29006d1f300a6ae30e --- /dev/null +++ b/docs/AMDGPU/gfx7_vdst32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_vdst32_0: + +vdst +=========================== + +Instruction output. + +*Size:* 1 dword. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_vdst64_0.rst b/docs/AMDGPU/gfx7_vdst64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..4cacaa0d63f10897f32ab4d620b15b363c0ae969 --- /dev/null +++ b/docs/AMDGPU/gfx7_vdst64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_vdst64_0: + +vdst +=========================== + +Instruction output. + +*Size:* 2 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_vdst96_0.rst b/docs/AMDGPU/gfx7_vdst96_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..3c5bf880ce99143146fea552d3e54dc7028e436a --- /dev/null +++ b/docs/AMDGPU/gfx7_vdst96_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_vdst96_0: + +vdst +=========================== + +Instruction output. + +*Size:* 3 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_vsrc128_0.rst b/docs/AMDGPU/gfx7_vsrc128_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..975237916bb6c6c8cde84d2b1060851f3185c48d --- /dev/null +++ b/docs/AMDGPU/gfx7_vsrc128_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_vsrc128_0: + +vsrc +=========================== + +Instruction input. + +*Size:* 4 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_vsrc32_0.rst b/docs/AMDGPU/gfx7_vsrc32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..93b12b00a1c74f1fbe198d87a668bf628a011354 --- /dev/null +++ b/docs/AMDGPU/gfx7_vsrc32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_vsrc32_0: + +vsrc +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_vsrc64_0.rst b/docs/AMDGPU/gfx7_vsrc64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..d8c9d45ddb5fa43f73031e3a6b334c95e661a96a --- /dev/null +++ b/docs/AMDGPU/gfx7_vsrc64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_vsrc64_0: + +vsrc +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx7_waitcnt.rst b/docs/AMDGPU/gfx7_waitcnt.rst new file mode 100644 index 0000000000000000000000000000000000000000..3f5e07d164934ea3e92177d6c81d921e32c147d4 --- /dev/null +++ b/docs/AMDGPU/gfx7_waitcnt.rst @@ -0,0 +1,55 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid7_waitcnt: + +waitcnt +=========================== + +Counts of outstanding instructions to wait for. + +The bits of this operand have the following meaning: + + ============ ====================================================== + Bits Description + ============ ====================================================== + 3:0 VM_CNT: vector memory operations count. + 6:4 EXP_CNT: export count. + 12:8 LGKM_CNT: LDS, GDS, Constant and Message count. + ============ ====================================================== + +This operand may be specified as a positive 16-bit :ref:`integer_number` +or as a combination of the following symbolic helpers: + + ====================== ====================================================================== + Syntax Description + ====================== ====================================================================== + vmcnt(<*N*>) VM_CNT value. *N* must not exceed the largest VM_CNT value. + expcnt(<*N*>) EXP_CNT value. *N* must not exceed the largest EXP_CNT value. + lgkmcnt(<*N*>) LGKM_CNT value. *N* must not exceed the largest LGKM_CNT value. + vmcnt_sat(<*N*>) VM_CNT value computed as min(*N*, the largest VM_CNT value). + expcnt_sat(<*N*>) EXP_CNT value computed as min(*N*, the largest EXP_CNT value). + lgkmcnt_sat(<*N*>) LGKM_CNT value computed as min(*N*, the largest LGKM_CNT value). + ====================== ====================================================================== + +These helpers may be specified in any order. Ampersands and commas may be used as optional separators. + +*N* is either an +:ref:`integer number` or an +:ref:`absolute expression`. + +Examples: + +.. parsed-literal:: + + s_waitcnt 0 + s_waitcnt vmcnt(1) + s_waitcnt expcnt(2) lgkmcnt(3) + s_waitcnt vmcnt(1) expcnt(2) lgkmcnt(3) + s_waitcnt vmcnt(1), expcnt(2), lgkmcnt(3) + s_waitcnt vmcnt(1) & lgkmcnt_sat(100) & expcnt(2) + diff --git a/docs/AMDGPU/gfx8_addr_buf.rst b/docs/AMDGPU/gfx8_addr_buf.rst new file mode 100644 index 0000000000000000000000000000000000000000..74aa275c19973bcc19576524b00400897a2ad40e --- /dev/null +++ b/docs/AMDGPU/gfx8_addr_buf.rst @@ -0,0 +1,22 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_addr_buf: + +vaddr +=========================== + +This is an optional operand which may specify offset and/or index. + +*Size:* 0, 1 or 2 dwords. Size is controlled by modifiers :ref:`offen` and :ref:`idxen`: + +* If only :ref:`idxen` is specified, this operand supplies an index. Size is 1 dword. +* If only :ref:`offen` is specified, this operand supplies an offset. Size is 1 dword. +* If both modifiers are specified, index is in the first register and offset is in the second. Size is 2 dwords. +* If none of these modifiers are specified, this operand must be set to :ref:`off`. + +*Operands:* :ref:`v`, :ref:`off` diff --git a/docs/AMDGPU/gfx8_addr_ds.rst b/docs/AMDGPU/gfx8_addr_ds.rst new file mode 100644 index 0000000000000000000000000000000000000000..7115ff0cf04337c4284e64cf73b024b47ce1d0e2 --- /dev/null +++ b/docs/AMDGPU/gfx8_addr_ds.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_addr_ds: + +vaddr +=========================== + +An offset from the start of GDS/LDS memory. + +*Size:* 1 dword. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_addr_flat.rst b/docs/AMDGPU/gfx8_addr_flat.rst new file mode 100644 index 0000000000000000000000000000000000000000..53dfcc3e895094150ac1a0f4664d33ee1c0e35be --- /dev/null +++ b/docs/AMDGPU/gfx8_addr_flat.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_addr_flat: + +vaddr +=========================== + +A 64-bit flat address. + +*Size:* 2 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_addr_mimg.rst b/docs/AMDGPU/gfx8_addr_mimg.rst new file mode 100644 index 0000000000000000000000000000000000000000..f1052badacdac84c13926d2e1a92ba2918b663a2 --- /dev/null +++ b/docs/AMDGPU/gfx8_addr_mimg.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_addr_mimg: + +vaddr +=========================== + +Image address which includes from one to four dimensional coordinates and other data used to locate a position in the image. + +*Size:* 1, 2, 3, 4, 8 or 16 dwords. Actual size depends on opcode and specific image being handled. + + Note 1. Image format and dimensions are encoded in the image resource constant but not in the instruction. + + Note 2. Actually image address size may vary from 1 to 13 dwords, but assembler currently supports a limited range of register sequences. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_attr.rst b/docs/AMDGPU/gfx8_attr.rst new file mode 100644 index 0000000000000000000000000000000000000000..12fa2cde842271f6cfddf088a9943a622120c89f --- /dev/null +++ b/docs/AMDGPU/gfx8_attr.rst @@ -0,0 +1,30 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_attr: + +attr +=========================== + +Interpolation attribute and channel: + + ============== =================================== + Syntax Description + ============== =================================== + attr{0..32}.x Attribute 0..32 with *x* channel. + attr{0..32}.y Attribute 0..32 with *y* channel. + attr{0..32}.z Attribute 0..32 with *z* channel. + attr{0..32}.w Attribute 0..32 with *w* channel. + ============== =================================== + +Examples: + +.. parsed-literal:: + + v_interp_p1_f32 v1, v0, attr0.x + v_interp_p1_f32 v1, v0, attr32.w + diff --git a/docs/AMDGPU/gfx8_base_smem_addr.rst b/docs/AMDGPU/gfx8_base_smem_addr.rst new file mode 100644 index 0000000000000000000000000000000000000000..81ef25586d6e1f0a82ac741a3dc3f7d03a3286e8 --- /dev/null +++ b/docs/AMDGPU/gfx8_base_smem_addr.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_base_smem_addr: + +sbase +=========================== + +A 64-bit base address for scalar memory operations. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap` diff --git a/docs/AMDGPU/gfx8_base_smem_buf.rst b/docs/AMDGPU/gfx8_base_smem_buf.rst new file mode 100644 index 0000000000000000000000000000000000000000..fb243d0f262f8ef8b98844e1bbaa8a246a168214 --- /dev/null +++ b/docs/AMDGPU/gfx8_base_smem_buf.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_base_smem_buf: + +sbase +=========================== + +A 128-bit buffer resource constant for scalar memory operations which provides a base address, a size and a stride. + +*Size:* 4 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx8_bimm16.rst b/docs/AMDGPU/gfx8_bimm16.rst new file mode 100644 index 0000000000000000000000000000000000000000..ed50e5582329ac480b5ea62849250a4812d4a351 --- /dev/null +++ b/docs/AMDGPU/gfx8_bimm16.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_bimm16: + +imm16 +=========================== + +An :ref:`integer_number`. The value is truncated to 16 bits. + diff --git a/docs/AMDGPU/gfx8_bimm32.rst b/docs/AMDGPU/gfx8_bimm32.rst new file mode 100644 index 0000000000000000000000000000000000000000..d03c27b1732b8267c7931c67eca75f8e60f1d3a1 --- /dev/null +++ b/docs/AMDGPU/gfx8_bimm32.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_bimm32: + +imm32 +=========================== + +An :ref:`integer_number`. The value is truncated to 32 bits. + diff --git a/docs/AMDGPU/gfx8_data_buf_atomic128.rst b/docs/AMDGPU/gfx8_data_buf_atomic128.rst new file mode 100644 index 0000000000000000000000000000000000000000..40b6d3a236e86c09503048f4c418adfb524e9ef3 --- /dev/null +++ b/docs/AMDGPU/gfx8_data_buf_atomic128.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_data_buf_atomic128: + +vdata +=========================== + +Input data for an atomic instruction. + +Optionally may serve as an output data: + +* If :ref:`glc` is specified, gets the memory value before the operation. + +*Size:* 4 dwords by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_data_buf_atomic32.rst b/docs/AMDGPU/gfx8_data_buf_atomic32.rst new file mode 100644 index 0000000000000000000000000000000000000000..51121820d740423e762ade9731c14130cc421106 --- /dev/null +++ b/docs/AMDGPU/gfx8_data_buf_atomic32.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_data_buf_atomic32: + +vdata +=========================== + +Input data for an atomic instruction. + +Optionally may serve as an output data: + +* If :ref:`glc` is specified, gets the memory value before the operation. + +*Size:* 1 dword by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_data_buf_atomic64.rst b/docs/AMDGPU/gfx8_data_buf_atomic64.rst new file mode 100644 index 0000000000000000000000000000000000000000..107998e37d9317c0d0ea6f4b9bfba6eba49aa4b1 --- /dev/null +++ b/docs/AMDGPU/gfx8_data_buf_atomic64.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_data_buf_atomic64: + +vdata +=========================== + +Input data for an atomic instruction. + +Optionally may serve as an output data: + +* If :ref:`glc` is specified, gets the memory value before the operation. + +*Size:* 2 dwords by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_data_buf_d16_128.rst b/docs/AMDGPU/gfx8_data_buf_d16_128.rst new file mode 100644 index 0000000000000000000000000000000000000000..3c98a360252e34b71715cdc1fbd77ea667eea404 --- /dev/null +++ b/docs/AMDGPU/gfx8_data_buf_d16_128.rst @@ -0,0 +1,20 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_data_buf_d16_128: + +vdata +=========================== + +16-bit data to store by a buffer instruction. + +*Size:* depends on GFX8 GPU revision: + +* 4 dwords for GFX8.0. This H/W supports no packing. +* 2 dwords for GFX8.1+. This H/W supports data packing. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_data_buf_d16_32.rst b/docs/AMDGPU/gfx8_data_buf_d16_32.rst new file mode 100644 index 0000000000000000000000000000000000000000..64328756f84c10335aee0edcb23aa722a6b65205 --- /dev/null +++ b/docs/AMDGPU/gfx8_data_buf_d16_32.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_data_buf_d16_32: + +vdata +=========================== + +16-bit data to store by a buffer instruction. + +*Size:* 1 dword. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_data_buf_d16_64.rst b/docs/AMDGPU/gfx8_data_buf_d16_64.rst new file mode 100644 index 0000000000000000000000000000000000000000..932ce6920fca17a18b848407e8316b7982cf1dd8 --- /dev/null +++ b/docs/AMDGPU/gfx8_data_buf_d16_64.rst @@ -0,0 +1,20 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_data_buf_d16_64: + +vdata +=========================== + +16-bit data to store by a buffer instruction. + +*Size:* depends on GFX8 GPU revision: + +* 2 dwords for GFX8.0. This H/W supports no packing. +* 1 dword for GFX8.1+. This H/W supports data packing. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_data_buf_d16_96.rst b/docs/AMDGPU/gfx8_data_buf_d16_96.rst new file mode 100644 index 0000000000000000000000000000000000000000..b9e6915229fdc404d3871ff7e2ecb2217121dab9 --- /dev/null +++ b/docs/AMDGPU/gfx8_data_buf_d16_96.rst @@ -0,0 +1,20 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_data_buf_d16_96: + +vdata +=========================== + +16-bit data to store by a buffer instruction. + +*Size:* depends on GFX8 GPU revision: + +* 3 dwords for GFX8.0. This H/W supports no packing. +* 2 dwords for GFX8.1+. This H/W supports data packing. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_data_mimg_atomic_cmp.rst b/docs/AMDGPU/gfx8_data_mimg_atomic_cmp.rst new file mode 100644 index 0000000000000000000000000000000000000000..80222ead81a83a6f5f84e2011205db89d88f13aa --- /dev/null +++ b/docs/AMDGPU/gfx8_data_mimg_atomic_cmp.rst @@ -0,0 +1,27 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_data_mimg_atomic_cmp: + +vdata +=========================== + +Input data for an atomic instruction. + +Optionally may serve as an output data: + +* If :ref:`glc` is specified, gets the memory value before the operation. + +*Size:* depends on :ref:`dmask` and :ref:`tfe`: + +* :ref:`dmask` may specify 2 data elements for 32-bit-per-pixel surfaces or 4 data elements for 64-bit-per-pixel surfaces. Each data element occupies 1 dword. +* :ref:`tfe` adds 1 dword if specified. + + Note. The surface data format is indicated in the image resource constant but not in the instruction. + + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_data_mimg_atomic_reg.rst b/docs/AMDGPU/gfx8_data_mimg_atomic_reg.rst new file mode 100644 index 0000000000000000000000000000000000000000..8baf9269b887b303abb58040dc218dc51fd8535e --- /dev/null +++ b/docs/AMDGPU/gfx8_data_mimg_atomic_reg.rst @@ -0,0 +1,26 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_data_mimg_atomic_reg: + +vdata +=========================== + +Input data for an atomic instruction. + +Optionally may serve as an output data: + +* If :ref:`glc` is specified, gets the memory value before the operation. + +*Size:* depends on :ref:`dmask` and :ref:`tfe`: + +* :ref:`dmask` may specify 1 data element for 32-bit-per-pixel surfaces or 2 data elements for 64-bit-per-pixel surfaces. Each data element occupies 1 dword. +* :ref:`tfe` adds 1 dword if specified. + + Note. The surface data format is indicated in the image resource constant but not in the instruction. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_data_mimg_store.rst b/docs/AMDGPU/gfx8_data_mimg_store.rst new file mode 100644 index 0000000000000000000000000000000000000000..65a1a49f67e5d7123bcd4f599f74cf0534c01574 --- /dev/null +++ b/docs/AMDGPU/gfx8_data_mimg_store.rst @@ -0,0 +1,18 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_data_mimg_store: + +vdata +=========================== + +Image data to store by an *image_store* instruction. + +*Size:* depends on :ref:`dmask` which may specify from 1 to 4 data elements. Each data element occupies 1 dword. + + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_data_mimg_store_d16.rst b/docs/AMDGPU/gfx8_data_mimg_store_d16.rst new file mode 100644 index 0000000000000000000000000000000000000000..7524c884e7650c977fc0ca4cc59014a7d1186bad --- /dev/null +++ b/docs/AMDGPU/gfx8_data_mimg_store_d16.rst @@ -0,0 +1,24 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_data_mimg_store_d16: + +vdata +=========================== + +Image data to store by an *image_store* instruction. + +*Size:* depends on :ref:`dmask` and :ref:`d16`: + +* :ref:`dmask` may specify from 1 to 4 data elements. Each data element occupies either 32 bits or 16 bits depending on :ref:`d16`. +* :ref:`d16` has different meaning for GFX8.0 and GFX8.1: + + * For GFX8.0 this modifier does not affect size of data elements in registers. Data in registers are stored in low 16 bits, high 16 bits are unused. There is no packing. + * Starting from GFX8.1 this modifier specifies that data elements in registers are packed; each value occupies 16 bits. + + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_dst_buf_128.rst b/docs/AMDGPU/gfx8_dst_buf_128.rst new file mode 100644 index 0000000000000000000000000000000000000000..d076c7025f63d23ac256f22888975446e23ee0b7 --- /dev/null +++ b/docs/AMDGPU/gfx8_dst_buf_128.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_dst_buf_128: + +vdst +=========================== + +Instruction output: data read from a memory buffer. + +*Size:* 4 dwords by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_dst_buf_64.rst b/docs/AMDGPU/gfx8_dst_buf_64.rst new file mode 100644 index 0000000000000000000000000000000000000000..f5da65bf57377de599b8e41eaf9317eb9cbe4f81 --- /dev/null +++ b/docs/AMDGPU/gfx8_dst_buf_64.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_dst_buf_64: + +vdst +=========================== + +Instruction output: data read from a memory buffer. + +*Size:* 2 dwords by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_dst_buf_96.rst b/docs/AMDGPU/gfx8_dst_buf_96.rst new file mode 100644 index 0000000000000000000000000000000000000000..0012c1a88d30d625f21ecfa0bfed9eeb63d3e1f3 --- /dev/null +++ b/docs/AMDGPU/gfx8_dst_buf_96.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_dst_buf_96: + +vdst +=========================== + +Instruction output: data read from a memory buffer. + +*Size:* 3 dwords by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_dst_buf_d16_128.rst b/docs/AMDGPU/gfx8_dst_buf_d16_128.rst new file mode 100644 index 0000000000000000000000000000000000000000..0f5318d8d8ce863d5c3a961a9e9c31f1f3079525 --- /dev/null +++ b/docs/AMDGPU/gfx8_dst_buf_d16_128.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_dst_buf_d16_128: + +vdst +=========================== + +Instruction output: data read from a memory buffer and converted to a 16-bit format. + +*Size:* depends on GFX8 GPU revision and :ref:`tfe`: + +* 4 dwords for GFX8.0. This H/W supports no packing. +* 2 dwords for GFX8.1+. This H/W supports data packing. +* :ref:`tfe` adds one dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_dst_buf_d16_32.rst b/docs/AMDGPU/gfx8_dst_buf_d16_32.rst new file mode 100644 index 0000000000000000000000000000000000000000..6288c2def70aafaa07e1a019381c549cd1c49cb1 --- /dev/null +++ b/docs/AMDGPU/gfx8_dst_buf_d16_32.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_dst_buf_d16_32: + +vdst +=========================== + +Instruction output: data read from a memory buffer and converted to a 16-bit format. + +*Size:* 1 dword by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_dst_buf_d16_64.rst b/docs/AMDGPU/gfx8_dst_buf_d16_64.rst new file mode 100644 index 0000000000000000000000000000000000000000..b46310fd5241ac3d0d520da8cbcb61214a24f8c8 --- /dev/null +++ b/docs/AMDGPU/gfx8_dst_buf_d16_64.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_dst_buf_d16_64: + +vdst +=========================== + +Instruction output: data read from a memory buffer and converted to a 16-bit format. + +*Size:* depends on GFX8 GPU revision and :ref:`tfe`: + +* 2 dwords for GFX8.0. This H/W supports no packing. +* 1 dword for GFX8.1+. This H/W supports data packing. +* :ref:`tfe` adds one dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_dst_buf_d16_96.rst b/docs/AMDGPU/gfx8_dst_buf_d16_96.rst new file mode 100644 index 0000000000000000000000000000000000000000..15e7e8901970a43e3aedb114bf8db58be06876fc --- /dev/null +++ b/docs/AMDGPU/gfx8_dst_buf_d16_96.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_dst_buf_d16_96: + +vdst +=========================== + +Instruction output: data read from a memory buffer and converted to a 16-bit format. + +*Size:* depends on GFX8 GPU revision and :ref:`tfe`: + +* 3 dwords for GFX8.0. This H/W supports no packing. +* 2 dwords for GFX8.1+. This H/W supports data packing. +* :ref:`tfe` adds one dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_dst_buf_lds.rst b/docs/AMDGPU/gfx8_dst_buf_lds.rst new file mode 100644 index 0000000000000000000000000000000000000000..b1cb1458dfac3a79ed62370cba80a8c9e47e6e64 --- /dev/null +++ b/docs/AMDGPU/gfx8_dst_buf_lds.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_dst_buf_lds: + +vdst +=========================== + +Instruction output: data read from a memory buffer. + +If :ref:`lds` is specified, this operand is ignored by H/W and data are stored directly into LDS. + +*Size:* 1 dword by default. :ref:`tfe` adds 1 dword if specified. + + Note that :ref:`tfe` and :ref:`lds` cannot be used together. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_dst_flat_atomic32.rst b/docs/AMDGPU/gfx8_dst_flat_atomic32.rst new file mode 100644 index 0000000000000000000000000000000000000000..a8ae4646ae98b350e64c791d4319fbc31b5580d7 --- /dev/null +++ b/docs/AMDGPU/gfx8_dst_flat_atomic32.rst @@ -0,0 +1,19 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_dst_flat_atomic32: + +vdst +=========================== + +Data returned by a 32-bit atomic flat instruction. + +This is an optional operand. It must be used if and only if :ref:`glc` is specified. + +*Size:* 1 dword. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_dst_flat_atomic64.rst b/docs/AMDGPU/gfx8_dst_flat_atomic64.rst new file mode 100644 index 0000000000000000000000000000000000000000..5b46e88f5610003073fef917c6880c40a52b50e6 --- /dev/null +++ b/docs/AMDGPU/gfx8_dst_flat_atomic64.rst @@ -0,0 +1,19 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_dst_flat_atomic64: + +vdst +=========================== + +Data returned by a 64-bit atomic flat instruction. + +This is an optional operand. It must be used if and only if :ref:`glc` is specified. + +*Size:* 2 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_dst_mimg_gather4.rst b/docs/AMDGPU/gfx8_dst_mimg_gather4.rst new file mode 100644 index 0000000000000000000000000000000000000000..6fc01926ed176ae8a3f4c8ad81dcdbc48a89d1a8 --- /dev/null +++ b/docs/AMDGPU/gfx8_dst_mimg_gather4.rst @@ -0,0 +1,26 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_dst_mimg_gather4: + +vdst +=========================== + +Image data to load by an *image_gather4* instruction. + +*Size:* 4 data elements by default. Each data element occupies either 32 bits or 16 bits depending on :ref:`d16`. + +:ref:`d16` and :ref:`tfe` affect operand size as follows: + +* :ref:`d16` has different meaning for GFX8.0 and GFX8.1: + + * For GFX8.0 this modifier does not affect size of data elements in registers. Data in registers are stored in low 16 bits, high 16 bits are unused. There is no packing. + * Starting from GFX8.1 this modifier specifies that data elements in registers are packed; each value occupies 16 bits. + +* :ref:`tfe` adds one dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_dst_mimg_regular.rst b/docs/AMDGPU/gfx8_dst_mimg_regular.rst new file mode 100644 index 0000000000000000000000000000000000000000..be1037a5ce5ae19c0e2ef3f3e7f692291c0913c7 --- /dev/null +++ b/docs/AMDGPU/gfx8_dst_mimg_regular.rst @@ -0,0 +1,20 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_dst_mimg_regular: + +vdst +=========================== + +Image data to load by an image instruction. + +*Size:* depends on :ref:`dmask` and :ref:`tfe`: + +* :ref:`dmask` may specify from 1 to 4 data elements. Each data element occupies 1 dword. +* :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_dst_mimg_regular_d16.rst b/docs/AMDGPU/gfx8_dst_mimg_regular_d16.rst new file mode 100644 index 0000000000000000000000000000000000000000..4eb7037386f10abb64912e05a4418d8e74f8f877 --- /dev/null +++ b/docs/AMDGPU/gfx8_dst_mimg_regular_d16.rst @@ -0,0 +1,26 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_dst_mimg_regular_d16: + +vdst +=========================== + +Image data to load by an image instruction. + +*Size:* depends on :ref:`dmask`, :ref:`tfe` and :ref:`d16`: + +* :ref:`dmask` may specify from 1 to 4 data elements. Each data element occupies either 32 bits or 16 bits depending on :ref:`d16`. +* :ref:`d16` has different meaning for GFX8.0 and GFX8.1: + + * For GFX8.0 this modifier does not affect size of data elements in registers. Data in registers are stored in low 16 bits, high 16 bits are unused. There is no packing. + * Starting from GFX8.1 this modifier specifies that data elements in registers are packed; each value occupies 16 bits. + +* :ref:`tfe` adds 1 dword if specified. + + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_fimm16.rst b/docs/AMDGPU/gfx8_fimm16.rst new file mode 100644 index 0000000000000000000000000000000000000000..5e387f5cb77464b33e77f38052ee8c6722fdb2a4 --- /dev/null +++ b/docs/AMDGPU/gfx8_fimm16.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_fimm16: + +imm32 +=========================== + +An :ref:`integer_number` or a :ref:`floating-point_number`. The number is converted to *f16* as described :ref:`here`. + diff --git a/docs/AMDGPU/gfx8_fimm32.rst b/docs/AMDGPU/gfx8_fimm32.rst new file mode 100644 index 0000000000000000000000000000000000000000..e29e7704b8aa5657aee16a42a45ea0651e3b5a88 --- /dev/null +++ b/docs/AMDGPU/gfx8_fimm32.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_fimm32: + +imm32 +=========================== + +An :ref:`integer_number` or a :ref:`floating-point_number`. The value is converted to *f32* as described :ref:`here`. + diff --git a/docs/AMDGPU/gfx8_hwreg.rst b/docs/AMDGPU/gfx8_hwreg.rst new file mode 100644 index 0000000000000000000000000000000000000000..ffa1ea5afde3dd946300976a44e6575fe281f45c --- /dev/null +++ b/docs/AMDGPU/gfx8_hwreg.rst @@ -0,0 +1,60 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_hwreg: + +hwreg +=========================== + +Bits of a hardware register being accessed. + +The bits of this operand have the following meaning: + + ============ =================================== + Bits Description + ============ =================================== + 5:0 Register *id*. + 10:6 First bit *offset* (0..31). + 15:11 *Size* in bits (1..32). + ============ =================================== + +This operand may be specified as a positive 16-bit :ref:`integer_number` or using the syntax described below. + + ==================================== ============================================================================ + Syntax Description + ==================================== ============================================================================ + hwreg({0..63}) All bits of a register indicated by its *id*. + hwreg(<*name*>) All bits of a register indicated by its *name*. + hwreg({0..63}, {0..31}, {1..32}) Register bits indicated by register *id*, first bit *offset* and *size*. + hwreg(<*name*>, {0..31}, {1..32}) Register bits indicated by register *name*, first bit *offset* and *size*. + ==================================== ============================================================================ + +Register *id*, *offset* and *size* must be specified as positive :ref:`integer numbers`. + +Defined register *names* include: + + =================== ========================================== + Name Description + =================== ========================================== + HW_REG_MODE Shader writeable mode bits. + HW_REG_STATUS Shader read-only status. + HW_REG_TRAPSTS Trap status. + HW_REG_HW_ID Id of wave, simd, compute unit, etc. + HW_REG_GPR_ALLOC Per-wave SGPR and VGPR allocation. + HW_REG_LDS_ALLOC Per-wave LDS allocation. + HW_REG_IB_STS Counters of outstanding instructions. + =================== ========================================== + +Examples: + +.. parsed-literal:: + + s_getreg_b32 s2, 0x6 + s_getreg_b32 s2, hwreg(15) + s_getreg_b32 s2, hwreg(51, 1, 31) + s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1) + diff --git a/docs/AMDGPU/gfx8_imm4.rst b/docs/AMDGPU/gfx8_imm4.rst new file mode 100644 index 0000000000000000000000000000000000000000..a03de76e86efbaa3daf0cd7d0a2fdda68e6d403a --- /dev/null +++ b/docs/AMDGPU/gfx8_imm4.rst @@ -0,0 +1,25 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_imm4: + +imm4 +=========================== + +A positive :ref:`integer_number`. The value is truncated to 4 bits. + +This operand is a mask which controls indexing mode for operands of subsequent instructions. Value 1 enables indexing and value 0 disables it. + + ============ ======================================== + Bit Meaning + ============ ======================================== + 0 Enables or disables *src0* indexing. + 1 Enables or disables *src1* indexing. + 2 Enables or disables *src2* indexing. + 3 Enables or disables *dst* indexing. + ============ ======================================== + diff --git a/docs/AMDGPU/gfx8_label.rst b/docs/AMDGPU/gfx8_label.rst new file mode 100644 index 0000000000000000000000000000000000000000..99e384ee392e86961045828cbeacf30550c138ad --- /dev/null +++ b/docs/AMDGPU/gfx8_label.rst @@ -0,0 +1,30 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_label: + +label +=========================== + +A branch target which is a 16-bit signed integer treated as a PC-relative dword offset. + +This operand may be specified as: + +* An :ref:`integer_number`. The number is truncated to 16 bits. +* An :ref:`absolute_expression` which must start with an :ref:`integer_number`. The value of the expression is truncated to 16 bits. +* A :ref:`symbol` (for example, a label). The value is handled as a 16-bit PC-relative dword offset to be resolved by a linker. + +Examples: + +.. parsed-literal:: + + offset = 30 + s_branch loop_end + s_branch 2 + offset + s_branch 32 + loop_end: + diff --git a/docs/AMDGPU/gfx8_mod_dpp_sdwa_abs_neg.rst b/docs/AMDGPU/gfx8_mod_dpp_sdwa_abs_neg.rst new file mode 100644 index 0000000000000000000000000000000000000000..be7b4b511b177754179154a2a63cf2ad0428e901 --- /dev/null +++ b/docs/AMDGPU/gfx8_mod_dpp_sdwa_abs_neg.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_mod_dpp_sdwa_abs_neg: + +m +=========================== + +This operand may be used with floating point operand modifiers :ref:`abs` and :ref:`neg`. + diff --git a/docs/AMDGPU/gfx8_mod_sdwa_sext.rst b/docs/AMDGPU/gfx8_mod_sdwa_sext.rst new file mode 100644 index 0000000000000000000000000000000000000000..b48c521f3b394bfcf54a264d2af877176f01d53f --- /dev/null +++ b/docs/AMDGPU/gfx8_mod_sdwa_sext.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_mod_sdwa_sext: + +m +=========================== + +This operand may be used with integer operand modifier :ref:`sext`. + diff --git a/docs/AMDGPU/gfx8_mod_vop3_abs_neg.rst b/docs/AMDGPU/gfx8_mod_vop3_abs_neg.rst new file mode 100644 index 0000000000000000000000000000000000000000..960e8b1f38f1309b111135d1628b3a4a5b1d9855 --- /dev/null +++ b/docs/AMDGPU/gfx8_mod_vop3_abs_neg.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_mod_vop3_abs_neg: + +m +=========================== + +This operand may be used with floating point operand modifiers :ref:`abs` and :ref:`neg`. + diff --git a/docs/AMDGPU/gfx8_msg.rst b/docs/AMDGPU/gfx8_msg.rst new file mode 100644 index 0000000000000000000000000000000000000000..313d8e68b4b80a88793f515199105fff3a7e3e6a --- /dev/null +++ b/docs/AMDGPU/gfx8_msg.rst @@ -0,0 +1,72 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_msg: + +msg +=========================== + +A 16-bit message code. The bits of this operand have the following meaning: + + ============ ====================================================== + Bits Description + ============ ====================================================== + 3:0 Message *type*. + 6:4 Optional *operation*. + 9:7 Optional *parameters*. + 15:10 Unused. + ============ ====================================================== + +This operand may be specified as a positive 16-bit :ref:`integer_number` or using the syntax described below: + + ======================================== ======================================================================== + Syntax Description + ======================================== ======================================================================== + sendmsg(<*type*>) A message identified by its *type*. + sendmsg(<*type*>, <*op*>) A message identified by its *type* and *operation*. + sendmsg(<*type*>, <*op*>, <*stream*>) A message identified by its *type* and *operation* with a stream *id*. + ======================================== ======================================================================== + +*Type* may be specified using message *name* or message *id*. + +*Op* may be specified using operation *name* or operation *id*. + +Stream *id* is an integer in the range 0..3. + +Message *id*, operation *id* and stream *id* must be specified as positive :ref:`integer numbers`. + +Each message type supports specific operations: + + ================= ========== ============================== ============ ========== + Message name Message Id Supported Operations Operation Id Stream Id + ================= ========== ============================== ============ ========== + MSG_INTERRUPT 1 \- \- \- + MSG_GS 2 GS_OP_CUT 1 Optional + \ GS_OP_EMIT 2 Optional + \ GS_OP_EMIT_CUT 3 Optional + MSG_GS_DONE 3 GS_OP_NOP 0 \- + \ GS_OP_CUT 1 Optional + \ GS_OP_EMIT 2 Optional + \ GS_OP_EMIT_CUT 3 Optional + MSG_SYSMSG 15 SYSMSG_OP_ECC_ERR_INTERRUPT 1 \- + \ SYSMSG_OP_REG_RD 2 \- + \ SYSMSG_OP_HOST_TRAP_ACK 3 \- + \ SYSMSG_OP_TTRACE_PC 4 \- + ================= ========== ============================== ============ ========== + +Examples: + +.. parsed-literal:: + + s_sendmsg 0x12 + s_sendmsg sendmsg(MSG_INTERRUPT) + s_sendmsg sendmsg(2, GS_OP_CUT) + s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT) + s_sendmsg sendmsg(MSG_GS, 2) + s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_EMIT_CUT, 1) + s_sendmsg sendmsg(MSG_SYSMSG, SYSMSG_OP_TTRACE_PC) + diff --git a/docs/AMDGPU/gfx8_offset_buf.rst b/docs/AMDGPU/gfx8_offset_buf.rst new file mode 100644 index 0000000000000000000000000000000000000000..42c452416b5be4d3ac93ff3a3fb98a6b8f59351c --- /dev/null +++ b/docs/AMDGPU/gfx8_offset_buf.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_offset_buf: + +soffset +=========================== + +An unsigned byte offset. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap`, :ref:`m0`, :ref:`exec`, :ref:`constant` diff --git a/docs/AMDGPU/gfx8_offset_smem_load.rst b/docs/AMDGPU/gfx8_offset_smem_load.rst new file mode 100644 index 0000000000000000000000000000000000000000..5c30a87f569c07589edd5f51ea1b126d2e95746e --- /dev/null +++ b/docs/AMDGPU/gfx8_offset_smem_load.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_offset_smem_load: + +soffset +=========================== + +An unsigned byte offset added to the base address to get memory address. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap`, :ref:`m0`, :ref:`uimm20` diff --git a/docs/AMDGPU/gfx8_offset_smem_store.rst b/docs/AMDGPU/gfx8_offset_smem_store.rst new file mode 100644 index 0000000000000000000000000000000000000000..9ff90f98b35463a0ebf4caf3054e6004e8da7ca4 --- /dev/null +++ b/docs/AMDGPU/gfx8_offset_smem_store.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_offset_smem_store: + +soffset +=========================== + +An unsigned byte offset added to the base address to get memory address. + +*Size:* 1 dword. + +*Operands:* :ref:`m0`, :ref:`uimm20` diff --git a/docs/AMDGPU/gfx8_opt.rst b/docs/AMDGPU/gfx8_opt.rst new file mode 100644 index 0000000000000000000000000000000000000000..417d7fa1ff38bf9327ad16b68798de1d2aa96be0 --- /dev/null +++ b/docs/AMDGPU/gfx8_opt.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_opt: + +opt +=========================== + +This is an optional operand. It must be used if and only if :ref:`glc` is specified. + diff --git a/docs/AMDGPU/gfx8_param.rst b/docs/AMDGPU/gfx8_param.rst new file mode 100644 index 0000000000000000000000000000000000000000..0bd88549f0923552f66aa9f97f7a36f69c0cd9d3 --- /dev/null +++ b/docs/AMDGPU/gfx8_param.rst @@ -0,0 +1,22 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_param: + +param +=========================== + +Interpolation parameter to read: + + ============ =================================== + Syntax Description + ============ =================================== + p0 Parameter *P0*. + p10 Parameter *P10*. + p20 Parameter *P20*. + ============ =================================== + diff --git a/docs/AMDGPU/gfx8_perm_smem.rst b/docs/AMDGPU/gfx8_perm_smem.rst new file mode 100644 index 0000000000000000000000000000000000000000..0035ac821a7248c6ff5777a02ec9656a7408be78 --- /dev/null +++ b/docs/AMDGPU/gfx8_perm_smem.rst @@ -0,0 +1,24 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_perm_smem: + +imm3 +=========================== + +A bit mask which indicates request permissions. + +This operand must be specified as an :ref:`integer_number`. The value is truncated to 7 bits, but only 3 low bits are significant. + + ============ ============================== + Bit Number Description + ============ ============================== + 0 Request *read* permission. + 1 Request *write* permission. + 2 Request *execute* permission. + ============ ============================== + diff --git a/docs/AMDGPU/gfx8_ret.rst b/docs/AMDGPU/gfx8_ret.rst new file mode 100644 index 0000000000000000000000000000000000000000..91fdaf378a1d5ce47f609343454814929761887b --- /dev/null +++ b/docs/AMDGPU/gfx8_ret.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_ret: + +dst +=========================== + +This is an input operand. It may optionally serve as a destination if :ref:`glc` is specified. + diff --git a/docs/AMDGPU/gfx8_rsrc_buf.rst b/docs/AMDGPU/gfx8_rsrc_buf.rst new file mode 100644 index 0000000000000000000000000000000000000000..ecdb0a0d9dcf5063ea12b98528e739b5431d67d4 --- /dev/null +++ b/docs/AMDGPU/gfx8_rsrc_buf.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_rsrc_buf: + +srsrc +=========================== + +Buffer resource constant which defines the address and characteristics of the buffer in memory. + +*Size:* 4 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx8_rsrc_mimg.rst b/docs/AMDGPU/gfx8_rsrc_mimg.rst new file mode 100644 index 0000000000000000000000000000000000000000..3ca255928983f100f3aa57eaa5f23f970e9e5a0c --- /dev/null +++ b/docs/AMDGPU/gfx8_rsrc_mimg.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_rsrc_mimg: + +srsrc +=========================== + +Image resource constant which defines the location of the image buffer in memory, its dimensions, tiling, and data format. + +*Size:* 8 dwords by default, 4 dwords if :ref:`r128` is specified. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx8_samp_mimg.rst b/docs/AMDGPU/gfx8_samp_mimg.rst new file mode 100644 index 0000000000000000000000000000000000000000..c4b27125768d8488cd57494b7d8b6e31b9341042 --- /dev/null +++ b/docs/AMDGPU/gfx8_samp_mimg.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_samp_mimg: + +ssamp +=========================== + +Sampler constant used to specify filtering options applied to the image data after it is read. + +*Size:* 4 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx8_sdata128_0.rst b/docs/AMDGPU/gfx8_sdata128_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..a52703c3545ad1d69d5cf90f2d87e820e1abafd0 --- /dev/null +++ b/docs/AMDGPU/gfx8_sdata128_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_sdata128_0: + +sdata +=========================== + +Instruction input. + +*Size:* 4 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx8_sdata32_0.rst b/docs/AMDGPU/gfx8_sdata32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..9ccd7bd8c7136d7673cf227b8b60cc55e8d3faf9 --- /dev/null +++ b/docs/AMDGPU/gfx8_sdata32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_sdata32_0: + +sdata +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap` diff --git a/docs/AMDGPU/gfx8_sdata64_0.rst b/docs/AMDGPU/gfx8_sdata64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..8718449228c7df64fd4e4611123b5dcbafc845a3 --- /dev/null +++ b/docs/AMDGPU/gfx8_sdata64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_sdata64_0: + +sdata +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap` diff --git a/docs/AMDGPU/gfx8_sdst128_0.rst b/docs/AMDGPU/gfx8_sdst128_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..277e3db02c410f9a8e3ac5d6561f3dc39d357257 --- /dev/null +++ b/docs/AMDGPU/gfx8_sdst128_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_sdst128_0: + +sdst +=========================== + +Instruction output. + +*Size:* 4 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx8_sdst256_0.rst b/docs/AMDGPU/gfx8_sdst256_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..2e54b9b1462c8bfa3ec4148f76ee9b0f164ec6fe --- /dev/null +++ b/docs/AMDGPU/gfx8_sdst256_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_sdst256_0: + +sdst +=========================== + +Instruction output. + +*Size:* 8 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx8_sdst32_0.rst b/docs/AMDGPU/gfx8_sdst32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..44e6cdc8f7f5473727532d0af791205d841f5aae --- /dev/null +++ b/docs/AMDGPU/gfx8_sdst32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_sdst32_0: + +sdst +=========================== + +Instruction output. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap` diff --git a/docs/AMDGPU/gfx8_sdst32_1.rst b/docs/AMDGPU/gfx8_sdst32_1.rst new file mode 100644 index 0000000000000000000000000000000000000000..7156225fa3c4556aad784b1ddbac69804092cf16 --- /dev/null +++ b/docs/AMDGPU/gfx8_sdst32_1.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_sdst32_1: + +sdst +=========================== + +Instruction output. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap`, :ref:`m0`, :ref:`exec` diff --git a/docs/AMDGPU/gfx8_sdst32_2.rst b/docs/AMDGPU/gfx8_sdst32_2.rst new file mode 100644 index 0000000000000000000000000000000000000000..af446d3e54ce7868a0c09c8f7ca15492a2024e51 --- /dev/null +++ b/docs/AMDGPU/gfx8_sdst32_2.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_sdst32_2: + +sdst +=========================== + +Instruction output. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`trap` diff --git a/docs/AMDGPU/gfx8_sdst512_0.rst b/docs/AMDGPU/gfx8_sdst512_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..95b82a7d024988fdbf5f4c7137768612f7521ffc --- /dev/null +++ b/docs/AMDGPU/gfx8_sdst512_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_sdst512_0: + +sdst +=========================== + +Instruction output. + +*Size:* 16 dwords. + +*Operands:* :ref:`s` diff --git a/docs/AMDGPU/gfx8_sdst64_0.rst b/docs/AMDGPU/gfx8_sdst64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..9195778a5e82b3385dceaa54c9b3cc2c41a96e6a --- /dev/null +++ b/docs/AMDGPU/gfx8_sdst64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_sdst64_0: + +sdst +=========================== + +Instruction output. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap` diff --git a/docs/AMDGPU/gfx8_sdst64_1.rst b/docs/AMDGPU/gfx8_sdst64_1.rst new file mode 100644 index 0000000000000000000000000000000000000000..165e0c0175f93a42287140372f08a5caf7cebec2 --- /dev/null +++ b/docs/AMDGPU/gfx8_sdst64_1.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_sdst64_1: + +sdst +=========================== + +Instruction output. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap`, :ref:`exec` diff --git a/docs/AMDGPU/gfx8_simm16.rst b/docs/AMDGPU/gfx8_simm16.rst new file mode 100644 index 0000000000000000000000000000000000000000..730f239b6bea705047f01439d1ff3f08590574b9 --- /dev/null +++ b/docs/AMDGPU/gfx8_simm16.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_simm16: + +imm16 +=========================== + +An :ref:`integer_number`. The value is truncated to 16 bits and then sign-extended to 32 bits. + diff --git a/docs/AMDGPU/gfx8_src32_0.rst b/docs/AMDGPU/gfx8_src32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..a9c11fea8e3b59338efd0d05151261fd7516f13a --- /dev/null +++ b/docs/AMDGPU/gfx8_src32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_src32_0: + +src +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`v`, :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap`, :ref:`m0`, :ref:`exec`, :ref:`constant`, :ref:`literal` diff --git a/docs/AMDGPU/gfx8_src32_1.rst b/docs/AMDGPU/gfx8_src32_1.rst new file mode 100644 index 0000000000000000000000000000000000000000..67dcdc8ed5e9bef0cbd6605cac1ee618d2fb1f4e --- /dev/null +++ b/docs/AMDGPU/gfx8_src32_1.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_src32_1: + +src +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`v`, :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap`, :ref:`m0`, :ref:`exec`, :ref:`constant` diff --git a/docs/AMDGPU/gfx8_src64_0.rst b/docs/AMDGPU/gfx8_src64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..573fd68b2a1176744fccd054bab8956ca82b6695 --- /dev/null +++ b/docs/AMDGPU/gfx8_src64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_src64_0: + +src +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`v`, :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap`, :ref:`exec`, :ref:`constant`, :ref:`literal` diff --git a/docs/AMDGPU/gfx8_src64_1.rst b/docs/AMDGPU/gfx8_src64_1.rst new file mode 100644 index 0000000000000000000000000000000000000000..d2c78b754d7efb3e1223d86687244b888f4d55f7 --- /dev/null +++ b/docs/AMDGPU/gfx8_src64_1.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_src64_1: + +src +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`v`, :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap`, :ref:`exec`, :ref:`constant` diff --git a/docs/AMDGPU/gfx8_src_exp.rst b/docs/AMDGPU/gfx8_src_exp.rst new file mode 100644 index 0000000000000000000000000000000000000000..10449b4e36e845daec37ce6dd8366ac49797f9e7 --- /dev/null +++ b/docs/AMDGPU/gfx8_src_exp.rst @@ -0,0 +1,28 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_src_exp: + +vsrc +=========================== + +Data to copy to export buffers. This is an optional operand. Must be specified as :ref:`off` if not used. + +:ref:`compr` modifier indicates use of compressed (16-bit) data. This limits number of source operands from 4 to 2: + +* src0 and src1 must specify the first register (or :ref:`off`). +* src2 and src3 must specify the second register (or :ref:`off`). + +An example: + +.. parsed-literal:: + + exp mrtz v3, v3, off, off compr + +*Size:* 1 dword. + +*Operands:* :ref:`v`, :ref:`off` diff --git a/docs/AMDGPU/gfx8_ssrc32_0.rst b/docs/AMDGPU/gfx8_ssrc32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..82d18b1585375d2861a7d27bb8ba010a3bdb317e --- /dev/null +++ b/docs/AMDGPU/gfx8_ssrc32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_ssrc32_0: + +ssrc +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap`, :ref:`m0`, :ref:`exec`, :ref:`constant`, :ref:`literal` diff --git a/docs/AMDGPU/gfx8_ssrc32_1.rst b/docs/AMDGPU/gfx8_ssrc32_1.rst new file mode 100644 index 0000000000000000000000000000000000000000..203d9c5397fc80bb9439448d2959eb8681a7def7 --- /dev/null +++ b/docs/AMDGPU/gfx8_ssrc32_1.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_ssrc32_1: + +ssrc +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap` diff --git a/docs/AMDGPU/gfx8_ssrc32_2.rst b/docs/AMDGPU/gfx8_ssrc32_2.rst new file mode 100644 index 0000000000000000000000000000000000000000..9b893e962a0ac8434f239d82b26cf63aa63cbefb --- /dev/null +++ b/docs/AMDGPU/gfx8_ssrc32_2.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_ssrc32_2: + +ssrc +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap`, :ref:`m0`, :ref:`exec` diff --git a/docs/AMDGPU/gfx8_ssrc32_3.rst b/docs/AMDGPU/gfx8_ssrc32_3.rst new file mode 100644 index 0000000000000000000000000000000000000000..131765fee506c27063983973f9c781b23c95f44d --- /dev/null +++ b/docs/AMDGPU/gfx8_ssrc32_3.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_ssrc32_3: + +ssrc +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap`, :ref:`m0`, :ref:`iconst` diff --git a/docs/AMDGPU/gfx8_ssrc32_4.rst b/docs/AMDGPU/gfx8_ssrc32_4.rst new file mode 100644 index 0000000000000000000000000000000000000000..02d90a4d1da43a84bad65670a32b654e1cdd304c --- /dev/null +++ b/docs/AMDGPU/gfx8_ssrc32_4.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_ssrc32_4: + +ssrc +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap`, :ref:`m0`, :ref:`exec`, :ref:`constant` diff --git a/docs/AMDGPU/gfx8_ssrc64_0.rst b/docs/AMDGPU/gfx8_ssrc64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..b8389dcdbae14b822a4065ae89aa0c65f4d9e4c5 --- /dev/null +++ b/docs/AMDGPU/gfx8_ssrc64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_ssrc64_0: + +ssrc +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap`, :ref:`exec`, :ref:`constant`, :ref:`literal` diff --git a/docs/AMDGPU/gfx8_ssrc64_1.rst b/docs/AMDGPU/gfx8_ssrc64_1.rst new file mode 100644 index 0000000000000000000000000000000000000000..c4fddf4ace2b77bd65385f030e7d1f3347c76cc0 --- /dev/null +++ b/docs/AMDGPU/gfx8_ssrc64_1.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_ssrc64_1: + +ssrc +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap` diff --git a/docs/AMDGPU/gfx8_ssrc64_2.rst b/docs/AMDGPU/gfx8_ssrc64_2.rst new file mode 100644 index 0000000000000000000000000000000000000000..209dffc8da03b614e9895fbd63b00a16186afea8 --- /dev/null +++ b/docs/AMDGPU/gfx8_ssrc64_2.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_ssrc64_2: + +ssrc +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap`, :ref:`exec`, :ref:`constant` diff --git a/docs/AMDGPU/gfx8_ssrc64_3.rst b/docs/AMDGPU/gfx8_ssrc64_3.rst new file mode 100644 index 0000000000000000000000000000000000000000..9ab5436572d10e33d2d71d3250b572540f24d530 --- /dev/null +++ b/docs/AMDGPU/gfx8_ssrc64_3.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_ssrc64_3: + +ssrc +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`trap`, :ref:`exec` diff --git a/docs/AMDGPU/gfx8_tgt.rst b/docs/AMDGPU/gfx8_tgt.rst new file mode 100644 index 0000000000000000000000000000000000000000..1be54a73269b15d72ddb7209675b6cb957a5fa41 --- /dev/null +++ b/docs/AMDGPU/gfx8_tgt.rst @@ -0,0 +1,24 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_tgt: + +tgt +=========================== + +An export target: + + ============== =================================== + Syntax Description + ============== =================================== + pos{0..3} Copy vertex position 0..3. + param{0..31} Copy vertex parameter 0..31. + mrt{0..7} Copy pixel color to the MRTs 0..7. + mrtz Copy pixel depth (Z) data. + null Copy nothing. + ============== =================================== + diff --git a/docs/AMDGPU/gfx8_type_dev.rst b/docs/AMDGPU/gfx8_type_dev.rst new file mode 100644 index 0000000000000000000000000000000000000000..2f5b36f9f9b40cedfb0418f8feb31252e20df44b --- /dev/null +++ b/docs/AMDGPU/gfx8_type_dev.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_type_dev: + +Type deviation +=========================== + +*Type* of this operand differs from *type* :ref:`implied by the opcode`. This tag specifies actual operand *type*. + diff --git a/docs/AMDGPU/gfx8_uimm16.rst b/docs/AMDGPU/gfx8_uimm16.rst new file mode 100644 index 0000000000000000000000000000000000000000..a20abcc1344867d9d93c722518b16a75d8801d4c --- /dev/null +++ b/docs/AMDGPU/gfx8_uimm16.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_uimm16: + +imm16 +=========================== + +An :ref:`integer_number`. The value is truncated to 16 bits and then zero-extended to 32 bits. + diff --git a/docs/AMDGPU/gfx8_vcc_64.rst b/docs/AMDGPU/gfx8_vcc_64.rst new file mode 100644 index 0000000000000000000000000000000000000000..e31df0e51c2ee9c0e385cbf78c196f61f34167c3 --- /dev/null +++ b/docs/AMDGPU/gfx8_vcc_64.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_vcc_64: + +vcc +=========================== + +Vector condition code. + +*Size:* 2 dwords. + +*Operands:* :ref:`vcc` diff --git a/docs/AMDGPU/gfx8_vdata128_0.rst b/docs/AMDGPU/gfx8_vdata128_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..bf7e3db732b1fa1c1666dc06ccdf9d32904dd4ac --- /dev/null +++ b/docs/AMDGPU/gfx8_vdata128_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_vdata128_0: + +vdata +=========================== + +Instruction input. + +*Size:* 4 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_vdata32_0.rst b/docs/AMDGPU/gfx8_vdata32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..b89d65b1c7931668eee592e7c5b41799e906f1f7 --- /dev/null +++ b/docs/AMDGPU/gfx8_vdata32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_vdata32_0: + +vdata +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_vdata64_0.rst b/docs/AMDGPU/gfx8_vdata64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..438054456060285468b48c9b4d2cff68dcdeba11 --- /dev/null +++ b/docs/AMDGPU/gfx8_vdata64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_vdata64_0: + +vdata +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_vdata96_0.rst b/docs/AMDGPU/gfx8_vdata96_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..b8ad22d1f24aa6a7ac9ea129e4dd65760defcefa --- /dev/null +++ b/docs/AMDGPU/gfx8_vdata96_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_vdata96_0: + +vdata +=========================== + +Instruction input. + +*Size:* 3 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_vdst128_0.rst b/docs/AMDGPU/gfx8_vdst128_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..1eccc95230928e8e5fc9bc44924e052e41295770 --- /dev/null +++ b/docs/AMDGPU/gfx8_vdst128_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_vdst128_0: + +vdst +=========================== + +Instruction output. + +*Size:* 4 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_vdst32_0.rst b/docs/AMDGPU/gfx8_vdst32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..781fcb6ec35b703c45fc964ed76018f41038afa7 --- /dev/null +++ b/docs/AMDGPU/gfx8_vdst32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_vdst32_0: + +vdst +=========================== + +Instruction output. + +*Size:* 1 dword. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_vdst64_0.rst b/docs/AMDGPU/gfx8_vdst64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..af2dfe9bf38c883e4f14390f916749f8f01402e9 --- /dev/null +++ b/docs/AMDGPU/gfx8_vdst64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_vdst64_0: + +vdst +=========================== + +Instruction output. + +*Size:* 2 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_vdst96_0.rst b/docs/AMDGPU/gfx8_vdst96_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..4895b657900294a360fa09ba01c74246da0c8347 --- /dev/null +++ b/docs/AMDGPU/gfx8_vdst96_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_vdst96_0: + +vdst +=========================== + +Instruction output. + +*Size:* 3 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_vsrc128_0.rst b/docs/AMDGPU/gfx8_vsrc128_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..25b1794d85f317a6cc7726caa9936e074c635163 --- /dev/null +++ b/docs/AMDGPU/gfx8_vsrc128_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_vsrc128_0: + +vsrc +=========================== + +Instruction input. + +*Size:* 4 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_vsrc32_0.rst b/docs/AMDGPU/gfx8_vsrc32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..524f36a0a8932a995a8d8a9d41fd4f43e6c05afa --- /dev/null +++ b/docs/AMDGPU/gfx8_vsrc32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_vsrc32_0: + +vsrc +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_vsrc64_0.rst b/docs/AMDGPU/gfx8_vsrc64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..7c2c39ff753083255f95dfb05764f0e4b0823275 --- /dev/null +++ b/docs/AMDGPU/gfx8_vsrc64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_vsrc64_0: + +vsrc +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx8_waitcnt.rst b/docs/AMDGPU/gfx8_waitcnt.rst new file mode 100644 index 0000000000000000000000000000000000000000..4bad594171164d6defd7ded0f4f6ded3a6fa4c5c --- /dev/null +++ b/docs/AMDGPU/gfx8_waitcnt.rst @@ -0,0 +1,55 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid8_waitcnt: + +waitcnt +=========================== + +Counts of outstanding instructions to wait for. + +The bits of this operand have the following meaning: + + ============ ====================================================== + Bits Description + ============ ====================================================== + 3:0 VM_CNT: vector memory operations count. + 6:4 EXP_CNT: export count. + 11:8 LGKM_CNT: LDS, GDS, Constant and Message count. + ============ ====================================================== + +This operand may be specified as a positive 16-bit :ref:`integer_number` +or as a combination of the following symbolic helpers: + + ====================== ====================================================================== + Syntax Description + ====================== ====================================================================== + vmcnt(<*N*>) VM_CNT value. *N* must not exceed the largest VM_CNT value. + expcnt(<*N*>) EXP_CNT value. *N* must not exceed the largest EXP_CNT value. + lgkmcnt(<*N*>) LGKM_CNT value. *N* must not exceed the largest LGKM_CNT value. + vmcnt_sat(<*N*>) VM_CNT value computed as min(*N*, the largest VM_CNT value). + expcnt_sat(<*N*>) EXP_CNT value computed as min(*N*, the largest EXP_CNT value). + lgkmcnt_sat(<*N*>) LGKM_CNT value computed as min(*N*, the largest LGKM_CNT value). + ====================== ====================================================================== + +These helpers may be specified in any order. Ampersands and commas may be used as optional separators. + +*N* is either an +:ref:`integer number` or an +:ref:`absolute expression`. + +Examples: + +.. parsed-literal:: + + s_waitcnt 0 + s_waitcnt vmcnt(1) + s_waitcnt expcnt(2) lgkmcnt(3) + s_waitcnt vmcnt(1) expcnt(2) lgkmcnt(3) + s_waitcnt vmcnt(1), expcnt(2), lgkmcnt(3) + s_waitcnt vmcnt(1) & lgkmcnt_sat(100) & expcnt(2) + diff --git a/docs/AMDGPU/gfx9_addr_buf.rst b/docs/AMDGPU/gfx9_addr_buf.rst new file mode 100644 index 0000000000000000000000000000000000000000..c253640fa5fb63a6b26c9feab6cb822a14f8d86e --- /dev/null +++ b/docs/AMDGPU/gfx9_addr_buf.rst @@ -0,0 +1,22 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_addr_buf: + +vaddr +=========================== + +This is an optional operand which may specify offset and/or index. + +*Size:* 0, 1 or 2 dwords. Size is controlled by modifiers :ref:`offen` and :ref:`idxen`: + +* If only :ref:`idxen` is specified, this operand supplies an index. Size is 1 dword. +* If only :ref:`offen` is specified, this operand supplies an offset. Size is 1 dword. +* If both modifiers are specified, index is in the first register and offset is in the second. Size is 2 dwords. +* If none of these modifiers are specified, this operand must be set to :ref:`off`. + +*Operands:* :ref:`v`, :ref:`off` diff --git a/docs/AMDGPU/gfx9_addr_ds.rst b/docs/AMDGPU/gfx9_addr_ds.rst new file mode 100644 index 0000000000000000000000000000000000000000..11742462b9466940364f9d52efbd4bb104bef6f5 --- /dev/null +++ b/docs/AMDGPU/gfx9_addr_ds.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_addr_ds: + +vaddr +=========================== + +An offset from the start of GDS/LDS memory. + +*Size:* 1 dword. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_addr_flat.rst b/docs/AMDGPU/gfx9_addr_flat.rst new file mode 100644 index 0000000000000000000000000000000000000000..c748d07090eccccc40e3d904c8322aaae4349b79 --- /dev/null +++ b/docs/AMDGPU/gfx9_addr_flat.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_addr_flat: + +vaddr +=========================== + +A 64-bit flat address. + +*Size:* 2 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_addr_mimg.rst b/docs/AMDGPU/gfx9_addr_mimg.rst new file mode 100644 index 0000000000000000000000000000000000000000..eb6ca882b6afe2d51000572022703328049a8b57 --- /dev/null +++ b/docs/AMDGPU/gfx9_addr_mimg.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_addr_mimg: + +vaddr +=========================== + +Image address which includes from one to four dimensional coordinates and other data used to locate a position in the image. + +*Size:* 1, 2, 3, 4, 8 or 16 dwords. Actual size depends on opcode, specific image being handled and :ref:`a16`. + + Note 1. Image format and dimensions are encoded in the image resource constant but not in the instruction. + + Note 2. Actually image address size may vary from 1 to 13 dwords, but assembler currently supports a limited range of register sequences. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_attr.rst b/docs/AMDGPU/gfx9_attr.rst new file mode 100644 index 0000000000000000000000000000000000000000..faffcc7ed18a32baf185d16d52292c252ff72c86 --- /dev/null +++ b/docs/AMDGPU/gfx9_attr.rst @@ -0,0 +1,30 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_attr: + +attr +=========================== + +Interpolation attribute and channel: + + ============== =================================== + Syntax Description + ============== =================================== + attr{0..32}.x Attribute 0..32 with *x* channel. + attr{0..32}.y Attribute 0..32 with *y* channel. + attr{0..32}.z Attribute 0..32 with *z* channel. + attr{0..32}.w Attribute 0..32 with *w* channel. + ============== =================================== + +Examples: + +.. parsed-literal:: + + v_interp_p1_f32 v1, v0, attr0.x + v_interp_p1_f32 v1, v0, attr32.w + diff --git a/docs/AMDGPU/gfx9_base_smem_addr.rst b/docs/AMDGPU/gfx9_base_smem_addr.rst new file mode 100644 index 0000000000000000000000000000000000000000..63c2cbd5fbd275713650270a1062b846424b358e --- /dev/null +++ b/docs/AMDGPU/gfx9_base_smem_addr.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_base_smem_addr: + +sbase +=========================== + +A 64-bit base address for scalar memory operations. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_base_smem_buf.rst b/docs/AMDGPU/gfx9_base_smem_buf.rst new file mode 100644 index 0000000000000000000000000000000000000000..191ecbaeabe486fe94c8ae563f80e849a61701ba --- /dev/null +++ b/docs/AMDGPU/gfx9_base_smem_buf.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_base_smem_buf: + +sbase +=========================== + +A 128-bit buffer resource constant for scalar memory operations which provides a base address, a size and a stride. + +*Size:* 4 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_base_smem_scratch.rst b/docs/AMDGPU/gfx9_base_smem_scratch.rst new file mode 100644 index 0000000000000000000000000000000000000000..83fd760d75af2049a4aef42ab436dfccd0b8400f --- /dev/null +++ b/docs/AMDGPU/gfx9_base_smem_scratch.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_base_smem_scratch: + +sbase +=========================== + +This operand is ignored by H/W and :ref:`flat_scratch` is supplied instead. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_bimm16.rst b/docs/AMDGPU/gfx9_bimm16.rst new file mode 100644 index 0000000000000000000000000000000000000000..2c9dc5c52156183b62a7ef92bef61b26460109b0 --- /dev/null +++ b/docs/AMDGPU/gfx9_bimm16.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_bimm16: + +imm16 +=========================== + +An :ref:`integer_number`. The value is truncated to 16 bits. + diff --git a/docs/AMDGPU/gfx9_bimm32.rst b/docs/AMDGPU/gfx9_bimm32.rst new file mode 100644 index 0000000000000000000000000000000000000000..e9b89674a2ef8c66c36e9211a5041a4261122327 --- /dev/null +++ b/docs/AMDGPU/gfx9_bimm32.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_bimm32: + +imm32 +=========================== + +An :ref:`integer_number`. The value is truncated to 32 bits. + diff --git a/docs/AMDGPU/gfx9_data_buf_atomic128.rst b/docs/AMDGPU/gfx9_data_buf_atomic128.rst new file mode 100644 index 0000000000000000000000000000000000000000..11c7c73479d20b74d35714331cafb5d379ec89db --- /dev/null +++ b/docs/AMDGPU/gfx9_data_buf_atomic128.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_data_buf_atomic128: + +vdata +=========================== + +Input data for an atomic instruction. + +Optionally may serve as an output data: + +* If :ref:`glc` is specified, gets the memory value before the operation. + +*Size:* 4 dwords by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_data_buf_atomic32.rst b/docs/AMDGPU/gfx9_data_buf_atomic32.rst new file mode 100644 index 0000000000000000000000000000000000000000..7b7d88b7a2d0f8227b6b0c60137c188f6b3e2a8c --- /dev/null +++ b/docs/AMDGPU/gfx9_data_buf_atomic32.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_data_buf_atomic32: + +vdata +=========================== + +Input data for an atomic instruction. + +Optionally may serve as an output data: + +* If :ref:`glc` is specified, gets the memory value before the operation. + +*Size:* 1 dword by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_data_buf_atomic64.rst b/docs/AMDGPU/gfx9_data_buf_atomic64.rst new file mode 100644 index 0000000000000000000000000000000000000000..71b2b4be2caf405b9906c4f4c21666555365b400 --- /dev/null +++ b/docs/AMDGPU/gfx9_data_buf_atomic64.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_data_buf_atomic64: + +vdata +=========================== + +Input data for an atomic instruction. + +Optionally may serve as an output data: + +* If :ref:`glc` is specified, gets the memory value before the operation. + +*Size:* 2 dwords by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_data_mimg_atomic_cmp.rst b/docs/AMDGPU/gfx9_data_mimg_atomic_cmp.rst new file mode 100644 index 0000000000000000000000000000000000000000..08fe297b4f39f9f8b6f4a343db98bc62515fc5af --- /dev/null +++ b/docs/AMDGPU/gfx9_data_mimg_atomic_cmp.rst @@ -0,0 +1,27 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_data_mimg_atomic_cmp: + +vdata +=========================== + +Input data for an atomic instruction. + +Optionally may serve as an output data: + +* If :ref:`glc` is specified, gets the memory value before the operation. + +*Size:* depends on :ref:`dmask` and :ref:`tfe`: + +* :ref:`dmask` may specify 2 data elements for 32-bit-per-pixel surfaces or 4 data elements for 64-bit-per-pixel surfaces. Each data element occupies 1 dword. +* :ref:`tfe` adds 1 dword if specified. + + Note. The surface data format is indicated in the image resource constant but not in the instruction. + + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_data_mimg_atomic_reg.rst b/docs/AMDGPU/gfx9_data_mimg_atomic_reg.rst new file mode 100644 index 0000000000000000000000000000000000000000..2037dfd5356b420992444713759ec9bf4b8f8f36 --- /dev/null +++ b/docs/AMDGPU/gfx9_data_mimg_atomic_reg.rst @@ -0,0 +1,26 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_data_mimg_atomic_reg: + +vdata +=========================== + +Input data for an atomic instruction. + +Optionally may serve as an output data: + +* If :ref:`glc` is specified, gets the memory value before the operation. + +*Size:* depends on :ref:`dmask` and :ref:`tfe`: + +* :ref:`dmask` may specify 1 data element for 32-bit-per-pixel surfaces or 2 data elements for 64-bit-per-pixel surfaces. Each data element occupies 1 dword. +* :ref:`tfe` adds 1 dword if specified. + + Note. The surface data format is indicated in the image resource constant but not in the instruction. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_data_mimg_store.rst b/docs/AMDGPU/gfx9_data_mimg_store.rst new file mode 100644 index 0000000000000000000000000000000000000000..5e2b8b6a103c31ff48f44cac38c2efe014851572 --- /dev/null +++ b/docs/AMDGPU/gfx9_data_mimg_store.rst @@ -0,0 +1,18 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_data_mimg_store: + +vdata +=========================== + +Image data to store by an *image_store* instruction. + +*Size:* depends on :ref:`dmask` which may specify from 1 to 4 data elements. Each data element occupies 1 dword. + + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_data_mimg_store_d16.rst b/docs/AMDGPU/gfx9_data_mimg_store_d16.rst new file mode 100644 index 0000000000000000000000000000000000000000..5c521f80a49a05cabc79ac6d6c2b69f16bc1d23c --- /dev/null +++ b/docs/AMDGPU/gfx9_data_mimg_store_d16.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_data_mimg_store_d16: + +vdata +=========================== + +Image data to store by an *image_store* instruction. + +*Size:* depends on :ref:`dmask` and :ref:`d16`: + +* :ref:`dmask` may specify from 1 to 4 data elements. Each data element occupies either 32 bits or 16 bits depending on :ref:`d16`. +* :ref:`d16` specifies that data in registers are packed; each value occupies 16 bits. + + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_data_smem_atomic128.rst b/docs/AMDGPU/gfx9_data_smem_atomic128.rst new file mode 100644 index 0000000000000000000000000000000000000000..677361894818f7983d3d00504e3bc8b20b0751f4 --- /dev/null +++ b/docs/AMDGPU/gfx9_data_smem_atomic128.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_data_smem_atomic128: + +sdata +=========================== + +Input data for an atomic instruction. + +Optionally may serve as an output data: + +* If :ref:`glc` is specified, gets the memory value before the operation. + +*Size:* 4 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_data_smem_atomic32.rst b/docs/AMDGPU/gfx9_data_smem_atomic32.rst new file mode 100644 index 0000000000000000000000000000000000000000..9ad25f3403de2fee20152e97cc54d397b18abb38 --- /dev/null +++ b/docs/AMDGPU/gfx9_data_smem_atomic32.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_data_smem_atomic32: + +sdata +=========================== + +Input data for an atomic instruction. + +Optionally may serve as an output data: + +* If :ref:`glc` is specified, gets the memory value before the operation. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_data_smem_atomic64.rst b/docs/AMDGPU/gfx9_data_smem_atomic64.rst new file mode 100644 index 0000000000000000000000000000000000000000..6f67bffbd5486bc8a3ff0d7c5753f1b238f53404 --- /dev/null +++ b/docs/AMDGPU/gfx9_data_smem_atomic64.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_data_smem_atomic64: + +sdata +=========================== + +Input data for an atomic instruction. + +Optionally may serve as an output data: + +* If :ref:`glc` is specified, gets the memory value before the operation. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_dst_buf_128.rst b/docs/AMDGPU/gfx9_dst_buf_128.rst new file mode 100644 index 0000000000000000000000000000000000000000..691be0f69fe2bcafe189987185cebb7aa91872c5 --- /dev/null +++ b/docs/AMDGPU/gfx9_dst_buf_128.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_dst_buf_128: + +vdst +=========================== + +Instruction output: data read from a memory buffer. + +*Size:* 4 dwords by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_dst_buf_32.rst b/docs/AMDGPU/gfx9_dst_buf_32.rst new file mode 100644 index 0000000000000000000000000000000000000000..5ee1aa2c0c6ec55a21f6cd4ca6d746e4155686bb --- /dev/null +++ b/docs/AMDGPU/gfx9_dst_buf_32.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_dst_buf_32: + +vdst +=========================== + +Instruction output: data read from a memory buffer. + +*Size:* 1 dword by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_dst_buf_64.rst b/docs/AMDGPU/gfx9_dst_buf_64.rst new file mode 100644 index 0000000000000000000000000000000000000000..6e27264ffe35285888222084b42f06f34c54224a --- /dev/null +++ b/docs/AMDGPU/gfx9_dst_buf_64.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_dst_buf_64: + +vdst +=========================== + +Instruction output: data read from a memory buffer. + +*Size:* 2 dwords by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_dst_buf_96.rst b/docs/AMDGPU/gfx9_dst_buf_96.rst new file mode 100644 index 0000000000000000000000000000000000000000..8011edc0cb1b8dc768efa8e42bddaf66d8335ef3 --- /dev/null +++ b/docs/AMDGPU/gfx9_dst_buf_96.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_dst_buf_96: + +vdst +=========================== + +Instruction output: data read from a memory buffer. + +*Size:* 3 dwords by default. :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_dst_buf_lds.rst b/docs/AMDGPU/gfx9_dst_buf_lds.rst new file mode 100644 index 0000000000000000000000000000000000000000..0445619d640821305336cf9407dc96ffb1755598 --- /dev/null +++ b/docs/AMDGPU/gfx9_dst_buf_lds.rst @@ -0,0 +1,21 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_dst_buf_lds: + +vdst +=========================== + +Instruction output: data read from a memory buffer. + +If :ref:`lds` is specified, this operand is ignored by H/W and data are stored directly into LDS. + +*Size:* 1 dword by default. :ref:`tfe` adds 1 dword if specified. + + Note that :ref:`tfe` and :ref:`lds` cannot be used together. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_dst_flat_atomic32.rst b/docs/AMDGPU/gfx9_dst_flat_atomic32.rst new file mode 100644 index 0000000000000000000000000000000000000000..94a7fdac9703d30c56408f77ffe61238c2adbaae --- /dev/null +++ b/docs/AMDGPU/gfx9_dst_flat_atomic32.rst @@ -0,0 +1,19 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_dst_flat_atomic32: + +vdst +=========================== + +Data returned by a 32-bit atomic flat instruction. + +This is an optional operand. It must be used if and only if :ref:`glc` is specified. + +*Size:* 1 dword. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_dst_flat_atomic64.rst b/docs/AMDGPU/gfx9_dst_flat_atomic64.rst new file mode 100644 index 0000000000000000000000000000000000000000..7f684a7165e4b39957bc894ae091dd8d5e01321d --- /dev/null +++ b/docs/AMDGPU/gfx9_dst_flat_atomic64.rst @@ -0,0 +1,19 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_dst_flat_atomic64: + +vdst +=========================== + +Data returned by a 64-bit atomic flat instruction. + +This is an optional operand. It must be used if and only if :ref:`glc` is specified. + +*Size:* 2 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_dst_mimg_gather4.rst b/docs/AMDGPU/gfx9_dst_mimg_gather4.rst new file mode 100644 index 0000000000000000000000000000000000000000..3bb6f30811f15c5703b0e41ea4c3913c3f8b72da --- /dev/null +++ b/docs/AMDGPU/gfx9_dst_mimg_gather4.rst @@ -0,0 +1,22 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_dst_mimg_gather4: + +vdst +=========================== + +Image data to load by an *image_gather4* instruction. + +*Size:* 4 data elements by default. Each data element occupies either 32 bits or 16 bits depending on :ref:`d16`. + +:ref:`d16` and :ref:`tfe` affect operand size as follows: + +* :ref:`d16` specifies that data elements in registers are packed; each value occupies 16 bits. +* :ref:`tfe` adds one dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_dst_mimg_regular.rst b/docs/AMDGPU/gfx9_dst_mimg_regular.rst new file mode 100644 index 0000000000000000000000000000000000000000..1a7b848c1ee9c58232c1c82283a26c65ad805654 --- /dev/null +++ b/docs/AMDGPU/gfx9_dst_mimg_regular.rst @@ -0,0 +1,20 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_dst_mimg_regular: + +vdst +=========================== + +Image data to load by an image instruction. + +*Size:* depends on :ref:`dmask` and :ref:`tfe`: + +* :ref:`dmask` may specify from 1 to 4 data elements. Each data element occupies 1 dword. +* :ref:`tfe` adds 1 dword if specified. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_dst_mimg_regular_d16.rst b/docs/AMDGPU/gfx9_dst_mimg_regular_d16.rst new file mode 100644 index 0000000000000000000000000000000000000000..a155639bf3e61c52cecc22b39b5379cc66347659 --- /dev/null +++ b/docs/AMDGPU/gfx9_dst_mimg_regular_d16.rst @@ -0,0 +1,22 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_dst_mimg_regular_d16: + +vdst +=========================== + +Image data to load by an image instruction. + +*Size:* depends on :ref:`dmask`, :ref:`tfe` and :ref:`d16`: + +* :ref:`dmask` may specify from 1 to 4 data elements. Each data element occupies either 32 bits or 16 bits depending on :ref:`d16`. +* :ref:`d16` specifies that data elements in registers are packed; each value occupies 16 bits. +* :ref:`tfe` adds 1 dword if specified. + + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_fimm16.rst b/docs/AMDGPU/gfx9_fimm16.rst new file mode 100644 index 0000000000000000000000000000000000000000..a438b452eecfa55e57056cf8be6010105db238e7 --- /dev/null +++ b/docs/AMDGPU/gfx9_fimm16.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_fimm16: + +imm32 +=========================== + +An :ref:`integer_number` or a :ref:`floating-point_number`. The number is converted to *f16* as described :ref:`here`. + diff --git a/docs/AMDGPU/gfx9_fimm32.rst b/docs/AMDGPU/gfx9_fimm32.rst new file mode 100644 index 0000000000000000000000000000000000000000..11103e70529e3b373d16679677317608d4db4846 --- /dev/null +++ b/docs/AMDGPU/gfx9_fimm32.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_fimm32: + +imm32 +=========================== + +An :ref:`integer_number` or a :ref:`floating-point_number`. The value is converted to *f32* as described :ref:`here`. + diff --git a/docs/AMDGPU/gfx9_hwreg.rst b/docs/AMDGPU/gfx9_hwreg.rst new file mode 100644 index 0000000000000000000000000000000000000000..7ebb38b42fe01ffa3bb41a738c87a5bf0a1feb0e --- /dev/null +++ b/docs/AMDGPU/gfx9_hwreg.rst @@ -0,0 +1,61 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_hwreg: + +hwreg +=========================== + +Bits of a hardware register being accessed. + +The bits of this operand have the following meaning: + + ============ =================================== + Bits Description + ============ =================================== + 5:0 Register *id*. + 10:6 First bit *offset* (0..31). + 15:11 *Size* in bits (1..32). + ============ =================================== + +This operand may be specified as a positive 16-bit :ref:`integer_number` or using the syntax described below. + + ==================================== ============================================================================ + Syntax Description + ==================================== ============================================================================ + hwreg({0..63}) All bits of a register indicated by its *id*. + hwreg(<*name*>) All bits of a register indicated by its *name*. + hwreg({0..63}, {0..31}, {1..32}) Register bits indicated by register *id*, first bit *offset* and *size*. + hwreg(<*name*>, {0..31}, {1..32}) Register bits indicated by register *name*, first bit *offset* and *size*. + ==================================== ============================================================================ + +Register *id*, *offset* and *size* must be specified as positive :ref:`integer numbers`. + +Defined register *names* include: + + =================== ========================================== + Name Description + =================== ========================================== + HW_REG_MODE Shader writeable mode bits. + HW_REG_STATUS Shader read-only status. + HW_REG_TRAPSTS Trap status. + HW_REG_HW_ID Id of wave, simd, compute unit, etc. + HW_REG_GPR_ALLOC Per-wave SGPR and VGPR allocation. + HW_REG_LDS_ALLOC Per-wave LDS allocation. + HW_REG_IB_STS Counters of outstanding instructions. + HW_REG_SH_MEM_BASES Memory aperture. + =================== ========================================== + +Examples: + +.. parsed-literal:: + + s_getreg_b32 s2, 0x6 + s_getreg_b32 s2, hwreg(15) + s_getreg_b32 s2, hwreg(51, 1, 31) + s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1) + diff --git a/docs/AMDGPU/gfx9_imm4.rst b/docs/AMDGPU/gfx9_imm4.rst new file mode 100644 index 0000000000000000000000000000000000000000..b1c97fb0b29d8c688afc5d58dd54b35df1cc7e88 --- /dev/null +++ b/docs/AMDGPU/gfx9_imm4.rst @@ -0,0 +1,25 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_imm4: + +imm4 +=========================== + +A positive :ref:`integer_number`. The value is truncated to 4 bits. + +This operand is a mask which controls indexing mode for operands of subsequent instructions. Value 1 enables indexing and value 0 disables it. + + ============ ======================================== + Bit Meaning + ============ ======================================== + 0 Enables or disables *src0* indexing. + 1 Enables or disables *src1* indexing. + 2 Enables or disables *src2* indexing. + 3 Enables or disables *dst* indexing. + ============ ======================================== + diff --git a/docs/AMDGPU/gfx9_label.rst b/docs/AMDGPU/gfx9_label.rst new file mode 100644 index 0000000000000000000000000000000000000000..32771722f71de630e511e937daaed1b32211967c --- /dev/null +++ b/docs/AMDGPU/gfx9_label.rst @@ -0,0 +1,30 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_label: + +label +=========================== + +A branch target which is a 16-bit signed integer treated as a PC-relative dword offset. + +This operand may be specified as: + +* An :ref:`integer_number`. The number is truncated to 16 bits. +* An :ref:`absolute_expression` which must start with an :ref:`integer_number`. The value of the expression is truncated to 16 bits. +* A :ref:`symbol` (for example, a label). The value is handled as a 16-bit PC-relative dword offset to be resolved by a linker. + +Examples: + +.. parsed-literal:: + + offset = 30 + s_branch loop_end + s_branch 2 + offset + s_branch 32 + loop_end: + diff --git a/docs/AMDGPU/gfx9_mad_type_dev.rst b/docs/AMDGPU/gfx9_mad_type_dev.rst new file mode 100644 index 0000000000000000000000000000000000000000..53c78e441128883e447d0b2ef2573c59a2d5861e --- /dev/null +++ b/docs/AMDGPU/gfx9_mad_type_dev.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_mad_type_dev: + +fx +=========================== + +This is an *f32* or *f16* operand depending on instruction modifiers: + +* Operand size is controlled by :ref:`mad_mix_op_sel_hi`. +* Location of 16-bit operand is controlled by :ref:`mad_mix_op_sel`. + diff --git a/docs/AMDGPU/gfx9_mod_dpp_sdwa_abs_neg.rst b/docs/AMDGPU/gfx9_mod_dpp_sdwa_abs_neg.rst new file mode 100644 index 0000000000000000000000000000000000000000..ccbad8a5c87c5ac647085fc43838546f3a8b9f93 --- /dev/null +++ b/docs/AMDGPU/gfx9_mod_dpp_sdwa_abs_neg.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_mod_dpp_sdwa_abs_neg: + +m +=========================== + +This operand may be used with floating point operand modifiers :ref:`abs` and :ref:`neg`. + diff --git a/docs/AMDGPU/gfx9_mod_sdwa_sext.rst b/docs/AMDGPU/gfx9_mod_sdwa_sext.rst new file mode 100644 index 0000000000000000000000000000000000000000..e832097f340e674d740bb0ecf75492bc2c9684bd --- /dev/null +++ b/docs/AMDGPU/gfx9_mod_sdwa_sext.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_mod_sdwa_sext: + +m +=========================== + +This operand may be used with integer operand modifier :ref:`sext`. + diff --git a/docs/AMDGPU/gfx9_mod_vop3_abs_neg.rst b/docs/AMDGPU/gfx9_mod_vop3_abs_neg.rst new file mode 100644 index 0000000000000000000000000000000000000000..2dac4b1bd7d3684f5cecfd7d534fe9c4f765dd0a --- /dev/null +++ b/docs/AMDGPU/gfx9_mod_vop3_abs_neg.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_mod_vop3_abs_neg: + +m +=========================== + +This operand may be used with floating point operand modifiers :ref:`abs` and :ref:`neg`. + diff --git a/docs/AMDGPU/gfx9_msg.rst b/docs/AMDGPU/gfx9_msg.rst new file mode 100644 index 0000000000000000000000000000000000000000..f18cff46ecb24a8ac1c3c030dbed2ea7643bddfd --- /dev/null +++ b/docs/AMDGPU/gfx9_msg.rst @@ -0,0 +1,72 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_msg: + +msg +=========================== + +A 16-bit message code. The bits of this operand have the following meaning: + + ============ ====================================================== + Bits Description + ============ ====================================================== + 3:0 Message *type*. + 6:4 Optional *operation*. + 9:7 Optional *parameters*. + 15:10 Unused. + ============ ====================================================== + +This operand may be specified as a positive 16-bit :ref:`integer_number` or using the syntax described below: + + ======================================== ======================================================================== + Syntax Description + ======================================== ======================================================================== + sendmsg(<*type*>) A message identified by its *type*. + sendmsg(<*type*>, <*op*>) A message identified by its *type* and *operation*. + sendmsg(<*type*>, <*op*>, <*stream*>) A message identified by its *type* and *operation* with a stream *id*. + ======================================== ======================================================================== + +*Type* may be specified using message *name* or message *id*. + +*Op* may be specified using operation *name* or operation *id*. + +Stream *id* is an integer in the range 0..3. + +Message *id*, operation *id* and stream *id* must be specified as positive :ref:`integer numbers`. + +Each message type supports specific operations: + + ================= ========== ============================== ============ ========== + Message name Message Id Supported Operations Operation Id Stream Id + ================= ========== ============================== ============ ========== + MSG_INTERRUPT 1 \- \- \- + MSG_GS 2 GS_OP_CUT 1 Optional + \ GS_OP_EMIT 2 Optional + \ GS_OP_EMIT_CUT 3 Optional + MSG_GS_DONE 3 GS_OP_NOP 0 \- + \ GS_OP_CUT 1 Optional + \ GS_OP_EMIT 2 Optional + \ GS_OP_EMIT_CUT 3 Optional + MSG_SYSMSG 15 SYSMSG_OP_ECC_ERR_INTERRUPT 1 \- + \ SYSMSG_OP_REG_RD 2 \- + \ SYSMSG_OP_HOST_TRAP_ACK 3 \- + \ SYSMSG_OP_TTRACE_PC 4 \- + ================= ========== ============================== ============ ========== + +Examples: + +.. parsed-literal:: + + s_sendmsg 0x12 + s_sendmsg sendmsg(MSG_INTERRUPT) + s_sendmsg sendmsg(2, GS_OP_CUT) + s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT) + s_sendmsg sendmsg(MSG_GS, 2) + s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_EMIT_CUT, 1) + s_sendmsg sendmsg(MSG_SYSMSG, SYSMSG_OP_TTRACE_PC) + diff --git a/docs/AMDGPU/gfx9_offset_buf.rst b/docs/AMDGPU/gfx9_offset_buf.rst new file mode 100644 index 0000000000000000000000000000000000000000..ec6cc330072e7e69dfad6d1dde03e5d0d1011189 --- /dev/null +++ b/docs/AMDGPU/gfx9_offset_buf.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_offset_buf: + +soffset +=========================== + +An unsigned byte offset. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp`, :ref:`m0`, :ref:`exec`, :ref:`constant` diff --git a/docs/AMDGPU/gfx9_offset_smem_buf.rst b/docs/AMDGPU/gfx9_offset_smem_buf.rst new file mode 100644 index 0000000000000000000000000000000000000000..fdc4b09ee8163a52e26b3735a86623bc83018c84 --- /dev/null +++ b/docs/AMDGPU/gfx9_offset_smem_buf.rst @@ -0,0 +1,19 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_offset_smem_buf: + +soffset +=========================== + +An unsigned byte offset added to the base address to get memory address. + +.. WARNING:: Assembler currently supports 20-bit offsets only. Use :ref:`uimm20` instead of :ref:`uimm21`. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp`, :ref:`m0`, :ref:`uimm21` diff --git a/docs/AMDGPU/gfx9_offset_smem_plain.rst b/docs/AMDGPU/gfx9_offset_smem_plain.rst new file mode 100644 index 0000000000000000000000000000000000000000..a58df5593730e5d4ba5143343196e7870dbf9f3c --- /dev/null +++ b/docs/AMDGPU/gfx9_offset_smem_plain.rst @@ -0,0 +1,22 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_offset_smem_plain: + +soffset +=========================== + +An offset added to the base address to get memory address. + +* If offset is specified as a register, it supplies an unsigned byte offset. +* If offset is specified as a 21-bit immediate, it supplies a signed byte offset. + +.. WARNING:: Assembler currently supports 20-bit unsigned offsets only. Use :ref:`uimm20` instead of :ref:`simm21`. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp`, :ref:`m0`, :ref:`simm21` diff --git a/docs/AMDGPU/gfx9_opt.rst b/docs/AMDGPU/gfx9_opt.rst new file mode 100644 index 0000000000000000000000000000000000000000..50d73f9fcde9faa15e3d8d9b6c79250800a8b4ac --- /dev/null +++ b/docs/AMDGPU/gfx9_opt.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_opt: + +opt +=========================== + +This is an optional operand. It must be used if and only if :ref:`glc` is specified. + diff --git a/docs/AMDGPU/gfx9_param.rst b/docs/AMDGPU/gfx9_param.rst new file mode 100644 index 0000000000000000000000000000000000000000..2831a65820f81ce27361736dac7eb65c0c26a85d --- /dev/null +++ b/docs/AMDGPU/gfx9_param.rst @@ -0,0 +1,22 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_param: + +param +=========================== + +Interpolation parameter to read: + + ============ =================================== + Syntax Description + ============ =================================== + p0 Parameter *P0*. + p10 Parameter *P10*. + p20 Parameter *P20*. + ============ =================================== + diff --git a/docs/AMDGPU/gfx9_perm_smem.rst b/docs/AMDGPU/gfx9_perm_smem.rst new file mode 100644 index 0000000000000000000000000000000000000000..370fb0d67b33fed20979dc4de1efdbee4f9a94e9 --- /dev/null +++ b/docs/AMDGPU/gfx9_perm_smem.rst @@ -0,0 +1,24 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_perm_smem: + +imm3 +=========================== + +A bit mask which indicates request permissions. + +This operand must be specified as an :ref:`integer_number`. The value is truncated to 7 bits, but only 3 low bits are significant. + + ============ ============================== + Bit Number Description + ============ ============================== + 0 Request *read* permission. + 1 Request *write* permission. + 2 Request *execute* permission. + ============ ============================== + diff --git a/docs/AMDGPU/gfx9_ret.rst b/docs/AMDGPU/gfx9_ret.rst new file mode 100644 index 0000000000000000000000000000000000000000..7015be6298d70749724676218d529a1195f9829e --- /dev/null +++ b/docs/AMDGPU/gfx9_ret.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_ret: + +dst +=========================== + +This is an input operand. It may optionally serve as a destination if :ref:`glc` is specified. + diff --git a/docs/AMDGPU/gfx9_rsrc_buf.rst b/docs/AMDGPU/gfx9_rsrc_buf.rst new file mode 100644 index 0000000000000000000000000000000000000000..3dc17753d61222dcde09e29f75e894f06f87e1d6 --- /dev/null +++ b/docs/AMDGPU/gfx9_rsrc_buf.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_rsrc_buf: + +srsrc +=========================== + +Buffer resource constant which defines the address and characteristics of the buffer in memory. + +*Size:* 4 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_rsrc_mimg.rst b/docs/AMDGPU/gfx9_rsrc_mimg.rst new file mode 100644 index 0000000000000000000000000000000000000000..29c90a14b8a7a109588ddf8fb4e31e00b36d59b4 --- /dev/null +++ b/docs/AMDGPU/gfx9_rsrc_mimg.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_rsrc_mimg: + +srsrc +=========================== + +Image resource constant which defines the location of the image buffer in memory, its dimensions, tiling, and data format. + +*Size:* 8 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_saddr_flat_global.rst b/docs/AMDGPU/gfx9_saddr_flat_global.rst new file mode 100644 index 0000000000000000000000000000000000000000..7396df0bc726b923e2bbebb0f5ccb6e9a431099a --- /dev/null +++ b/docs/AMDGPU/gfx9_saddr_flat_global.rst @@ -0,0 +1,19 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_saddr_flat_global: + +saddr +=========================== + +An optional 64-bit flat global address. Must be specified as :ref:`off` if not used. + +See :ref:`vaddr` for description of available addressing modes. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp`, :ref:`exec`, :ref:`off` diff --git a/docs/AMDGPU/gfx9_saddr_flat_scratch.rst b/docs/AMDGPU/gfx9_saddr_flat_scratch.rst new file mode 100644 index 0000000000000000000000000000000000000000..5bdbf394af1aeff480512fb6e6008d7575495fcb --- /dev/null +++ b/docs/AMDGPU/gfx9_saddr_flat_scratch.rst @@ -0,0 +1,19 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_saddr_flat_scratch: + +saddr +=========================== + +An optional 32-bit flat scratch offset. Must be specified as :ref:`off` if not used. + +Either this operand or :ref:`vaddr` must be set to :ref:`off`. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp`, :ref:`exec`, :ref:`off` diff --git a/docs/AMDGPU/gfx9_samp_mimg.rst b/docs/AMDGPU/gfx9_samp_mimg.rst new file mode 100644 index 0000000000000000000000000000000000000000..f901142909bb97b35660dee7637018f28bf2f6a0 --- /dev/null +++ b/docs/AMDGPU/gfx9_samp_mimg.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_samp_mimg: + +ssamp +=========================== + +Sampler constant used to specify filtering options applied to the image data after it is read. + +*Size:* 4 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_sdata128_0.rst b/docs/AMDGPU/gfx9_sdata128_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..d3609d497f6671e9aad4167f7d3f471def155e2c --- /dev/null +++ b/docs/AMDGPU/gfx9_sdata128_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_sdata128_0: + +sdata +=========================== + +Instruction input. + +*Size:* 4 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_sdata32_0.rst b/docs/AMDGPU/gfx9_sdata32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..4de36350dc7a94637f53594b5260aba5c551cd58 --- /dev/null +++ b/docs/AMDGPU/gfx9_sdata32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_sdata32_0: + +sdata +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_sdata64_0.rst b/docs/AMDGPU/gfx9_sdata64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..7cde210d7b4f667a2e39c23360602fd6ca898d84 --- /dev/null +++ b/docs/AMDGPU/gfx9_sdata64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_sdata64_0: + +sdata +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_sdst128_0.rst b/docs/AMDGPU/gfx9_sdst128_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..974df548f465b9cda48f72d19e00a26c32c6c411 --- /dev/null +++ b/docs/AMDGPU/gfx9_sdst128_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_sdst128_0: + +sdst +=========================== + +Instruction output. + +*Size:* 4 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_sdst256_0.rst b/docs/AMDGPU/gfx9_sdst256_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..b492921d47a5246afe460996509a2530a983bf5e --- /dev/null +++ b/docs/AMDGPU/gfx9_sdst256_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_sdst256_0: + +sdst +=========================== + +Instruction output. + +*Size:* 8 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_sdst32_0.rst b/docs/AMDGPU/gfx9_sdst32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..911c843e7f47abf4905d7a2a62da5c36016e1620 --- /dev/null +++ b/docs/AMDGPU/gfx9_sdst32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_sdst32_0: + +sdst +=========================== + +Instruction output. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_sdst32_1.rst b/docs/AMDGPU/gfx9_sdst32_1.rst new file mode 100644 index 0000000000000000000000000000000000000000..8f63024f18721a67599e263419176f53d5fc99f6 --- /dev/null +++ b/docs/AMDGPU/gfx9_sdst32_1.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_sdst32_1: + +sdst +=========================== + +Instruction output. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp`, :ref:`m0`, :ref:`exec` diff --git a/docs/AMDGPU/gfx9_sdst32_2.rst b/docs/AMDGPU/gfx9_sdst32_2.rst new file mode 100644 index 0000000000000000000000000000000000000000..04f3cda2d08a63d559f7965f41d694b13562dc69 --- /dev/null +++ b/docs/AMDGPU/gfx9_sdst32_2.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_sdst32_2: + +sdst +=========================== + +Instruction output. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_sdst512_0.rst b/docs/AMDGPU/gfx9_sdst512_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..7f7dab60aa42a4153d568dc5dd1670804c38b648 --- /dev/null +++ b/docs/AMDGPU/gfx9_sdst512_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_sdst512_0: + +sdst +=========================== + +Instruction output. + +*Size:* 16 dwords. + +*Operands:* :ref:`s`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_sdst64_0.rst b/docs/AMDGPU/gfx9_sdst64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..dc5f4c72fb98258e353339723c456938068593ef --- /dev/null +++ b/docs/AMDGPU/gfx9_sdst64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_sdst64_0: + +sdst +=========================== + +Instruction output. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_sdst64_1.rst b/docs/AMDGPU/gfx9_sdst64_1.rst new file mode 100644 index 0000000000000000000000000000000000000000..208d68b41715f6293a55716e5b8e956d246eae76 --- /dev/null +++ b/docs/AMDGPU/gfx9_sdst64_1.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_sdst64_1: + +sdst +=========================== + +Instruction output. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp`, :ref:`exec` diff --git a/docs/AMDGPU/gfx9_simm16.rst b/docs/AMDGPU/gfx9_simm16.rst new file mode 100644 index 0000000000000000000000000000000000000000..47b200a72070c2377db3bc579d0144bf6596e367 --- /dev/null +++ b/docs/AMDGPU/gfx9_simm16.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_simm16: + +imm16 +=========================== + +An :ref:`integer_number`. The value is truncated to 16 bits and then sign-extended to 32 bits. + diff --git a/docs/AMDGPU/gfx9_src32_0.rst b/docs/AMDGPU/gfx9_src32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..288ccbb37d095ab6cae3231742585ff576a41bc4 --- /dev/null +++ b/docs/AMDGPU/gfx9_src32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_src32_0: + +src +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`v`, :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp`, :ref:`m0`, :ref:`exec`, :ref:`constant`, :ref:`literal` diff --git a/docs/AMDGPU/gfx9_src32_1.rst b/docs/AMDGPU/gfx9_src32_1.rst new file mode 100644 index 0000000000000000000000000000000000000000..a06764da8b221dd7fab7566cd5246b6a3a5c3074 --- /dev/null +++ b/docs/AMDGPU/gfx9_src32_1.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_src32_1: + +src +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`v`, :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp`, :ref:`m0`, :ref:`exec`, :ref:`constant` diff --git a/docs/AMDGPU/gfx9_src64_0.rst b/docs/AMDGPU/gfx9_src64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..f8ef842ef4f16edd256c675a8c0fba15870c8192 --- /dev/null +++ b/docs/AMDGPU/gfx9_src64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_src64_0: + +src +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`v`, :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp`, :ref:`exec`, :ref:`constant`, :ref:`literal` diff --git a/docs/AMDGPU/gfx9_src64_1.rst b/docs/AMDGPU/gfx9_src64_1.rst new file mode 100644 index 0000000000000000000000000000000000000000..fe7b7fda1f5e689bbc63e6cc02b9e3bc6ea6e993 --- /dev/null +++ b/docs/AMDGPU/gfx9_src64_1.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_src64_1: + +src +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`v`, :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp`, :ref:`exec`, :ref:`constant` diff --git a/docs/AMDGPU/gfx9_src_exp.rst b/docs/AMDGPU/gfx9_src_exp.rst new file mode 100644 index 0000000000000000000000000000000000000000..91a5d53b92ba5a3246af9a755863a58f973ee9e2 --- /dev/null +++ b/docs/AMDGPU/gfx9_src_exp.rst @@ -0,0 +1,28 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_src_exp: + +vsrc +=========================== + +Data to copy to export buffers. This is an optional operand. Must be specified as :ref:`off` if not used. + +:ref:`compr` modifier indicates use of compressed (16-bit) data. This limits number of source operands from 4 to 2: + +* src0 and src1 must specify the first register (or :ref:`off`). +* src2 and src3 must specify the second register (or :ref:`off`). + +An example: + +.. parsed-literal:: + + exp mrtz v3, v3, off, off compr + +*Size:* 1 dword. + +*Operands:* :ref:`v`, :ref:`off` diff --git a/docs/AMDGPU/gfx9_ssrc32_0.rst b/docs/AMDGPU/gfx9_ssrc32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..25d6b3744fd64cf5c5bbfffcebbb24eaeed72fc2 --- /dev/null +++ b/docs/AMDGPU/gfx9_ssrc32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_ssrc32_0: + +ssrc +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp`, :ref:`m0`, :ref:`exec`, :ref:`constant`, :ref:`literal` diff --git a/docs/AMDGPU/gfx9_ssrc32_1.rst b/docs/AMDGPU/gfx9_ssrc32_1.rst new file mode 100644 index 0000000000000000000000000000000000000000..caea7640661489c80b8b1eb29584d1279f16e895 --- /dev/null +++ b/docs/AMDGPU/gfx9_ssrc32_1.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_ssrc32_1: + +ssrc +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_ssrc32_2.rst b/docs/AMDGPU/gfx9_ssrc32_2.rst new file mode 100644 index 0000000000000000000000000000000000000000..034f20e30ee915d658e645f599d558ccc3a331d5 --- /dev/null +++ b/docs/AMDGPU/gfx9_ssrc32_2.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_ssrc32_2: + +ssrc +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp`, :ref:`m0`, :ref:`exec` diff --git a/docs/AMDGPU/gfx9_ssrc32_3.rst b/docs/AMDGPU/gfx9_ssrc32_3.rst new file mode 100644 index 0000000000000000000000000000000000000000..1b08cad751d822fc99a83c774737bd0e3c44efe5 --- /dev/null +++ b/docs/AMDGPU/gfx9_ssrc32_3.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_ssrc32_3: + +ssrc +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp`, :ref:`m0`, :ref:`iconst` diff --git a/docs/AMDGPU/gfx9_ssrc32_4.rst b/docs/AMDGPU/gfx9_ssrc32_4.rst new file mode 100644 index 0000000000000000000000000000000000000000..7e5542737bd88cf3ba664e0302a124c395d7b528 --- /dev/null +++ b/docs/AMDGPU/gfx9_ssrc32_4.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_ssrc32_4: + +ssrc +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp`, :ref:`m0`, :ref:`exec`, :ref:`constant` diff --git a/docs/AMDGPU/gfx9_ssrc64_0.rst b/docs/AMDGPU/gfx9_ssrc64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..b2f86e11cb0d4938d82a2f1583562e7bd2fdbcf2 --- /dev/null +++ b/docs/AMDGPU/gfx9_ssrc64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_ssrc64_0: + +ssrc +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp`, :ref:`exec`, :ref:`constant`, :ref:`literal` diff --git a/docs/AMDGPU/gfx9_ssrc64_1.rst b/docs/AMDGPU/gfx9_ssrc64_1.rst new file mode 100644 index 0000000000000000000000000000000000000000..02be6c537b72f76894918f4eef2f9456b5c80cc4 --- /dev/null +++ b/docs/AMDGPU/gfx9_ssrc64_1.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_ssrc64_1: + +ssrc +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp` diff --git a/docs/AMDGPU/gfx9_ssrc64_2.rst b/docs/AMDGPU/gfx9_ssrc64_2.rst new file mode 100644 index 0000000000000000000000000000000000000000..72d2c46d3a9b0dfff88cf5609b0134711e806cb6 --- /dev/null +++ b/docs/AMDGPU/gfx9_ssrc64_2.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_ssrc64_2: + +ssrc +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp`, :ref:`exec`, :ref:`constant` diff --git a/docs/AMDGPU/gfx9_ssrc64_3.rst b/docs/AMDGPU/gfx9_ssrc64_3.rst new file mode 100644 index 0000000000000000000000000000000000000000..3414802295ad07ad39beed38748fc97a1dbc3c28 --- /dev/null +++ b/docs/AMDGPU/gfx9_ssrc64_3.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_ssrc64_3: + +ssrc +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`s`, :ref:`flat_scratch`, :ref:`xnack`, :ref:`vcc`, :ref:`ttmp`, :ref:`exec` diff --git a/docs/AMDGPU/gfx9_tgt.rst b/docs/AMDGPU/gfx9_tgt.rst new file mode 100644 index 0000000000000000000000000000000000000000..3ba8baebb3bc0fe2cb2af2cdc5ab2ac04df4e5fe --- /dev/null +++ b/docs/AMDGPU/gfx9_tgt.rst @@ -0,0 +1,24 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_tgt: + +tgt +=========================== + +An export target: + + ============== =================================== + Syntax Description + ============== =================================== + pos{0..3} Copy vertex position 0..3. + param{0..31} Copy vertex parameter 0..31. + mrt{0..7} Copy pixel color to the MRTs 0..7. + mrtz Copy pixel depth (Z) data. + null Copy nothing. + ============== =================================== + diff --git a/docs/AMDGPU/gfx9_type_dev.rst b/docs/AMDGPU/gfx9_type_dev.rst new file mode 100644 index 0000000000000000000000000000000000000000..ae2b0bf6c50c5ac7be0bdfdb4ecd83847e31b273 --- /dev/null +++ b/docs/AMDGPU/gfx9_type_dev.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_type_dev: + +Type deviation +=========================== + +*Type* of this operand differs from *type* :ref:`implied by the opcode`. This tag specifies actual operand *type*. + diff --git a/docs/AMDGPU/gfx9_uimm16.rst b/docs/AMDGPU/gfx9_uimm16.rst new file mode 100644 index 0000000000000000000000000000000000000000..4d1fe1de3f20bfbc8239694e7265c527cd1262b4 --- /dev/null +++ b/docs/AMDGPU/gfx9_uimm16.rst @@ -0,0 +1,14 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_uimm16: + +imm16 +=========================== + +An :ref:`integer_number`. The value is truncated to 16 bits and then zero-extended to 32 bits. + diff --git a/docs/AMDGPU/gfx9_vaddr_flat_global.rst b/docs/AMDGPU/gfx9_vaddr_flat_global.rst new file mode 100644 index 0000000000000000000000000000000000000000..38beb6c44321bcdc81c30e05635439326e91e191 --- /dev/null +++ b/docs/AMDGPU/gfx9_vaddr_flat_global.rst @@ -0,0 +1,22 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_vaddr_flat_global: + +vaddr +=========================== + +A 64-bit flat global address or a 32-bit offset depending on addressing mode: + +* Address = :ref:`vaddr` + :ref:`flat_offset13`. :ref:`vaddr` is a 64-bit address. This mode is indicated by :ref:`saddr` set to :ref:`off`. +* Address = :ref:`saddr` + :ref:`vaddr` + :ref:`flat_offset13`. :ref:`vaddr` is a 32-bit offset. This mode is used when :ref:`saddr` is not :ref:`off`. + +.. WARNING:: Assembler currently expects a 64-bit *vaddr* regardless of addressing mode. This have to be fixed. + +*Size:* 1 or 2 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_vaddr_flat_scratch.rst b/docs/AMDGPU/gfx9_vaddr_flat_scratch.rst new file mode 100644 index 0000000000000000000000000000000000000000..a72cf0662f420be512464b528b164fe463ac18e2 --- /dev/null +++ b/docs/AMDGPU/gfx9_vaddr_flat_scratch.rst @@ -0,0 +1,19 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_vaddr_flat_scratch: + +vaddr +=========================== + +An optional 32-bit flat scratch offset. Must be specified as :ref:`off` if not used. + +Either this operand or :ref:`saddr` must be set to :ref:`off`. + +*Size:* 1 dword. + +*Operands:* :ref:`v`, :ref:`off` diff --git a/docs/AMDGPU/gfx9_vcc_64.rst b/docs/AMDGPU/gfx9_vcc_64.rst new file mode 100644 index 0000000000000000000000000000000000000000..788306b2a5e46874d3427b0806b7d6e9a5257b05 --- /dev/null +++ b/docs/AMDGPU/gfx9_vcc_64.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_vcc_64: + +vcc +=========================== + +Vector condition code. + +*Size:* 2 dwords. + +*Operands:* :ref:`vcc` diff --git a/docs/AMDGPU/gfx9_vdata128_0.rst b/docs/AMDGPU/gfx9_vdata128_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..3d2b4905fa3f61e010dec1033acb7ff7ebeb993d --- /dev/null +++ b/docs/AMDGPU/gfx9_vdata128_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_vdata128_0: + +vdata +=========================== + +Instruction input. + +*Size:* 4 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_vdata32_0.rst b/docs/AMDGPU/gfx9_vdata32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..e2857695acfb92e28bb4bc4ac10e86fbbd57bf4a --- /dev/null +++ b/docs/AMDGPU/gfx9_vdata32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_vdata32_0: + +vdata +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_vdata64_0.rst b/docs/AMDGPU/gfx9_vdata64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..fcd1d0f827087cbb6c27a0c7384d36a03e546e4d --- /dev/null +++ b/docs/AMDGPU/gfx9_vdata64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_vdata64_0: + +vdata +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_vdata96_0.rst b/docs/AMDGPU/gfx9_vdata96_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..8a8cb7647bce709709465f89581210c49cc7e6ba --- /dev/null +++ b/docs/AMDGPU/gfx9_vdata96_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_vdata96_0: + +vdata +=========================== + +Instruction input. + +*Size:* 3 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_vdst128_0.rst b/docs/AMDGPU/gfx9_vdst128_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..b8bbf89c2729910b7b1bc464efab43ee7450537d --- /dev/null +++ b/docs/AMDGPU/gfx9_vdst128_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_vdst128_0: + +vdst +=========================== + +Instruction output. + +*Size:* 4 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_vdst32_0.rst b/docs/AMDGPU/gfx9_vdst32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..ccee55f7c10615fcb10ba8c044d9d8f7b4ac8d65 --- /dev/null +++ b/docs/AMDGPU/gfx9_vdst32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_vdst32_0: + +vdst +=========================== + +Instruction output. + +*Size:* 1 dword. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_vdst64_0.rst b/docs/AMDGPU/gfx9_vdst64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..60bf384f2c11227044bd614799f7ebbb97eee626 --- /dev/null +++ b/docs/AMDGPU/gfx9_vdst64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_vdst64_0: + +vdst +=========================== + +Instruction output. + +*Size:* 2 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_vdst96_0.rst b/docs/AMDGPU/gfx9_vdst96_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..834d32a4387c0f1759804abfd5f1ac4a4d3882a4 --- /dev/null +++ b/docs/AMDGPU/gfx9_vdst96_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_vdst96_0: + +vdst +=========================== + +Instruction output. + +*Size:* 3 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_vsrc128_0.rst b/docs/AMDGPU/gfx9_vsrc128_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..3e8c6dedf5d5d49008f04c71a313928f31db5cd1 --- /dev/null +++ b/docs/AMDGPU/gfx9_vsrc128_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_vsrc128_0: + +vsrc +=========================== + +Instruction input. + +*Size:* 4 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_vsrc32_0.rst b/docs/AMDGPU/gfx9_vsrc32_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..a056f3a1dffb726a0f1f1d56779e8fc3ac5c11a7 --- /dev/null +++ b/docs/AMDGPU/gfx9_vsrc32_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_vsrc32_0: + +vsrc +=========================== + +Instruction input. + +*Size:* 1 dword. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_vsrc64_0.rst b/docs/AMDGPU/gfx9_vsrc64_0.rst new file mode 100644 index 0000000000000000000000000000000000000000..b91b3405c1db44776e99faa1742ff5fe05cbbf49 --- /dev/null +++ b/docs/AMDGPU/gfx9_vsrc64_0.rst @@ -0,0 +1,17 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_vsrc64_0: + +vsrc +=========================== + +Instruction input. + +*Size:* 2 dwords. + +*Operands:* :ref:`v` diff --git a/docs/AMDGPU/gfx9_waitcnt.rst b/docs/AMDGPU/gfx9_waitcnt.rst new file mode 100644 index 0000000000000000000000000000000000000000..015a51ae8c329485b6fcfff2b3747f0a7666ab45 --- /dev/null +++ b/docs/AMDGPU/gfx9_waitcnt.rst @@ -0,0 +1,56 @@ +.. + ************************************************** + * * + * Automatically generated file, do not edit! * + * * + ************************************************** + +.. _amdgpu_synid9_waitcnt: + +waitcnt +=========================== + +Counts of outstanding instructions to wait for. + +The bits of this operand have the following meaning: + + ============ ====================================================== + Bits Description + ============ ====================================================== + 3:0 VM_CNT: vector memory operations count, lower bits. + 6:4 EXP_CNT: export count. + 11:8 LGKM_CNT: LDS, GDS, Constant and Message count. + 15:14 VM_CNT: vector memory operations count, upper bits. + ============ ====================================================== + +This operand may be specified as a positive 16-bit :ref:`integer_number` +or as a combination of the following symbolic helpers: + + ====================== ====================================================================== + Syntax Description + ====================== ====================================================================== + vmcnt(<*N*>) VM_CNT value. *N* must not exceed the largest VM_CNT value. + expcnt(<*N*>) EXP_CNT value. *N* must not exceed the largest EXP_CNT value. + lgkmcnt(<*N*>) LGKM_CNT value. *N* must not exceed the largest LGKM_CNT value. + vmcnt_sat(<*N*>) VM_CNT value computed as min(*N*, the largest VM_CNT value). + expcnt_sat(<*N*>) EXP_CNT value computed as min(*N*, the largest EXP_CNT value). + lgkmcnt_sat(<*N*>) LGKM_CNT value computed as min(*N*, the largest LGKM_CNT value). + ====================== ====================================================================== + +These helpers may be specified in any order. Ampersands and commas may be used as optional separators. + +*N* is either an +:ref:`integer number` or an +:ref:`absolute expression`. + +Examples: + +.. parsed-literal:: + + s_waitcnt 0 + s_waitcnt vmcnt(1) + s_waitcnt expcnt(2) lgkmcnt(3) + s_waitcnt vmcnt(1) expcnt(2) lgkmcnt(3) + s_waitcnt vmcnt(1), expcnt(2), lgkmcnt(3) + s_waitcnt vmcnt(1) & lgkmcnt_sat(100) & expcnt(2) + diff --git a/docs/AMDGPUAsmGFX7.rst b/docs/AMDGPUAsmGFX7.rst deleted file mode 100644 index 8973c5035ad64005c0d63afa87b17b221386727f..0000000000000000000000000000000000000000 --- a/docs/AMDGPUAsmGFX7.rst +++ /dev/null @@ -1,1255 +0,0 @@ -.. - ************************************************** - * * - * Automatically generated file, do not edit! * - * * - ************************************************** - -=========================== -Syntax of GFX7 Instructions -=========================== - -.. contents:: - :local: - - -DS -=========================== - -.. parsed-literal:: - - ds_add_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_add_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_add_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_add_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_add_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_add_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_and_b32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_and_b64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_and_rtn_b32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_and_rtn_b64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_and_src2_b32 src0 :ref:`ds_offset16` :ref:`gds` - ds_and_src2_b64 src0 :ref:`ds_offset16` :ref:`gds` - ds_append dst :ref:`ds_offset16` :ref:`gds` - ds_cmpst_b32 src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_b64 src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_f32 src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_f64 src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_rtn_b32 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_rtn_b64 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_rtn_f32 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_rtn_f64 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_condxchg32_rtn_b64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_consume dst :ref:`ds_offset16` :ref:`gds` - ds_dec_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_dec_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_dec_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_dec_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_dec_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_dec_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_gws_barrier src0 :ref:`ds_offset16` :ref:`gds` - ds_gws_init src0 :ref:`ds_offset16` :ref:`gds` - ds_gws_sema_br src0 :ref:`ds_offset16` :ref:`gds` - ds_gws_sema_p src0 :ref:`ds_offset16` :ref:`gds` - ds_gws_sema_release_all src0 :ref:`ds_offset16` :ref:`gds` - ds_gws_sema_v src0 :ref:`ds_offset16` :ref:`gds` - ds_inc_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_inc_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_inc_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_inc_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_inc_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_inc_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_f32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_f64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_i32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_i64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_rtn_f32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_rtn_f64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_rtn_i32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_rtn_i64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_src2_f32 src0 :ref:`ds_offset16` :ref:`gds` - ds_max_src2_f64 src0 :ref:`ds_offset16` :ref:`gds` - ds_max_src2_i32 src0 :ref:`ds_offset16` :ref:`gds` - ds_max_src2_i64 src0 :ref:`ds_offset16` :ref:`gds` - ds_max_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_max_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_max_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_f32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_f64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_i32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_i64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_rtn_f32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_rtn_f64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_rtn_i32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_rtn_i64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_src2_f32 src0 :ref:`ds_offset16` :ref:`gds` - ds_min_src2_f64 src0 :ref:`ds_offset16` :ref:`gds` - ds_min_src2_i32 src0 :ref:`ds_offset16` :ref:`gds` - ds_min_src2_i64 src0 :ref:`ds_offset16` :ref:`gds` - ds_min_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_min_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_min_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_mskor_b32 src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_mskor_b64 src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_mskor_rtn_b32 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_mskor_rtn_b64 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_nop src0 - ds_or_b32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_or_b64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_or_rtn_b32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_or_rtn_b64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_or_src2_b32 src0 :ref:`ds_offset16` :ref:`gds` - ds_or_src2_b64 src0 :ref:`ds_offset16` :ref:`gds` - ds_ordered_count dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read2_b32 dst, src0 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_read2_b64 dst, src0 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_read2st64_b32 dst, src0 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_read2st64_b64 dst, src0 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_read_b128 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_b32 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_b64 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_b96 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_i16 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_i8 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_u16 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_u8 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_rsub_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_rsub_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_rsub_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_rsub_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_rsub_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_rsub_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_sub_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_sub_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_sub_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_sub_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_sub_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_sub_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_swizzle_b32 dst, src0 :ref:`sw_offset16` :ref:`gds` - ds_wrap_rtn_b32 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_write2_b32 src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_write2_b64 src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_write2st64_b32 src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_write2st64_b64 src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_write_b128 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_b16 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_b32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_b64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_b8 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_b96 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_src2_b32 src0 :ref:`ds_offset16` :ref:`gds` - ds_write_src2_b64 src0 :ref:`ds_offset16` :ref:`gds` - ds_wrxchg2_rtn_b32 dst, src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_wrxchg2_rtn_b64 dst, src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_wrxchg2st64_rtn_b32 dst, src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_wrxchg2st64_rtn_b64 dst, src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_wrxchg_rtn_b32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_wrxchg_rtn_b64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_xor_b32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_xor_b64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_xor_rtn_b32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_xor_rtn_b64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_xor_src2_b32 src0 :ref:`ds_offset16` :ref:`gds` - ds_xor_src2_b64 src0 :ref:`ds_offset16` :ref:`gds` - -EXP -=========================== - -.. parsed-literal:: - - exp dst, src0, src1, src2, src3 :ref:`done` :ref:`compr` :ref:`vm` - -FLAT -=========================== - -.. parsed-literal:: - - flat_atomic_add dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_add_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_and dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_and_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_cmpswap dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_cmpswap_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_dec dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_dec_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_fcmpswap dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_fcmpswap_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_fmax dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_fmax_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_fmin dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_fmin_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_inc dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_inc_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_or dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_or_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_smax dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_smax_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_smin dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_smin_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_sub dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_sub_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_swap dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_swap_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_umax dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_umax_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_umin dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_umin_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_xor dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_xor_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_load_dword dst, src0 :ref:`glc` :ref:`slc` - flat_load_dwordx2 dst, src0 :ref:`glc` :ref:`slc` - flat_load_dwordx3 dst, src0 :ref:`glc` :ref:`slc` - flat_load_dwordx4 dst, src0 :ref:`glc` :ref:`slc` - flat_load_sbyte dst, src0 :ref:`glc` :ref:`slc` - flat_load_sshort dst, src0 :ref:`glc` :ref:`slc` - flat_load_ubyte dst, src0 :ref:`glc` :ref:`slc` - flat_load_ushort dst, src0 :ref:`glc` :ref:`slc` - flat_store_byte src0, src1 :ref:`glc` :ref:`slc` - flat_store_dword src0, src1 :ref:`glc` :ref:`slc` - flat_store_dwordx2 src0, src1 :ref:`glc` :ref:`slc` - flat_store_dwordx3 src0, src1 :ref:`glc` :ref:`slc` - flat_store_dwordx4 src0, src1 :ref:`glc` :ref:`slc` - flat_store_short src0, src1 :ref:`glc` :ref:`slc` - -MIMG -=========================== - -.. parsed-literal:: - - image_atomic_add src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_and src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_cmpswap src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_dec src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_inc src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_or src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_smax src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_smin src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_sub src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_swap src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_umax src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_umin src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_xor src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4 dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_b dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_b_cl dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_b_cl_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_b_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_c dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_c_b dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_c_b_cl dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_c_b_cl_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_c_b_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_c_cl dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_c_cl_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_c_l dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_c_l_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_c_lz dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_c_lz_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_c_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_cl dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_cl_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_l dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_l_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_lz dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_lz_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_get_lod dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_get_resinfo dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_load dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_load_mip dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_load_mip_pck dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_load_mip_pck_sgn dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_load_pck dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_load_pck_sgn dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_sample dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_sample_b dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_sample_b_cl dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_sample_c dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_sample_c_b dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_sample_c_b_cl dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_sample_c_cd dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_sample_c_cl dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_sample_c_d dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_sample_c_l dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_sample_c_lz dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_sample_cl dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_sample_l dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_sample_lz dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_store src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_store_mip src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_store_mip_pck src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_store_pck src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - -MUBUF -=========================== - -.. parsed-literal:: - - buffer_atomic_add src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_add_x2 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_and src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_and_x2 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_cmpswap src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_cmpswap_x2 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_dec src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_dec_x2 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_inc src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_inc_x2 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_or src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_or_x2 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_smax src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_smax_x2 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_smin src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_smin_x2 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_sub src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_sub_x2 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_swap src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_swap_x2 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_umax src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_umax_x2 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_umin src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_umin_x2 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_xor src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_xor_x2 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_dword dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` - buffer_load_dwordx2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_dwordx3 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_dwordx4 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_format_x dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` - buffer_load_format_xy dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_format_xyz dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_format_xyzw dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_sbyte dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` - buffer_load_sshort dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` - buffer_load_ubyte dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` - buffer_load_ushort dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` - buffer_store_byte src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_dword src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_dwordx2 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_dwordx3 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_dwordx4 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_x src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_xy src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_xyz src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_xyzw src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_short src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`addr64` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_wbinvl1 - buffer_wbinvl1_vol - -SMRD -=========================== - -.. parsed-literal:: - - s_buffer_load_dword dst, src0, src1 - s_buffer_load_dwordx16 dst, src0, src1 - s_buffer_load_dwordx2 dst, src0, src1 - s_buffer_load_dwordx4 dst, src0, src1 - s_buffer_load_dwordx8 dst, src0, src1 - s_dcache_inv - s_dcache_inv_vol - s_load_dword dst, src0, src1 - s_load_dwordx16 dst, src0, src1 - s_load_dwordx2 dst, src0, src1 - s_load_dwordx4 dst, src0, src1 - s_load_dwordx8 dst, src0, src1 - s_memtime dst - -SOP1 -=========================== - -.. parsed-literal:: - - s_abs_i32 dst, src0 - s_and_saveexec_b64 dst, src0 - s_andn2_saveexec_b64 dst, src0 - s_bcnt0_i32_b32 dst, src0 - s_bcnt0_i32_b64 dst, src0 - s_bcnt1_i32_b32 dst, src0 - s_bcnt1_i32_b64 dst, src0 - s_bitset0_b32 dst, src0 - s_bitset0_b64 dst, src0 - s_bitset1_b32 dst, src0 - s_bitset1_b64 dst, src0 - s_brev_b32 dst, src0 - s_brev_b64 dst, src0 - s_cbranch_join src0 - s_cmov_b32 dst, src0 - s_cmov_b64 dst, src0 - s_ff0_i32_b32 dst, src0 - s_ff0_i32_b64 dst, src0 - s_ff1_i32_b32 dst, src0 - s_ff1_i32_b64 dst, src0 - s_flbit_i32 dst, src0 - s_flbit_i32_b32 dst, src0 - s_flbit_i32_b64 dst, src0 - s_flbit_i32_i64 dst, src0 - s_getpc_b64 dst - s_mov_b32 dst, src0 - s_mov_b64 dst, src0 - s_mov_fed_b32 dst, src0 - s_movreld_b32 dst, src0 - s_movreld_b64 dst, src0 - s_movrels_b32 dst, src0 - s_movrels_b64 dst, src0 - s_nand_saveexec_b64 dst, src0 - s_nor_saveexec_b64 dst, src0 - s_not_b32 dst, src0 - s_not_b64 dst, src0 - s_or_saveexec_b64 dst, src0 - s_orn2_saveexec_b64 dst, src0 - s_quadmask_b32 dst, src0 - s_quadmask_b64 dst, src0 - s_rfe_b64 src0 - s_setpc_b64 src0 - s_sext_i32_i16 dst, src0 - s_sext_i32_i8 dst, src0 - s_swappc_b64 dst, src0 - s_wqm_b32 dst, src0 - s_wqm_b64 dst, src0 - s_xnor_saveexec_b64 dst, src0 - s_xor_saveexec_b64 dst, src0 - -SOP2 -=========================== - -.. parsed-literal:: - - s_absdiff_i32 dst, src0, src1 - s_add_i32 dst, src0, src1 - s_add_u32 dst, src0, src1 - s_addc_u32 dst, src0, src1 - s_and_b32 dst, src0, src1 - s_and_b64 dst, src0, src1 - s_andn2_b32 dst, src0, src1 - s_andn2_b64 dst, src0, src1 - s_ashr_i32 dst, src0, src1 - s_ashr_i64 dst, src0, src1 - s_bfe_i32 dst, src0, src1 - s_bfe_i64 dst, src0, src1 - s_bfe_u32 dst, src0, src1 - s_bfe_u64 dst, src0, src1 - s_bfm_b32 dst, src0, src1 - s_bfm_b64 dst, src0, src1 - s_cbranch_g_fork src0, src1 - s_cselect_b32 dst, src0, src1 - s_cselect_b64 dst, src0, src1 - s_lshl_b32 dst, src0, src1 - s_lshl_b64 dst, src0, src1 - s_lshr_b32 dst, src0, src1 - s_lshr_b64 dst, src0, src1 - s_max_i32 dst, src0, src1 - s_max_u32 dst, src0, src1 - s_min_i32 dst, src0, src1 - s_min_u32 dst, src0, src1 - s_mul_i32 dst, src0, src1 - s_nand_b32 dst, src0, src1 - s_nand_b64 dst, src0, src1 - s_nor_b32 dst, src0, src1 - s_nor_b64 dst, src0, src1 - s_or_b32 dst, src0, src1 - s_or_b64 dst, src0, src1 - s_orn2_b32 dst, src0, src1 - s_orn2_b64 dst, src0, src1 - s_sub_i32 dst, src0, src1 - s_sub_u32 dst, src0, src1 - s_subb_u32 dst, src0, src1 - s_xnor_b32 dst, src0, src1 - s_xnor_b64 dst, src0, src1 - s_xor_b32 dst, src0, src1 - s_xor_b64 dst, src0, src1 - -SOPC -=========================== - -.. parsed-literal:: - - s_bitcmp0_b32 src0, src1 - s_bitcmp0_b64 src0, src1 - s_bitcmp1_b32 src0, src1 - s_bitcmp1_b64 src0, src1 - s_cmp_eq_i32 src0, src1 - s_cmp_eq_u32 src0, src1 - s_cmp_ge_i32 src0, src1 - s_cmp_ge_u32 src0, src1 - s_cmp_gt_i32 src0, src1 - s_cmp_gt_u32 src0, src1 - s_cmp_le_i32 src0, src1 - s_cmp_le_u32 src0, src1 - s_cmp_lg_i32 src0, src1 - s_cmp_lg_u32 src0, src1 - s_cmp_lt_i32 src0, src1 - s_cmp_lt_u32 src0, src1 - s_setvskip src0, src1 - -SOPK -=========================== - -.. parsed-literal:: - - s_addk_i32 dst, src0 - s_cbranch_i_fork src0, src1 - s_cmovk_i32 dst, src0 - s_cmpk_eq_i32 src0, src1 - s_cmpk_eq_u32 src0, src1 - s_cmpk_ge_i32 src0, src1 - s_cmpk_ge_u32 src0, src1 - s_cmpk_gt_i32 src0, src1 - s_cmpk_gt_u32 src0, src1 - s_cmpk_le_i32 src0, src1 - s_cmpk_le_u32 src0, src1 - s_cmpk_lg_i32 src0, src1 - s_cmpk_lg_u32 src0, src1 - s_cmpk_lt_i32 src0, src1 - s_cmpk_lt_u32 src0, src1 - s_getreg_b32 dst, src0 - s_movk_i32 dst, src0 - s_mulk_i32 dst, src0 - s_setreg_b32 dst, src0 - s_setreg_imm32_b32 dst, src0 - -SOPP -=========================== - -.. parsed-literal:: - - s_barrier - s_branch src0 - s_cbranch_cdbgsys src0 - s_cbranch_cdbgsys_and_user src0 - s_cbranch_cdbgsys_or_user src0 - s_cbranch_cdbguser src0 - s_cbranch_execnz src0 - s_cbranch_execz src0 - s_cbranch_scc0 src0 - s_cbranch_scc1 src0 - s_cbranch_vccnz src0 - s_cbranch_vccz src0 - s_decperflevel src0 - s_endpgm - s_icache_inv - s_incperflevel src0 - s_nop src0 - s_sendmsg src0 - s_sendmsghalt src0 - s_sethalt src0 - s_setkill src0 - s_setprio src0 - s_sleep src0 - s_trap src0 - s_ttracedata - s_waitcnt src0 - -VINTRP -=========================== - -.. parsed-literal:: - - v_interp_mov_f32 dst, src0, src1 - v_interp_p1_f32 dst, src0, src1 - v_interp_p2_f32 dst, src0, src1 - -VOP1 -=========================== - -.. parsed-literal:: - - v_bfrev_b32 dst, src0 - v_ceil_f32 dst, src0 - v_ceil_f64 dst, src0 - v_clrexcp - v_cos_f32 dst, src0 - v_cvt_f16_f32 dst, src0 - v_cvt_f32_f16 dst, src0 - v_cvt_f32_f64 dst, src0 - v_cvt_f32_i32 dst, src0 - v_cvt_f32_u32 dst, src0 - v_cvt_f32_ubyte0 dst, src0 - v_cvt_f32_ubyte1 dst, src0 - v_cvt_f32_ubyte2 dst, src0 - v_cvt_f32_ubyte3 dst, src0 - v_cvt_f64_f32 dst, src0 - v_cvt_f64_i32 dst, src0 - v_cvt_f64_u32 dst, src0 - v_cvt_flr_i32_f32 dst, src0 - v_cvt_i32_f32 dst, src0 - v_cvt_i32_f64 dst, src0 - v_cvt_off_f32_i4 dst, src0 - v_cvt_rpi_i32_f32 dst, src0 - v_cvt_u32_f32 dst, src0 - v_cvt_u32_f64 dst, src0 - v_exp_f32 dst, src0 - v_exp_legacy_f32 dst, src0 - v_ffbh_i32 dst, src0 - v_ffbh_u32 dst, src0 - v_ffbl_b32 dst, src0 - v_floor_f32 dst, src0 - v_floor_f64 dst, src0 - v_fract_f32 dst, src0 - v_fract_f64 dst, src0 - v_frexp_exp_i32_f32 dst, src0 - v_frexp_exp_i32_f64 dst, src0 - v_frexp_mant_f32 dst, src0 - v_frexp_mant_f64 dst, src0 - v_log_clamp_f32 dst, src0 - v_log_f32 dst, src0 - v_log_legacy_f32 dst, src0 - v_mov_b32 dst, src0 - v_mov_fed_b32 dst, src0 - v_movreld_b32 dst, src0 - v_movrels_b32 dst, src0 - v_movrelsd_b32 dst, src0 - v_nop - v_not_b32 dst, src0 - v_rcp_clamp_f32 dst, src0 - v_rcp_clamp_f64 dst, src0 - v_rcp_f32 dst, src0 - v_rcp_f64 dst, src0 - v_rcp_iflag_f32 dst, src0 - v_rcp_legacy_f32 dst, src0 - v_readfirstlane_b32 dst, src0 - v_rndne_f32 dst, src0 - v_rndne_f64 dst, src0 - v_rsq_clamp_f32 dst, src0 - v_rsq_clamp_f64 dst, src0 - v_rsq_f32 dst, src0 - v_rsq_f64 dst, src0 - v_rsq_legacy_f32 dst, src0 - v_sin_f32 dst, src0 - v_sqrt_f32 dst, src0 - v_sqrt_f64 dst, src0 - v_trunc_f32 dst, src0 - v_trunc_f64 dst, src0 - -VOP2 -=========================== - -.. parsed-literal:: - - v_add_f32 dst, src0, src1 - v_add_i32 dst0, dst1, src0, src1 - v_addc_u32 dst0, dst1, src0, src1, src2 - v_and_b32 dst, src0, src1 - v_ashr_i32 dst, src0, src1 - v_ashrrev_i32 dst, src0, src1 - v_bcnt_u32_b32 dst, src0, src1 - v_bfm_b32 dst, src0, src1 - v_cndmask_b32 dst, src0, src1, src2 - v_cvt_pk_i16_i32 dst, src0, src1 - v_cvt_pk_u16_u32 dst, src0, src1 - v_cvt_pkaccum_u8_f32 dst, src0, src1 - v_cvt_pknorm_i16_f32 dst, src0, src1 - v_cvt_pknorm_u16_f32 dst, src0, src1 - v_cvt_pkrtz_f16_f32 dst, src0, src1 - v_ldexp_f32 dst, src0, src1 - v_lshl_b32 dst, src0, src1 - v_lshlrev_b32 dst, src0, src1 - v_lshr_b32 dst, src0, src1 - v_lshrrev_b32 dst, src0, src1 - v_mac_f32 dst, src0, src1 - v_mac_legacy_f32 dst, src0, src1 - v_madak_f32 dst, src0, src1, src2 - v_madmk_f32 dst, src0, src1, src2 - v_max_f32 dst, src0, src1 - v_max_i32 dst, src0, src1 - v_max_legacy_f32 dst, src0, src1 - v_max_u32 dst, src0, src1 - v_mbcnt_hi_u32_b32 dst, src0, src1 - v_mbcnt_lo_u32_b32 dst, src0, src1 - v_min_f32 dst, src0, src1 - v_min_i32 dst, src0, src1 - v_min_legacy_f32 dst, src0, src1 - v_min_u32 dst, src0, src1 - v_mul_f32 dst, src0, src1 - v_mul_hi_i32_i24 dst, src0, src1 - v_mul_hi_u32_u24 dst, src0, src1 - v_mul_i32_i24 dst, src0, src1 - v_mul_legacy_f32 dst, src0, src1 - v_mul_u32_u24 dst, src0, src1 - v_or_b32 dst, src0, src1 - v_readlane_b32 dst, src0, src1 - v_sub_f32 dst, src0, src1 - v_sub_i32 dst0, dst1, src0, src1 - v_subb_u32 dst0, dst1, src0, src1, src2 - v_subbrev_u32 dst0, dst1, src0, src1, src2 - v_subrev_f32 dst, src0, src1 - v_subrev_i32 dst0, dst1, src0, src1 - v_writelane_b32 dst, src0, src1 - v_xor_b32 dst, src0, src1 - -VOP3 -=========================== - -.. parsed-literal:: - - v_add_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_add_f64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_add_i32_e64 dst0, dst1, src0, src1 :ref:`omod` - v_addc_u32_e64 dst0, dst1, src0, src1, src2 :ref:`omod` - v_alignbit_b32 dst, src0, src1, src2 :ref:`omod` - v_alignbyte_b32 dst, src0, src1, src2 :ref:`omod` - v_and_b32_e64 dst, src0, src1 :ref:`omod` - v_ashr_i32_e64 dst, src0, src1 :ref:`omod` - v_ashr_i64 dst, src0, src1 :ref:`omod` - v_ashrrev_i32_e64 dst, src0, src1 :ref:`omod` - v_bcnt_u32_b32_e64 dst, src0, src1 :ref:`omod` - v_bfe_i32 dst, src0, src1, src2 :ref:`omod` - v_bfe_u32 dst, src0, src1, src2 :ref:`omod` - v_bfi_b32 dst, src0, src1, src2 :ref:`omod` - v_bfm_b32_e64 dst, src0, src1 :ref:`omod` - v_bfrev_b32_e64 dst, src0 :ref:`omod` - v_ceil_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_ceil_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_clrexcp_e64 :ref:`omod` - v_cmp_class_f32_e64 dst, src0, src1 :ref:`omod` - v_cmp_class_f64_e64 dst, src0, src1 :ref:`omod` - v_cmp_eq_f32_e64 dst, src0, src1 :ref:`omod` - v_cmp_eq_f64_e64 dst, src0, src1 :ref:`omod` - v_cmp_eq_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_eq_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_eq_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_eq_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_f_f32_e64 dst, src0, src1 :ref:`omod` - v_cmp_f_f64_e64 dst, src0, src1 :ref:`omod` - v_cmp_f_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_f_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_f_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_f_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_ge_f32_e64 dst, src0, src1 :ref:`omod` - v_cmp_ge_f64_e64 dst, src0, src1 :ref:`omod` - v_cmp_ge_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_ge_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_ge_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_ge_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_gt_f32_e64 dst, src0, src1 :ref:`omod` - v_cmp_gt_f64_e64 dst, src0, src1 :ref:`omod` - v_cmp_gt_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_gt_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_gt_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_gt_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_le_f32_e64 dst, src0, src1 :ref:`omod` - v_cmp_le_f64_e64 dst, src0, src1 :ref:`omod` - v_cmp_le_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_le_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_le_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_le_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_lg_f32_e64 dst, src0, src1 :ref:`omod` - v_cmp_lg_f64_e64 dst, src0, src1 :ref:`omod` - v_cmp_lt_f32_e64 dst, src0, src1 :ref:`omod` - v_cmp_lt_f64_e64 dst, src0, src1 :ref:`omod` - v_cmp_lt_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_lt_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_lt_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_lt_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_ne_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_ne_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_ne_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_ne_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_neq_f32_e64 dst, src0, src1 :ref:`omod` - v_cmp_neq_f64_e64 dst, src0, src1 :ref:`omod` - v_cmp_nge_f32_e64 dst, src0, src1 :ref:`omod` - v_cmp_nge_f64_e64 dst, src0, src1 :ref:`omod` - v_cmp_ngt_f32_e64 dst, src0, src1 :ref:`omod` - v_cmp_ngt_f64_e64 dst, src0, src1 :ref:`omod` - v_cmp_nle_f32_e64 dst, src0, src1 :ref:`omod` - v_cmp_nle_f64_e64 dst, src0, src1 :ref:`omod` - v_cmp_nlg_f32_e64 dst, src0, src1 :ref:`omod` - v_cmp_nlg_f64_e64 dst, src0, src1 :ref:`omod` - v_cmp_nlt_f32_e64 dst, src0, src1 :ref:`omod` - v_cmp_nlt_f64_e64 dst, src0, src1 :ref:`omod` - v_cmp_o_f32_e64 dst, src0, src1 :ref:`omod` - v_cmp_o_f64_e64 dst, src0, src1 :ref:`omod` - v_cmp_t_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_t_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_t_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_t_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_tru_f32_e64 dst, src0, src1 :ref:`omod` - v_cmp_tru_f64_e64 dst, src0, src1 :ref:`omod` - v_cmp_u_f32_e64 dst, src0, src1 :ref:`omod` - v_cmp_u_f64_e64 dst, src0, src1 :ref:`omod` - v_cmps_eq_f32_e64 dst, src0, src1 :ref:`omod` - v_cmps_eq_f64_e64 dst, src0, src1 :ref:`omod` - v_cmps_f_f32_e64 dst, src0, src1 :ref:`omod` - v_cmps_f_f64_e64 dst, src0, src1 :ref:`omod` - v_cmps_ge_f32_e64 dst, src0, src1 :ref:`omod` - v_cmps_ge_f64_e64 dst, src0, src1 :ref:`omod` - v_cmps_gt_f32_e64 dst, src0, src1 :ref:`omod` - v_cmps_gt_f64_e64 dst, src0, src1 :ref:`omod` - v_cmps_le_f32_e64 dst, src0, src1 :ref:`omod` - v_cmps_le_f64_e64 dst, src0, src1 :ref:`omod` - v_cmps_lg_f32_e64 dst, src0, src1 :ref:`omod` - v_cmps_lg_f64_e64 dst, src0, src1 :ref:`omod` - v_cmps_lt_f32_e64 dst, src0, src1 :ref:`omod` - v_cmps_lt_f64_e64 dst, src0, src1 :ref:`omod` - v_cmps_neq_f32_e64 dst, src0, src1 :ref:`omod` - v_cmps_neq_f64_e64 dst, src0, src1 :ref:`omod` - v_cmps_nge_f32_e64 dst, src0, src1 :ref:`omod` - v_cmps_nge_f64_e64 dst, src0, src1 :ref:`omod` - v_cmps_ngt_f32_e64 dst, src0, src1 :ref:`omod` - v_cmps_ngt_f64_e64 dst, src0, src1 :ref:`omod` - v_cmps_nle_f32_e64 dst, src0, src1 :ref:`omod` - v_cmps_nle_f64_e64 dst, src0, src1 :ref:`omod` - v_cmps_nlg_f32_e64 dst, src0, src1 :ref:`omod` - v_cmps_nlg_f64_e64 dst, src0, src1 :ref:`omod` - v_cmps_nlt_f32_e64 dst, src0, src1 :ref:`omod` - v_cmps_nlt_f64_e64 dst, src0, src1 :ref:`omod` - v_cmps_o_f32_e64 dst, src0, src1 :ref:`omod` - v_cmps_o_f64_e64 dst, src0, src1 :ref:`omod` - v_cmps_tru_f32_e64 dst, src0, src1 :ref:`omod` - v_cmps_tru_f64_e64 dst, src0, src1 :ref:`omod` - v_cmps_u_f32_e64 dst, src0, src1 :ref:`omod` - v_cmps_u_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_eq_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_eq_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_f_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_f_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_ge_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_ge_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_gt_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_gt_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_le_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_le_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_lg_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_lg_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_lt_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_lt_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_neq_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_neq_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_nge_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_nge_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_ngt_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_ngt_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_nle_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_nle_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_nlg_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_nlg_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_nlt_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_nlt_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_o_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_o_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_tru_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_tru_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_u_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpsx_u_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_class_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_class_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_eq_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_eq_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_eq_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_eq_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_eq_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_eq_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_f_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_f_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_f_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_f_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_f_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_f_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ge_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ge_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ge_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ge_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ge_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ge_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_gt_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_gt_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_gt_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_gt_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_gt_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_gt_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_le_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_le_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_le_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_le_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_le_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_le_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lg_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lg_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lt_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lt_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lt_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lt_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lt_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lt_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ne_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ne_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ne_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ne_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_neq_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_neq_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_nge_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_nge_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ngt_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ngt_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_nle_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_nle_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_nlg_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_nlg_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_nlt_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_nlt_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_o_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_o_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_t_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_t_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_t_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_t_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_tru_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_tru_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_u_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_u_f64_e64 dst, src0, src1 :ref:`omod` - v_cndmask_b32_e64 dst, src0, src1, src2 :ref:`omod` - v_cos_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cubeid_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_cubema_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_cubesc_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_cubetc_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_cvt_f16_f32_e64 dst, src0 :ref:`omod` - v_cvt_f32_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_i32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_u32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_ubyte0_e64 dst, src0 :ref:`omod` - v_cvt_f32_ubyte1_e64 dst, src0 :ref:`omod` - v_cvt_f32_ubyte2_e64 dst, src0 :ref:`omod` - v_cvt_f32_ubyte3_e64 dst, src0 :ref:`omod` - v_cvt_f64_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f64_i32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f64_u32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_flr_i32_f32_e64 dst, src0 :ref:`omod` - v_cvt_i32_f32_e64 dst, src0 :ref:`omod` - v_cvt_i32_f64_e64 dst, src0 :ref:`omod` - v_cvt_off_f32_i4_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_pk_i16_i32_e64 dst, src0, src1 :ref:`omod` - v_cvt_pk_u16_u32_e64 dst, src0, src1 :ref:`omod` - v_cvt_pk_u8_f32 dst, src0, src1, src2 :ref:`omod` - v_cvt_pkaccum_u8_f32_e64 dst, src0, src1 :ref:`omod` - v_cvt_pknorm_i16_f32_e64 dst, src0, src1 :ref:`omod` - v_cvt_pknorm_u16_f32_e64 dst, src0, src1 :ref:`omod` - v_cvt_pkrtz_f16_f32_e64 dst, src0, src1 :ref:`omod` - v_cvt_rpi_i32_f32_e64 dst, src0 :ref:`omod` - v_cvt_u32_f32_e64 dst, src0 :ref:`omod` - v_cvt_u32_f64_e64 dst, src0 :ref:`omod` - v_div_fixup_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_div_fixup_f64 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_div_fmas_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_div_fmas_f64 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_div_scale_f32 dst0, dst1, src0, src1, src2 :ref:`omod` - v_div_scale_f64 dst0, dst1, src0, src1, src2 :ref:`omod` - v_exp_f32_e64 dst, src0 :ref:`omod` - v_exp_legacy_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_ffbh_i32_e64 dst, src0 :ref:`omod` - v_ffbh_u32_e64 dst, src0 :ref:`omod` - v_ffbl_b32_e64 dst, src0 :ref:`omod` - v_floor_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_floor_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_fma_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_fma_f64 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_fract_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_fract_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_frexp_exp_i32_f32_e64 dst, src0 :ref:`omod` - v_frexp_exp_i32_f64_e64 dst, src0 :ref:`omod` - v_frexp_mant_f32_e64 dst, src0 :ref:`omod` - v_frexp_mant_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_ldexp_f32_e64 dst, src0, src1 :ref:`omod` - v_ldexp_f64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_lerp_u8 dst, src0, src1, src2 :ref:`omod` - v_log_clamp_f32_e64 dst, src0 :ref:`omod` - v_log_f32_e64 dst, src0 :ref:`omod` - v_log_legacy_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_lshl_b32_e64 dst, src0, src1 :ref:`omod` - v_lshl_b64 dst, src0, src1 :ref:`omod` - v_lshlrev_b32_e64 dst, src0, src1 :ref:`omod` - v_lshr_b32_e64 dst, src0, src1 :ref:`omod` - v_lshr_b64 dst, src0, src1 :ref:`omod` - v_lshrrev_b32_e64 dst, src0, src1 :ref:`omod` - v_mac_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_mac_legacy_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_mad_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_mad_i32_i24 dst, src0, src1, src2 :ref:`omod` - v_mad_i64_i32 dst0, dst1, src0, src1, src2 :ref:`omod` - v_mad_legacy_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_mad_u32_u24 dst, src0, src1, src2 :ref:`omod` - v_mad_u64_u32 dst0, dst1, src0, src1, src2 :ref:`omod` - v_max3_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_max3_i32 dst, src0, src1, src2 :ref:`omod` - v_max3_u32 dst, src0, src1, src2 :ref:`omod` - v_max_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_max_f64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_max_i32_e64 dst, src0, src1 :ref:`omod` - v_max_legacy_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_max_u32_e64 dst, src0, src1 :ref:`omod` - v_mbcnt_hi_u32_b32_e64 dst, src0, src1 :ref:`omod` - v_mbcnt_lo_u32_b32_e64 dst, src0, src1 :ref:`omod` - v_med3_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_med3_i32 dst, src0, src1, src2 :ref:`omod` - v_med3_u32 dst, src0, src1, src2 :ref:`omod` - v_min3_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_min3_i32 dst, src0, src1, src2 :ref:`omod` - v_min3_u32 dst, src0, src1, src2 :ref:`omod` - v_min_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_min_f64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_min_i32_e64 dst, src0, src1 :ref:`omod` - v_min_legacy_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_min_u32_e64 dst, src0, src1 :ref:`omod` - v_mov_b32_e64 dst, src0 :ref:`omod` - v_mov_fed_b32_e64 dst, src0 :ref:`omod` - v_movreld_b32_e64 dst, src0 :ref:`omod` - v_movrels_b32_e64 dst, src0 :ref:`omod` - v_movrelsd_b32_e64 dst, src0 :ref:`omod` - v_mqsad_pk_u16_u8 dst, src0, src1, src2 :ref:`omod` - v_mqsad_u32_u8 dst, src0, src1, src2 :ref:`omod` - v_msad_u8 dst, src0, src1, src2 :ref:`omod` - v_mul_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_mul_f64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_mul_hi_i32 dst, src0, src1 :ref:`omod` - v_mul_hi_i32_i24_e64 dst, src0, src1 :ref:`omod` - v_mul_hi_u32 dst, src0, src1 :ref:`omod` - v_mul_hi_u32_u24_e64 dst, src0, src1 :ref:`omod` - v_mul_i32_i24_e64 dst, src0, src1 :ref:`omod` - v_mul_legacy_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_mul_lo_i32 dst, src0, src1 :ref:`omod` - v_mul_lo_u32 dst, src0, src1 :ref:`omod` - v_mul_u32_u24_e64 dst, src0, src1 :ref:`omod` - v_mullit_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_nop_e64 :ref:`omod` - v_not_b32_e64 dst, src0 :ref:`omod` - v_or_b32_e64 dst, src0, src1 :ref:`omod` - v_qsad_pk_u16_u8 dst, src0, src1, src2 :ref:`omod` - v_rcp_clamp_f32_e64 dst, src0 :ref:`omod` - v_rcp_clamp_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rcp_f32_e64 dst, src0 :ref:`omod` - v_rcp_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rcp_iflag_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rcp_legacy_f32_e64 dst, src0 :ref:`omod` - v_rndne_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rndne_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rsq_clamp_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rsq_clamp_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rsq_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rsq_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rsq_legacy_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_sad_hi_u8 dst, src0, src1, src2 :ref:`omod` - v_sad_u16 dst, src0, src1, src2 :ref:`omod` - v_sad_u32 dst, src0, src1, src2 :ref:`omod` - v_sad_u8 dst, src0, src1, src2 :ref:`omod` - v_sin_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_sqrt_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_sqrt_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_sub_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_sub_i32_e64 dst0, dst1, src0, src1 :ref:`omod` - v_subb_u32_e64 dst0, dst1, src0, src1, src2 :ref:`omod` - v_subbrev_u32_e64 dst0, dst1, src0, src1, src2 :ref:`omod` - v_subrev_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_subrev_i32_e64 dst0, dst1, src0, src1 :ref:`omod` - v_trig_preop_f64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_trunc_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_trunc_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_xor_b32_e64 dst, src0, src1 :ref:`omod` - -VOPC -=========================== - -.. parsed-literal:: - - v_cmp_class_f32 dst, src0, src1 - v_cmp_class_f64 dst, src0, src1 - v_cmp_eq_f32 dst, src0, src1 - v_cmp_eq_f64 dst, src0, src1 - v_cmp_eq_i32 dst, src0, src1 - v_cmp_eq_i64 dst, src0, src1 - v_cmp_eq_u32 dst, src0, src1 - v_cmp_eq_u64 dst, src0, src1 - v_cmp_f_f32 dst, src0, src1 - v_cmp_f_f64 dst, src0, src1 - v_cmp_f_i32 dst, src0, src1 - v_cmp_f_i64 dst, src0, src1 - v_cmp_f_u32 dst, src0, src1 - v_cmp_f_u64 dst, src0, src1 - v_cmp_ge_f32 dst, src0, src1 - v_cmp_ge_f64 dst, src0, src1 - v_cmp_ge_i32 dst, src0, src1 - v_cmp_ge_i64 dst, src0, src1 - v_cmp_ge_u32 dst, src0, src1 - v_cmp_ge_u64 dst, src0, src1 - v_cmp_gt_f32 dst, src0, src1 - v_cmp_gt_f64 dst, src0, src1 - v_cmp_gt_i32 dst, src0, src1 - v_cmp_gt_i64 dst, src0, src1 - v_cmp_gt_u32 dst, src0, src1 - v_cmp_gt_u64 dst, src0, src1 - v_cmp_le_f32 dst, src0, src1 - v_cmp_le_f64 dst, src0, src1 - v_cmp_le_i32 dst, src0, src1 - v_cmp_le_i64 dst, src0, src1 - v_cmp_le_u32 dst, src0, src1 - v_cmp_le_u64 dst, src0, src1 - v_cmp_lg_f32 dst, src0, src1 - v_cmp_lg_f64 dst, src0, src1 - v_cmp_lt_f32 dst, src0, src1 - v_cmp_lt_f64 dst, src0, src1 - v_cmp_lt_i32 dst, src0, src1 - v_cmp_lt_i64 dst, src0, src1 - v_cmp_lt_u32 dst, src0, src1 - v_cmp_lt_u64 dst, src0, src1 - v_cmp_ne_i32 dst, src0, src1 - v_cmp_ne_i64 dst, src0, src1 - v_cmp_ne_u32 dst, src0, src1 - v_cmp_ne_u64 dst, src0, src1 - v_cmp_neq_f32 dst, src0, src1 - v_cmp_neq_f64 dst, src0, src1 - v_cmp_nge_f32 dst, src0, src1 - v_cmp_nge_f64 dst, src0, src1 - v_cmp_ngt_f32 dst, src0, src1 - v_cmp_ngt_f64 dst, src0, src1 - v_cmp_nle_f32 dst, src0, src1 - v_cmp_nle_f64 dst, src0, src1 - v_cmp_nlg_f32 dst, src0, src1 - v_cmp_nlg_f64 dst, src0, src1 - v_cmp_nlt_f32 dst, src0, src1 - v_cmp_nlt_f64 dst, src0, src1 - v_cmp_o_f32 dst, src0, src1 - v_cmp_o_f64 dst, src0, src1 - v_cmp_t_i32 dst, src0, src1 - v_cmp_t_i64 dst, src0, src1 - v_cmp_t_u32 dst, src0, src1 - v_cmp_t_u64 dst, src0, src1 - v_cmp_tru_f32 dst, src0, src1 - v_cmp_tru_f64 dst, src0, src1 - v_cmp_u_f32 dst, src0, src1 - v_cmp_u_f64 dst, src0, src1 - v_cmps_eq_f32 dst, src0, src1 - v_cmps_eq_f64 dst, src0, src1 - v_cmps_f_f32 dst, src0, src1 - v_cmps_f_f64 dst, src0, src1 - v_cmps_ge_f32 dst, src0, src1 - v_cmps_ge_f64 dst, src0, src1 - v_cmps_gt_f32 dst, src0, src1 - v_cmps_gt_f64 dst, src0, src1 - v_cmps_le_f32 dst, src0, src1 - v_cmps_le_f64 dst, src0, src1 - v_cmps_lg_f32 dst, src0, src1 - v_cmps_lg_f64 dst, src0, src1 - v_cmps_lt_f32 dst, src0, src1 - v_cmps_lt_f64 dst, src0, src1 - v_cmps_neq_f32 dst, src0, src1 - v_cmps_neq_f64 dst, src0, src1 - v_cmps_nge_f32 dst, src0, src1 - v_cmps_nge_f64 dst, src0, src1 - v_cmps_ngt_f32 dst, src0, src1 - v_cmps_ngt_f64 dst, src0, src1 - v_cmps_nle_f32 dst, src0, src1 - v_cmps_nle_f64 dst, src0, src1 - v_cmps_nlg_f32 dst, src0, src1 - v_cmps_nlg_f64 dst, src0, src1 - v_cmps_nlt_f32 dst, src0, src1 - v_cmps_nlt_f64 dst, src0, src1 - v_cmps_o_f32 dst, src0, src1 - v_cmps_o_f64 dst, src0, src1 - v_cmps_tru_f32 dst, src0, src1 - v_cmps_tru_f64 dst, src0, src1 - v_cmps_u_f32 dst, src0, src1 - v_cmps_u_f64 dst, src0, src1 - v_cmpsx_eq_f32 dst, src0, src1 - v_cmpsx_eq_f64 dst, src0, src1 - v_cmpsx_f_f32 dst, src0, src1 - v_cmpsx_f_f64 dst, src0, src1 - v_cmpsx_ge_f32 dst, src0, src1 - v_cmpsx_ge_f64 dst, src0, src1 - v_cmpsx_gt_f32 dst, src0, src1 - v_cmpsx_gt_f64 dst, src0, src1 - v_cmpsx_le_f32 dst, src0, src1 - v_cmpsx_le_f64 dst, src0, src1 - v_cmpsx_lg_f32 dst, src0, src1 - v_cmpsx_lg_f64 dst, src0, src1 - v_cmpsx_lt_f32 dst, src0, src1 - v_cmpsx_lt_f64 dst, src0, src1 - v_cmpsx_neq_f32 dst, src0, src1 - v_cmpsx_neq_f64 dst, src0, src1 - v_cmpsx_nge_f32 dst, src0, src1 - v_cmpsx_nge_f64 dst, src0, src1 - v_cmpsx_ngt_f32 dst, src0, src1 - v_cmpsx_ngt_f64 dst, src0, src1 - v_cmpsx_nle_f32 dst, src0, src1 - v_cmpsx_nle_f64 dst, src0, src1 - v_cmpsx_nlg_f32 dst, src0, src1 - v_cmpsx_nlg_f64 dst, src0, src1 - v_cmpsx_nlt_f32 dst, src0, src1 - v_cmpsx_nlt_f64 dst, src0, src1 - v_cmpsx_o_f32 dst, src0, src1 - v_cmpsx_o_f64 dst, src0, src1 - v_cmpsx_tru_f32 dst, src0, src1 - v_cmpsx_tru_f64 dst, src0, src1 - v_cmpsx_u_f32 dst, src0, src1 - v_cmpsx_u_f64 dst, src0, src1 - v_cmpx_class_f32 dst, src0, src1 - v_cmpx_class_f64 dst, src0, src1 - v_cmpx_eq_f32 dst, src0, src1 - v_cmpx_eq_f64 dst, src0, src1 - v_cmpx_eq_i32 dst, src0, src1 - v_cmpx_eq_i64 dst, src0, src1 - v_cmpx_eq_u32 dst, src0, src1 - v_cmpx_eq_u64 dst, src0, src1 - v_cmpx_f_f32 dst, src0, src1 - v_cmpx_f_f64 dst, src0, src1 - v_cmpx_f_i32 dst, src0, src1 - v_cmpx_f_i64 dst, src0, src1 - v_cmpx_f_u32 dst, src0, src1 - v_cmpx_f_u64 dst, src0, src1 - v_cmpx_ge_f32 dst, src0, src1 - v_cmpx_ge_f64 dst, src0, src1 - v_cmpx_ge_i32 dst, src0, src1 - v_cmpx_ge_i64 dst, src0, src1 - v_cmpx_ge_u32 dst, src0, src1 - v_cmpx_ge_u64 dst, src0, src1 - v_cmpx_gt_f32 dst, src0, src1 - v_cmpx_gt_f64 dst, src0, src1 - v_cmpx_gt_i32 dst, src0, src1 - v_cmpx_gt_i64 dst, src0, src1 - v_cmpx_gt_u32 dst, src0, src1 - v_cmpx_gt_u64 dst, src0, src1 - v_cmpx_le_f32 dst, src0, src1 - v_cmpx_le_f64 dst, src0, src1 - v_cmpx_le_i32 dst, src0, src1 - v_cmpx_le_i64 dst, src0, src1 - v_cmpx_le_u32 dst, src0, src1 - v_cmpx_le_u64 dst, src0, src1 - v_cmpx_lg_f32 dst, src0, src1 - v_cmpx_lg_f64 dst, src0, src1 - v_cmpx_lt_f32 dst, src0, src1 - v_cmpx_lt_f64 dst, src0, src1 - v_cmpx_lt_i32 dst, src0, src1 - v_cmpx_lt_i64 dst, src0, src1 - v_cmpx_lt_u32 dst, src0, src1 - v_cmpx_lt_u64 dst, src0, src1 - v_cmpx_ne_i32 dst, src0, src1 - v_cmpx_ne_i64 dst, src0, src1 - v_cmpx_ne_u32 dst, src0, src1 - v_cmpx_ne_u64 dst, src0, src1 - v_cmpx_neq_f32 dst, src0, src1 - v_cmpx_neq_f64 dst, src0, src1 - v_cmpx_nge_f32 dst, src0, src1 - v_cmpx_nge_f64 dst, src0, src1 - v_cmpx_ngt_f32 dst, src0, src1 - v_cmpx_ngt_f64 dst, src0, src1 - v_cmpx_nle_f32 dst, src0, src1 - v_cmpx_nle_f64 dst, src0, src1 - v_cmpx_nlg_f32 dst, src0, src1 - v_cmpx_nlg_f64 dst, src0, src1 - v_cmpx_nlt_f32 dst, src0, src1 - v_cmpx_nlt_f64 dst, src0, src1 - v_cmpx_o_f32 dst, src0, src1 - v_cmpx_o_f64 dst, src0, src1 - v_cmpx_t_i32 dst, src0, src1 - v_cmpx_t_i64 dst, src0, src1 - v_cmpx_t_u32 dst, src0, src1 - v_cmpx_t_u64 dst, src0, src1 - v_cmpx_tru_f32 dst, src0, src1 - v_cmpx_tru_f64 dst, src0, src1 - v_cmpx_u_f32 dst, src0, src1 - v_cmpx_u_f64 dst, src0, src1 diff --git a/docs/AMDGPUAsmGFX8.rst b/docs/AMDGPUAsmGFX8.rst deleted file mode 100644 index 44c48432fda11e6d79474f6fa3e9676df0d8995a..0000000000000000000000000000000000000000 --- a/docs/AMDGPUAsmGFX8.rst +++ /dev/null @@ -1,1672 +0,0 @@ -.. - ************************************************** - * * - * Automatically generated file, do not edit! * - * * - ************************************************** - -=========================== -Syntax of GFX8 Instructions -=========================== - -.. contents:: - :local: - - -DS -=========================== - -.. parsed-literal:: - - ds_add_f32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_add_rtn_f32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_add_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_add_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_add_src2_f32 src0 :ref:`ds_offset16` :ref:`gds` - ds_add_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_add_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_add_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_add_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_and_b32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_and_b64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_and_rtn_b32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_and_rtn_b64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_and_src2_b32 src0 :ref:`ds_offset16` :ref:`gds` - ds_and_src2_b64 src0 :ref:`ds_offset16` :ref:`gds` - ds_append dst :ref:`ds_offset16` :ref:`gds` - ds_bpermute_b32 dst, src0, src1 :ref:`ds_offset16` - ds_cmpst_b32 src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_b64 src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_f32 src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_f64 src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_rtn_b32 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_rtn_b64 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_rtn_f32 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_rtn_f64 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_condxchg32_rtn_b64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_consume dst :ref:`ds_offset16` :ref:`gds` - ds_dec_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_dec_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_dec_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_dec_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_dec_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_dec_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_gws_barrier src0 :ref:`ds_offset16` :ref:`gds` - ds_gws_init src0 :ref:`ds_offset16` :ref:`gds` - ds_gws_sema_br src0 :ref:`ds_offset16` :ref:`gds` - ds_gws_sema_p :ref:`ds_offset16` :ref:`gds` - ds_gws_sema_release_all :ref:`ds_offset16` :ref:`gds` - ds_gws_sema_v :ref:`ds_offset16` :ref:`gds` - ds_inc_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_inc_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_inc_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_inc_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_inc_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_inc_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_f32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_f64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_i32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_i64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_rtn_f32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_rtn_f64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_rtn_i32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_rtn_i64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_src2_f32 src0 :ref:`ds_offset16` :ref:`gds` - ds_max_src2_f64 src0 :ref:`ds_offset16` :ref:`gds` - ds_max_src2_i32 src0 :ref:`ds_offset16` :ref:`gds` - ds_max_src2_i64 src0 :ref:`ds_offset16` :ref:`gds` - ds_max_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_max_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_max_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_f32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_f64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_i32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_i64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_rtn_f32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_rtn_f64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_rtn_i32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_rtn_i64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_src2_f32 src0 :ref:`ds_offset16` :ref:`gds` - ds_min_src2_f64 src0 :ref:`ds_offset16` :ref:`gds` - ds_min_src2_i32 src0 :ref:`ds_offset16` :ref:`gds` - ds_min_src2_i64 src0 :ref:`ds_offset16` :ref:`gds` - ds_min_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_min_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_min_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_mskor_b32 src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_mskor_b64 src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_mskor_rtn_b32 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_mskor_rtn_b64 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_nop - ds_or_b32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_or_b64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_or_rtn_b32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_or_rtn_b64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_or_src2_b32 src0 :ref:`ds_offset16` :ref:`gds` - ds_or_src2_b64 src0 :ref:`ds_offset16` :ref:`gds` - ds_ordered_count dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_permute_b32 dst, src0, src1 :ref:`ds_offset16` - ds_read2_b32 dst, src0 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_read2_b64 dst, src0 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_read2st64_b32 dst, src0 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_read2st64_b64 dst, src0 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_read_b128 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_b32 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_b64 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_b96 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_i16 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_i8 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_u16 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_u8 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_rsub_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_rsub_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_rsub_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_rsub_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_rsub_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_rsub_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_sub_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_sub_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_sub_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_sub_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_sub_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_sub_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_swizzle_b32 dst, src0 :ref:`sw_offset16` :ref:`gds` - ds_wrap_rtn_b32 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_write2_b32 src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_write2_b64 src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_write2st64_b32 src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_write2st64_b64 src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_write_b128 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_b16 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_b32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_b64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_b8 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_b96 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_src2_b32 src0 :ref:`ds_offset16` :ref:`gds` - ds_write_src2_b64 src0 :ref:`ds_offset16` :ref:`gds` - ds_wrxchg2_rtn_b32 dst, src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_wrxchg2_rtn_b64 dst, src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_wrxchg2st64_rtn_b32 dst, src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_wrxchg2st64_rtn_b64 dst, src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_wrxchg_rtn_b32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_wrxchg_rtn_b64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_xor_b32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_xor_b64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_xor_rtn_b32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_xor_rtn_b64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_xor_src2_b32 src0 :ref:`ds_offset16` :ref:`gds` - ds_xor_src2_b64 src0 :ref:`ds_offset16` :ref:`gds` - -EXP -=========================== - -.. parsed-literal:: - - exp dst, src0, src1, src2, src3 :ref:`done` :ref:`compr` :ref:`vm` - -FLAT -=========================== - -.. parsed-literal:: - - flat_atomic_add dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_add_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_and dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_and_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_cmpswap dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_cmpswap_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_dec dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_dec_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_inc dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_inc_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_or dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_or_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_smax dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_smax_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_smin dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_smin_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_sub dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_sub_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_swap dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_swap_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_umax dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_umax_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_umin dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_umin_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_xor dst, src0, src1 :ref:`glc` :ref:`slc` - flat_atomic_xor_x2 dst, src0, src1 :ref:`glc` :ref:`slc` - flat_load_dword dst, src0 :ref:`glc` :ref:`slc` - flat_load_dwordx2 dst, src0 :ref:`glc` :ref:`slc` - flat_load_dwordx3 dst, src0 :ref:`glc` :ref:`slc` - flat_load_dwordx4 dst, src0 :ref:`glc` :ref:`slc` - flat_load_sbyte dst, src0 :ref:`glc` :ref:`slc` - flat_load_sshort dst, src0 :ref:`glc` :ref:`slc` - flat_load_ubyte dst, src0 :ref:`glc` :ref:`slc` - flat_load_ushort dst, src0 :ref:`glc` :ref:`slc` - flat_store_byte src0, src1 :ref:`glc` :ref:`slc` - flat_store_dword src0, src1 :ref:`glc` :ref:`slc` - flat_store_dwordx2 src0, src1 :ref:`glc` :ref:`slc` - flat_store_dwordx3 src0, src1 :ref:`glc` :ref:`slc` - flat_store_dwordx4 src0, src1 :ref:`glc` :ref:`slc` - flat_store_short src0, src1 :ref:`glc` :ref:`slc` - -MIMG -=========================== - -.. parsed-literal:: - - image_atomic_add dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_and dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_cmpswap dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_dec dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_inc dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_or dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_smax dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_smin dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_sub dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_swap dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_umax dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_umin dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_xor dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4 dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_b dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_b_cl dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_b_cl_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_b_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_c dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_c_b dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_c_b_cl dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_c_b_cl_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_c_b_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_c_cl dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_c_cl_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_c_l dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_c_l_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_c_lz dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_c_lz_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_c_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_cl dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_cl_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_l dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_l_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_lz dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_lz_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_get_lod dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_get_resinfo dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_load dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_load_mip dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_load_mip_pck dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_load_mip_pck_sgn dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_load_pck dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_load_pck_sgn dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_sample dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_sample_b dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_sample_b_cl dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_sample_c dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_sample_c_b dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_sample_c_b_cl dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_sample_c_cl dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_sample_c_l dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_sample_c_lz dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_sample_cl dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_sample_l dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_sample_lz dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_store src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_store_mip src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_store_mip_pck src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_store_pck src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - -MUBUF -=========================== - -.. parsed-literal:: - - buffer_atomic_add dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_add_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_and dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_and_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_cmpswap dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_cmpswap_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_dec dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_dec_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_inc dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_inc_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_or dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_or_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_smax dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_smax_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_smin dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_smin_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_sub dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_sub_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_swap dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_swap_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_umax dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_umax_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_umin dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_umin_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_xor dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_xor_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_dword dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` - buffer_load_dwordx2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_dwordx3 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_dwordx4 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_format_d16_x dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_format_d16_xy dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_format_d16_xyz dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_format_d16_xyzw dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_format_x dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` - buffer_load_format_xy dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_format_xyz dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_format_xyzw dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_sbyte dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` - buffer_load_sshort dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` - buffer_load_ubyte dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` - buffer_load_ushort dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` - buffer_store_byte src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_dword src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_dwordx2 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_dwordx3 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_dwordx4 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_d16_x src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_d16_xy src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_d16_xyz src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_d16_xyzw src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_x src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_xy src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_xyz src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_xyzw src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_lds_dword src0, src1 :ref:`buf_offset12` :ref:`lds` :ref:`glc` :ref:`slc` - buffer_store_short src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_wbinvl1 - buffer_wbinvl1_vol - -SMEM -=========================== - -.. parsed-literal:: - - s_atc_probe src0, src1, src2 - s_atc_probe_buffer src0, src1, src2 - s_buffer_load_dword dst, src0, src1 :ref:`glc` - s_buffer_load_dwordx16 dst, src0, src1 :ref:`glc` - s_buffer_load_dwordx2 dst, src0, src1 :ref:`glc` - s_buffer_load_dwordx4 dst, src0, src1 :ref:`glc` - s_buffer_load_dwordx8 dst, src0, src1 :ref:`glc` - s_buffer_store_dword src0, src1, src2 :ref:`glc` - s_buffer_store_dwordx2 src0, src1, src2 :ref:`glc` - s_buffer_store_dwordx4 src0, src1, src2 :ref:`glc` - s_dcache_inv - s_dcache_inv_vol - s_dcache_wb - s_dcache_wb_vol - s_load_dword dst, src0, src1 :ref:`glc` - s_load_dwordx16 dst, src0, src1 :ref:`glc` - s_load_dwordx2 dst, src0, src1 :ref:`glc` - s_load_dwordx4 dst, src0, src1 :ref:`glc` - s_load_dwordx8 dst, src0, src1 :ref:`glc` - s_memrealtime dst - s_memtime dst - s_store_dword src0, src1, src2 :ref:`glc` - s_store_dwordx2 src0, src1, src2 :ref:`glc` - s_store_dwordx4 src0, src1, src2 :ref:`glc` - -SOP1 -=========================== - -.. parsed-literal:: - - s_abs_i32 dst, src0 - s_and_saveexec_b64 dst, src0 - s_andn2_saveexec_b64 dst, src0 - s_bcnt0_i32_b32 dst, src0 - s_bcnt0_i32_b64 dst, src0 - s_bcnt1_i32_b32 dst, src0 - s_bcnt1_i32_b64 dst, src0 - s_bitset0_b32 dst, src0 - s_bitset0_b64 dst, src0 - s_bitset1_b32 dst, src0 - s_bitset1_b64 dst, src0 - s_brev_b32 dst, src0 - s_brev_b64 dst, src0 - s_cbranch_join src0 - s_cmov_b32 dst, src0 - s_cmov_b64 dst, src0 - s_ff0_i32_b32 dst, src0 - s_ff0_i32_b64 dst, src0 - s_ff1_i32_b32 dst, src0 - s_ff1_i32_b64 dst, src0 - s_flbit_i32 dst, src0 - s_flbit_i32_b32 dst, src0 - s_flbit_i32_b64 dst, src0 - s_flbit_i32_i64 dst, src0 - s_getpc_b64 dst - s_mov_b32 dst, src0 - s_mov_b64 dst, src0 - s_mov_fed_b32 dst, src0 - s_movreld_b32 dst, src0 - s_movreld_b64 dst, src0 - s_movrels_b32 dst, src0 - s_movrels_b64 dst, src0 - s_nand_saveexec_b64 dst, src0 - s_nor_saveexec_b64 dst, src0 - s_not_b32 dst, src0 - s_not_b64 dst, src0 - s_or_saveexec_b64 dst, src0 - s_orn2_saveexec_b64 dst, src0 - s_quadmask_b32 dst, src0 - s_quadmask_b64 dst, src0 - s_rfe_b64 src0 - s_set_gpr_idx_idx src0 - s_setpc_b64 src0 - s_sext_i32_i16 dst, src0 - s_sext_i32_i8 dst, src0 - s_swappc_b64 dst, src0 - s_wqm_b32 dst, src0 - s_wqm_b64 dst, src0 - s_xnor_saveexec_b64 dst, src0 - s_xor_saveexec_b64 dst, src0 - -SOP2 -=========================== - -.. parsed-literal:: - - s_absdiff_i32 dst, src0, src1 - s_add_i32 dst, src0, src1 - s_add_u32 dst, src0, src1 - s_addc_u32 dst, src0, src1 - s_and_b32 dst, src0, src1 - s_and_b64 dst, src0, src1 - s_andn2_b32 dst, src0, src1 - s_andn2_b64 dst, src0, src1 - s_ashr_i32 dst, src0, src1 - s_ashr_i64 dst, src0, src1 - s_bfe_i32 dst, src0, src1 - s_bfe_i64 dst, src0, src1 - s_bfe_u32 dst, src0, src1 - s_bfe_u64 dst, src0, src1 - s_bfm_b32 dst, src0, src1 - s_bfm_b64 dst, src0, src1 - s_cbranch_g_fork src0, src1 - s_cselect_b32 dst, src0, src1 - s_cselect_b64 dst, src0, src1 - s_lshl_b32 dst, src0, src1 - s_lshl_b64 dst, src0, src1 - s_lshr_b32 dst, src0, src1 - s_lshr_b64 dst, src0, src1 - s_max_i32 dst, src0, src1 - s_max_u32 dst, src0, src1 - s_min_i32 dst, src0, src1 - s_min_u32 dst, src0, src1 - s_mul_i32 dst, src0, src1 - s_nand_b32 dst, src0, src1 - s_nand_b64 dst, src0, src1 - s_nor_b32 dst, src0, src1 - s_nor_b64 dst, src0, src1 - s_or_b32 dst, src0, src1 - s_or_b64 dst, src0, src1 - s_orn2_b32 dst, src0, src1 - s_orn2_b64 dst, src0, src1 - s_rfe_restore_b64 src0, src1 - s_sub_i32 dst, src0, src1 - s_sub_u32 dst, src0, src1 - s_subb_u32 dst, src0, src1 - s_xnor_b32 dst, src0, src1 - s_xnor_b64 dst, src0, src1 - s_xor_b32 dst, src0, src1 - s_xor_b64 dst, src0, src1 - -SOPC -=========================== - -.. parsed-literal:: - - s_bitcmp0_b32 src0, src1 - s_bitcmp0_b64 src0, src1 - s_bitcmp1_b32 src0, src1 - s_bitcmp1_b64 src0, src1 - s_cmp_eq_i32 src0, src1 - s_cmp_eq_u32 src0, src1 - s_cmp_eq_u64 src0, src1 - s_cmp_ge_i32 src0, src1 - s_cmp_ge_u32 src0, src1 - s_cmp_gt_i32 src0, src1 - s_cmp_gt_u32 src0, src1 - s_cmp_le_i32 src0, src1 - s_cmp_le_u32 src0, src1 - s_cmp_lg_i32 src0, src1 - s_cmp_lg_u32 src0, src1 - s_cmp_lg_u64 src0, src1 - s_cmp_lt_i32 src0, src1 - s_cmp_lt_u32 src0, src1 - s_set_gpr_idx_on src0, src1 - s_setvskip src0, src1 - -SOPK -=========================== - -.. parsed-literal:: - - s_addk_i32 dst, src0 - s_cbranch_i_fork src0, src1 - s_cmovk_i32 dst, src0 - s_cmpk_eq_i32 src0, src1 - s_cmpk_eq_u32 src0, src1 - s_cmpk_ge_i32 src0, src1 - s_cmpk_ge_u32 src0, src1 - s_cmpk_gt_i32 src0, src1 - s_cmpk_gt_u32 src0, src1 - s_cmpk_le_i32 src0, src1 - s_cmpk_le_u32 src0, src1 - s_cmpk_lg_i32 src0, src1 - s_cmpk_lg_u32 src0, src1 - s_cmpk_lt_i32 src0, src1 - s_cmpk_lt_u32 src0, src1 - s_getreg_b32 dst, src0 - s_movk_i32 dst, src0 - s_mulk_i32 dst, src0 - s_setreg_b32 dst, src0 - s_setreg_imm32_b32 dst, src0 - -SOPP -=========================== - -.. parsed-literal:: - - s_barrier - s_branch src0 - s_cbranch_cdbgsys src0 - s_cbranch_cdbgsys_and_user src0 - s_cbranch_cdbgsys_or_user src0 - s_cbranch_cdbguser src0 - s_cbranch_execnz src0 - s_cbranch_execz src0 - s_cbranch_scc0 src0 - s_cbranch_scc1 src0 - s_cbranch_vccnz src0 - s_cbranch_vccz src0 - s_decperflevel src0 - s_endpgm - s_endpgm_saved - s_icache_inv - s_incperflevel src0 - s_nop src0 - s_sendmsg src0 - s_sendmsghalt src0 - s_set_gpr_idx_mode src0 - s_set_gpr_idx_off - s_sethalt src0 - s_setkill src0 - s_setprio src0 - s_sleep src0 - s_trap src0 - s_ttracedata - s_waitcnt src0 - s_wakeup - -VINTRP -=========================== - -.. parsed-literal:: - - v_interp_mov_f32 dst, src0, src1 - v_interp_p1_f32 dst, src0, src1 - v_interp_p2_f32 dst, src0, src1 - -VOP1 -=========================== - -.. parsed-literal:: - - v_bfrev_b32 dst, src0 - v_bfrev_b32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_bfrev_b32_sdwa dst, src0 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_ceil_f16 dst, src0 - v_ceil_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_ceil_f16_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_ceil_f32 dst, src0 - v_ceil_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_ceil_f32_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_ceil_f64 dst, src0 - v_clrexcp - v_cos_f16 dst, src0 - v_cos_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cos_f16_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cos_f32 dst, src0 - v_cos_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cos_f32_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f16_f32 dst, src0 - v_cvt_f16_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f16_f32_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f16_i16 dst, src0 - v_cvt_f16_i16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f16_i16_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f16_u16 dst, src0 - v_cvt_f16_u16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f16_u16_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f32_f16 dst, src0 - v_cvt_f32_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f32_f16_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f32_f64 dst, src0 - v_cvt_f32_i32 dst, src0 - v_cvt_f32_i32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f32_i32_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f32_u32 dst, src0 - v_cvt_f32_u32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f32_u32_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f32_ubyte0 dst, src0 - v_cvt_f32_ubyte0_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f32_ubyte0_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f32_ubyte1 dst, src0 - v_cvt_f32_ubyte1_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f32_ubyte1_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f32_ubyte2 dst, src0 - v_cvt_f32_ubyte2_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f32_ubyte2_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f32_ubyte3 dst, src0 - v_cvt_f32_ubyte3_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f32_ubyte3_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f64_f32 dst, src0 - v_cvt_f64_i32 dst, src0 - v_cvt_f64_u32 dst, src0 - v_cvt_flr_i32_f32 dst, src0 - v_cvt_flr_i32_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_flr_i32_f32_sdwa dst, src0 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_i16_f16 dst, src0 - v_cvt_i16_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_i16_f16_sdwa dst, src0 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_i32_f32 dst, src0 - v_cvt_i32_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_i32_f32_sdwa dst, src0 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_i32_f64 dst, src0 - v_cvt_off_f32_i4 dst, src0 - v_cvt_off_f32_i4_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_off_f32_i4_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_rpi_i32_f32 dst, src0 - v_cvt_rpi_i32_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_rpi_i32_f32_sdwa dst, src0 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_u16_f16 dst, src0 - v_cvt_u16_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_u16_f16_sdwa dst, src0 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_u32_f32 dst, src0 - v_cvt_u32_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_u32_f32_sdwa dst, src0 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_u32_f64 dst, src0 - v_exp_f16 dst, src0 - v_exp_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_exp_f16_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_exp_f32 dst, src0 - v_exp_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_exp_f32_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_exp_legacy_f32 dst, src0 - v_exp_legacy_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_exp_legacy_f32_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_ffbh_i32 dst, src0 - v_ffbh_i32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_ffbh_i32_sdwa dst, src0 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_ffbh_u32 dst, src0 - v_ffbh_u32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_ffbh_u32_sdwa dst, src0 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_ffbl_b32 dst, src0 - v_ffbl_b32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_ffbl_b32_sdwa dst, src0 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_floor_f16 dst, src0 - v_floor_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_floor_f16_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_floor_f32 dst, src0 - v_floor_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_floor_f32_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_floor_f64 dst, src0 - v_fract_f16 dst, src0 - v_fract_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_fract_f16_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_fract_f32 dst, src0 - v_fract_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_fract_f32_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_fract_f64 dst, src0 - v_frexp_exp_i16_f16 dst, src0 - v_frexp_exp_i16_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_frexp_exp_i16_f16_sdwa dst, src0 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_frexp_exp_i32_f32 dst, src0 - v_frexp_exp_i32_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_frexp_exp_i32_f32_sdwa dst, src0 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_frexp_exp_i32_f64 dst, src0 - v_frexp_mant_f16 dst, src0 - v_frexp_mant_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_frexp_mant_f16_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_frexp_mant_f32 dst, src0 - v_frexp_mant_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_frexp_mant_f32_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_frexp_mant_f64 dst, src0 - v_log_f16 dst, src0 - v_log_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_log_f16_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_log_f32 dst, src0 - v_log_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_log_f32_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_log_legacy_f32 dst, src0 - v_log_legacy_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_log_legacy_f32_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_mov_b32 dst, src0 - v_mov_b32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mov_b32_sdwa dst, src0 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_mov_fed_b32 dst, src0 - v_mov_fed_b32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mov_fed_b32_sdwa dst, src0 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_movreld_b32 dst, src0 - v_movrels_b32 dst, src0 - v_movrelsd_b32 dst, src0 - v_nop - v_not_b32 dst, src0 - v_not_b32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_not_b32_sdwa dst, src0 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_rcp_f16 dst, src0 - v_rcp_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_rcp_f16_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_rcp_f32 dst, src0 - v_rcp_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_rcp_f32_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_rcp_f64 dst, src0 - v_rcp_iflag_f32 dst, src0 - v_rcp_iflag_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_rcp_iflag_f32_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_readfirstlane_b32 dst, src0 - v_rndne_f16 dst, src0 - v_rndne_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_rndne_f16_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_rndne_f32 dst, src0 - v_rndne_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_rndne_f32_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_rndne_f64 dst, src0 - v_rsq_f16 dst, src0 - v_rsq_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_rsq_f16_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_rsq_f32 dst, src0 - v_rsq_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_rsq_f32_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_rsq_f64 dst, src0 - v_sin_f16 dst, src0 - v_sin_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_sin_f16_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_sin_f32 dst, src0 - v_sin_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_sin_f32_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_sqrt_f16 dst, src0 - v_sqrt_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_sqrt_f16_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_sqrt_f32 dst, src0 - v_sqrt_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_sqrt_f32_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_sqrt_f64 dst, src0 - v_trunc_f16 dst, src0 - v_trunc_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_trunc_f16_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_trunc_f32 dst, src0 - v_trunc_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_trunc_f32_sdwa dst, src0 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_trunc_f64 dst, src0 - -VOP2 -=========================== - -.. parsed-literal:: - - v_add_f16 dst, src0, src1 - v_add_f16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_add_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_add_f32 dst, src0, src1 - v_add_f32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_add_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_add_u16 dst, src0, src1 - v_add_u16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_add_u16_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_add_u32 dst0, dst1, src0, src1 - v_add_u32_dpp dst0, dst1, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_add_u32_sdwa dst0, dst1, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_addc_u32 dst0, dst1, src0, src1, src2 - v_addc_u32_dpp dst0, dst1, src0, src1, src2 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_addc_u32_sdwa dst0, dst1, src0, src1, src2 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_and_b32 dst, src0, src1 - v_and_b32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_and_b32_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_ashrrev_i16 dst, src0, src1 - v_ashrrev_i16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_ashrrev_i16_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_ashrrev_i32 dst, src0, src1 - v_ashrrev_i32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_ashrrev_i32_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_cndmask_b32 dst, src0, src1, src2 - v_cndmask_b32_dpp dst, src0, src1, src2 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cndmask_b32_sdwa dst, src0, src1, src2 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_ldexp_f16 dst, src0, src1 - v_ldexp_f16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_ldexp_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_lshlrev_b16 dst, src0, src1 - v_lshlrev_b16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_lshlrev_b16_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_lshlrev_b32 dst, src0, src1 - v_lshlrev_b32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_lshlrev_b32_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_lshrrev_b16 dst, src0, src1 - v_lshrrev_b16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_lshrrev_b16_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_lshrrev_b32 dst, src0, src1 - v_lshrrev_b32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_lshrrev_b32_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_mac_f16 dst, src0, src1 - v_mac_f16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mac_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_mac_f32 dst, src0, src1 - v_mac_f32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mac_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_madak_f16 dst, src0, src1, src2 - v_madak_f32 dst, src0, src1, src2 - v_madmk_f16 dst, src0, src1, src2 - v_madmk_f32 dst, src0, src1, src2 - v_max_f16 dst, src0, src1 - v_max_f16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_max_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_max_f32 dst, src0, src1 - v_max_f32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_max_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_max_i16 dst, src0, src1 - v_max_i16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_max_i16_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_max_i32 dst, src0, src1 - v_max_i32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_max_i32_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_max_u16 dst, src0, src1 - v_max_u16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_max_u16_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_max_u32 dst, src0, src1 - v_max_u32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_max_u32_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_min_f16 dst, src0, src1 - v_min_f16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_min_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_min_f32 dst, src0, src1 - v_min_f32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_min_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_min_i16 dst, src0, src1 - v_min_i16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_min_i16_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_min_i32 dst, src0, src1 - v_min_i32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_min_i32_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_min_u16 dst, src0, src1 - v_min_u16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_min_u16_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_min_u32 dst, src0, src1 - v_min_u32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_min_u32_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_mul_f16 dst, src0, src1 - v_mul_f16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mul_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_mul_f32 dst, src0, src1 - v_mul_f32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mul_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_mul_hi_i32_i24 dst, src0, src1 - v_mul_hi_i32_i24_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mul_hi_i32_i24_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_mul_hi_u32_u24 dst, src0, src1 - v_mul_hi_u32_u24_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mul_hi_u32_u24_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_mul_i32_i24 dst, src0, src1 - v_mul_i32_i24_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mul_i32_i24_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_mul_legacy_f32 dst, src0, src1 - v_mul_legacy_f32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mul_legacy_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_mul_lo_u16 dst, src0, src1 - v_mul_lo_u16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mul_lo_u16_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_mul_u32_u24 dst, src0, src1 - v_mul_u32_u24_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mul_u32_u24_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_or_b32 dst, src0, src1 - v_or_b32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_or_b32_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_sub_f16 dst, src0, src1 - v_sub_f16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_sub_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_sub_f32 dst, src0, src1 - v_sub_f32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_sub_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_sub_u16 dst, src0, src1 - v_sub_u16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_sub_u16_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_sub_u32 dst0, dst1, src0, src1 - v_sub_u32_dpp dst0, dst1, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_sub_u32_sdwa dst0, dst1, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_subb_u32 dst0, dst1, src0, src1, src2 - v_subb_u32_dpp dst0, dst1, src0, src1, src2 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_subb_u32_sdwa dst0, dst1, src0, src1, src2 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_subbrev_u32 dst0, dst1, src0, src1, src2 - v_subbrev_u32_dpp dst0, dst1, src0, src1, src2 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_subbrev_u32_sdwa dst0, dst1, src0, src1, src2 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_subrev_f16 dst, src0, src1 - v_subrev_f16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_subrev_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_subrev_f32 dst, src0, src1 - v_subrev_f32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_subrev_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_subrev_u16 dst, src0, src1 - v_subrev_u16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_subrev_u16_sdwa dst, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_subrev_u32 dst0, dst1, src0, src1 - v_subrev_u32_dpp dst0, dst1, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_subrev_u32_sdwa dst0, dst1, src0, src1 :ref:`clamp` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_xor_b32 dst, src0, src1 - v_xor_b32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_xor_b32_sdwa dst, src0, src1 :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - -VOP3 -=========================== - -.. parsed-literal:: - - v_add_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_add_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_add_f64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_add_u16_e64 dst, src0, src1 :ref:`omod` - v_add_u32_e64 dst0, dst1, src0, src1 :ref:`omod` - v_addc_u32_e64 dst0, dst1, src0, src1, src2 :ref:`omod` - v_alignbit_b32 dst, src0, src1, src2 :ref:`omod` - v_alignbyte_b32 dst, src0, src1, src2 :ref:`omod` - v_and_b32_e64 dst, src0, src1 :ref:`omod` - v_ashrrev_i16_e64 dst, src0, src1 :ref:`omod` - v_ashrrev_i32_e64 dst, src0, src1 :ref:`omod` - v_ashrrev_i64 dst, src0, src1 :ref:`omod` - v_bcnt_u32_b32 dst, src0, src1 :ref:`omod` - v_bfe_i32 dst, src0, src1, src2 :ref:`omod` - v_bfe_u32 dst, src0, src1, src2 :ref:`omod` - v_bfi_b32 dst, src0, src1, src2 :ref:`omod` - v_bfm_b32 dst, src0, src1 :ref:`omod` - v_bfrev_b32_e64 dst, src0 :ref:`omod` - v_ceil_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_ceil_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_ceil_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_clrexcp_e64 :ref:`omod` - v_cmp_class_f16_e64 dst, src0, src1 :ref:`omod` - v_cmp_class_f32_e64 dst, src0, src1 :ref:`omod` - v_cmp_class_f64_e64 dst, src0, src1 :ref:`omod` - v_cmp_eq_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_eq_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_eq_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_eq_i16_e64 dst, src0, src1 :ref:`omod` - v_cmp_eq_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_eq_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_eq_u16_e64 dst, src0, src1 :ref:`omod` - v_cmp_eq_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_eq_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_f_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_f_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_f_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_f_i16_e64 dst, src0, src1 :ref:`omod` - v_cmp_f_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_f_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_f_u16_e64 dst, src0, src1 :ref:`omod` - v_cmp_f_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_f_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_ge_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_ge_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_ge_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_ge_i16_e64 dst, src0, src1 :ref:`omod` - v_cmp_ge_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_ge_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_ge_u16_e64 dst, src0, src1 :ref:`omod` - v_cmp_ge_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_ge_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_gt_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_gt_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_gt_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_gt_i16_e64 dst, src0, src1 :ref:`omod` - v_cmp_gt_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_gt_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_gt_u16_e64 dst, src0, src1 :ref:`omod` - v_cmp_gt_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_gt_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_le_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_le_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_le_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_le_i16_e64 dst, src0, src1 :ref:`omod` - v_cmp_le_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_le_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_le_u16_e64 dst, src0, src1 :ref:`omod` - v_cmp_le_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_le_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_lg_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_lg_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_lg_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_lt_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_lt_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_lt_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_lt_i16_e64 dst, src0, src1 :ref:`omod` - v_cmp_lt_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_lt_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_lt_u16_e64 dst, src0, src1 :ref:`omod` - v_cmp_lt_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_lt_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_ne_i16_e64 dst, src0, src1 :ref:`omod` - v_cmp_ne_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_ne_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_ne_u16_e64 dst, src0, src1 :ref:`omod` - v_cmp_ne_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_ne_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_neq_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_neq_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_neq_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nge_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nge_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nge_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_ngt_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_ngt_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_ngt_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nle_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nle_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nle_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nlg_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nlg_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nlg_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nlt_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nlt_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nlt_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_o_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_o_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_o_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_t_i16_e64 dst, src0, src1 :ref:`omod` - v_cmp_t_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_t_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_t_u16_e64 dst, src0, src1 :ref:`omod` - v_cmp_t_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_t_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_tru_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_tru_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_tru_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_u_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_u_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_u_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_class_f16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_class_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_class_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_eq_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_eq_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_eq_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_eq_i16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_eq_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_eq_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_eq_u16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_eq_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_eq_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_f_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_f_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_f_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_f_i16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_f_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_f_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_f_u16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_f_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_f_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ge_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_ge_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_ge_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_ge_i16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ge_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ge_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ge_u16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ge_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ge_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_gt_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_gt_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_gt_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_gt_i16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_gt_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_gt_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_gt_u16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_gt_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_gt_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_le_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_le_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_le_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_le_i16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_le_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_le_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_le_u16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_le_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_le_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lg_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_lg_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_lg_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_lt_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_lt_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_lt_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_lt_i16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lt_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lt_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lt_u16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lt_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lt_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ne_i16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ne_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ne_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ne_u16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ne_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ne_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_neq_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_neq_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_neq_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nge_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nge_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nge_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_ngt_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_ngt_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_ngt_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nle_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nle_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nle_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nlg_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nlg_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nlg_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nlt_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nlt_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nlt_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_o_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_o_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_o_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_t_i16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_t_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_t_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_t_u16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_t_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_t_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_tru_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_tru_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_tru_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_u_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_u_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_u_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cndmask_b32_e64 dst, src0, src1, src2 :ref:`omod` - v_cos_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cos_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cubeid_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_cubema_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_cubesc_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_cubetc_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_cvt_f16_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f16_i16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f16_u16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_i32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_u32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_ubyte0_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_ubyte1_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_ubyte2_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_ubyte3_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f64_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f64_i32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f64_u32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_flr_i32_f32_e64 dst, src0 :ref:`omod` - v_cvt_i16_f16_e64 dst, src0 :ref:`omod` - v_cvt_i32_f32_e64 dst, src0 :ref:`omod` - v_cvt_i32_f64_e64 dst, src0 :ref:`omod` - v_cvt_off_f32_i4_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_pk_i16_i32 dst, src0, src1 :ref:`omod` - v_cvt_pk_u16_u32 dst, src0, src1 :ref:`omod` - v_cvt_pk_u8_f32 dst, src0, src1, src2 :ref:`omod` - v_cvt_pkaccum_u8_f32 dst, src0, src1 :ref:`omod` - v_cvt_pknorm_i16_f32 dst, src0, src1 :ref:`omod` - v_cvt_pknorm_u16_f32 dst, src0, src1 :ref:`omod` - v_cvt_pkrtz_f16_f32 dst, src0, src1 :ref:`omod` - v_cvt_rpi_i32_f32_e64 dst, src0 :ref:`omod` - v_cvt_u16_f16_e64 dst, src0 :ref:`omod` - v_cvt_u32_f32_e64 dst, src0 :ref:`omod` - v_cvt_u32_f64_e64 dst, src0 :ref:`omod` - v_div_fixup_f16 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_div_fixup_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_div_fixup_f64 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_div_fmas_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_div_fmas_f64 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_div_scale_f32 dst0, dst1, src0, src1, src2 :ref:`omod` - v_div_scale_f64 dst0, dst1, src0, src1, src2 :ref:`omod` - v_exp_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_exp_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_exp_legacy_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_ffbh_i32_e64 dst, src0 :ref:`omod` - v_ffbh_u32_e64 dst, src0 :ref:`omod` - v_ffbl_b32_e64 dst, src0 :ref:`omod` - v_floor_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_floor_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_floor_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_fma_f16 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_fma_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_fma_f64 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_fract_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_fract_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_fract_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_frexp_exp_i16_f16_e64 dst, src0 :ref:`omod` - v_frexp_exp_i32_f32_e64 dst, src0 :ref:`omod` - v_frexp_exp_i32_f64_e64 dst, src0 :ref:`omod` - v_frexp_mant_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_frexp_mant_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_frexp_mant_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_interp_mov_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_interp_p1_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_interp_p1ll_f16 dst, src0, src1 :ref:`high` :ref:`clamp` :ref:`omod` - v_interp_p1lv_f16 dst, src0, src1, src2 :ref:`high` :ref:`clamp` :ref:`omod` - v_interp_p2_f16 dst, src0, src1, src2 :ref:`high` :ref:`clamp` :ref:`omod` - v_interp_p2_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_ldexp_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_ldexp_f32 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_ldexp_f64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_lerp_u8 dst, src0, src1, src2 :ref:`omod` - v_log_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_log_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_log_legacy_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_lshlrev_b16_e64 dst, src0, src1 :ref:`omod` - v_lshlrev_b32_e64 dst, src0, src1 :ref:`omod` - v_lshlrev_b64 dst, src0, src1 :ref:`omod` - v_lshrrev_b16_e64 dst, src0, src1 :ref:`omod` - v_lshrrev_b32_e64 dst, src0, src1 :ref:`omod` - v_lshrrev_b64 dst, src0, src1 :ref:`omod` - v_mac_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_mac_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_mad_f16 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_mad_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_mad_i16 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_mad_i32_i24 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_mad_i64_i32 dst0, dst1, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_mad_legacy_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_mad_u16 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_mad_u32_u24 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_mad_u64_u32 dst0, dst1, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_max3_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_max3_i32 dst, src0, src1, src2 :ref:`omod` - v_max3_u32 dst, src0, src1, src2 :ref:`omod` - v_max_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_max_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_max_f64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_max_i16_e64 dst, src0, src1 :ref:`omod` - v_max_i32_e64 dst, src0, src1 :ref:`omod` - v_max_u16_e64 dst, src0, src1 :ref:`omod` - v_max_u32_e64 dst, src0, src1 :ref:`omod` - v_mbcnt_hi_u32_b32 dst, src0, src1 :ref:`omod` - v_mbcnt_lo_u32_b32 dst, src0, src1 :ref:`omod` - v_med3_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_med3_i32 dst, src0, src1, src2 :ref:`omod` - v_med3_u32 dst, src0, src1, src2 :ref:`omod` - v_min3_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_min3_i32 dst, src0, src1, src2 :ref:`omod` - v_min3_u32 dst, src0, src1, src2 :ref:`omod` - v_min_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_min_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_min_f64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_min_i16_e64 dst, src0, src1 :ref:`omod` - v_min_i32_e64 dst, src0, src1 :ref:`omod` - v_min_u16_e64 dst, src0, src1 :ref:`omod` - v_min_u32_e64 dst, src0, src1 :ref:`omod` - v_mov_b32_e64 dst, src0 :ref:`omod` - v_mov_fed_b32_e64 dst, src0 :ref:`omod` - v_movreld_b32_e64 dst, src0 :ref:`omod` - v_movrels_b32_e64 dst, src0 :ref:`omod` - v_movrelsd_b32_e64 dst, src0 :ref:`omod` - v_mqsad_pk_u16_u8 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_mqsad_u32_u8 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_msad_u8 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_mul_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_mul_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_mul_f64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_mul_hi_i32 dst, src0, src1 :ref:`omod` - v_mul_hi_i32_i24_e64 dst, src0, src1 :ref:`omod` - v_mul_hi_u32 dst, src0, src1 :ref:`omod` - v_mul_hi_u32_u24_e64 dst, src0, src1 :ref:`omod` - v_mul_i32_i24_e64 dst, src0, src1 :ref:`omod` - v_mul_legacy_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_mul_lo_u16_e64 dst, src0, src1 :ref:`omod` - v_mul_lo_u32 dst, src0, src1 :ref:`omod` - v_mul_u32_u24_e64 dst, src0, src1 :ref:`omod` - v_nop_e64 :ref:`omod` - v_not_b32_e64 dst, src0 :ref:`omod` - v_or_b32_e64 dst, src0, src1 :ref:`omod` - v_perm_b32 dst, src0, src1, src2 :ref:`omod` - v_qsad_pk_u16_u8 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_rcp_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rcp_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rcp_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rcp_iflag_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_readlane_b32 dst, src0, src1 :ref:`omod` - v_rndne_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rndne_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rndne_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rsq_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rsq_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rsq_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_sad_hi_u8 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_sad_u16 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_sad_u32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_sad_u8 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_sin_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_sin_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_sqrt_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_sqrt_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_sqrt_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_sub_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_sub_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_sub_u16_e64 dst, src0, src1 :ref:`omod` - v_sub_u32_e64 dst0, dst1, src0, src1 :ref:`omod` - v_subb_u32_e64 dst0, dst1, src0, src1, src2 :ref:`omod` - v_subbrev_u32_e64 dst0, dst1, src0, src1, src2 :ref:`omod` - v_subrev_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_subrev_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_subrev_u16_e64 dst, src0, src1 :ref:`omod` - v_subrev_u32_e64 dst0, dst1, src0, src1 :ref:`omod` - v_trig_preop_f64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_trunc_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_trunc_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_trunc_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_writelane_b32 dst, src0, src1 :ref:`omod` - v_xor_b32_e64 dst, src0, src1 :ref:`omod` - -VOPC -=========================== - -.. parsed-literal:: - - v_cmp_class_f16 dst, src0, src1 - v_cmp_class_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_class_f32 dst, src0, src1 - v_cmp_class_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_class_f64 dst, src0, src1 - v_cmp_eq_f16 dst, src0, src1 - v_cmp_eq_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_eq_f32 dst, src0, src1 - v_cmp_eq_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_eq_f64 dst, src0, src1 - v_cmp_eq_i16 dst, src0, src1 - v_cmp_eq_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_eq_i32 dst, src0, src1 - v_cmp_eq_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_eq_i64 dst, src0, src1 - v_cmp_eq_u16 dst, src0, src1 - v_cmp_eq_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_eq_u32 dst, src0, src1 - v_cmp_eq_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_eq_u64 dst, src0, src1 - v_cmp_f_f16 dst, src0, src1 - v_cmp_f_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_f_f32 dst, src0, src1 - v_cmp_f_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_f_f64 dst, src0, src1 - v_cmp_f_i16 dst, src0, src1 - v_cmp_f_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_f_i32 dst, src0, src1 - v_cmp_f_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_f_i64 dst, src0, src1 - v_cmp_f_u16 dst, src0, src1 - v_cmp_f_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_f_u32 dst, src0, src1 - v_cmp_f_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_f_u64 dst, src0, src1 - v_cmp_ge_f16 dst, src0, src1 - v_cmp_ge_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ge_f32 dst, src0, src1 - v_cmp_ge_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ge_f64 dst, src0, src1 - v_cmp_ge_i16 dst, src0, src1 - v_cmp_ge_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ge_i32 dst, src0, src1 - v_cmp_ge_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ge_i64 dst, src0, src1 - v_cmp_ge_u16 dst, src0, src1 - v_cmp_ge_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ge_u32 dst, src0, src1 - v_cmp_ge_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ge_u64 dst, src0, src1 - v_cmp_gt_f16 dst, src0, src1 - v_cmp_gt_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_gt_f32 dst, src0, src1 - v_cmp_gt_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_gt_f64 dst, src0, src1 - v_cmp_gt_i16 dst, src0, src1 - v_cmp_gt_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_gt_i32 dst, src0, src1 - v_cmp_gt_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_gt_i64 dst, src0, src1 - v_cmp_gt_u16 dst, src0, src1 - v_cmp_gt_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_gt_u32 dst, src0, src1 - v_cmp_gt_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_gt_u64 dst, src0, src1 - v_cmp_le_f16 dst, src0, src1 - v_cmp_le_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_le_f32 dst, src0, src1 - v_cmp_le_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_le_f64 dst, src0, src1 - v_cmp_le_i16 dst, src0, src1 - v_cmp_le_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_le_i32 dst, src0, src1 - v_cmp_le_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_le_i64 dst, src0, src1 - v_cmp_le_u16 dst, src0, src1 - v_cmp_le_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_le_u32 dst, src0, src1 - v_cmp_le_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_le_u64 dst, src0, src1 - v_cmp_lg_f16 dst, src0, src1 - v_cmp_lg_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_lg_f32 dst, src0, src1 - v_cmp_lg_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_lg_f64 dst, src0, src1 - v_cmp_lt_f16 dst, src0, src1 - v_cmp_lt_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_lt_f32 dst, src0, src1 - v_cmp_lt_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_lt_f64 dst, src0, src1 - v_cmp_lt_i16 dst, src0, src1 - v_cmp_lt_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_lt_i32 dst, src0, src1 - v_cmp_lt_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_lt_i64 dst, src0, src1 - v_cmp_lt_u16 dst, src0, src1 - v_cmp_lt_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_lt_u32 dst, src0, src1 - v_cmp_lt_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_lt_u64 dst, src0, src1 - v_cmp_ne_i16 dst, src0, src1 - v_cmp_ne_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ne_i32 dst, src0, src1 - v_cmp_ne_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ne_i64 dst, src0, src1 - v_cmp_ne_u16 dst, src0, src1 - v_cmp_ne_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ne_u32 dst, src0, src1 - v_cmp_ne_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ne_u64 dst, src0, src1 - v_cmp_neq_f16 dst, src0, src1 - v_cmp_neq_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_neq_f32 dst, src0, src1 - v_cmp_neq_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_neq_f64 dst, src0, src1 - v_cmp_nge_f16 dst, src0, src1 - v_cmp_nge_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_nge_f32 dst, src0, src1 - v_cmp_nge_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_nge_f64 dst, src0, src1 - v_cmp_ngt_f16 dst, src0, src1 - v_cmp_ngt_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ngt_f32 dst, src0, src1 - v_cmp_ngt_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ngt_f64 dst, src0, src1 - v_cmp_nle_f16 dst, src0, src1 - v_cmp_nle_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_nle_f32 dst, src0, src1 - v_cmp_nle_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_nle_f64 dst, src0, src1 - v_cmp_nlg_f16 dst, src0, src1 - v_cmp_nlg_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_nlg_f32 dst, src0, src1 - v_cmp_nlg_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_nlg_f64 dst, src0, src1 - v_cmp_nlt_f16 dst, src0, src1 - v_cmp_nlt_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_nlt_f32 dst, src0, src1 - v_cmp_nlt_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_nlt_f64 dst, src0, src1 - v_cmp_o_f16 dst, src0, src1 - v_cmp_o_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_o_f32 dst, src0, src1 - v_cmp_o_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_o_f64 dst, src0, src1 - v_cmp_t_i16 dst, src0, src1 - v_cmp_t_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_t_i32 dst, src0, src1 - v_cmp_t_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_t_i64 dst, src0, src1 - v_cmp_t_u16 dst, src0, src1 - v_cmp_t_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_t_u32 dst, src0, src1 - v_cmp_t_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_t_u64 dst, src0, src1 - v_cmp_tru_f16 dst, src0, src1 - v_cmp_tru_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_tru_f32 dst, src0, src1 - v_cmp_tru_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_tru_f64 dst, src0, src1 - v_cmp_u_f16 dst, src0, src1 - v_cmp_u_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_u_f32 dst, src0, src1 - v_cmp_u_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmp_u_f64 dst, src0, src1 - v_cmpx_class_f16 dst, src0, src1 - v_cmpx_class_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_class_f32 dst, src0, src1 - v_cmpx_class_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_class_f64 dst, src0, src1 - v_cmpx_eq_f16 dst, src0, src1 - v_cmpx_eq_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_eq_f32 dst, src0, src1 - v_cmpx_eq_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_eq_f64 dst, src0, src1 - v_cmpx_eq_i16 dst, src0, src1 - v_cmpx_eq_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_eq_i32 dst, src0, src1 - v_cmpx_eq_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_eq_i64 dst, src0, src1 - v_cmpx_eq_u16 dst, src0, src1 - v_cmpx_eq_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_eq_u32 dst, src0, src1 - v_cmpx_eq_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_eq_u64 dst, src0, src1 - v_cmpx_f_f16 dst, src0, src1 - v_cmpx_f_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_f_f32 dst, src0, src1 - v_cmpx_f_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_f_f64 dst, src0, src1 - v_cmpx_f_i16 dst, src0, src1 - v_cmpx_f_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_f_i32 dst, src0, src1 - v_cmpx_f_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_f_i64 dst, src0, src1 - v_cmpx_f_u16 dst, src0, src1 - v_cmpx_f_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_f_u32 dst, src0, src1 - v_cmpx_f_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_f_u64 dst, src0, src1 - v_cmpx_ge_f16 dst, src0, src1 - v_cmpx_ge_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ge_f32 dst, src0, src1 - v_cmpx_ge_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ge_f64 dst, src0, src1 - v_cmpx_ge_i16 dst, src0, src1 - v_cmpx_ge_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ge_i32 dst, src0, src1 - v_cmpx_ge_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ge_i64 dst, src0, src1 - v_cmpx_ge_u16 dst, src0, src1 - v_cmpx_ge_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ge_u32 dst, src0, src1 - v_cmpx_ge_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ge_u64 dst, src0, src1 - v_cmpx_gt_f16 dst, src0, src1 - v_cmpx_gt_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_gt_f32 dst, src0, src1 - v_cmpx_gt_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_gt_f64 dst, src0, src1 - v_cmpx_gt_i16 dst, src0, src1 - v_cmpx_gt_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_gt_i32 dst, src0, src1 - v_cmpx_gt_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_gt_i64 dst, src0, src1 - v_cmpx_gt_u16 dst, src0, src1 - v_cmpx_gt_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_gt_u32 dst, src0, src1 - v_cmpx_gt_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_gt_u64 dst, src0, src1 - v_cmpx_le_f16 dst, src0, src1 - v_cmpx_le_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_le_f32 dst, src0, src1 - v_cmpx_le_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_le_f64 dst, src0, src1 - v_cmpx_le_i16 dst, src0, src1 - v_cmpx_le_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_le_i32 dst, src0, src1 - v_cmpx_le_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_le_i64 dst, src0, src1 - v_cmpx_le_u16 dst, src0, src1 - v_cmpx_le_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_le_u32 dst, src0, src1 - v_cmpx_le_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_le_u64 dst, src0, src1 - v_cmpx_lg_f16 dst, src0, src1 - v_cmpx_lg_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_lg_f32 dst, src0, src1 - v_cmpx_lg_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_lg_f64 dst, src0, src1 - v_cmpx_lt_f16 dst, src0, src1 - v_cmpx_lt_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_lt_f32 dst, src0, src1 - v_cmpx_lt_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_lt_f64 dst, src0, src1 - v_cmpx_lt_i16 dst, src0, src1 - v_cmpx_lt_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_lt_i32 dst, src0, src1 - v_cmpx_lt_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_lt_i64 dst, src0, src1 - v_cmpx_lt_u16 dst, src0, src1 - v_cmpx_lt_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_lt_u32 dst, src0, src1 - v_cmpx_lt_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_lt_u64 dst, src0, src1 - v_cmpx_ne_i16 dst, src0, src1 - v_cmpx_ne_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ne_i32 dst, src0, src1 - v_cmpx_ne_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ne_i64 dst, src0, src1 - v_cmpx_ne_u16 dst, src0, src1 - v_cmpx_ne_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ne_u32 dst, src0, src1 - v_cmpx_ne_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ne_u64 dst, src0, src1 - v_cmpx_neq_f16 dst, src0, src1 - v_cmpx_neq_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_neq_f32 dst, src0, src1 - v_cmpx_neq_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_neq_f64 dst, src0, src1 - v_cmpx_nge_f16 dst, src0, src1 - v_cmpx_nge_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_nge_f32 dst, src0, src1 - v_cmpx_nge_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_nge_f64 dst, src0, src1 - v_cmpx_ngt_f16 dst, src0, src1 - v_cmpx_ngt_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ngt_f32 dst, src0, src1 - v_cmpx_ngt_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ngt_f64 dst, src0, src1 - v_cmpx_nle_f16 dst, src0, src1 - v_cmpx_nle_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_nle_f32 dst, src0, src1 - v_cmpx_nle_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_nle_f64 dst, src0, src1 - v_cmpx_nlg_f16 dst, src0, src1 - v_cmpx_nlg_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_nlg_f32 dst, src0, src1 - v_cmpx_nlg_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_nlg_f64 dst, src0, src1 - v_cmpx_nlt_f16 dst, src0, src1 - v_cmpx_nlt_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_nlt_f32 dst, src0, src1 - v_cmpx_nlt_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_nlt_f64 dst, src0, src1 - v_cmpx_o_f16 dst, src0, src1 - v_cmpx_o_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_o_f32 dst, src0, src1 - v_cmpx_o_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_o_f64 dst, src0, src1 - v_cmpx_t_i16 dst, src0, src1 - v_cmpx_t_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_t_i32 dst, src0, src1 - v_cmpx_t_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_t_i64 dst, src0, src1 - v_cmpx_t_u16 dst, src0, src1 - v_cmpx_t_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_t_u32 dst, src0, src1 - v_cmpx_t_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_t_u64 dst, src0, src1 - v_cmpx_tru_f16 dst, src0, src1 - v_cmpx_tru_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_tru_f32 dst, src0, src1 - v_cmpx_tru_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_tru_f64 dst, src0, src1 - v_cmpx_u_f16 dst, src0, src1 - v_cmpx_u_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_u_f32 dst, src0, src1 - v_cmpx_u_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_u_f64 dst, src0, src1 diff --git a/docs/AMDGPUAsmGFX9.rst b/docs/AMDGPUAsmGFX9.rst deleted file mode 100644 index 97c13f2476bd4dec4776e3979b2a2d4f4b3ccc14..0000000000000000000000000000000000000000 --- a/docs/AMDGPUAsmGFX9.rst +++ /dev/null @@ -1,1906 +0,0 @@ -.. - ************************************************** - * * - * Automatically generated file, do not edit! * - * * - ************************************************** - -=========================== -Syntax of GFX9 Instructions -=========================== - -.. contents:: - :local: - - -DS -=========================== - -.. parsed-literal:: - - ds_add_f32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_add_rtn_f32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_add_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_add_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_add_src2_f32 src0 :ref:`ds_offset16` :ref:`gds` - ds_add_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_add_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_add_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_add_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_and_b32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_and_b64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_and_rtn_b32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_and_rtn_b64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_and_src2_b32 src0 :ref:`ds_offset16` :ref:`gds` - ds_and_src2_b64 src0 :ref:`ds_offset16` :ref:`gds` - ds_append dst :ref:`ds_offset16` :ref:`gds` - ds_bpermute_b32 dst, src0, src1 :ref:`ds_offset16` - ds_cmpst_b32 src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_b64 src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_f32 src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_f64 src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_rtn_b32 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_rtn_b64 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_rtn_f32 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_cmpst_rtn_f64 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_condxchg32_rtn_b64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_consume dst :ref:`ds_offset16` :ref:`gds` - ds_dec_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_dec_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_dec_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_dec_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_dec_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_dec_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_gws_barrier src0 :ref:`ds_offset16` :ref:`gds` - ds_gws_init src0 :ref:`ds_offset16` :ref:`gds` - ds_gws_sema_br src0 :ref:`ds_offset16` :ref:`gds` - ds_gws_sema_p :ref:`ds_offset16` :ref:`gds` - ds_gws_sema_release_all :ref:`ds_offset16` :ref:`gds` - ds_gws_sema_v :ref:`ds_offset16` :ref:`gds` - ds_inc_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_inc_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_inc_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_inc_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_inc_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_inc_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_f32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_f64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_i32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_i64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_rtn_f32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_rtn_f64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_rtn_i32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_rtn_i64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_src2_f32 src0 :ref:`ds_offset16` :ref:`gds` - ds_max_src2_f64 src0 :ref:`ds_offset16` :ref:`gds` - ds_max_src2_i32 src0 :ref:`ds_offset16` :ref:`gds` - ds_max_src2_i64 src0 :ref:`ds_offset16` :ref:`gds` - ds_max_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_max_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_max_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_max_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_f32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_f64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_i32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_i64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_rtn_f32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_rtn_f64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_rtn_i32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_rtn_i64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_src2_f32 src0 :ref:`ds_offset16` :ref:`gds` - ds_min_src2_f64 src0 :ref:`ds_offset16` :ref:`gds` - ds_min_src2_i32 src0 :ref:`ds_offset16` :ref:`gds` - ds_min_src2_i64 src0 :ref:`ds_offset16` :ref:`gds` - ds_min_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_min_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_min_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_min_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_mskor_b32 src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_mskor_b64 src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_mskor_rtn_b32 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_mskor_rtn_b64 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_nop - ds_or_b32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_or_b64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_or_rtn_b32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_or_rtn_b64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_or_src2_b32 src0 :ref:`ds_offset16` :ref:`gds` - ds_or_src2_b64 src0 :ref:`ds_offset16` :ref:`gds` - ds_ordered_count dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_permute_b32 dst, src0, src1 :ref:`ds_offset16` - ds_read2_b32 dst, src0 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_read2_b64 dst, src0 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_read2st64_b32 dst, src0 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_read2st64_b64 dst, src0 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_read_b128 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_b32 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_b64 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_b96 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_i16 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_i8 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_i8_d16 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_i8_d16_hi dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_u16 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_u16_d16 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_u16_d16_hi dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_u8 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_u8_d16 dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_read_u8_d16_hi dst, src0 :ref:`ds_offset16` :ref:`gds` - ds_rsub_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_rsub_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_rsub_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_rsub_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_rsub_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_rsub_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_sub_rtn_u32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_sub_rtn_u64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_sub_src2_u32 src0 :ref:`ds_offset16` :ref:`gds` - ds_sub_src2_u64 src0 :ref:`ds_offset16` :ref:`gds` - ds_sub_u32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_sub_u64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_swizzle_b32 dst, src0 :ref:`sw_offset16` :ref:`gds` - ds_wrap_rtn_b32 dst, src0, src1, src2 :ref:`ds_offset16` :ref:`gds` - ds_write2_b32 src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_write2_b64 src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_write2st64_b32 src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_write2st64_b64 src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_write_b128 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_b16 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_b16_d16_hi src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_b32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_b64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_b8 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_b8_d16_hi src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_b96 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_write_src2_b32 src0 :ref:`ds_offset16` :ref:`gds` - ds_write_src2_b64 src0 :ref:`ds_offset16` :ref:`gds` - ds_wrxchg2_rtn_b32 dst, src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_wrxchg2_rtn_b64 dst, src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_wrxchg2st64_rtn_b32 dst, src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_wrxchg2st64_rtn_b64 dst, src0, src1, src2 :ref:`ds_offset8` :ref:`ds_offset8` :ref:`gds` - ds_wrxchg_rtn_b32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_wrxchg_rtn_b64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_xor_b32 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_xor_b64 src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_xor_rtn_b32 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_xor_rtn_b64 dst, src0, src1 :ref:`ds_offset16` :ref:`gds` - ds_xor_src2_b32 src0 :ref:`ds_offset16` :ref:`gds` - ds_xor_src2_b64 src0 :ref:`ds_offset16` :ref:`gds` - -EXP -=========================== - -.. parsed-literal:: - - exp dst, src0, src1, src2, src3 :ref:`done` :ref:`compr` :ref:`vm` - -FLAT -=========================== - -.. parsed-literal:: - - flat_atomic_add dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_add_x2 dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_and dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_and_x2 dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_cmpswap dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_cmpswap_x2 dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_dec dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_dec_x2 dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_inc dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_inc_x2 dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_or dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_or_x2 dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_smax dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_smax_x2 dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_smin dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_smin_x2 dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_sub dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_sub_x2 dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_swap dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_swap_x2 dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_umax dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_umax_x2 dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_umin dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_umin_x2 dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_xor dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_atomic_xor_x2 dst, src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_load_dword dst, src0 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_load_dwordx2 dst, src0 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_load_dwordx3 dst, src0 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_load_dwordx4 dst, src0 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_load_sbyte dst, src0 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_load_sbyte_d16 dst, src0 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_load_sbyte_d16_hi dst, src0 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_load_short_d16 dst, src0 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_load_short_d16_hi dst, src0 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_load_sshort dst, src0 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_load_ubyte dst, src0 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_load_ubyte_d16 dst, src0 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_load_ubyte_d16_hi dst, src0 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_load_ushort dst, src0 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_store_byte src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_store_byte_d16_hi src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_store_dword src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_store_dwordx2 src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_store_dwordx3 src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_store_dwordx4 src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_store_short src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - flat_store_short_d16_hi src0, src1 :ref:`flat_offset12` :ref:`glc` :ref:`slc` - global_atomic_add dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_add_x2 dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_and dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_and_x2 dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_cmpswap dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_cmpswap_x2 dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_dec dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_dec_x2 dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_inc dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_inc_x2 dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_or dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_or_x2 dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_smax dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_smax_x2 dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_smin dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_smin_x2 dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_sub dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_sub_x2 dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_swap dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_swap_x2 dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_umax dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_umax_x2 dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_umin dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_umin_x2 dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_xor dst, src0, src1, src2 :ref:`flat_offset13` - global_atomic_xor_x2 dst, src0, src1, src2 :ref:`flat_offset13` - global_load_dword dst, src0, src1 :ref:`flat_offset13` - global_load_dwordx2 dst, src0, src1 :ref:`flat_offset13` - global_load_dwordx3 dst, src0, src1 :ref:`flat_offset13` - global_load_dwordx4 dst, src0, src1 :ref:`flat_offset13` - global_load_sbyte dst, src0, src1 :ref:`flat_offset13` - global_load_sbyte_d16 dst, src0, src1 :ref:`flat_offset13` - global_load_sbyte_d16_hi dst, src0, src1 :ref:`flat_offset13` - global_load_short_d16 dst, src0, src1 :ref:`flat_offset13` - global_load_short_d16_hi dst, src0, src1 :ref:`flat_offset13` - global_load_sshort dst, src0, src1 :ref:`flat_offset13` - global_load_ubyte dst, src0, src1 :ref:`flat_offset13` - global_load_ubyte_d16 dst, src0, src1 :ref:`flat_offset13` - global_load_ubyte_d16_hi dst, src0, src1 :ref:`flat_offset13` - global_load_ushort dst, src0, src1 :ref:`flat_offset13` - global_store_byte src0, src1, src2 :ref:`flat_offset13` - global_store_byte_d16_hi src0, src1, src2 :ref:`flat_offset13` - global_store_dword src0, src1, src2 :ref:`flat_offset13` - global_store_dwordx2 src0, src1, src2 :ref:`flat_offset13` - global_store_dwordx3 src0, src1, src2 :ref:`flat_offset13` - global_store_dwordx4 src0, src1, src2 :ref:`flat_offset13` - global_store_short src0, src1, src2 :ref:`flat_offset13` - global_store_short_d16_hi src0, src1, src2 :ref:`flat_offset13` - scratch_load_dword dst, src0, src1 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_load_dwordx2 dst, src0, src1 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_load_dwordx3 dst, src0, src1 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_load_dwordx4 dst, src0, src1 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_load_sbyte dst, src0, src1 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_load_sbyte_d16 dst, src0, src1 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_load_sbyte_d16_hi dst, src0, src1 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_load_short_d16 dst, src0, src1 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_load_short_d16_hi dst, src0, src1 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_load_sshort dst, src0, src1 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_load_ubyte dst, src0, src1 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_load_ubyte_d16 dst, src0, src1 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_load_ubyte_d16_hi dst, src0, src1 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_load_ushort dst, src0, src1 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_store_byte src0, src1, src2 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_store_byte_d16_hi src0, src1, src2 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_store_dword src0, src1, src2 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_store_dwordx2 src0, src1, src2 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_store_dwordx3 src0, src1, src2 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_store_dwordx4 src0, src1, src2 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_store_short src0, src1, src2 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - scratch_store_short_d16_hi src0, src1, src2 :ref:`flat_offset13` :ref:`glc` :ref:`slc` - -MIMG -=========================== - -.. parsed-literal:: - - image_atomic_add dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_and dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_cmpswap dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_dec dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_inc dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_or dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_smax dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_smin dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_sub dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_swap dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_umax dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_umin dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_atomic_xor dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_gather4 dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_b dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_c dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_c_lz dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_cl dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_l dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_lz dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_lz_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_gather4_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_get_lod dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_get_resinfo dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_load dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_load_mip dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_load_mip_pck dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_load_mip_pck_sgn dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_load_pck dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_load_pck_sgn dst, src0, src1 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` - image_sample dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_sample_b dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_sample_c dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_sample_c_lz dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_sample_cl dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_sample_l dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_sample_lz dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_sample_lz_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_sample_o dst, src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`tfe` :ref:`lwe` :ref:`da` :ref:`d16` - image_store src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_store_mip src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` :ref:`d16` - image_store_mip_pck src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - image_store_pck src0, src1, src2 :ref:`dmask` :ref:`unorm` :ref:`glc` :ref:`slc` :ref:`lwe` :ref:`da` - -MUBUF -=========================== - -.. parsed-literal:: - - buffer_atomic_add dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_add_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_and dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_and_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_cmpswap dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_cmpswap_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_dec dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_dec_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_inc dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_inc_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_or dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_or_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_smax dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_smax_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_smin dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_smin_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_sub dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_sub_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_swap dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_swap_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_umax dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_umax_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_umin dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_umin_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_xor dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_atomic_xor_x2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_dword dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` - buffer_load_dwordx2 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_dwordx3 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_dwordx4 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_format_d16_hi_x dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_format_d16_x dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_format_d16_xy dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_format_d16_xyz dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_format_d16_xyzw dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_format_x dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` - buffer_load_format_xy dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_format_xyz dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_format_xyzw dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_sbyte dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` - buffer_load_sbyte_d16 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_sbyte_d16_hi dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_short_d16 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_short_d16_hi dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_sshort dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` - buffer_load_ubyte dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` - buffer_load_ubyte_d16 dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_ubyte_d16_hi dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_load_ushort dst, src0, src1, src2 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` :ref:`lds` - buffer_store_byte src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_byte_d16_hi src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_dword src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_dwordx2 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_dwordx3 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_dwordx4 src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_d16_hi_x src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_d16_x src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_d16_xy src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_d16_xyz src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_d16_xyzw src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_x src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_xy src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_xyz src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_format_xyzw src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_lds_dword src0, src1 :ref:`buf_offset12` :ref:`lds` - buffer_store_short src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_store_short_d16_hi src0, src1, src2, src3 :ref:`idxen` :ref:`offen` :ref:`buf_offset12` :ref:`glc` :ref:`slc` - buffer_wbinvl1 - buffer_wbinvl1_vol - -SMEM -=========================== - -.. parsed-literal:: - - s_atc_probe src0, src1, src2 - s_atc_probe_buffer src0, src1, src2 - s_atomic_add dst, src0, src1 :ref:`glc` - s_atomic_add_x2 dst, src0, src1 :ref:`glc` - s_atomic_and dst, src0, src1 :ref:`glc` - s_atomic_and_x2 dst, src0, src1 :ref:`glc` - s_atomic_cmpswap dst, src0, src1 :ref:`glc` - s_atomic_cmpswap_x2 dst, src0, src1 :ref:`glc` - s_atomic_dec dst, src0, src1 :ref:`glc` - s_atomic_dec_x2 dst, src0, src1 :ref:`glc` - s_atomic_inc dst, src0, src1 :ref:`glc` - s_atomic_inc_x2 dst, src0, src1 :ref:`glc` - s_atomic_or dst, src0, src1 :ref:`glc` - s_atomic_or_x2 dst, src0, src1 :ref:`glc` - s_atomic_smax dst, src0, src1 :ref:`glc` - s_atomic_smax_x2 dst, src0, src1 :ref:`glc` - s_atomic_smin dst, src0, src1 :ref:`glc` - s_atomic_smin_x2 dst, src0, src1 :ref:`glc` - s_atomic_sub dst, src0, src1 :ref:`glc` - s_atomic_sub_x2 dst, src0, src1 :ref:`glc` - s_atomic_swap dst, src0, src1 :ref:`glc` - s_atomic_swap_x2 dst, src0, src1 :ref:`glc` - s_atomic_umax dst, src0, src1 :ref:`glc` - s_atomic_umax_x2 dst, src0, src1 :ref:`glc` - s_atomic_umin dst, src0, src1 :ref:`glc` - s_atomic_umin_x2 dst, src0, src1 :ref:`glc` - s_atomic_xor dst, src0, src1 :ref:`glc` - s_atomic_xor_x2 dst, src0, src1 :ref:`glc` - s_buffer_atomic_add dst, src0, src1 :ref:`glc` - s_buffer_atomic_add_x2 dst, src0, src1 :ref:`glc` - s_buffer_atomic_and dst, src0, src1 :ref:`glc` - s_buffer_atomic_and_x2 dst, src0, src1 :ref:`glc` - s_buffer_atomic_cmpswap dst, src0, src1 :ref:`glc` - s_buffer_atomic_cmpswap_x2 dst, src0, src1 :ref:`glc` - s_buffer_atomic_dec dst, src0, src1 :ref:`glc` - s_buffer_atomic_dec_x2 dst, src0, src1 :ref:`glc` - s_buffer_atomic_inc dst, src0, src1 :ref:`glc` - s_buffer_atomic_inc_x2 dst, src0, src1 :ref:`glc` - s_buffer_atomic_or dst, src0, src1 :ref:`glc` - s_buffer_atomic_or_x2 dst, src0, src1 :ref:`glc` - s_buffer_atomic_smax dst, src0, src1 :ref:`glc` - s_buffer_atomic_smax_x2 dst, src0, src1 :ref:`glc` - s_buffer_atomic_smin dst, src0, src1 :ref:`glc` - s_buffer_atomic_smin_x2 dst, src0, src1 :ref:`glc` - s_buffer_atomic_sub dst, src0, src1 :ref:`glc` - s_buffer_atomic_sub_x2 dst, src0, src1 :ref:`glc` - s_buffer_atomic_swap dst, src0, src1 :ref:`glc` - s_buffer_atomic_swap_x2 dst, src0, src1 :ref:`glc` - s_buffer_atomic_umax dst, src0, src1 :ref:`glc` - s_buffer_atomic_umax_x2 dst, src0, src1 :ref:`glc` - s_buffer_atomic_umin dst, src0, src1 :ref:`glc` - s_buffer_atomic_umin_x2 dst, src0, src1 :ref:`glc` - s_buffer_atomic_xor dst, src0, src1 :ref:`glc` - s_buffer_atomic_xor_x2 dst, src0, src1 :ref:`glc` - s_buffer_load_dword dst, src0, src1 :ref:`glc` - s_buffer_load_dwordx16 dst, src0, src1 :ref:`glc` - s_buffer_load_dwordx2 dst, src0, src1 :ref:`glc` - s_buffer_load_dwordx4 dst, src0, src1 :ref:`glc` - s_buffer_load_dwordx8 dst, src0, src1 :ref:`glc` - s_buffer_store_dword src0, src1, src2 :ref:`glc` - s_buffer_store_dwordx2 src0, src1, src2 :ref:`glc` - s_buffer_store_dwordx4 src0, src1, src2 :ref:`glc` - s_dcache_discard src0, src1 - s_dcache_discard_x2 src0, src1 - s_dcache_inv - s_dcache_inv_vol - s_dcache_wb - s_dcache_wb_vol - s_load_dword dst, src0, src1 :ref:`glc` - s_load_dwordx16 dst, src0, src1 :ref:`glc` - s_load_dwordx2 dst, src0, src1 :ref:`glc` - s_load_dwordx4 dst, src0, src1 :ref:`glc` - s_load_dwordx8 dst, src0, src1 :ref:`glc` - s_memrealtime dst - s_memtime dst - s_scratch_load_dword dst, src0, src1 :ref:`glc` - s_scratch_load_dwordx2 dst, src0, src1 :ref:`glc` - s_scratch_load_dwordx4 dst, src0, src1 :ref:`glc` - s_scratch_store_dword src0, src1, src2 :ref:`glc` - s_scratch_store_dwordx2 src0, src1, src2 :ref:`glc` - s_scratch_store_dwordx4 src0, src1, src2 :ref:`glc` - s_store_dword src0, src1, src2 :ref:`glc` - s_store_dwordx2 src0, src1, src2 :ref:`glc` - s_store_dwordx4 src0, src1, src2 :ref:`glc` - -SOP1 -=========================== - -.. parsed-literal:: - - s_abs_i32 dst, src0 - s_and_saveexec_b64 dst, src0 - s_andn1_saveexec_b64 dst, src0 - s_andn1_wrexec_b64 dst, src0 - s_andn2_saveexec_b64 dst, src0 - s_andn2_wrexec_b64 dst, src0 - s_bcnt0_i32_b32 dst, src0 - s_bcnt0_i32_b64 dst, src0 - s_bcnt1_i32_b32 dst, src0 - s_bcnt1_i32_b64 dst, src0 - s_bitreplicate_b64_b32 dst, src0 - s_bitset0_b32 dst, src0 - s_bitset0_b64 dst, src0 - s_bitset1_b32 dst, src0 - s_bitset1_b64 dst, src0 - s_brev_b32 dst, src0 - s_brev_b64 dst, src0 - s_cbranch_join src0 - s_cmov_b32 dst, src0 - s_cmov_b64 dst, src0 - s_ff0_i32_b32 dst, src0 - s_ff0_i32_b64 dst, src0 - s_ff1_i32_b32 dst, src0 - s_ff1_i32_b64 dst, src0 - s_flbit_i32 dst, src0 - s_flbit_i32_b32 dst, src0 - s_flbit_i32_b64 dst, src0 - s_flbit_i32_i64 dst, src0 - s_getpc_b64 dst - s_mov_b32 dst, src0 - s_mov_b64 dst, src0 - s_mov_fed_b32 dst, src0 - s_movreld_b32 dst, src0 - s_movreld_b64 dst, src0 - s_movrels_b32 dst, src0 - s_movrels_b64 dst, src0 - s_nand_saveexec_b64 dst, src0 - s_nor_saveexec_b64 dst, src0 - s_not_b32 dst, src0 - s_not_b64 dst, src0 - s_or_saveexec_b64 dst, src0 - s_orn1_saveexec_b64 dst, src0 - s_orn2_saveexec_b64 dst, src0 - s_quadmask_b32 dst, src0 - s_quadmask_b64 dst, src0 - s_rfe_b64 src0 - s_set_gpr_idx_idx src0 - s_setpc_b64 src0 - s_sext_i32_i16 dst, src0 - s_sext_i32_i8 dst, src0 - s_swappc_b64 dst, src0 - s_wqm_b32 dst, src0 - s_wqm_b64 dst, src0 - s_xnor_saveexec_b64 dst, src0 - s_xor_saveexec_b64 dst, src0 - -SOP2 -=========================== - -.. parsed-literal:: - - s_absdiff_i32 dst, src0, src1 - s_add_i32 dst, src0, src1 - s_add_u32 dst, src0, src1 - s_addc_u32 dst, src0, src1 - s_and_b32 dst, src0, src1 - s_and_b64 dst, src0, src1 - s_andn2_b32 dst, src0, src1 - s_andn2_b64 dst, src0, src1 - s_ashr_i32 dst, src0, src1 - s_ashr_i64 dst, src0, src1 - s_bfe_i32 dst, src0, src1 - s_bfe_i64 dst, src0, src1 - s_bfe_u32 dst, src0, src1 - s_bfe_u64 dst, src0, src1 - s_bfm_b32 dst, src0, src1 - s_bfm_b64 dst, src0, src1 - s_cbranch_g_fork src0, src1 - s_cselect_b32 dst, src0, src1 - s_cselect_b64 dst, src0, src1 - s_lshl1_add_u32 dst, src0, src1 - s_lshl2_add_u32 dst, src0, src1 - s_lshl3_add_u32 dst, src0, src1 - s_lshl4_add_u32 dst, src0, src1 - s_lshl_b32 dst, src0, src1 - s_lshl_b64 dst, src0, src1 - s_lshr_b32 dst, src0, src1 - s_lshr_b64 dst, src0, src1 - s_max_i32 dst, src0, src1 - s_max_u32 dst, src0, src1 - s_min_i32 dst, src0, src1 - s_min_u32 dst, src0, src1 - s_mul_hi_i32 dst, src0, src1 - s_mul_hi_u32 dst, src0, src1 - s_mul_i32 dst, src0, src1 - s_nand_b32 dst, src0, src1 - s_nand_b64 dst, src0, src1 - s_nor_b32 dst, src0, src1 - s_nor_b64 dst, src0, src1 - s_or_b32 dst, src0, src1 - s_or_b64 dst, src0, src1 - s_orn2_b32 dst, src0, src1 - s_orn2_b64 dst, src0, src1 - s_pack_hh_b32_b16 dst, src0, src1 - s_pack_lh_b32_b16 dst, src0, src1 - s_pack_ll_b32_b16 dst, src0, src1 - s_rfe_restore_b64 src0, src1 - s_sub_i32 dst, src0, src1 - s_sub_u32 dst, src0, src1 - s_subb_u32 dst, src0, src1 - s_xnor_b32 dst, src0, src1 - s_xnor_b64 dst, src0, src1 - s_xor_b32 dst, src0, src1 - s_xor_b64 dst, src0, src1 - -SOPC -=========================== - -.. parsed-literal:: - - s_bitcmp0_b32 src0, src1 - s_bitcmp0_b64 src0, src1 - s_bitcmp1_b32 src0, src1 - s_bitcmp1_b64 src0, src1 - s_cmp_eq_i32 src0, src1 - s_cmp_eq_u32 src0, src1 - s_cmp_eq_u64 src0, src1 - s_cmp_ge_i32 src0, src1 - s_cmp_ge_u32 src0, src1 - s_cmp_gt_i32 src0, src1 - s_cmp_gt_u32 src0, src1 - s_cmp_le_i32 src0, src1 - s_cmp_le_u32 src0, src1 - s_cmp_lg_i32 src0, src1 - s_cmp_lg_u32 src0, src1 - s_cmp_lg_u64 src0, src1 - s_cmp_lt_i32 src0, src1 - s_cmp_lt_u32 src0, src1 - s_set_gpr_idx_on src0, src1 - s_setvskip src0, src1 - -SOPK -=========================== - -.. parsed-literal:: - - s_addk_i32 dst, src0 - s_call_b64 dst, src0 - s_cbranch_i_fork src0, src1 - s_cmovk_i32 dst, src0 - s_cmpk_eq_i32 src0, src1 - s_cmpk_eq_u32 src0, src1 - s_cmpk_ge_i32 src0, src1 - s_cmpk_ge_u32 src0, src1 - s_cmpk_gt_i32 src0, src1 - s_cmpk_gt_u32 src0, src1 - s_cmpk_le_i32 src0, src1 - s_cmpk_le_u32 src0, src1 - s_cmpk_lg_i32 src0, src1 - s_cmpk_lg_u32 src0, src1 - s_cmpk_lt_i32 src0, src1 - s_cmpk_lt_u32 src0, src1 - s_getreg_b32 dst, src0 - s_movk_i32 dst, src0 - s_mulk_i32 dst, src0 - s_setreg_b32 dst, src0 - s_setreg_imm32_b32 dst, src0 - -SOPP -=========================== - -.. parsed-literal:: - - s_barrier - s_branch src0 - s_cbranch_cdbgsys src0 - s_cbranch_cdbgsys_and_user src0 - s_cbranch_cdbgsys_or_user src0 - s_cbranch_cdbguser src0 - s_cbranch_execnz src0 - s_cbranch_execz src0 - s_cbranch_scc0 src0 - s_cbranch_scc1 src0 - s_cbranch_vccnz src0 - s_cbranch_vccz src0 - s_decperflevel src0 - s_endpgm - s_endpgm_ordered_ps_done - s_endpgm_saved - s_icache_inv - s_incperflevel src0 - s_nop src0 - s_sendmsg src0 - s_sendmsghalt src0 - s_set_gpr_idx_mode src0 - s_set_gpr_idx_off - s_sethalt src0 - s_setkill src0 - s_setprio src0 - s_sleep src0 - s_trap src0 - s_ttracedata - s_waitcnt src0 - s_wakeup - -VINTRP -=========================== - -.. parsed-literal:: - - v_interp_mov_f32 dst, src0, src1 - v_interp_p1_f32 dst, src0, src1 - v_interp_p2_f32 dst, src0, src1 - -VOP1 -=========================== - -.. parsed-literal:: - - v_bfrev_b32 dst, src0 - v_bfrev_b32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_bfrev_b32_sdwa dst, src0 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_ceil_f16 dst, src0 - v_ceil_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_ceil_f16_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_ceil_f32 dst, src0 - v_ceil_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_ceil_f32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_ceil_f64 dst, src0 - v_clrexcp - v_cos_f16 dst, src0 - v_cos_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cos_f16_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cos_f32 dst, src0 - v_cos_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cos_f32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f16_f32 dst, src0 - v_cvt_f16_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f16_f32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f16_i16 dst, src0 - v_cvt_f16_i16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f16_i16_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f16_u16 dst, src0 - v_cvt_f16_u16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f16_u16_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f32_f16 dst, src0 - v_cvt_f32_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f32_f16_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f32_f64 dst, src0 - v_cvt_f32_i32 dst, src0 - v_cvt_f32_i32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f32_i32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f32_u32 dst, src0 - v_cvt_f32_u32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f32_u32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f32_ubyte0 dst, src0 - v_cvt_f32_ubyte0_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f32_ubyte0_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f32_ubyte1 dst, src0 - v_cvt_f32_ubyte1_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f32_ubyte1_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f32_ubyte2 dst, src0 - v_cvt_f32_ubyte2_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f32_ubyte2_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f32_ubyte3 dst, src0 - v_cvt_f32_ubyte3_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_f32_ubyte3_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_f64_f32 dst, src0 - v_cvt_f64_i32 dst, src0 - v_cvt_f64_u32 dst, src0 - v_cvt_flr_i32_f32 dst, src0 - v_cvt_flr_i32_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_flr_i32_f32_sdwa dst, src0 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_i16_f16 dst, src0 - v_cvt_i16_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_i16_f16_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_i32_f32 dst, src0 - v_cvt_i32_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_i32_f32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_i32_f64 dst, src0 - v_cvt_norm_i16_f16 dst, src0 - v_cvt_norm_i16_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_norm_i16_f16_sdwa dst, src0 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_norm_u16_f16 dst, src0 - v_cvt_norm_u16_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_norm_u16_f16_sdwa dst, src0 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_off_f32_i4 dst, src0 - v_cvt_off_f32_i4_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_off_f32_i4_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_rpi_i32_f32 dst, src0 - v_cvt_rpi_i32_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_rpi_i32_f32_sdwa dst, src0 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_u16_f16 dst, src0 - v_cvt_u16_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_u16_f16_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_u32_f32 dst, src0 - v_cvt_u32_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cvt_u32_f32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_cvt_u32_f64 dst, src0 - v_exp_f16 dst, src0 - v_exp_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_exp_f16_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_exp_f32 dst, src0 - v_exp_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_exp_f32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_exp_legacy_f32 dst, src0 - v_exp_legacy_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_exp_legacy_f32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_ffbh_i32 dst, src0 - v_ffbh_i32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_ffbh_i32_sdwa dst, src0 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_ffbh_u32 dst, src0 - v_ffbh_u32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_ffbh_u32_sdwa dst, src0 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_ffbl_b32 dst, src0 - v_ffbl_b32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_ffbl_b32_sdwa dst, src0 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_floor_f16 dst, src0 - v_floor_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_floor_f16_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_floor_f32 dst, src0 - v_floor_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_floor_f32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_floor_f64 dst, src0 - v_fract_f16 dst, src0 - v_fract_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_fract_f16_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_fract_f32 dst, src0 - v_fract_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_fract_f32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_fract_f64 dst, src0 - v_frexp_exp_i16_f16 dst, src0 - v_frexp_exp_i16_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_frexp_exp_i16_f16_sdwa dst, src0 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_frexp_exp_i32_f32 dst, src0 - v_frexp_exp_i32_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_frexp_exp_i32_f32_sdwa dst, src0 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_frexp_exp_i32_f64 dst, src0 - v_frexp_mant_f16 dst, src0 - v_frexp_mant_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_frexp_mant_f16_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_frexp_mant_f32 dst, src0 - v_frexp_mant_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_frexp_mant_f32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_frexp_mant_f64 dst, src0 - v_log_f16 dst, src0 - v_log_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_log_f16_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_log_f32 dst, src0 - v_log_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_log_f32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_log_legacy_f32 dst, src0 - v_log_legacy_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_log_legacy_f32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_mov_b32 dst, src0 - v_mov_b32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mov_b32_sdwa dst, src0 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_mov_fed_b32 dst, src0 - v_mov_fed_b32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mov_fed_b32_sdwa dst, src0 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_nop - v_not_b32 dst, src0 - v_not_b32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_not_b32_sdwa dst, src0 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_rcp_f16 dst, src0 - v_rcp_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_rcp_f16_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_rcp_f32 dst, src0 - v_rcp_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_rcp_f32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_rcp_f64 dst, src0 - v_rcp_iflag_f32 dst, src0 - v_rcp_iflag_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_rcp_iflag_f32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_readfirstlane_b32 dst, src0 - v_rndne_f16 dst, src0 - v_rndne_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_rndne_f16_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_rndne_f32 dst, src0 - v_rndne_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_rndne_f32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_rndne_f64 dst, src0 - v_rsq_f16 dst, src0 - v_rsq_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_rsq_f16_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_rsq_f32 dst, src0 - v_rsq_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_rsq_f32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_rsq_f64 dst, src0 - v_sat_pk_u8_i16 dst, src0 - v_sat_pk_u8_i16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_sat_pk_u8_i16_sdwa dst, src0 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_screen_partition_4se_b32 dst, src0 - v_screen_partition_4se_b32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_screen_partition_4se_b32_sdwa dst, src0 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_sin_f16 dst, src0 - v_sin_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_sin_f16_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_sin_f32 dst, src0 - v_sin_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_sin_f32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_sqrt_f16 dst, src0 - v_sqrt_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_sqrt_f16_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_sqrt_f32 dst, src0 - v_sqrt_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_sqrt_f32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_sqrt_f64 dst, src0 - v_swap_b32 dst, src0 - v_trunc_f16 dst, src0 - v_trunc_f16_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_trunc_f16_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_trunc_f32 dst, src0 - v_trunc_f32_dpp dst, src0 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_trunc_f32_sdwa dst, src0 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` - v_trunc_f64 dst, src0 - -VOP2 -=========================== - -.. parsed-literal:: - - v_add_co_u32 dst0, dst1, src0, src1 - v_add_co_u32_dpp dst0, dst1, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_add_co_u32_sdwa dst0, dst1, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_add_f16 dst, src0, src1 - v_add_f16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_add_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_add_f32 dst, src0, src1 - v_add_f32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_add_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_add_u16 dst, src0, src1 - v_add_u16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_add_u16_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_add_u32 dst, src0, src1 - v_add_u32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_add_u32_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_addc_co_u32 dst0, dst1, src0, src1, src2 - v_addc_co_u32_dpp dst0, dst1, src0, src1, src2 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_addc_co_u32_sdwa dst0, dst1, src0, src1, src2 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_and_b32 dst, src0, src1 - v_and_b32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_and_b32_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_ashrrev_i16 dst, src0, src1 - v_ashrrev_i16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_ashrrev_i16_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_ashrrev_i32 dst, src0, src1 - v_ashrrev_i32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_ashrrev_i32_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_cndmask_b32 dst, src0, src1, src2 - v_cndmask_b32_dpp dst, src0, src1, src2 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_cndmask_b32_sdwa dst, src0, src1, src2 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_ldexp_f16 dst, src0, src1 - v_ldexp_f16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_ldexp_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_lshlrev_b16 dst, src0, src1 - v_lshlrev_b16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_lshlrev_b16_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_lshlrev_b32 dst, src0, src1 - v_lshlrev_b32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_lshlrev_b32_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_lshrrev_b16 dst, src0, src1 - v_lshrrev_b16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_lshrrev_b16_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_lshrrev_b32 dst, src0, src1 - v_lshrrev_b32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_lshrrev_b32_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_mac_f16 dst, src0, src1 - v_mac_f16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mac_f32 dst, src0, src1 - v_mac_f32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_madak_f16 dst, src0, src1, src2 - v_madak_f32 dst, src0, src1, src2 - v_madmk_f16 dst, src0, src1, src2 - v_madmk_f32 dst, src0, src1, src2 - v_max_f16 dst, src0, src1 - v_max_f16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_max_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_max_f32 dst, src0, src1 - v_max_f32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_max_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_max_i16 dst, src0, src1 - v_max_i16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_max_i16_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_max_i32 dst, src0, src1 - v_max_i32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_max_i32_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_max_u16 dst, src0, src1 - v_max_u16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_max_u16_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_max_u32 dst, src0, src1 - v_max_u32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_max_u32_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_min_f16 dst, src0, src1 - v_min_f16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_min_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_min_f32 dst, src0, src1 - v_min_f32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_min_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_min_i16 dst, src0, src1 - v_min_i16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_min_i16_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_min_i32 dst, src0, src1 - v_min_i32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_min_i32_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_min_u16 dst, src0, src1 - v_min_u16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_min_u16_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_min_u32 dst, src0, src1 - v_min_u32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_min_u32_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_mul_f16 dst, src0, src1 - v_mul_f16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mul_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_mul_f32 dst, src0, src1 - v_mul_f32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mul_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_mul_hi_i32_i24 dst, src0, src1 - v_mul_hi_i32_i24_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mul_hi_i32_i24_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_mul_hi_u32_u24 dst, src0, src1 - v_mul_hi_u32_u24_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mul_hi_u32_u24_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_mul_i32_i24 dst, src0, src1 - v_mul_i32_i24_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mul_i32_i24_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_mul_legacy_f32 dst, src0, src1 - v_mul_legacy_f32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mul_legacy_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_mul_lo_u16 dst, src0, src1 - v_mul_lo_u16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mul_lo_u16_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_mul_u32_u24 dst, src0, src1 - v_mul_u32_u24_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_mul_u32_u24_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_or_b32 dst, src0, src1 - v_or_b32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_or_b32_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_sub_co_u32 dst0, dst1, src0, src1 - v_sub_co_u32_dpp dst0, dst1, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_sub_co_u32_sdwa dst0, dst1, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_sub_f16 dst, src0, src1 - v_sub_f16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_sub_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_sub_f32 dst, src0, src1 - v_sub_f32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_sub_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_sub_u16 dst, src0, src1 - v_sub_u16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_sub_u16_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_sub_u32 dst, src0, src1 - v_sub_u32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_sub_u32_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_subb_co_u32 dst0, dst1, src0, src1, src2 - v_subb_co_u32_dpp dst0, dst1, src0, src1, src2 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_subb_co_u32_sdwa dst0, dst1, src0, src1, src2 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_subbrev_co_u32 dst0, dst1, src0, src1, src2 - v_subbrev_co_u32_dpp dst0, dst1, src0, src1, src2 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_subbrev_co_u32_sdwa dst0, dst1, src0, src1, src2 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_subrev_co_u32 dst0, dst1, src0, src1 - v_subrev_co_u32_dpp dst0, dst1, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_subrev_co_u32_sdwa dst0, dst1, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_subrev_f16 dst, src0, src1 - v_subrev_f16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_subrev_f16_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_subrev_f32 dst, src0, src1 - v_subrev_f32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_subrev_f32_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_subrev_u16 dst, src0, src1 - v_subrev_u16_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_subrev_u16_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_subrev_u32 dst, src0, src1 - v_subrev_u32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_subrev_u32_sdwa dst, src0, src1 :ref:`clamp` :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - v_xor_b32 dst, src0, src1 - v_xor_b32_dpp dst, src0, src1 :ref:`dpp_ctrl` :ref:`row_mask` :ref:`bank_mask` :ref:`bound_ctrl` - v_xor_b32_sdwa dst, src0, src1 :ref:`omod` :ref:`dst_sel` :ref:`dst_unused` :ref:`src0_sel` :ref:`src1_sel` - -VOP3 -=========================== - -.. parsed-literal:: - - v_add3_u32 dst, src0, src1, src2 :ref:`omod` - v_add_co_u32_e64 dst0, dst1, src0, src1 :ref:`omod` - v_add_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_add_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_add_f64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_add_i16 dst, src0, src1 :ref:`vop3_op_sel` :ref:`clamp` :ref:`omod` - v_add_i32 dst, src0, src1 :ref:`omod` - v_add_lshl_u32 dst, src0, src1, src2 :ref:`omod` - v_add_u16_e64 dst, src0, src1 :ref:`omod` - v_add_u32_e64 dst, src0, src1 :ref:`omod` - v_addc_co_u32_e64 dst0, dst1, src0, src1, src2 :ref:`omod` - v_alignbit_b32 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`omod` - v_alignbyte_b32 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`omod` - v_and_b32_e64 dst, src0, src1 :ref:`omod` - v_and_or_b32 dst, src0, src1, src2 :ref:`omod` - v_ashrrev_i16_e64 dst, src0, src1 :ref:`omod` - v_ashrrev_i32_e64 dst, src0, src1 :ref:`omod` - v_ashrrev_i64 dst, src0, src1 :ref:`omod` - v_bcnt_u32_b32 dst, src0, src1 :ref:`omod` - v_bfe_i32 dst, src0, src1, src2 :ref:`omod` - v_bfe_u32 dst, src0, src1, src2 :ref:`omod` - v_bfi_b32 dst, src0, src1, src2 :ref:`omod` - v_bfm_b32 dst, src0, src1 :ref:`omod` - v_bfrev_b32_e64 dst, src0 :ref:`omod` - v_ceil_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_ceil_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_ceil_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_clrexcp_e64 :ref:`omod` - v_cmp_class_f16_e64 dst, src0, src1 :ref:`omod` - v_cmp_class_f32_e64 dst, src0, src1 :ref:`omod` - v_cmp_class_f64_e64 dst, src0, src1 :ref:`omod` - v_cmp_eq_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_eq_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_eq_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_eq_i16_e64 dst, src0, src1 :ref:`omod` - v_cmp_eq_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_eq_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_eq_u16_e64 dst, src0, src1 :ref:`omod` - v_cmp_eq_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_eq_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_f_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_f_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_f_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_f_i16_e64 dst, src0, src1 :ref:`omod` - v_cmp_f_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_f_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_f_u16_e64 dst, src0, src1 :ref:`omod` - v_cmp_f_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_f_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_ge_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_ge_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_ge_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_ge_i16_e64 dst, src0, src1 :ref:`omod` - v_cmp_ge_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_ge_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_ge_u16_e64 dst, src0, src1 :ref:`omod` - v_cmp_ge_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_ge_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_gt_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_gt_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_gt_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_gt_i16_e64 dst, src0, src1 :ref:`omod` - v_cmp_gt_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_gt_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_gt_u16_e64 dst, src0, src1 :ref:`omod` - v_cmp_gt_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_gt_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_le_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_le_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_le_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_le_i16_e64 dst, src0, src1 :ref:`omod` - v_cmp_le_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_le_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_le_u16_e64 dst, src0, src1 :ref:`omod` - v_cmp_le_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_le_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_lg_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_lg_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_lg_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_lt_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_lt_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_lt_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_lt_i16_e64 dst, src0, src1 :ref:`omod` - v_cmp_lt_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_lt_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_lt_u16_e64 dst, src0, src1 :ref:`omod` - v_cmp_lt_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_lt_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_ne_i16_e64 dst, src0, src1 :ref:`omod` - v_cmp_ne_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_ne_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_ne_u16_e64 dst, src0, src1 :ref:`omod` - v_cmp_ne_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_ne_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_neq_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_neq_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_neq_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nge_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nge_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nge_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_ngt_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_ngt_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_ngt_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nle_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nle_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nle_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nlg_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nlg_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nlg_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nlt_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nlt_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_nlt_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_o_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_o_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_o_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_t_i16_e64 dst, src0, src1 :ref:`omod` - v_cmp_t_i32_e64 dst, src0, src1 :ref:`omod` - v_cmp_t_i64_e64 dst, src0, src1 :ref:`omod` - v_cmp_t_u16_e64 dst, src0, src1 :ref:`omod` - v_cmp_t_u32_e64 dst, src0, src1 :ref:`omod` - v_cmp_t_u64_e64 dst, src0, src1 :ref:`omod` - v_cmp_tru_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_tru_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_tru_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_u_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_u_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmp_u_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_class_f16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_class_f32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_class_f64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_eq_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_eq_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_eq_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_eq_i16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_eq_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_eq_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_eq_u16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_eq_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_eq_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_f_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_f_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_f_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_f_i16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_f_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_f_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_f_u16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_f_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_f_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ge_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_ge_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_ge_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_ge_i16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ge_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ge_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ge_u16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ge_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ge_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_gt_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_gt_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_gt_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_gt_i16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_gt_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_gt_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_gt_u16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_gt_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_gt_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_le_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_le_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_le_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_le_i16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_le_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_le_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_le_u16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_le_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_le_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lg_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_lg_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_lg_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_lt_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_lt_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_lt_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_lt_i16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lt_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lt_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lt_u16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lt_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_lt_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ne_i16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ne_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ne_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ne_u16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ne_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_ne_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_neq_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_neq_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_neq_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nge_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nge_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nge_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_ngt_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_ngt_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_ngt_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nle_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nle_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nle_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nlg_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nlg_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nlg_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nlt_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nlt_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_nlt_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_o_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_o_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_o_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_t_i16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_t_i32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_t_i64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_t_u16_e64 dst, src0, src1 :ref:`omod` - v_cmpx_t_u32_e64 dst, src0, src1 :ref:`omod` - v_cmpx_t_u64_e64 dst, src0, src1 :ref:`omod` - v_cmpx_tru_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_tru_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_tru_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_u_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_u_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cmpx_u_f64_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_cndmask_b32_e64 dst, src0, src1, src2 :ref:`omod` - v_cos_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cos_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cubeid_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_cubema_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_cubesc_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_cubetc_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_cvt_f16_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f16_i16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f16_u16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_i32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_u32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_ubyte0_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_ubyte1_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_ubyte2_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f32_ubyte3_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f64_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f64_i32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_f64_u32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_flr_i32_f32_e64 dst, src0 :ref:`omod` - v_cvt_i16_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_i32_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_i32_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_norm_i16_f16_e64 dst, src0 :ref:`omod` - v_cvt_norm_u16_f16_e64 dst, src0 :ref:`omod` - v_cvt_off_f32_i4_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_pk_i16_i32 dst, src0, src1 :ref:`omod` - v_cvt_pk_u16_u32 dst, src0, src1 :ref:`omod` - v_cvt_pk_u8_f32 dst, src0, src1, src2 :ref:`omod` - v_cvt_pkaccum_u8_f32 dst, src0, src1 :ref:`omod` - v_cvt_pknorm_i16_f16 dst, src0, src1 :ref:`vop3_op_sel` :ref:`omod` - v_cvt_pknorm_i16_f32 dst, src0, src1 :ref:`omod` - v_cvt_pknorm_u16_f16 dst, src0, src1 :ref:`vop3_op_sel` :ref:`omod` - v_cvt_pknorm_u16_f32 dst, src0, src1 :ref:`omod` - v_cvt_pkrtz_f16_f32 dst, src0, src1 :ref:`omod` - v_cvt_rpi_i32_f32_e64 dst, src0 :ref:`omod` - v_cvt_u16_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_u32_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_cvt_u32_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_div_fixup_f16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`clamp` :ref:`omod` - v_div_fixup_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_div_fixup_f64 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_div_fixup_legacy_f16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`clamp` :ref:`omod` - v_div_fmas_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_div_fmas_f64 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_div_scale_f32 dst0, dst1, src0, src1, src2 :ref:`omod` - v_div_scale_f64 dst0, dst1, src0, src1, src2 :ref:`omod` - v_exp_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_exp_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_exp_legacy_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_ffbh_i32_e64 dst, src0 :ref:`omod` - v_ffbh_u32_e64 dst, src0 :ref:`omod` - v_ffbl_b32_e64 dst, src0 :ref:`omod` - v_floor_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_floor_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_floor_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_fma_f16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`clamp` :ref:`omod` - v_fma_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_fma_f64 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_fma_legacy_f16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`clamp` :ref:`omod` - v_fract_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_fract_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_fract_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_frexp_exp_i16_f16_e64 dst, src0 :ref:`omod` - v_frexp_exp_i32_f32_e64 dst, src0 :ref:`omod` - v_frexp_exp_i32_f64_e64 dst, src0 :ref:`omod` - v_frexp_mant_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_frexp_mant_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_frexp_mant_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_interp_mov_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_interp_p1_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_interp_p1ll_f16 dst, src0, src1 :ref:`high` :ref:`clamp` :ref:`omod` - v_interp_p1lv_f16 dst, src0, src1, src2 :ref:`high` :ref:`clamp` :ref:`omod` - v_interp_p2_f16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`high` :ref:`clamp` :ref:`omod` - v_interp_p2_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_interp_p2_legacy_f16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`high` :ref:`clamp` :ref:`omod` - v_ldexp_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_ldexp_f32 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_ldexp_f64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_lerp_u8 dst, src0, src1, src2 :ref:`omod` - v_log_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_log_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_log_legacy_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_lshl_add_u32 dst, src0, src1, src2 :ref:`omod` - v_lshl_or_b32 dst, src0, src1, src2 :ref:`omod` - v_lshlrev_b16_e64 dst, src0, src1 :ref:`omod` - v_lshlrev_b32_e64 dst, src0, src1 :ref:`omod` - v_lshlrev_b64 dst, src0, src1 :ref:`omod` - v_lshrrev_b16_e64 dst, src0, src1 :ref:`omod` - v_lshrrev_b32_e64 dst, src0, src1 :ref:`omod` - v_lshrrev_b64 dst, src0, src1 :ref:`omod` - v_mac_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_mac_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_mad_f16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`clamp` :ref:`omod` - v_mad_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_mad_i16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`clamp` :ref:`omod` - v_mad_i32_i16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`clamp` :ref:`omod` - v_mad_i32_i24 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_mad_i64_i32 dst0, dst1, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_mad_legacy_f16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`clamp` :ref:`omod` - v_mad_legacy_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_mad_legacy_i16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`clamp` :ref:`omod` - v_mad_legacy_u16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`clamp` :ref:`omod` - v_mad_u16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`clamp` :ref:`omod` - v_mad_u32_u16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`clamp` :ref:`omod` - v_mad_u32_u24 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_mad_u64_u32 dst0, dst1, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_max3_f16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`clamp` :ref:`omod` - v_max3_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_max3_i16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`omod` - v_max3_i32 dst, src0, src1, src2 :ref:`omod` - v_max3_u16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`omod` - v_max3_u32 dst, src0, src1, src2 :ref:`omod` - v_max_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_max_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_max_f64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_max_i16_e64 dst, src0, src1 :ref:`omod` - v_max_i32_e64 dst, src0, src1 :ref:`omod` - v_max_u16_e64 dst, src0, src1 :ref:`omod` - v_max_u32_e64 dst, src0, src1 :ref:`omod` - v_mbcnt_hi_u32_b32 dst, src0, src1 :ref:`omod` - v_mbcnt_lo_u32_b32 dst, src0, src1 :ref:`omod` - v_med3_f16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`clamp` :ref:`omod` - v_med3_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_med3_i16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`omod` - v_med3_i32 dst, src0, src1, src2 :ref:`omod` - v_med3_u16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`omod` - v_med3_u32 dst, src0, src1, src2 :ref:`omod` - v_min3_f16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`clamp` :ref:`omod` - v_min3_f32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_min3_i16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`omod` - v_min3_i32 dst, src0, src1, src2 :ref:`omod` - v_min3_u16 dst, src0, src1, src2 :ref:`vop3_op_sel` :ref:`omod` - v_min3_u32 dst, src0, src1, src2 :ref:`omod` - v_min_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_min_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_min_f64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_min_i16_e64 dst, src0, src1 :ref:`omod` - v_min_i32_e64 dst, src0, src1 :ref:`omod` - v_min_u16_e64 dst, src0, src1 :ref:`omod` - v_min_u32_e64 dst, src0, src1 :ref:`omod` - v_mov_b32_e64 dst, src0 :ref:`omod` - v_mov_fed_b32_e64 dst, src0 :ref:`omod` - v_mqsad_pk_u16_u8 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_mqsad_u32_u8 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_msad_u8 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_mul_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_mul_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_mul_f64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_mul_hi_i32 dst, src0, src1 :ref:`omod` - v_mul_hi_i32_i24_e64 dst, src0, src1 :ref:`omod` - v_mul_hi_u32 dst, src0, src1 :ref:`omod` - v_mul_hi_u32_u24_e64 dst, src0, src1 :ref:`omod` - v_mul_i32_i24_e64 dst, src0, src1 :ref:`omod` - v_mul_legacy_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_mul_lo_u16_e64 dst, src0, src1 :ref:`omod` - v_mul_lo_u32 dst, src0, src1 :ref:`omod` - v_mul_u32_u24_e64 dst, src0, src1 :ref:`omod` - v_nop_e64 :ref:`omod` - v_not_b32_e64 dst, src0 :ref:`omod` - v_or3_b32 dst, src0, src1, src2 :ref:`omod` - v_or_b32_e64 dst, src0, src1 :ref:`omod` - v_pack_b32_f16 dst, src0, src1 :ref:`vop3_op_sel` :ref:`omod` - v_perm_b32 dst, src0, src1, src2 :ref:`omod` - v_qsad_pk_u16_u8 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_rcp_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rcp_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rcp_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rcp_iflag_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_readlane_b32 dst, src0, src1 :ref:`omod` - v_rndne_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rndne_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rndne_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rsq_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rsq_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_rsq_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_sad_hi_u8 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_sad_u16 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_sad_u32 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_sad_u8 dst, src0, src1, src2 :ref:`clamp` :ref:`omod` - v_sat_pk_u8_i16_e64 dst, src0 :ref:`omod` - v_screen_partition_4se_b32_e64 dst, src0 :ref:`omod` - v_sin_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_sin_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_sqrt_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_sqrt_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_sqrt_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_sub_co_u32_e64 dst0, dst1, src0, src1 :ref:`omod` - v_sub_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_sub_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_sub_i16 dst, src0, src1 :ref:`vop3_op_sel` :ref:`clamp` :ref:`omod` - v_sub_i32 dst, src0, src1 :ref:`omod` - v_sub_u16_e64 dst, src0, src1 :ref:`omod` - v_sub_u32_e64 dst, src0, src1 :ref:`omod` - v_subb_co_u32_e64 dst0, dst1, src0, src1, src2 :ref:`omod` - v_subbrev_co_u32_e64 dst0, dst1, src0, src1, src2 :ref:`omod` - v_subrev_co_u32_e64 dst0, dst1, src0, src1 :ref:`omod` - v_subrev_f16_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_subrev_f32_e64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_subrev_u16_e64 dst, src0, src1 :ref:`omod` - v_subrev_u32_e64 dst, src0, src1 :ref:`omod` - v_trig_preop_f64 dst, src0, src1 :ref:`clamp` :ref:`omod` - v_trunc_f16_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_trunc_f32_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_trunc_f64_e64 dst, src0 :ref:`clamp` :ref:`omod` - v_writelane_b32 dst, src0, src1 :ref:`omod` - v_xad_u32 dst, src0, src1, src2 :ref:`omod` - v_xor_b32_e64 dst, src0, src1 :ref:`omod` - -VOP3P -=========================== - -.. parsed-literal:: - - v_mad_mix_f32 dst, src0, src1, src2 :ref:`mad_mix_op_sel` :ref:`mad_mix_op_sel_hi` :ref:`clamp` - v_mad_mixhi_f16 dst, src0, src1, src2 :ref:`mad_mix_op_sel` :ref:`mad_mix_op_sel_hi` :ref:`clamp` - v_mad_mixlo_f16 dst, src0, src1, src2 :ref:`mad_mix_op_sel` :ref:`mad_mix_op_sel_hi` :ref:`clamp` - v_pk_add_f16 dst, src0, src1 :ref:`op_sel` :ref:`op_sel_hi` :ref:`neg_lo` :ref:`neg_hi` :ref:`clamp` - v_pk_add_i16 dst, src0, src1 :ref:`op_sel` :ref:`op_sel_hi` :ref:`clamp` - v_pk_add_u16 dst, src0, src1 :ref:`op_sel` :ref:`op_sel_hi` :ref:`clamp` - v_pk_ashrrev_i16 dst, src0, src1 :ref:`op_sel` :ref:`op_sel_hi` - v_pk_fma_f16 dst, src0, src1, src2 :ref:`op_sel` :ref:`op_sel_hi` :ref:`neg_lo` :ref:`neg_hi` :ref:`clamp` - v_pk_lshlrev_b16 dst, src0, src1 :ref:`op_sel` :ref:`op_sel_hi` - v_pk_lshrrev_b16 dst, src0, src1 :ref:`op_sel` :ref:`op_sel_hi` - v_pk_mad_i16 dst, src0, src1, src2 :ref:`op_sel` :ref:`op_sel_hi` :ref:`clamp` - v_pk_mad_u16 dst, src0, src1, src2 :ref:`op_sel` :ref:`op_sel_hi` :ref:`clamp` - v_pk_max_f16 dst, src0, src1 :ref:`op_sel` :ref:`op_sel_hi` :ref:`neg_lo` :ref:`neg_hi` :ref:`clamp` - v_pk_max_i16 dst, src0, src1 :ref:`op_sel` :ref:`op_sel_hi` - v_pk_max_u16 dst, src0, src1 :ref:`op_sel` :ref:`op_sel_hi` - v_pk_min_f16 dst, src0, src1 :ref:`op_sel` :ref:`op_sel_hi` :ref:`neg_lo` :ref:`neg_hi` :ref:`clamp` - v_pk_min_i16 dst, src0, src1 :ref:`op_sel` :ref:`op_sel_hi` - v_pk_min_u16 dst, src0, src1 :ref:`op_sel` :ref:`op_sel_hi` - v_pk_mul_f16 dst, src0, src1 :ref:`op_sel` :ref:`op_sel_hi` :ref:`neg_lo` :ref:`neg_hi` :ref:`clamp` - v_pk_mul_lo_u16 dst, src0, src1 :ref:`op_sel` :ref:`op_sel_hi` - v_pk_sub_i16 dst, src0, src1 :ref:`op_sel` :ref:`op_sel_hi` :ref:`clamp` - v_pk_sub_u16 dst, src0, src1 :ref:`op_sel` :ref:`op_sel_hi` :ref:`clamp` - -VOPC -=========================== - -.. parsed-literal:: - - v_cmp_class_f16 dst, src0, src1 - v_cmp_class_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_class_f32 dst, src0, src1 - v_cmp_class_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_class_f64 dst, src0, src1 - v_cmp_eq_f16 dst, src0, src1 - v_cmp_eq_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_eq_f32 dst, src0, src1 - v_cmp_eq_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_eq_f64 dst, src0, src1 - v_cmp_eq_i16 dst, src0, src1 - v_cmp_eq_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_eq_i32 dst, src0, src1 - v_cmp_eq_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_eq_i64 dst, src0, src1 - v_cmp_eq_u16 dst, src0, src1 - v_cmp_eq_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_eq_u32 dst, src0, src1 - v_cmp_eq_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_eq_u64 dst, src0, src1 - v_cmp_f_f16 dst, src0, src1 - v_cmp_f_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_f_f32 dst, src0, src1 - v_cmp_f_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_f_f64 dst, src0, src1 - v_cmp_f_i16 dst, src0, src1 - v_cmp_f_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_f_i32 dst, src0, src1 - v_cmp_f_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_f_i64 dst, src0, src1 - v_cmp_f_u16 dst, src0, src1 - v_cmp_f_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_f_u32 dst, src0, src1 - v_cmp_f_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_f_u64 dst, src0, src1 - v_cmp_ge_f16 dst, src0, src1 - v_cmp_ge_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ge_f32 dst, src0, src1 - v_cmp_ge_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ge_f64 dst, src0, src1 - v_cmp_ge_i16 dst, src0, src1 - v_cmp_ge_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ge_i32 dst, src0, src1 - v_cmp_ge_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ge_i64 dst, src0, src1 - v_cmp_ge_u16 dst, src0, src1 - v_cmp_ge_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ge_u32 dst, src0, src1 - v_cmp_ge_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ge_u64 dst, src0, src1 - v_cmp_gt_f16 dst, src0, src1 - v_cmp_gt_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_gt_f32 dst, src0, src1 - v_cmp_gt_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_gt_f64 dst, src0, src1 - v_cmp_gt_i16 dst, src0, src1 - v_cmp_gt_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_gt_i32 dst, src0, src1 - v_cmp_gt_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_gt_i64 dst, src0, src1 - v_cmp_gt_u16 dst, src0, src1 - v_cmp_gt_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_gt_u32 dst, src0, src1 - v_cmp_gt_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_gt_u64 dst, src0, src1 - v_cmp_le_f16 dst, src0, src1 - v_cmp_le_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_le_f32 dst, src0, src1 - v_cmp_le_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_le_f64 dst, src0, src1 - v_cmp_le_i16 dst, src0, src1 - v_cmp_le_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_le_i32 dst, src0, src1 - v_cmp_le_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_le_i64 dst, src0, src1 - v_cmp_le_u16 dst, src0, src1 - v_cmp_le_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_le_u32 dst, src0, src1 - v_cmp_le_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_le_u64 dst, src0, src1 - v_cmp_lg_f16 dst, src0, src1 - v_cmp_lg_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_lg_f32 dst, src0, src1 - v_cmp_lg_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_lg_f64 dst, src0, src1 - v_cmp_lt_f16 dst, src0, src1 - v_cmp_lt_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_lt_f32 dst, src0, src1 - v_cmp_lt_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_lt_f64 dst, src0, src1 - v_cmp_lt_i16 dst, src0, src1 - v_cmp_lt_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_lt_i32 dst, src0, src1 - v_cmp_lt_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_lt_i64 dst, src0, src1 - v_cmp_lt_u16 dst, src0, src1 - v_cmp_lt_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_lt_u32 dst, src0, src1 - v_cmp_lt_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_lt_u64 dst, src0, src1 - v_cmp_ne_i16 dst, src0, src1 - v_cmp_ne_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ne_i32 dst, src0, src1 - v_cmp_ne_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ne_i64 dst, src0, src1 - v_cmp_ne_u16 dst, src0, src1 - v_cmp_ne_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ne_u32 dst, src0, src1 - v_cmp_ne_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ne_u64 dst, src0, src1 - v_cmp_neq_f16 dst, src0, src1 - v_cmp_neq_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_neq_f32 dst, src0, src1 - v_cmp_neq_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_neq_f64 dst, src0, src1 - v_cmp_nge_f16 dst, src0, src1 - v_cmp_nge_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_nge_f32 dst, src0, src1 - v_cmp_nge_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_nge_f64 dst, src0, src1 - v_cmp_ngt_f16 dst, src0, src1 - v_cmp_ngt_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ngt_f32 dst, src0, src1 - v_cmp_ngt_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_ngt_f64 dst, src0, src1 - v_cmp_nle_f16 dst, src0, src1 - v_cmp_nle_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_nle_f32 dst, src0, src1 - v_cmp_nle_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_nle_f64 dst, src0, src1 - v_cmp_nlg_f16 dst, src0, src1 - v_cmp_nlg_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_nlg_f32 dst, src0, src1 - v_cmp_nlg_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_nlg_f64 dst, src0, src1 - v_cmp_nlt_f16 dst, src0, src1 - v_cmp_nlt_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_nlt_f32 dst, src0, src1 - v_cmp_nlt_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_nlt_f64 dst, src0, src1 - v_cmp_o_f16 dst, src0, src1 - v_cmp_o_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_o_f32 dst, src0, src1 - v_cmp_o_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_o_f64 dst, src0, src1 - v_cmp_t_i16 dst, src0, src1 - v_cmp_t_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_t_i32 dst, src0, src1 - v_cmp_t_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_t_i64 dst, src0, src1 - v_cmp_t_u16 dst, src0, src1 - v_cmp_t_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_t_u32 dst, src0, src1 - v_cmp_t_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_t_u64 dst, src0, src1 - v_cmp_tru_f16 dst, src0, src1 - v_cmp_tru_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_tru_f32 dst, src0, src1 - v_cmp_tru_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_tru_f64 dst, src0, src1 - v_cmp_u_f16 dst, src0, src1 - v_cmp_u_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_u_f32 dst, src0, src1 - v_cmp_u_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmp_u_f64 dst, src0, src1 - v_cmpx_class_f16 dst, src0, src1 - v_cmpx_class_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_class_f32 dst, src0, src1 - v_cmpx_class_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_class_f64 dst, src0, src1 - v_cmpx_eq_f16 dst, src0, src1 - v_cmpx_eq_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_eq_f32 dst, src0, src1 - v_cmpx_eq_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_eq_f64 dst, src0, src1 - v_cmpx_eq_i16 dst, src0, src1 - v_cmpx_eq_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_eq_i32 dst, src0, src1 - v_cmpx_eq_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_eq_i64 dst, src0, src1 - v_cmpx_eq_u16 dst, src0, src1 - v_cmpx_eq_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_eq_u32 dst, src0, src1 - v_cmpx_eq_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_eq_u64 dst, src0, src1 - v_cmpx_f_f16 dst, src0, src1 - v_cmpx_f_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_f_f32 dst, src0, src1 - v_cmpx_f_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_f_f64 dst, src0, src1 - v_cmpx_f_i16 dst, src0, src1 - v_cmpx_f_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_f_i32 dst, src0, src1 - v_cmpx_f_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_f_i64 dst, src0, src1 - v_cmpx_f_u16 dst, src0, src1 - v_cmpx_f_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_f_u32 dst, src0, src1 - v_cmpx_f_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_f_u64 dst, src0, src1 - v_cmpx_ge_f16 dst, src0, src1 - v_cmpx_ge_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ge_f32 dst, src0, src1 - v_cmpx_ge_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ge_f64 dst, src0, src1 - v_cmpx_ge_i16 dst, src0, src1 - v_cmpx_ge_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ge_i32 dst, src0, src1 - v_cmpx_ge_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ge_i64 dst, src0, src1 - v_cmpx_ge_u16 dst, src0, src1 - v_cmpx_ge_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ge_u32 dst, src0, src1 - v_cmpx_ge_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ge_u64 dst, src0, src1 - v_cmpx_gt_f16 dst, src0, src1 - v_cmpx_gt_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_gt_f32 dst, src0, src1 - v_cmpx_gt_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_gt_f64 dst, src0, src1 - v_cmpx_gt_i16 dst, src0, src1 - v_cmpx_gt_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_gt_i32 dst, src0, src1 - v_cmpx_gt_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_gt_i64 dst, src0, src1 - v_cmpx_gt_u16 dst, src0, src1 - v_cmpx_gt_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_gt_u32 dst, src0, src1 - v_cmpx_gt_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_gt_u64 dst, src0, src1 - v_cmpx_le_f16 dst, src0, src1 - v_cmpx_le_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_le_f32 dst, src0, src1 - v_cmpx_le_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_le_f64 dst, src0, src1 - v_cmpx_le_i16 dst, src0, src1 - v_cmpx_le_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_le_i32 dst, src0, src1 - v_cmpx_le_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_le_i64 dst, src0, src1 - v_cmpx_le_u16 dst, src0, src1 - v_cmpx_le_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_le_u32 dst, src0, src1 - v_cmpx_le_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_le_u64 dst, src0, src1 - v_cmpx_lg_f16 dst, src0, src1 - v_cmpx_lg_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_lg_f32 dst, src0, src1 - v_cmpx_lg_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_lg_f64 dst, src0, src1 - v_cmpx_lt_f16 dst, src0, src1 - v_cmpx_lt_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_lt_f32 dst, src0, src1 - v_cmpx_lt_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_lt_f64 dst, src0, src1 - v_cmpx_lt_i16 dst, src0, src1 - v_cmpx_lt_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_lt_i32 dst, src0, src1 - v_cmpx_lt_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_lt_i64 dst, src0, src1 - v_cmpx_lt_u16 dst, src0, src1 - v_cmpx_lt_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_lt_u32 dst, src0, src1 - v_cmpx_lt_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_lt_u64 dst, src0, src1 - v_cmpx_ne_i16 dst, src0, src1 - v_cmpx_ne_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ne_i32 dst, src0, src1 - v_cmpx_ne_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ne_i64 dst, src0, src1 - v_cmpx_ne_u16 dst, src0, src1 - v_cmpx_ne_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ne_u32 dst, src0, src1 - v_cmpx_ne_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ne_u64 dst, src0, src1 - v_cmpx_neq_f16 dst, src0, src1 - v_cmpx_neq_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_neq_f32 dst, src0, src1 - v_cmpx_neq_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_neq_f64 dst, src0, src1 - v_cmpx_nge_f16 dst, src0, src1 - v_cmpx_nge_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_nge_f32 dst, src0, src1 - v_cmpx_nge_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_nge_f64 dst, src0, src1 - v_cmpx_ngt_f16 dst, src0, src1 - v_cmpx_ngt_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ngt_f32 dst, src0, src1 - v_cmpx_ngt_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_ngt_f64 dst, src0, src1 - v_cmpx_nle_f16 dst, src0, src1 - v_cmpx_nle_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_nle_f32 dst, src0, src1 - v_cmpx_nle_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_nle_f64 dst, src0, src1 - v_cmpx_nlg_f16 dst, src0, src1 - v_cmpx_nlg_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_nlg_f32 dst, src0, src1 - v_cmpx_nlg_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_nlg_f64 dst, src0, src1 - v_cmpx_nlt_f16 dst, src0, src1 - v_cmpx_nlt_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_nlt_f32 dst, src0, src1 - v_cmpx_nlt_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_nlt_f64 dst, src0, src1 - v_cmpx_o_f16 dst, src0, src1 - v_cmpx_o_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_o_f32 dst, src0, src1 - v_cmpx_o_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_o_f64 dst, src0, src1 - v_cmpx_t_i16 dst, src0, src1 - v_cmpx_t_i16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_t_i32 dst, src0, src1 - v_cmpx_t_i32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_t_i64 dst, src0, src1 - v_cmpx_t_u16 dst, src0, src1 - v_cmpx_t_u16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_t_u32 dst, src0, src1 - v_cmpx_t_u32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_t_u64 dst, src0, src1 - v_cmpx_tru_f16 dst, src0, src1 - v_cmpx_tru_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_tru_f32 dst, src0, src1 - v_cmpx_tru_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_tru_f64 dst, src0, src1 - v_cmpx_u_f16 dst, src0, src1 - v_cmpx_u_f16_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_u_f32 dst, src0, src1 - v_cmpx_u_f32_sdwa dst, src0, src1 :ref:`src0_sel` :ref:`src1_sel` - v_cmpx_u_f64 dst, src0, src1 diff --git a/docs/AMDGPUInstructionNotation.rst b/docs/AMDGPUInstructionNotation.rst new file mode 100644 index 0000000000000000000000000000000000000000..2b41d5b81949f4a478ac6e625a5eb5951e2d0cbf --- /dev/null +++ b/docs/AMDGPUInstructionNotation.rst @@ -0,0 +1,110 @@ +============================ +AMDGPU Instructions Notation +============================ + +.. contents:: + :local: + +.. _amdgpu_syn_instruction_notation: + +Introduction +============ + +This is an overview of notation used to describe the syntax of AMDGPU assembler instructions. + +This notation mimics the :ref:`syntax of assembler instructions` +except that instead of real operands and modifiers it provides references to their description. + +Instructions +============ + +Notation +~~~~~~~~ + +This is the notation used to describe AMDGPU instructions: + + ``<``\ :ref:`opcode description`\ ``> <``\ :ref:`operands description`\ ``> <``\ :ref:`modifiers description`\ ``>`` + +.. _amdgpu_syn_opcode_notation: + +Opcode +====== + +Notation +~~~~~~~~ + +TBD + +.. _amdgpu_syn_instruction_operands_notation: + +Operands +======== + +An instruction may have zero or more *operands*. They are comma-separated in the description: + + ``<``\ :ref:`description of operand 0`\ ``>, <``\ :ref:`description of operand 1`\ ``>, ...`` + +The order of *operands* is fixed. *Operands* cannot be omitted +except for special cases described below. + +.. _amdgpu_syn_instruction_operand_notation: + +Notation +~~~~~~~~ + +An operand is described using the following notation: + + *...* + +Where: + +* *name* is a link to a description of the operand. +* *tags* are optional. They are used to indicate special operand properties: + +.. _amdgpu_syn_instruction_operand_tags: + + ============== ================================================================================= + Operand tag Meaning + ============== ================================================================================= + :opt An optional operand. + :m An operand which may be used with + :ref:`VOP3 operand modifiers` or + :ref:`SDWA operand modifiers`. + :dst An input operand which may also serve as a destination + if :ref:`glc` modifier is specified. + :fx This is an *f32* or *f16* operand depending on + :ref:`mad_mix_op_sel_hi` modifier. + : Operand *type* differs from *type* + :ref:`implied by the opcode name`. + This tag specifies actual operand *type*. + ============== ================================================================================= + +Examples: + +.. parsed-literal:: + + src1:m // src1 operand may be used with operand modifiers + vdata:dst // vdata operand may be used as both source and destination + vdst:u32 // vdst operand has u32 type + +.. _amdgpu_syn_instruction_modifiers_notation: + +Modifiers +========= + +An instruction may have zero or more optional *modifiers*. They are space-separated in the description: + + ``<``\ :ref:`description of modifier 0`\ ``> <``\ :ref:`description of modifier 1`\ ``> ...`` + +The order of *modifiers* is fixed. + +.. _amdgpu_syn_instruction_modifier_notation: + +Notation +~~~~~~~~ + +A *modifier* is described using the following notation: + + ** + +Where *name* is a link to a description of the *modifier*. diff --git a/docs/AMDGPUInstructionSyntax.rst b/docs/AMDGPUInstructionSyntax.rst new file mode 100644 index 0000000000000000000000000000000000000000..90ad54aade839479e026d730ef85805a8736c30c --- /dev/null +++ b/docs/AMDGPUInstructionSyntax.rst @@ -0,0 +1,170 @@ +========================= +AMDGPU Instruction Syntax +========================= + +.. contents:: + :local: + +.. _amdgpu_syn_instructions: + +Instructions +============ + +Syntax +~~~~~~ + +An instruction has the following syntax: + + ``<``\ *opcode mnemonic*\ ``> <``\ *operand0*\ ``>, <``\ *operand1*\ ``>,... <``\ *modifier0*\ ``> <``\ *modifier1*\ ``>...`` + +:doc:`Operands` are normally comma-separated while +:doc:`modifiers` are space-separated. + +The order of *operands* and *modifiers* is fixed. +Most *modifiers* are optional and may be omitted. + +.. _amdgpu_syn_instruction_mnemo: + +Opcode Mnemonic +~~~~~~~~~~~~~~~ + +Opcode mnemonic describes opcode semantics and may include one or more suffices in this order: + +* :ref:`Destination operand type suffix`. +* :ref:`Source operand type suffix`. +* :ref:`Encoding suffix`. + +.. _amdgpu_syn_instruction_type: + +Type and Size Suffices +~~~~~~~~~~~~~~~~~~~~~~ + +Instructions which operate with data have an implied type of *data* operands. +This data type is specified as a suffix of instruction mnemonic. + +There are instructions which have 2 type suffices: +the first is the data type of the destination operand, +the second is the data type of source *data* operand(s). + +Note that data type specified by an instruction does not apply +to other kinds of operands such as *addresses*, *offsets* and so on. + +The following table enumerates the most frequently used type suffices. + + ============================================ ======================= ================= + Type Suffices Packed instruction? Data Type + ============================================ ======================= ================= + _b512, _b256, _b128, _b64, _b32, _b16, _b8 No Bits. + _u64, _u32, _u16, _u8 No Unsigned integer. + _i64, _i32, _i16, _i8 No Signed integer. + _f64, _f32, _f16 No Floating-point. + _b16, _u16, _i16, _f16 Yes Packed. + ============================================ ======================= ================= + +Instructions which have no type suffices are assumed to operate with typeless data. +The size of data is specified by size suffices: + + ================= =================== ===================================== + Size Suffix Implied data type Required register size in dwords + ================= =================== ===================================== + \- b32 1 + x2 b64 2 + x3 b96 3 + x4 b128 4 + x8 b256 8 + x16 b512 16 + x b32 1 + xy b64 2 + xyz b96 3 + xyzw b128 4 + d16_x b16 1 + d16_xy b16x2 2 for GFX8.0, 1 for GFX8.1 and GFX9 + d16_xyz b16x3 3 for GFX8.0, 2 for GFX8.1 and GFX9 + d16_xyzw b16x4 4 for GFX8.0, 2 for GFX8.1 and GFX9 + ================= =================== ===================================== + +.. WARNING:: + There are exceptions from rules described above. + Operands which have type different from type specified by the opcode are + :ref:`tagged` in the description. + +Examples of instructions with different types of source and destination operands: + +.. parsed-literal:: + + s_bcnt0_i32_b64 + v_cvt_f32_u32 + +Examples of instructions with one data type: + +.. parsed-literal:: + + v_max3_f32 + v_max3_i16 + +Examples of instructions which operate with packed data: + +.. parsed-literal:: + + v_pk_add_u16 + v_pk_add_i16 + v_pk_add_f16 + +Examples of typeless instructions which operate on b128 data: + +.. parsed-literal:: + + buffer_store_dwordx4 + flat_load_dwordx4 + +.. _amdgpu_syn_instruction_enc: + +Encoding Suffices +~~~~~~~~~~~~~~~~~ + +Most *VOP1*, *VOP2* and *VOPC* instructions have several variants: +they may also be encoded in *VOP3*, *DPP* and *SDWA* formats. + +The assembler will automatically use optimal encoding based on instruction operands. +To force specific encoding, one can add a suffix to the opcode of the instruction: + + =================================================== ================= + Encoding Encoding Suffix + =================================================== ================= + Native 32-bit encoding (*VOP1*, *VOP2* or *VOPC*) _e32 + *VOP3* (64-bit) encoding _e64 + *DPP* encoding _dpp + *SDWA* encoding _sdwa + =================================================== ================= + +These suffices are used in this reference to indicate the assumed encoding. +When no suffix is specified, a native encoding is implied. + +Operands +======== + +Syntax +~~~~~~ + +Syntax of most operands is described :doc:`in this document`. + +For detailed information about operands follow *operand links* in GPU-specific documents: + +* :doc:`GFX7` +* :doc:`GFX8` +* :doc:`GFX9` + +Modifiers +========= + +Syntax +~~~~~~ + +Syntax of modifiers is described :doc:`in this document`. + +Information about modifiers supported for individual instructions may be found in GPU-specific documents: + +* :doc:`GFX7` +* :doc:`GFX8` +* :doc:`GFX9` + diff --git a/docs/AMDGPUModifierSyntax.rst b/docs/AMDGPUModifierSyntax.rst new file mode 100644 index 0000000000000000000000000000000000000000..e2b8bb3f952a7c6d05c97db09c142db0cfe67ad7 --- /dev/null +++ b/docs/AMDGPUModifierSyntax.rst @@ -0,0 +1,1248 @@ +====================================== +Syntax of AMDGPU Instruction Modifiers +====================================== + +.. contents:: + :local: + +Conventions +=========== + +The following notation is used throughout this document: + + =================== ============================================================= + Notation Description + =================== ============================================================= + {0..N} Any integer value in the range from 0 to N (inclusive). + Syntax and meaning of *x* is explained elsewhere. + =================== ============================================================= + +.. _amdgpu_syn_modifiers: + +Modifiers +========= + +DS Modifiers +------------ + +.. _amdgpu_synid_ds_offset8: + +ds_offset8 +~~~~~~~~~~ + +Specifies an immediate unsigned 8-bit offset, in bytes. The default value is 0. + +Used with DS instructions which have 2 addresses. + + =================== ===================================================== + Syntax Description + =================== ===================================================== + offset:{0..0xFF} Specifies an unsigned 8-bit offset as a positive + :ref:`integer number `. + =================== ===================================================== + +Examples: + +.. parsed-literal:: + + offset:255 + offset:0xff + +.. _amdgpu_synid_ds_offset16: + +ds_offset16 +~~~~~~~~~~~ + +Specifies an immediate unsigned 16-bit offset, in bytes. The default value is 0. + +Used with DS instructions which have 1 address. + + ==================== ====================================================== + Syntax Description + ==================== ====================================================== + offset:{0..0xFFFF} Specifies an unsigned 16-bit offset as a positive + :ref:`integer number `. + ==================== ====================================================== + +Examples: + +.. parsed-literal:: + + offset:65535 + offset:0xffff + +.. _amdgpu_synid_sw_offset16: + +sw_offset16 +~~~~~~~~~~~ + +This is a special modifier which may be used with *ds_swizzle_b32* instruction only. +It specifies a swizzle pattern in numeric or symbolic form. The default value is 0. + +See AMD documentation for more information. + + ======================================================= =========================================================== + Syntax Description + ======================================================= =========================================================== + offset:{0..0xFFFF} Specifies a 16-bit swizzle pattern. + offset:swizzle(QUAD_PERM,{0..3},{0..3},{0..3},{0..3}) Specifies a quad permute mode pattern + + Each number is a lane *id*. + offset:swizzle(BITMASK_PERM, "") Specifies a bitmask permute mode pattern. + + The pattern converts a 5-bit lane *id* to another + lane *id* with which the lane interacts. + + *mask* is a 5 character sequence which + specifies how to transform the bits of the + lane *id*. + + The following characters are allowed: + + * "0" - set bit to 0. + + * "1" - set bit to 1. + + * "p" - preserve bit. + + * "i" - inverse bit. + + offset:swizzle(BROADCAST,{2..32},{0..N}) Specifies a broadcast mode. + + Broadcasts the value of any particular lane to + all lanes in its group. + + The first numeric parameter is a group + size and must be equal to 2, 4, 8, 16 or 32. + + The second numeric parameter is an index of the + lane being broadcasted. + + The index must not exceed group size. + offset:swizzle(SWAP,{1..16}) Specifies a swap mode. + + Swaps the neighboring groups of + 1, 2, 4, 8 or 16 lanes. + offset:swizzle(REVERSE,{2..32}) Specifies a reverse mode. + + Reverses the lanes for groups of 2, 4, 8, 16 or 32 lanes. + ======================================================= =========================================================== + +Numeric parameters may be specified as either :ref:`integer numbers` or +:ref:`absolute expressions`. + +Examples: + +.. parsed-literal:: + + offset:255 + offset:0xffff + offset:swizzle(QUAD_PERM, 0, 1, 2 ,3) + offset:swizzle(BITMASK_PERM, "01pi0") + offset:swizzle(BROADCAST, 2, 0) + offset:swizzle(SWAP, 8) + offset:swizzle(REVERSE, 30 + 2) + +.. _amdgpu_synid_gds: + +gds +~~~ + +Specifies whether to use GDS or LDS memory (LDS is the default). + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + gds Use GDS memory. + ======================================== ================================================ + + +EXP Modifiers +------------- + +.. _amdgpu_synid_done: + +done +~~~~ + +Specifies if this is the last export from the shader to the target. By default, current +instruction does not finish an export sequence. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + done Indicates the last export operation. + ======================================== ================================================ + +.. _amdgpu_synid_compr: + +compr +~~~~~ + +Indicates if the data are compressed (data are not compressed by default). + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + compr Data are compressed. + ======================================== ================================================ + +.. _amdgpu_synid_vm: + +vm +~~ + +Specifies valid mask flag state (off by default). + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + vm Set valid mask flag. + ======================================== ================================================ + +FLAT Modifiers +-------------- + +.. _amdgpu_synid_flat_offset12: + +flat_offset12 +~~~~~~~~~~~~~ + +Specifies an immediate unsigned 12-bit offset, in bytes. The default value is 0. + +Cannot be used with *global/scratch* opcodes. GFX9 only. + + ================= ====================================================== + Syntax Description + ================= ====================================================== + offset:{0..4095} Specifies a 12-bit unsigned offset as a positive + :ref:`integer number `. + ================= ====================================================== + +Examples: + +.. parsed-literal:: + + offset:4095 + offset:0xff + +.. _amdgpu_synid_flat_offset13: + +flat_offset13 +~~~~~~~~~~~~~ + +Specifies an immediate signed 13-bit offset, in bytes. The default value is 0. + +Can be used with *global/scratch* opcodes only. GFX9 only. + + ============================ ======================================================= + Syntax Description + ============================ ======================================================= + offset:{-4096..+4095} Specifies a 13-bit signed offset as an + :ref:`integer number `. + ============================ ======================================================= + +Examples: + +.. parsed-literal:: + + offset:-4000 + offset:0x10 + +glc +~~~ + +See a description :ref:`here`. + +slc +~~~ + +See a description :ref:`here`. + +tfe +~~~ + +See a description :ref:`here`. + +nv +~~ + +See a description :ref:`here`. + +MIMG Modifiers +-------------- + +.. _amdgpu_synid_dmask: + +dmask +~~~~~ + +Specifies which channels (image components) are used by the operation. By default, no channels +are used. + + =============== ===================================================== + Syntax Description + =============== ===================================================== + dmask:{0..15} Specifies image channels as a positive + :ref:`integer number `. + + Each bit corresponds to one of 4 image + components (RGBA). + + If the specified bit value + is 0, the component is not used, value 1 means + that the component is used. + =============== ===================================================== + +This modifier has some limitations depending on instruction kind: + + =================================================== ======================== + Instruction Kind Valid dmask Values + =================================================== ======================== + 32-bit atomic *cmpswap* 0x3 + 32-bit atomic instructions except for *cmpswap* 0x1 + 64-bit atomic *cmpswap* 0xF + 64-bit atomic instructions except for *cmpswap* 0x3 + *gather4* 0x1, 0x2, 0x4, 0x8 + Other instructions any value + =================================================== ======================== + +Examples: + +.. parsed-literal:: + + dmask:0xf + dmask:0b1111 + dmask:3 + +.. _amdgpu_synid_unorm: + +unorm +~~~~~ + +Specifies whether the address is normalized or not (the address is normalized by default). + + ======================== ======================================== + Syntax Description + ======================== ======================================== + unorm Force the address to be unnormalized. + ======================== ======================================== + +glc +~~~ + +See a description :ref:`here`. + +slc +~~~ + +See a description :ref:`here`. + +.. _amdgpu_synid_r128: + +r128 +~~~~ + +Specifies texture resource size. The default size is 256 bits. + +GFX7 and GFX8 only. + + =================== ================================================ + Syntax Description + =================== ================================================ + r128 Specifies 128 bits texture resource size. + =================== ================================================ + +.. WARNING:: Using this modifier should descrease *rsrc* register size from 8 to 4 dwords, but assembler does not currently support this feature. + +tfe +~~~ + +See a description :ref:`here`. + +.. _amdgpu_synid_lwe: + +lwe +~~~ + +Specifies LOD warning status (LOD warning is disabled by default). + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + lwe Enables LOD warning. + ======================================== ================================================ + +.. _amdgpu_synid_da: + +da +~~ + +Specifies if an array index must be sent to TA. By default, array index is not sent. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + da Send an array-index to TA. + ======================================== ================================================ + +.. _amdgpu_synid_d16: + +d16 +~~~ + +Specifies data size: 16 or 32 bits (32 bits by default). Not supported by GFX7. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + d16 Enables 16-bits data mode. + + On loads, convert data in memory to 16-bit + format before storing it in VGPRs. + + For stores, convert 16-bit data in VGPRs to + 32 bits before going to memory. + + Note that GFX8.0 does not support data packing. + Each 16-bit data element occupies 1 VGPR. + + GFX8.1 and GFX9 support data packing. + Each pair of 16-bit data elements + occupies 1 VGPR. + ======================================== ================================================ + +.. _amdgpu_synid_a16: + +a16 +~~~ + +Specifies size of image address components: 16 or 32 bits (32 bits by default). GFX9 only. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + a16 Enables 16-bits image address components. + ======================================== ================================================ + +Miscellaneous Modifiers +----------------------- + +.. _amdgpu_synid_glc: + +glc +~~~ + +This modifier has different meaning for loads, stores, and atomic operations. +The default value is off (0). + +See AMD documentation for details. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + glc Set glc bit to 1. + ======================================== ================================================ + +.. _amdgpu_synid_slc: + +slc +~~~ + +Specifies cache policy. The default value is off (0). + +See AMD documentation for details. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + slc Set slc bit to 1. + ======================================== ================================================ + +.. _amdgpu_synid_tfe: + +tfe +~~~ + +Controls access to partially resident textures. The default value is off (0). + +See AMD documentation for details. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + tfe Set tfe bit to 1. + ======================================== ================================================ + +.. _amdgpu_synid_nv: + +nv +~~ + +Specifies if instruction is operating on non-volatile memory. By default, memory is volatile. + +GFX9 only. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + nv Indicates that instruction operates on + non-volatile memory. + ======================================== ================================================ + +MUBUF/MTBUF Modifiers +--------------------- + +.. _amdgpu_synid_idxen: + +idxen +~~~~~ + +Specifies whether address components include an index. By default, no components are used. + +Can be used together with :ref:`offen`. + +Cannot be used with :ref:`addr64`. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + idxen Address components include an index. + ======================================== ================================================ + +.. _amdgpu_synid_offen: + +offen +~~~~~ + +Specifies whether address components include an offset. By default, no components are used. + +Can be used together with :ref:`idxen`. + +Cannot be used with :ref:`addr64`. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + offen Address components include an offset. + ======================================== ================================================ + +.. _amdgpu_synid_addr64: + +addr64 +~~~~~~ + +Specifies whether a 64-bit address is used. By default, no address is used. + +GFX7 only. Cannot be used with :ref:`offen` and +:ref:`idxen` modifiers. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + addr64 A 64-bit address is used. + ======================================== ================================================ + +.. _amdgpu_synid_buf_offset12: + +buf_offset12 +~~~~~~~~~~~~ + +Specifies an immediate unsigned 12-bit offset, in bytes. The default value is 0. + + =============================== ====================================================== + Syntax Description + =============================== ====================================================== + offset:{0..0xFFF} Specifies a 12-bit unsigned offset as a positive + :ref:`integer number `. + =============================== ====================================================== + +Examples: + +.. parsed-literal:: + + offset:0 + offset:0x10 + +glc +~~~ + +See a description :ref:`here`. + +slc +~~~ + +See a description :ref:`here`. + +.. _amdgpu_synid_lds: + +lds +~~~ + +Specifies where to store the result: VGPRs or LDS (VGPRs by default). + + ======================================== =========================== + Syntax Description + ======================================== =========================== + lds Store result in LDS. + ======================================== =========================== + +tfe +~~~ + +See a description :ref:`here`. + +.. _amdgpu_synid_dfmt: + +dfmt +~~~~ + +TBD + +.. _amdgpu_synid_nfmt: + +nfmt +~~~~ + +TBD + +SMRD/SMEM Modifiers +------------------- + +glc +~~~ + +See a description :ref:`here`. + +nv +~~ + +See a description :ref:`here`. + +VINTRP Modifiers +---------------- + +.. _amdgpu_synid_high: + +high +~~~~ + +Specifies which half of the LDS word to use. Low half of LDS word is used by default. +GFX9 only. + + ======================================== ================================ + Syntax Description + ======================================== ================================ + high Use high half of LDS word. + ======================================== ================================ + +VOP1/VOP2 DPP Modifiers +----------------------- + +GFX8 and GFX9 only. + +.. _amdgpu_synid_dpp_ctrl: + +dpp_ctrl +~~~~~~~~ + +Specifies how data are shared between threads. This is a mandatory modifier. +There is no default value. + +Note. The lanes of a wavefront are organized in four banks and four rows. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + quad_perm:[{0..3},{0..3},{0..3},{0..3}] Full permute of 4 threads. + row_mirror Mirror threads within row. + row_half_mirror Mirror threads within 1/2 row (8 threads). + row_bcast:15 Broadcast 15th thread of each row to next row. + row_bcast:31 Broadcast thread 31 to rows 2 and 3. + wave_shl:1 Wavefront left shift by 1 thread. + wave_rol:1 Wavefront left rotate by 1 thread. + wave_shr:1 Wavefront right shift by 1 thread. + wave_ror:1 Wavefront right rotate by 1 thread. + row_shl:{1..15} Row shift left by 1-15 threads. + row_shr:{1..15} Row shift right by 1-15 threads. + row_ror:{1..15} Row rotate right by 1-15 threads. + ======================================== ================================================ + +Note: Numeric parameters may be specified as either +:ref:`integer numbers` or +:ref:`absolute expressions`. + +Examples: + +.. parsed-literal:: + + quad_perm:[0, 1, 2, 3] + row_shl:3 + +.. _amdgpu_synid_row_mask: + +row_mask +~~~~~~~~ + +Controls which rows are enabled for data sharing. By default, all rows are enabled. + +Note. The lanes of a wavefront are organized in four banks and four rows. + + ======================================== ===================================================== + Syntax Description + ======================================== ===================================================== + row_mask:{0..15} Specifies a *row mask* as a positive + :ref:`integer number `. + + Each of 4 bits in the mask controls one + row (0 - disabled, 1 - enabled). + ======================================== ===================================================== + +Examples: + +.. parsed-literal:: + + row_mask:0xf + row_mask:0b1010 + row_mask:0b1111 + +.. _amdgpu_synid_bank_mask: + +bank_mask +~~~~~~~~~ + +Controls which banks are enabled for data sharing. By default, all banks are enabled. + +Note. The lanes of a wavefront are organized in four banks and four rows. + + ======================================== ======================================================= + Syntax Description + ======================================== ======================================================= + bank_mask:{0..15} Specifies a *bank mask* as a positive + :ref:`integer number `. + + Each of 4 bits in the mask controls one + bank (0 - disabled, 1 - enabled). + ======================================== ======================================================= + +Examples: + +.. parsed-literal:: + + bank_mask:0x3 + bank_mask:0b0011 + bank_mask:0b1111 + +.. _amdgpu_synid_bound_ctrl: + +bound_ctrl +~~~~~~~~~~ + +Controls data sharing when accessing an invalid lane. By default, data sharing with +invalid lanes is disabled. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + bound_ctrl:0 Enables data sharing with invalid lanes. + + Accessing data from an invalid lane will + return zero. + ======================================== ================================================ + +VOP1/VOP2/VOPC SDWA Modifiers +----------------------------- + +GFX8 and GFX9 only. + +clamp +~~~~~ + +See a description :ref:`here`. + +omod +~~~~ + +See a description :ref:`here`. + +GFX9 only. + +.. _amdgpu_synid_dst_sel: + +dst_sel +~~~~~~~ + +Selects which bits in the destination are affected. By default, all bits are affected. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + dst_sel:DWORD Use bits 31:0. + dst_sel:BYTE_0 Use bits 7:0. + dst_sel:BYTE_1 Use bits 15:8. + dst_sel:BYTE_2 Use bits 23:16. + dst_sel:BYTE_3 Use bits 31:24. + dst_sel:WORD_0 Use bits 15:0. + dst_sel:WORD_1 Use bits 31:16. + ======================================== ================================================ + + +.. _amdgpu_synid_dst_unused: + +dst_unused +~~~~~~~~~~ + +Controls what to do with the bits in the destination which are not selected +by :ref:`dst_sel`. +By default, unused bits are preserved. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + dst_unused:UNUSED_PAD Pad with zeros. + dst_unused:UNUSED_SEXT Sign-extend upper bits, zero lower bits. + dst_unused:UNUSED_PRESERVE Preserve bits. + ======================================== ================================================ + +.. _amdgpu_synid_src0_sel: + +src0_sel +~~~~~~~~ + +Controls which bits in the src0 are used. By default, all bits are used. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + src0_sel:DWORD Use bits 31:0. + src0_sel:BYTE_0 Use bits 7:0. + src0_sel:BYTE_1 Use bits 15:8. + src0_sel:BYTE_2 Use bits 23:16. + src0_sel:BYTE_3 Use bits 31:24. + src0_sel:WORD_0 Use bits 15:0. + src0_sel:WORD_1 Use bits 31:16. + ======================================== ================================================ + +.. _amdgpu_synid_src1_sel: + +src1_sel +~~~~~~~~ + +Controls which bits in the src1 are used. By default, all bits are used. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + src1_sel:DWORD Use bits 31:0. + src1_sel:BYTE_0 Use bits 7:0. + src1_sel:BYTE_1 Use bits 15:8. + src1_sel:BYTE_2 Use bits 23:16. + src1_sel:BYTE_3 Use bits 31:24. + src1_sel:WORD_0 Use bits 15:0. + src1_sel:WORD_1 Use bits 31:16. + ======================================== ================================================ + +.. _amdgpu_synid_sdwa_operand_modifiers: + +VOP1/VOP2/VOPC SDWA Operand Modifiers +------------------------------------- + +Operand modifiers are not used separately. They are applied to source operands. + +GFX8 and GFX9 only. + +abs +~~~ + +See a description :ref:`here`. + +neg +~~~ + +See a description :ref:`here`. + +.. _amdgpu_synid_sext: + +sext +~~~~ + +Sign-extends value of a (sub-dword) operand to fill all 32 bits. +Has no effect for 32-bit operands. + +Valid for integer operands only. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + sext() Sign-extend operand value. + ======================================== ================================================ + +Examples: + +.. parsed-literal:: + + sext(v4) + sext(v255) + +VOP3 Modifiers +-------------- + +.. _amdgpu_synid_vop3_op_sel: + +vop3_op_sel +~~~~~~~~~~~ + +Selects the low [15:0] or high [31:16] operand bits for source and destination operands. +By default, low bits are used for all operands. + +The number of values specified with the op_sel modifier must match the number of instruction +operands (both source and destination). First value controls src0, second value controls src1 +and so on, except that the last value controls destination. +The value 0 selects the low bits, while 1 selects the high bits. + +Note. op_sel modifier affects 16-bit operands only. For 32-bit operands the value specified +by op_sel must be 0. + +GFX9 only. + + ======================================== ============================================================ + Syntax Description + ======================================== ============================================================ + op_sel:[{0..1},{0..1}] Select operand bits for instructions with 1 source operand. + op_sel:[{0..1},{0..1},{0..1}] Select operand bits for instructions with 2 source operands. + op_sel:[{0..1},{0..1},{0..1},{0..1}] Select operand bits for instructions with 3 source operands. + ======================================== ============================================================ + +Examples: + +.. parsed-literal:: + + op_sel:[0,0] + op_sel:[0,1] + +.. _amdgpu_synid_clamp: + +clamp +~~~~~ + +Clamp meaning depends on instruction. + +For *v_cmp* instructions, clamp modifier indicates that the compare signals +if a floating point exception occurs. By default, signaling is disabled. +Not supported by GFX7. + +For integer operations, clamp modifier indicates that the result must be clamped +to the largest and smallest representable value. By default, there is no clamping. +Integer clamping is not supported by GFX7. + +For floating point operations, clamp modifier indicates that the result must be clamped +to the range [0.0, 1.0]. By default, there is no clamping. + +Note. Clamp modifier is applied after :ref:`output modifiers` (if any). + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + clamp Enables clamping (or signaling). + ======================================== ================================================ + +.. _amdgpu_synid_omod: + +omod +~~~~ + +Specifies if an output modifier must be applied to the result. +By default, no output modifiers are applied. + +Note. Output modifiers are applied before :ref:`clamping` (if any). + +Output modifiers are valid for f32 and f64 floating point results only. +They must not be used with f16. + +Note. *v_cvt_f16_f32* is an exception. This instruction produces f16 result +but accepts output modifiers. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + mul:2 Multiply the result by 2. + mul:4 Multiply the result by 4. + div:2 Multiply the result by 0.5. + ======================================== ================================================ + +.. _amdgpu_synid_vop3_operand_modifiers: + +VOP3 Operand Modifiers +---------------------- + +Operand modifiers are not used separately. They are applied to source operands. + +.. _amdgpu_synid_abs: + +abs +~~~ + +Computes absolute value of its operand. Applied before :ref:`neg` (if any). +Valid for floating point operands only. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + abs() Get absolute value of operand. + \|| The same as above. + ======================================== ================================================ + +Examples: + +.. parsed-literal:: + + abs(v36) + \|v36| + +.. _amdgpu_synid_neg: + +neg +~~~ + +Computes negative value of its operand. Applied after :ref:`abs` (if any). +Valid for floating point operands only. + + ======================================== ================================================ + Syntax Description + ======================================== ================================================ + neg() Get negative value of operand. + - The same as above. + ======================================== ================================================ + +Examples: + +.. parsed-literal:: + + neg(v[0]) + -v4 + +VOP3P Modifiers +--------------- + +This section describes modifiers of *regular* VOP3P instructions. + +*v_mad_mix_f32*, *v_mad_mixhi_f16* and *v_mad_mixlo_f16* +instructions use these modifiers :ref:`in a special manner`. + +GFX9 only. + +.. _amdgpu_synid_op_sel: + +op_sel +~~~~~~ + +Selects the low [15:0] or high [31:16] operand bits as input to the operation +which results in the lower-half of the destination. +By default, low bits are used for all operands. + +The number of values specified by the *op_sel* modifier must match the number of source +operands. First value controls src0, second value controls src1 and so on. + +The value 0 selects the low bits, while 1 selects the high bits. + + ================================= ============================================================= + Syntax Description + ================================= ============================================================= + op_sel:[{0..1}] Select operand bits for instructions with 1 source operand. + op_sel:[{0..1},{0..1}] Select operand bits for instructions with 2 source operands. + op_sel:[{0..1},{0..1},{0..1}] Select operand bits for instructions with 3 source operands. + ================================= ============================================================= + +Examples: + +.. parsed-literal:: + + op_sel:[0,0] + op_sel:[0,1,0] + +.. _amdgpu_synid_op_sel_hi: + +op_sel_hi +~~~~~~~~~ + +Selects the low [15:0] or high [31:16] operand bits as input to the operation +which results in the upper-half of the destination. +By default, high bits are used for all operands. + +The number of values specified by the *op_sel_hi* modifier must match the number of source +operands. First value controls src0, second value controls src1 and so on. + +The value 0 selects the low bits, while 1 selects the high bits. + + =================================== ============================================================= + Syntax Description + =================================== ============================================================= + op_sel_hi:[{0..1}] Select operand bits for instructions with 1 source operand. + op_sel_hi:[{0..1},{0..1}] Select operand bits for instructions with 2 source operands. + op_sel_hi:[{0..1},{0..1},{0..1}] Select operand bits for instructions with 3 source operands. + =================================== ============================================================= + +Examples: + +.. parsed-literal:: + + op_sel_hi:[0,0] + op_sel_hi:[0,0,1] + +.. _amdgpu_synid_neg_lo: + +neg_lo +~~~~~~ + +Specifies whether to change sign of operand values selected by +:ref:`op_sel`. These values are then used +as input to the operation which results in the upper-half of the destination. + +The number of values specified by this modifier must match the number of source +operands. First value controls src0, second value controls src1 and so on. + +The value 0 indicates that the corresponding operand value is used unmodified, +the value 1 indicates that negative value of the operand must be used. + +By default, operand values are used unmodified. + +This modifier is valid for floating point operands only. + + ================================ ================================================================== + Syntax Description + ================================ ================================================================== + neg_lo:[{0..1}] Select affected operands for instructions with 1 source operand. + neg_lo:[{0..1},{0..1}] Select affected operands for instructions with 2 source operands. + neg_lo:[{0..1},{0..1},{0..1}] Select affected operands for instructions with 3 source operands. + ================================ ================================================================== + +Examples: + +.. parsed-literal:: + + neg_lo:[0] + neg_lo:[0,1] + +.. _amdgpu_synid_neg_hi: + +neg_hi +~~~~~~ + +Specifies whether to change sign of operand values selected by +:ref:`op_sel_hi`. These values are then used +as input to the operation which results in the upper-half of the destination. + +The number of values specified by this modifier must match the number of source +operands. First value controls src0, second value controls src1 and so on. + +The value 0 indicates that the corresponding operand value is used unmodified, +the value 1 indicates that negative value of the operand must be used. + +By default, operand values are used unmodified. + +This modifier is valid for floating point operands only. + + =============================== ================================================================== + Syntax Description + =============================== ================================================================== + neg_hi:[{0..1}] Select affected operands for instructions with 1 source operand. + neg_hi:[{0..1},{0..1}] Select affected operands for instructions with 2 source operands. + neg_hi:[{0..1},{0..1},{0..1}] Select affected operands for instructions with 3 source operands. + =============================== ================================================================== + +Examples: + +.. parsed-literal:: + + neg_hi:[1,0] + neg_hi:[0,1,1] + +clamp +~~~~~ + +See a description :ref:`here`. + +.. _amdgpu_synid_mad_mix: + +VOP3P V_MAD_MIX Modifiers +------------------------- + +*v_mad_mix_f32*, *v_mad_mixhi_f16* and *v_mad_mixlo_f16* instructions +use *op_sel* and *op_sel_hi* modifiers +in a manner different from *regular* VOP3P instructions. + +See a description below. + +GFX9 only. + +.. _amdgpu_synid_mad_mix_op_sel: + +mad_mix_op_sel +~~~~~~~~~~~~~~ + +This operand has meaning only for 16-bit source operands as indicated by +:ref:`mad_mix_op_sel_hi`. +It specifies to select either the low [15:0] or high [31:16] operand bits +as input to the operation. + +The number of values specified by the *op_sel* modifier must match the number of source +operands. First value controls src0, second value controls src1 and so on. + +The value 0 indicates the low bits, the value 1 indicates the high 16 bits. + +By default, low bits are used for all operands. + + =============================== ================================================ + Syntax Description + =============================== ================================================ + op_sel:[{0..1},{0..1},{0..1}] Select location of each 16-bit source operand. + =============================== ================================================ + +Examples: + +.. parsed-literal:: + + op_sel:[0,1] + +.. _amdgpu_synid_mad_mix_op_sel_hi: + +mad_mix_op_sel_hi +~~~~~~~~~~~~~~~~~ + +Selects the size of source operands: either 32 bits or 16 bits. +By default, 32 bits are used for all source operands. + +The number of values specified by the *op_sel_hi* modifier must match the number of source +operands. First value controls src0, second value controls src1 and so on. + +The value 0 indicates 32 bits, the value 1 indicates 16 bits. + +The location of 16 bits in the operand may be specified by +:ref:`mad_mix_op_sel`. + + ======================================== ==================================== + Syntax Description + ======================================== ==================================== + op_sel_hi:[{0..1},{0..1},{0..1}] Select size of each source operand. + ======================================== ==================================== + +Examples: + +.. parsed-literal:: + + op_sel_hi:[1,1,1] + +abs +~~~ + +See a description :ref:`here`. + +neg +~~~ + +See a description :ref:`here`. + +clamp +~~~~~ + +See a description :ref:`here`. diff --git a/docs/AMDGPUOperandSyntax.rst b/docs/AMDGPUOperandSyntax.rst index 4f3536eed40d05b6475e7ba146fc8b4458b5c2ec..5f5d822526530f04cc34265caf50ae63d0589e2b 100644 --- a/docs/AMDGPUOperandSyntax.rst +++ b/docs/AMDGPUOperandSyntax.rst @@ -1,6 +1,6 @@ -================================================= -Syntax of AMDGPU Assembler Operands and Modifiers -================================================= +===================================== +Syntax of AMDGPU Instruction Operands +===================================== .. contents:: :local: @@ -8,1048 +8,1056 @@ Syntax of AMDGPU Assembler Operands and Modifiers Conventions =========== -The following conventions are used in syntax description: +The following notation is used throughout this document: - =================== ============================================================= + =================== ============================================================================= Notation Description - =================== ============================================================= + =================== ============================================================================= {0..N} Any integer value in the range from 0 to N (inclusive). - Unless stated otherwise, this value may be specified as - either a literal or an llvm expression. - Syntax and meaning of ** is explained elsewhere. - =================== ============================================================= + Syntax and meaning of *x* is explained elsewhere. + =================== ============================================================================= .. _amdgpu_syn_operands: Operands ======== -TBD +.. _amdgpu_synid_v: -.. _amdgpu_syn_modifiers: +v +- -Modifiers -========= +Vector registers. There are 256 32-bit vector registers. -DS Modifiers ------------- - -.. _amdgpu_synid_ds_offset8: - -ds_offset8 -~~~~~~~~~~ - -Specifies an immediate unsigned 8-bit offset, in bytes. The default value is 0. - -Used with DS instructions which have 2 addresses. - - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - offset:{0..0xFF} Specifies a 8-bit offset. - ======================================== ================================================ - -.. _amdgpu_synid_ds_offset16: - -ds_offset16 -~~~~~~~~~~~ - -Specifies an immediate unsigned 16-bit offset, in bytes. The default value is 0. - -Used with DS instructions which have 1 address. - - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - offset:{0..0xFFFF} Specifies a 16-bit offset. - ======================================== ================================================ - -.. _amdgpu_synid_sw_offset16: - -sw_offset16 -~~~~~~~~~~~ - -This is a special modifier which may be used with *ds_swizzle_b32* instruction only. -Specifies a sizzle pattern in numeric or symbolic form. The default value is 0. - -See AMD documentation for more information. - - ======================================================= =================================================== - Syntax Description - ======================================================= =================================================== - offset:{0..0xFFFF} Specifies a 16-bit swizzle pattern - in a numeric form. - offset:swizzle(QUAD_PERM,{0..3},{0..3},{0..3},{0..3}) Specifies a quad permute mode pattern; each - number is a lane id. - offset:swizzle(BITMASK_PERM, "") Specifies a bitmask permute mode pattern - which converts a 5-bit lane id to another - lane id with which the lane interacts. - - is a 5 character sequence which - specifies how to transform the bits of the - lane id. The following characters are allowed: - - * "0" - set bit to 0. - - * "1" - set bit to 1. +A sequence of *vector* registers may be used to operate with more than 32 bits of data. - * "p" - preserve bit. +Assembler currently supports sequences of 1, 2, 3, 4, 8 and 16 *vector* registers. - * "i" - inverse bit. + =================================================== ==================================================================== + Syntax Description + =================================================== ==================================================================== + **v**\ A single 32-bit *vector* register. - offset:swizzle(BROADCAST,{2..32},{0..N}) Specifies a broadcast mode. - Broadcasts the value of any particular lane to - all lanes in its group. + *N* must be a decimal integer number. + **v[**\ \ **]** A single 32-bit *vector* register. - The first numeric parameter is a group - size and must be equal to 2, 4, 8, 16 or 32. + *N* may be specified as an + :ref:`integer number` + or an :ref:`absolute expression`. + **v[**\ :\ **]** A sequence of (\ *K-N+1*\ ) *vector* registers. - The second numeric parameter is an index of the - lane being broadcasted. The index must not exceed - group size. - offset:swizzle(SWAP,{1..16}) Specifies a swap mode. - Swaps the neighboring groups of - 1, 2, 4, 8 or 16 lanes. - offset:swizzle(REVERSE,{2..32}) Specifies a reverse mode. Reverses - the lanes for groups of 2, 4, 8, 16 or 32 lanes. - ======================================================= =================================================== + *N* and *K* may be specified as + :ref:`integer numbers` + or :ref:`absolute expressions`. + **[v**\ , \ **v**\ , ... **v**\ \ **]** A sequence of (\ *K-N+1*\ ) *vector* registers. -.. _amdgpu_synid_gds: + Register indices must be specified as decimal integer numbers. + =================================================== ==================================================================== -gds -~~~ +Note. *N* and *K* must satisfy the following conditions: -Specifies whether to use GDS or LDS memory (LDS is the default). +* *N* <= *K*. +* 0 <= *N* <= 255. +* 0 <= *K* <= 255. +* *K-N+1* must be equal to 1, 2, 3, 4, 8 or 16. - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - gds Use GDS memory. - ======================================== ================================================ +Examples: +.. parsed-literal:: -EXP Modifiers -------------- + v255 + v[0] + v[0:1] + v[1:1] + v[0:3] + v[2*2] + v[1-1:2-1] + [v252] + [v252,v253,v254,v255] -.. _amdgpu_synid_done: +.. _amdgpu_synid_s: -done -~~~~ +s +- -Specifies if this is the last export from the shader to the target. By default, current -instruction does not finish an export sequence. +Scalar 32-bit registers. The number of available *scalar* registers depends on GPU: - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - done Indicates the last export operation. - ======================================== ================================================ + ======= ============================ + GPU Number of *scalar* registers + ======= ============================ + GFX7 104 + GFX8 102 + GFX9 102 + ======= ============================ -.. _amdgpu_synid_compr: +A sequence of *scalar* registers may be used to operate with more than 32 bits of data. +Assembler currently supports sequences of 1, 2, 4, 8 and 16 *scalar* registers. -compr -~~~~~ +Pairs of *scalar* registers must be even-aligned (the first register must be even). +Sequences of 4 and more *scalar* registers must be quad-aligned. -Indicates if the data are compressed (not compressed by default). + ======================================================== ==================================================================== + Syntax Description + ======================================================== ==================================================================== + **s**\ A single 32-bit *scalar* register. - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - compr Data are compressed. - ======================================== ================================================ + *N* must be a decimal integer number. + **s[**\ \ **]** A single 32-bit *scalar* register. -.. _amdgpu_synid_vm: + *N* may be specified as an + :ref:`integer number` + or an :ref:`absolute expression`. + **s[**\ :\ **]** A sequence of (\ *K-N+1*\ ) *scalar* registers. -vm -~~ + *N* and *K* may be specified as + :ref:`integer numbers` + or :ref:`absolute expressions`. + **[s**\ , \ **s**\ , ... **s**\ \ **]** A sequence of (\ *K-N+1*\ ) *scalar* registers. -Specifies valid mask flag state (off by default). + Register indices must be specified as decimal integer numbers. + ======================================================== ==================================================================== - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - vm Set valid mask flag. - ======================================== ================================================ +Note. *N* and *K* must satisfy the following conditions: -FLAT Modifiers --------------- +* *N* must be properly aligned based on sequence size. +* *N* <= *K*. +* 0 <= *N* < *SMAX*\ , where *SMAX* is the number of available *scalar* registers. +* 0 <= *K* < *SMAX*\ , where *SMAX* is the number of available *scalar* registers. +* *K-N+1* must be equal to 1, 2, 4, 8 or 16. -.. _amdgpu_synid_flat_offset12: +Examples: -flat_offset12 -~~~~~~~~~~~~~ +.. parsed-literal:: -Specifies an immediate unsigned 12-bit offset, in bytes. The default value is 0. + s0 + s[0] + s[0:1] + s[1:1] + s[0:3] + s[2*2] + s[1-1:2-1] + [s4] + [s4,s5,s6,s7] -Cannot be used with *global/scratch* opcodes. GFX9 only. +Examples of *scalar* registers with an invalid alignment: - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - offset:{0..4095} Specifies a 12-bit unsigned offset. - ======================================== ================================================ +.. parsed-literal:: -.. _amdgpu_synid_flat_offset13: + s[1:2] + s[2:5] -flat_offset13 -~~~~~~~~~~~~~ +.. _amdgpu_synid_trap: -Specifies an immediate signed 13-bit offset, in bytes. The default value is 0. +trap +---- -Can be used with *global/scratch* opcodes only. GFX9 only. +A set of trap handler registers: - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - offset:{-4096..+4095} Specifies a 13-bit signed offset. - ======================================== ================================================ +* :ref:`ttmp` +* :ref:`tba` +* :ref:`tma` -glc -~~~ +.. _amdgpu_synid_ttmp: -See a description :ref:`here`. +ttmp +---- -slc -~~~ +Trap handler temporary scalar registers, 32-bits wide. +The number of available *ttmp* registers depends on GPU: -See a description :ref:`here`. + ======= =========================== + GPU Number of *ttmp* registers + ======= =========================== + GFX7 12 + GFX8 12 + GFX9 16 + ======= =========================== -tfe -~~~ +A sequence of *ttmp* registers may be used to operate with more than 32 bits of data. +Assembler currently supports sequences of 1, 2, 4, 8 and 16 *ttmp* registers. -See a description :ref:`here`. +Pairs of *ttmp* registers must be even-aligned (the first register must be even). +Sequences of 4 and more *ttmp* registers must be quad-aligned. -nv -~~ + ============================================================= ==================================================================== + Syntax Description + ============================================================= ==================================================================== + **ttmp**\ A single 32-bit *ttmp* register. -See a description :ref:`here`. + *N* must be a decimal integer number. + **ttmp[**\ \ **]** A single 32-bit *ttmp* register. -MIMG Modifiers --------------- + *N* may be specified as an + :ref:`integer number` + or an :ref:`absolute expression`. + **ttmp[**\ :\ **]** A sequence of (\ *K-N+1*\ ) *ttmp* registers. -.. _amdgpu_synid_dmask: + *N* and *K* may be specified as + :ref:`integer numbers` + or :ref:`absolute expressions`. + **[ttmp**\ , \ **ttmp**\ , ... **ttmp**\ \ **]** A sequence of (\ *K-N+1*\ ) *ttmp* registers. -dmask -~~~~~ + Register indices must be specified as decimal integer numbers. + ============================================================= ==================================================================== -Specifies which channels (image components) are used by the operation. By default, no channels -are used. +Note. *N* and *K* must satisfy the following conditions: - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - dmask:{0..15} Each bit corresponds to one of 4 image - components (RGBA). If the specified bit value - is 0, the component is not used, value 1 means - that the component is used. - ======================================== ================================================ +* *N* must be properly aligned based on sequence size. +* *N* <= *K*. +* 0 <= *N* < *TMAX*, where *TMAX* is the number of available *ttmp* registers. +* 0 <= *K* < *TMAX*, where *TMAX* is the number of available *ttmp* registers. +* *K-N+1* must be equal to 1, 2, 4, 8 or 16. -This modifier has some limitations depending on instruction kind: +Examples: - ======================================== ================================================ - Instruction Kind Valid dmask Values - ======================================== ================================================ - 32-bit atomic cmpswap 0x3 - other 32-bit atomic instructions 0x1 - 64-bit atomic cmpswap 0xF - other 64-bit atomic instructions 0x3 - GATHER4 0x1, 0x2, 0x4, 0x8 - Other instructions any value - ======================================== ================================================ +.. parsed-literal:: -.. _amdgpu_synid_unorm: + ttmp0 + ttmp[0] + ttmp[0:1] + ttmp[1:1] + ttmp[0:3] + ttmp[2*2] + ttmp[1-1:2-1] + [ttmp4] + [ttmp4,ttmp5,ttmp6,ttmp7] -unorm -~~~~~ +Examples of *ttmp* registers with an invalid alignment: -Specifies whether address is normalized or not (normalized by default). +.. parsed-literal:: - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - unorm Force address to be un-normalized. - ======================================== ================================================ + ttmp[1:2] + ttmp[2:5] -glc -~~~ +.. _amdgpu_synid_tba: -See a description :ref:`here`. +tba +--- -slc -~~~ +Trap base address, 64-bits wide. Holds the pointer to the current trap handler program. -See a description :ref:`here`. + ================== ======================================================================= ============= + Syntax Description Availability + ================== ======================================================================= ============= + tba 64-bit *trap base address* register. GFX7, GFX8 + [tba] 64-bit *trap base address* register (an alternative syntax). GFX7, GFX8 + [tba_lo,tba_hi] 64-bit *trap base address* register (an alternative syntax). GFX7, GFX8 + ================== ======================================================================= ============= -.. _amdgpu_synid_r128: +High and low 32 bits of *trap base address* may be accessed as separate registers: -r128 -~~~~ + ================== ======================================================================= ============= + Syntax Description Availability + ================== ======================================================================= ============= + tba_lo Low 32 bits of *trap base address* register. GFX7, GFX8 + tba_hi High 32 bits of *trap base address* register. GFX7, GFX8 + [tba_lo] Low 32 bits of *trap base address* register (an alternative syntax). GFX7, GFX8 + [tba_hi] High 32 bits of *trap base address* register (an alternative syntax). GFX7, GFX8 + ================== ======================================================================= ============= -Specifies texture resource size. The default size is 256 bits. +Note that *tba*, *tba_lo* and *tba_hi* are not accessible as assembler registers in GFX9, +but *tba* is readable/writable with the help of *s_get_reg* and *s_set_reg* instructions. -GFX7 and GFX8 only. +.. _amdgpu_synid_tma: - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - r128 Specifies 128 bits texture resource size. - ======================================== ================================================ +tma +--- -tfe -~~~ +Trap memory address, 64-bits wide. -See a description :ref:`here`. + ================= ======================================================================= ================== + Syntax Description Availability + ================= ======================================================================= ================== + tma 64-bit *trap memory address* register. GFX7, GFX8 + [tma] 64-bit *trap memory address* register (an alternative syntax). GFX7, GFX8 + [tma_lo,tma_hi] 64-bit *trap memory address* register (an alternative syntax). GFX7, GFX8 + ================= ======================================================================= ================== -.. _amdgpu_synid_lwe: +High and low 32 bits of *trap memory address* may be accessed as separate registers: -lwe -~~~ + ================= ======================================================================= ================== + Syntax Description Availability + ================= ======================================================================= ================== + tma_lo Low 32 bits of *trap memory address* register. GFX7, GFX8 + tma_hi High 32 bits of *trap memory address* register. GFX7, GFX8 + [tma_lo] Low 32 bits of *trap memory address* register (an alternative syntax). GFX7, GFX8 + [tma_hi] High 32 bits of *trap memory address* register (an alternative syntax). GFX7, GFX8 + ================= ======================================================================= ================== -Specifies LOD warning status (LOD warning is disabled by default). +Note that *tma*, *tma_lo* and *tma_hi* are not accessible as assembler registers in GFX9, +but *tma* is readable/writable with the help of *s_get_reg* and *s_set_reg* instructions. - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - lwe Enables LOD warning. - ======================================== ================================================ +.. _amdgpu_synid_flat_scratch: -.. _amdgpu_synid_da: - -da -~~ - -Specifies if an array index must be sent to TA. By default, array index is not sent. - - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - da Send an array-index to TA. - ======================================== ================================================ - -.. _amdgpu_synid_d16: +flat_scratch +------------ -d16 -~~~ +Flat scratch address, 64-bits wide. Holds the base address of scratch memory. -Specifies data size: 16 or 32 bits (32 bits by default). Not supported by GFX7. + ================================== ================================================================ + Syntax Description + ================================== ================================================================ + flat_scratch 64-bit *flat scratch* address register. + [flat_scratch] 64-bit *flat scratch* address register (an alternative syntax). + [flat_scratch_lo,flat_scratch_hi] 64-bit *flat scratch* address register (an alternative syntax). + ================================== ================================================================ + +High and low 32 bits of *flat scratch* address may be accessed as separate registers: + + ========================= ========================================================================= + Syntax Description + ========================= ========================================================================= + flat_scratch_lo Low 32 bits of *flat scratch* address register. + flat_scratch_hi High 32 bits of *flat scratch* address register. + [flat_scratch_lo] Low 32 bits of *flat scratch* address register (an alternative syntax). + [flat_scratch_hi] High 32 bits of *flat scratch* address register (an alternative syntax). + ========================= ========================================================================= + +.. _amdgpu_synid_xnack: + +xnack +----- + +Xnack mask, 64-bits wide. Holds a 64-bit mask of which threads +received an *XNACK* due to a vector memory operation. - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - d16 Enables 16-bits data mode. +.. WARNING:: GFX7 does not support *xnack* feature. Not all GFX8 and GFX9 :ref:`processors` support *xnack* feature. - On loads, convert data in memory to 16-bit - format before storing it in VGPRs. +\ - For stores, convert 16-bit data in VGPRs to - 32 bits before going to memory. + ============================== ===================================================== + Syntax Description + ============================== ===================================================== + xnack_mask 64-bit *xnack mask* register. + [xnack_mask] 64-bit *xnack mask* register (an alternative syntax). + [xnack_mask_lo,xnack_mask_hi] 64-bit *xnack mask* register (an alternative syntax). + ============================== ===================================================== - Note that 16-bit data are stored in VGPRs - unpacked in GFX8.0. In GFX8.1 and GFX9 16-bit - data are packed. - ======================================== ================================================ +High and low 32 bits of *xnack mask* may be accessed as separate registers: -.. _amdgpu_synid_a16: + ===================== ============================================================== + Syntax Description + ===================== ============================================================== + xnack_mask_lo Low 32 bits of *xnack mask* register. + xnack_mask_hi High 32 bits of *xnack mask* register. + [xnack_mask_lo] Low 32 bits of *xnack mask* register (an alternative syntax). + [xnack_mask_hi] High 32 bits of *xnack mask* register (an alternative syntax). + ===================== ============================================================== -a16 -~~~ +.. _amdgpu_synid_vcc: -Specifies size of image address components: 16 or 32 bits (32 bits by default). GFX9 only. +vcc +--- - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - a16 Enables 16-bits image address components. - ======================================== ================================================ +Vector condition code, 64-bits wide. A bit mask with one bit per thread; +it holds the result of a vector compare operation. -Miscellaneous Modifiers ------------------------ + ================ ========================================================================= + Syntax Description + ================ ========================================================================= + vcc 64-bit *vector condition code* register. + [vcc] 64-bit *vector condition code* register (an alternative syntax). + [vcc_lo,vcc_hi] 64-bit *vector condition code* register (an alternative syntax). + ================ ========================================================================= -.. _amdgpu_synid_glc: +High and low 32 bits of *vector condition code* may be accessed as separate registers: -glc -~~~ + ================ ========================================================================= + Syntax Description + ================ ========================================================================= + vcc_lo Low 32 bits of *vector condition code* register. + vcc_hi High 32 bits of *vector condition code* register. + [vcc_lo] Low 32 bits of *vector condition code* register (an alternative syntax). + [vcc_hi] High 32 bits of *vector condition code* register (an alternative syntax). + ================ ========================================================================= -This modifier has different meaning for loads, stores, and atomic operations. -The default value is off (0). +.. _amdgpu_synid_m0: -See AMD documentation for details. +m0 +-- - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - glc Set glc bit to 1. - ======================================== ================================================ +A 32-bit memory register. It has various uses, +including register indexing and bounds checking. -.. _amdgpu_synid_slc: + =========== =================================================== + Syntax Description + =========== =================================================== + m0 A 32-bit *memory* register. + [m0] A 32-bit *memory* register (an alternative syntax). + =========== =================================================== -slc -~~~ +.. _amdgpu_synid_exec: -Specifies cache policy. The default value is off (0). +exec +---- -See AMD documentation for details. +Execute mask, 64-bits wide. A bit mask with one bit per thread, +which is applied to vector instructions and controls which threads execute +and which ignore the instruction. - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - slc Set slc bit to 1. - ======================================== ================================================ + ===================== ================================================================= + Syntax Description + ===================== ================================================================= + exec 64-bit *execute mask* register. + [exec] 64-bit *execute mask* register (an alternative syntax). + [exec_lo,exec_hi] 64-bit *execute mask* register (an alternative syntax). + ===================== ================================================================= -.. _amdgpu_synid_tfe: +High and low 32 bits of *execute mask* may be accessed as separate registers: -tfe -~~~ + ===================== ================================================================= + Syntax Description + ===================== ================================================================= + exec_lo Low 32 bits of *execute mask* register. + exec_hi High 32 bits of *execute mask* register. + [exec_lo] Low 32 bits of *execute mask* register (an alternative syntax). + [exec_hi] High 32 bits of *execute mask* register (an alternative syntax). + ===================== ================================================================= -Controls access to partially resident textures. The default value is off (0). +.. _amdgpu_synid_vccz: -See AMD documentation for details. +vccz +---- - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - tfe Set tfe bit to 1. - ======================================== ================================================ +A single bit-flag indicating that the :ref:`vcc` is all zeros. -.. _amdgpu_synid_nv: +.. WARNING:: This operand is not currently supported by AMDGPU assembler. -nv -~~ +.. _amdgpu_synid_execz: -Specifies if instruction is operating on non-volatile memory. By default, memory is volatile. +execz +----- -GFX9 only. +A single bit flag indicating that the :ref:`exec` is all zeros. - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - nv Indicates that instruction operates on - non-volatile memory. - ======================================== ================================================ +.. WARNING:: This operand is not currently supported by AMDGPU assembler. -MUBUF/MTBUF Modifiers ---------------------- +.. _amdgpu_synid_scc: -.. _amdgpu_synid_idxen: +scc +--- -idxen -~~~~~ +A single bit flag indicating the result of a scalar compare operation. -Specifies whether address components include an index. By default, no components are used. +.. WARNING:: This operand is not currently supported by AMDGPU assembler. -Can be used together with :ref:`offen`. +lds_direct +---------- -Cannot be used with :ref:`addr64`. +A special operand which supplies a 32-bit value +fetched from *LDS* memory using :ref:`m0` as an address. - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - idxen Address components include an index. - ======================================== ================================================ +.. WARNING:: This operand is not currently supported by AMDGPU assembler. -.. _amdgpu_synid_offen: +.. _amdgpu_synid_constant: -offen -~~~~~ +constant +-------- -Specifies whether address components include an offset. By default, no components are used. +A set of integer and floating-point *inline constants*: -Can be used together with :ref:`idxen`. +* :ref:`iconst` +* :ref:`fconst` -Cannot be used with :ref:`addr64`. +These operands are encoded as a part of instruction. - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - offen Address components include an offset. - ======================================== ================================================ +If a number may be encoded as either +a :ref:`literal` or +an :ref:`inline constant`, +assembler selects the latter encoding as more efficient. -.. _amdgpu_synid_addr64: +.. _amdgpu_synid_iconst: -addr64 -~~~~~~ +iconst +------ -Specifies whether a 64-bit address is used. By default, no address is used. +An :ref:`integer number` +encoded as an *inline constant*. -GFX7 only. Cannot be used with :ref:`offen` and -:ref:`idxen` modifiers. +Only a small fraction of integer numbers may be encoded as *inline constants*. +They are enumerated in the table below. +Other integer numbers have to be encoded as :ref:`literals`. - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - addr64 A 64-bit address is used. - ======================================== ================================================ +Integer *inline constants* are converted to +:ref:`expected operand type` +as described :ref:`here`. -.. _amdgpu_synid_buf_offset12: + ================================== ==================================== + Value Note + ================================== ==================================== + {0..64} Positive integer inline constants. + {-16..-1} Negative integer inline constants. + ================================== ==================================== -buf_offset12 -~~~~~~~~~~~~ +.. WARNING:: GFX7 does not support inline constants for *f16* operands. -Specifies an immediate unsigned 12-bit offset, in bytes. The default value is 0. +There are also symbolic inline constants which provide read-only access to H/W registers. - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - offset:{0..0xFFF} Specifies a 12-bit unsigned offset. - ======================================== ================================================ +.. WARNING:: These inline constants are not currently supported by AMDGPU assembler. -glc -~~~ +\ -See a description :ref:`here`. + ======================== ================================================ ============= + Syntax Note Availability + ======================== ================================================ ============= + shared_base Base address of shared memory region. GFX9 + shared_limit Address of the end of shared memory region. GFX9 + private_base Base address of private memory region. GFX9 + private_limit Address of the end of private memory region. GFX9 + pops_exiting_wave_id A dedicated counter for POPS. GFX9 + ======================== ================================================ ============= -slc -~~~ +.. _amdgpu_synid_fconst: -See a description :ref:`here`. +fconst +------ -.. _amdgpu_synid_lds: +A :ref:`floating-point number` +encoded as an *inline constant*. -lds -~~~ +Only a small fraction of floating-point numbers may be encoded as *inline constants*. +They are enumerated in the table below. +Other floating-point numbers have to be encoded as :ref:`literals`. -Specifies where to store the result: VGPRs or LDS (VGPRs by default). +Floating-point *inline constants* are converted to +:ref:`expected operand type` +as described :ref:`here`. - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - lds Store result in LDS. - ======================================== ================================================ + ================================== ===================================================== ================== + Value Note Availability + ================================== ===================================================== ================== + 0.0 The same as integer constant 0. All GPUs + 0.5 Floating-point constant 0.5 All GPUs + 1.0 Floating-point constant 1.0 All GPUs + 2.0 Floating-point constant 2.0 All GPUs + 4.0 Floating-point constant 4.0 All GPUs + -0.5 Floating-point constant -0.5 All GPUs + -1.0 Floating-point constant -1.0 All GPUs + -2.0 Floating-point constant -2.0 All GPUs + -4.0 Floating-point constant -4.0 All GPUs + 0.1592 1.0/(2.0*pi). Use only for 16-bit operands. GFX8, GFX9 + 0.15915494 1.0/(2.0*pi). Use only for 16- and 32-bit operands. GFX8, GFX9 + 0.159154943091895317852646485335 1.0/(2.0*pi). GFX8, GFX9 + ================================== ===================================================== ================== -tfe -~~~ +.. WARNING:: GFX7 does not support inline constants for *f16* operands. -See a description :ref:`here`. +.. _amdgpu_synid_literal: -.. _amdgpu_synid_dfmt: +literal +------- -dfmt -~~~~ +A literal is a 64-bit value which is encoded as a separate 32-bit dword in the instruction stream. -TBD +If a number may be encoded as either +a :ref:`literal` or +an :ref:`inline constant`, +assembler selects the latter encoding as more efficient. -.. _amdgpu_synid_nfmt: +Literals may be specified as :ref:`integer numbers`, +:ref:`floating-point numbers` or +:ref:`expressions` +(expressions are currently supported for 32-bit operands only). -nfmt -~~~~ +A 64-bit literal value is converted by assembler +to an :ref:`expected operand type` +as described :ref:`here`. -TBD +An instruction may use only one literal but several operands may refer the same literal. -SMRD/SMEM Modifiers -------------------- +.. _amdgpu_synid_uimm8: -glc -~~~ +uimm8 +----- -See a description :ref:`here`. +A 8-bit positive :ref:`integer number`. +The value is encoded as part of the opcode so it is free to use. -nv -~~ +.. _amdgpu_synid_uimm32: -See a description :ref:`here`. +uimm32 +------ -VINTRP Modifiers ----------------- +A 32-bit positive :ref:`integer number`. +The value is stored as a separate 32-bit dword in the instruction stream. -.. _amdgpu_synid_high: +.. _amdgpu_synid_uimm20: -high -~~~~ +uimm20 +------ -Specifies which half of the LDS word to use. Low half of LDS word is used by default. -GFX9 only. +A 20-bit positive :ref:`integer number`. - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - high Use high half of LDS word. - ======================================== ================================================ +.. _amdgpu_synid_uimm21: -VOP1/VOP2 DPP Modifiers ------------------------ +uimm21 +------ -GFX8 and GFX9 only. +A 21-bit positive :ref:`integer number`. -.. _amdgpu_synid_dpp_ctrl: +.. WARNING:: Assembler currently supports 20-bit offsets only. Use :ref:`uimm20` as a replacement. -dpp_ctrl -~~~~~~~~ +.. _amdgpu_synid_simm21: -Specifies how data are shared between threads. This is a mandatory modifier. -There is no default value. +simm21 +------ -Note. The lanes of a wavefront are organized in four banks and four rows. +A 21-bit :ref:`integer number`. - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - quad_perm:[{0..3},{0..3},{0..3},{0..3}] Full permute of 4 threads. - row_mirror Mirror threads within row. - row_half_mirror Mirror threads within 1/2 row (8 threads). - row_bcast:15 Broadcast 15th thread of each row to next row. - row_bcast:31 Broadcast thread 31 to rows 2 and 3. - wave_shl:1 Wavefront left shift by 1 thread. - wave_rol:1 Wavefront left rotate by 1 thread. - wave_shr:1 Wavefront right shift by 1 thread. - wave_ror:1 Wavefront right rotate by 1 thread. - row_shl:{1..15} Row shift left by 1-15 threads. - row_shr:{1..15} Row shift right by 1-15 threads. - row_ror:{1..15} Row rotate right by 1-15 threads. - ======================================== ================================================ +.. WARNING:: Assembler currently supports 20-bit unsigned offsets only .Use :ref:`uimm20` as a replacement. -.. _amdgpu_synid_row_mask: +.. _amdgpu_synid_off: -row_mask -~~~~~~~~ +off +--- -Controls which rows are enabled for data sharing. By default, all rows are enabled. +A special entity which indicates that the value of this operand is not used. -Note. The lanes of a wavefront are organized in four banks and four rows. + ================================== =================================================== + Syntax Description + ================================== =================================================== + off Indicates an unused operand. + ================================== =================================================== - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - row_mask:{0..15} Each of 4 bits in the mask controls one - row (0 - disabled, 1 - enabled). - ======================================== ================================================ -.. _amdgpu_synid_bank_mask: +.. _amdgpu_synid_number: -bank_mask -~~~~~~~~~ +Numbers +======= -Controls which banks are enabled for data sharing. By default, all banks are enabled. +.. _amdgpu_synid_integer_number: -Note. The lanes of a wavefront are organized in four banks and four rows. +Integer Numbers +--------------- - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - bank_mask:{0..15} Each of 4 bits in the mask controls one - bank (0 - disabled, 1 - enabled). - ======================================== ================================================ +Integer numbers are 64 bits wide. +They may be specified in binary, octal, hexadecimal and decimal formats: -.. _amdgpu_synid_bound_ctrl: + ============== ==================================== + Format Syntax + ============== ==================================== + Decimal [-]?[1-9][0-9]* + Binary [-]?0b[01]+ + Octal [-]?0[0-7]+ + Hexadecimal [-]?0x[0-9a-fA-F]+ + \ [-]?[0x]?[0-9][0-9a-fA-F]*[hH] + ============== ==================================== -bound_ctrl -~~~~~~~~~~ +Examples: -Controls data sharing when accessing an invalid lane. By default, data sharing with -invalid lanes is disabled. +.. parsed-literal:: - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - bound_ctrl:0 Enables data sharing with invalid lanes. - Accessing data from an invalid lane will - return zero. - ======================================== ================================================ + -1234 + 0b1010 + 010 + 0xff + 0ffh -VOP1/VOP2/VOPC SDWA Modifiers ------------------------------ +.. _amdgpu_synid_floating-point_number: -GFX8 and GFX9 only. +Floating-Point Numbers +---------------------- -clamp -~~~~~ +All floating-point numbers are handled as double (64 bits wide). -See a description :ref:`here`. +Floating-point numbers may be specified in hexadecimal and decimal formats: -omod -~~~~ + ============== ======================================================== ======================================================== + Format Syntax Note + ============== ======================================================== ======================================================== + Decimal [-]?[0-9]*[.][0-9]*([eE][+-]?[0-9]*)? Must include either a decimal separator or an exponent. + Hexadecimal [-]0x[0-9a-fA-F]*(.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+ + ============== ======================================================== ======================================================== -See a description :ref:`here`. +Examples: -GFX9 only. +.. parsed-literal:: -.. _amdgpu_synid_dst_sel: + -1.234 + 234e2 + -0x1afp-10 + 0x.1afp10 -dst_sel -~~~~~~~ +.. _amdgpu_synid_expression: -Selects which bits in the destination are affected. By default, all bits are affected. +Expressions +=========== - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - dst_sel:DWORD Use bits 31:0. - dst_sel:BYTE_0 Use bits 7:0. - dst_sel:BYTE_1 Use bits 15:8. - dst_sel:BYTE_2 Use bits 23:16. - dst_sel:BYTE_3 Use bits 31:24. - dst_sel:WORD_0 Use bits 15:0. - dst_sel:WORD_1 Use bits 31:16. - ======================================== ================================================ +An expression specifies an address or a numeric value. +There are two kinds of expressions: +* :ref:`Absolute`. +* :ref:`Relocatable`. -.. _amdgpu_synid_dst_unused: +.. _amdgpu_synid_absolute_expression: -dst_unused -~~~~~~~~~~ +Absolute Expressions +-------------------- -Controls what to do with the bits in the destination which are not selected -by :ref:`dst_sel`. -By default, unused bits are preserved. +The value of an absolute expression remains the same after program relocation. +Absolute expressions must not include unassigned and relocatable values +such as labels. - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - dst_unused:UNUSED_PAD Pad with zeros. - dst_unused:UNUSED_SEXT Sign-extend upper bits, zero lower bits. - dst_unused:UNUSED_PRESERVE Preserve bits. - ======================================== ================================================ +Examples: -.. _amdgpu_synid_src0_sel: +.. parsed-literal:: -src0_sel -~~~~~~~~ + x = -1 + y = x + 10 -Controls which bits in the src0 are used. By default, all bits are used. +.. _amdgpu_synid_relocatable_expression: - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - src0_sel:DWORD Use bits 31:0. - src0_sel:BYTE_0 Use bits 7:0. - src0_sel:BYTE_1 Use bits 15:8. - src0_sel:BYTE_2 Use bits 23:16. - src0_sel:BYTE_3 Use bits 31:24. - src0_sel:WORD_0 Use bits 15:0. - src0_sel:WORD_1 Use bits 31:16. - ======================================== ================================================ +Relocatable Expressions +----------------------- -.. _amdgpu_synid_src1_sel: +The value of a relocatable expression depends on program relocation. -src1_sel -~~~~~~~~ +Note that use of relocatable expressions is limited with branch targets +and 32-bit :ref:`literals`. -Controls which bits in the src1 are used. By default, all bits are used. +Addition information about relocation may be found :ref:`here`. - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - src1_sel:DWORD Use bits 31:0. - src1_sel:BYTE_0 Use bits 7:0. - src1_sel:BYTE_1 Use bits 15:8. - src1_sel:BYTE_2 Use bits 23:16. - src1_sel:BYTE_3 Use bits 31:24. - src1_sel:WORD_0 Use bits 15:0. - src1_sel:WORD_1 Use bits 31:16. - ======================================== ================================================ +Examples: -VOP1/VOP2/VOPC SDWA Operand Modifiers -------------------------------------- +.. parsed-literal:: -Operand modifiers are not used separately. They are applied to source operands. + y = x + 10 // x is not yet defined. Undefined symbols are assumed to be PC-relative. + z = . -GFX8 and GFX9 only. +Expression Data Type +-------------------- -abs -~~~ +Expressions and operands of expressions are interpreted as 64-bit integers. -See a description :ref:`here`. +Expressions may include 64-bit :ref:`floating-point numbers` (double). +However these operands are also handled as 64-bit integers +using binary representation of specified floating-point numbers. +No conversion from floating-point to integer is performed. -neg -~~~ +Examples: -See a description :ref:`here`. +.. parsed-literal:: -.. _amdgpu_synid_sext: + x = 0.1 // x is assigned an integer 4591870180066957722 which is a binary representation of 0.1. + y = x + x // y is a sum of two integer values; it is not equal to 0.2! -sext -~~~~ +Syntax +------ -Sign-extends value of a (sub-dword) operand to fill all 32 bits. -Has no effect for 32-bit operands. +Expressions are composed of +:ref:`symbols`, +:ref:`integer numbers`, +:ref:`floating-point numbers`, +:ref:`binary operators`, +:ref:`unary operators` and subexpressions. -Valid for integer operands only. +Expressions may also use "." which is a reference to the current PC (program counter). - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - sext() Sign-extend operand value. - ======================================== ================================================ +The syntax of expressions is shown below:: -VOP3 Modifiers --------------- + expr ::= expr binop expr | primaryexpr ; -.. _amdgpu_synid_vop3_op_sel: + primaryexpr ::= '(' expr ')' | symbol | number | '.' | unop primaryexpr ; -vop3_op_sel -~~~~~~~~~~~ + binop ::= '&&' + | '||' + | '|' + | '^' + | '&' + | '!' + | '==' + | '!=' + | '<>' + | '<' + | '<=' + | '>' + | '>=' + | '<<' + | '>>' + | '+' + | '-' + | '*' + | '/' + | '%' ; -Selects the low [15:0] or high [31:16] operand bits for source and destination operands. -By default, low bits are used for all operands. + unop ::= '~' + | '+' + | '-' + | '!' ; -The number of values specified with the op_sel modifier must match the number of instruction -operands (both source and destination). First value controls src0, second value controls src1 -and so on, except that the last value controls destination. -The value 0 selects the low bits, while 1 selects the high bits. +.. _amdgpu_synid_expression_bin_op: -Note. op_sel modifier affects 16-bit operands only. For 32-bit operands the value specified -by op_sel must be 0. +Binary Operators +---------------- -GFX9 only. +Binary operators are described in the following table. +They operate on and produce 64-bit integers. +Operators with higher priority are performed first. + + ========== ========= =============================================== + Operator Priority Meaning + ========== ========= =============================================== + \* 5 Integer multiplication. + / 5 Integer division. + % 5 Integer signed remainder. + \+ 4 Integer addition. + \- 4 Integer subtraction. + << 3 Integer shift left. + >> 3 Logical shift right. + == 2 Equality comparison. + != 2 Inequality comparison. + <> 2 Inequality comparison. + < 2 Signed less than comparison. + <= 2 Signed less than or equal comparison. + > 2 Signed greater than comparison. + >= 2 Signed greater than or equal comparison. + \| 1 Bitwise or. + ^ 1 Bitwise xor. + & 1 Bitwise and. + && 0 Logical and. + || 0 Logical or. + ========== ========= =============================================== + +.. _amdgpu_synid_expression_un_op: + +Unary Operators +--------------- - ======================================== ============================================================ - Syntax Description - ======================================== ============================================================ - op_sel:[{0..1},{0..1}] Select operand bits for instructions with 1 source operand. - op_sel:[{0..1},{0..1},{0..1}] Select operand bits for instructions with 2 source operands. - op_sel:[{0..1},{0..1},{0..1},{0..1}] Select operand bits for instructions with 3 source operands. - ======================================== ============================================================ +Unary operators are described in the following table. +They operate on and produce 64-bit integers. -.. _amdgpu_synid_clamp: + ========== =============================================== + Operator Meaning + ========== =============================================== + ! Logical negation. + ~ Bitwise negation. + \+ Integer unary plus. + \- Integer unary minus. + ========== =============================================== -clamp -~~~~~ +.. _amdgpu_synid_symbol: -Clamp meaning depends on instruction. +Symbols +------- -For *v_cmp* instructions, clamp modifier indicates that the compare signals -if a floating point exception occurs. By default, signaling is disabled. -Not supported by GFX7. +A symbol is a named 64-bit value, representing a relocatable +address or an absolute (non-relocatable) number. -For integer operations, clamp modifier indicates that the result must be clamped -to the largest and smallest representable value. By default, there is no clamping. -Integer clamping is not supported by GFX7. +Symbol names have the following syntax: + ``[a-zA-Z_.][a-zA-Z0-9_$.@]*`` -For floating point operations, clamp modifier indicates that the result must be clamped -to the range [0.0, 1.0]. By default, there is no clamping. +The table below provides several examples of syntax used for symbol definition. -Note. Clamp modifier is applied after :ref:`output modifiers` (if any). + ================ ========================================================== + Syntax Meaning + ================ ========================================================== + .globl Declares a global symbol S without assigning it a value. + .set , Assigns the value of an expression E to a symbol S. + = Assigns the value of an expression E to a symbol S. + : Declares a label S and assigns it the current PC value. + ================ ========================================================== - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - clamp Enables clamping (or signaling). - ======================================== ================================================ +A symbol may be used before it is declared or assigned; +unassigned symbols are assumed to be PC-relative. -.. _amdgpu_synid_omod: +Addition information about symbols may be found :ref:`here`. -omod -~~~~ +.. _amdgpu_synid_conv: -Specifies if an output modifier must be applied to the result. -By default, no output modifiers are applied. +Conversions +=========== -Note. Output modifiers are applied before :ref:`clamping` (if any). +This section describes what happens when a 64-bit +:ref:`integer number`, a +:ref:`floating-point numbers` or a +:ref:`symbol` +is used for an operand which has a different type or size. -Output modifiers are valid for f32 and f64 floating point results only. -They must not be used with f16. +Depending on operand kind, this conversion is performed by either assembler or AMDGPU H/W: -Note. *v_cvt_f16_f32* is an exception. This instruction produces f16 result -but accepts output modifiers. +* Values encoded as :ref:`inline constants` are handled by H/W. +* Values encoded as :ref:`literals` are converted by assembler. - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - mul:2 Multiply the result by 2. - mul:4 Multiply the result by 4. - div:2 Multiply the result by 0.5. - ======================================== ================================================ +.. _amdgpu_synid_const_conv: -VOP3 Operand Modifiers ----------------------- +Inline Constants +---------------- -Operand modifiers are not used separately. They are applied to source operands. +.. _amdgpu_synid_int_const_conv: -.. _amdgpu_synid_abs: +Integer Inline Constants +~~~~~~~~~~~~~~~~~~~~~~~~ -abs -~~~ +Integer :ref:`inline constants` +may be thought of as 64-bit +:ref:`integer numbers`; +when used as operands they are truncated to the size of +:ref:`expected operand type`. +No data type conversions are performed. -Computes absolute value of its operand. Applied before :ref:`neg` (if any). -Valid for floating point operands only. +Examples: - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - abs() Get absolute value of operand. - \|| The same as above. - ======================================== ================================================ +.. parsed-literal:: -.. _amdgpu_synid_neg: + // GFX9 -neg -~~~ + v_add_u16 v0, -1, 0 // v0 = 0xFFFF + v_add_f16 v0, -1, 0 // v0 = 0xFFFF (NaN) -Computes negative value of its operand. Applied after :ref:`abs` (if any). -Valid for floating point operands only. + v_add_u32 v0, -1, 0 // v0 = 0xFFFFFFFF + v_add_f32 v0, -1, 0 // v0 = 0xFFFFFFFF (NaN) - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - neg() Get negative value of operand. - - The same as above. - ======================================== ================================================ +.. _amdgpu_synid_fp_const_conv: -VOP3P Modifiers ---------------- +Floating-Point Inline Constants +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -This section describes modifiers of regular VOP3P instructions. -*v_mad_mix* modifiers are described :ref:`in a separate section`. +Floating-point :ref:`inline constants` +may be thought of as 64-bit +:ref:`floating-point numbers`; +when used as operands they are converted to a floating-point number of +:ref:`expected operand size`. -GFX9 only. +Examples: -.. _amdgpu_synid_op_sel: +.. parsed-literal:: -op_sel -~~~~~~ + // GFX9 -Selects the low [15:0] or high [31:16] operand bits as input to the operation -which results in the lower-half of the destination. -By default, low bits are used for all operands. + v_add_f16 v0, 1.0, 0 // v0 = 0x3C00 (1.0) + v_add_u16 v0, 1.0, 0 // v0 = 0x3C00 -The number of values specified with the op_sel modifier must match the number of source -operands. First value controls src0, second value controls src1 and so on. -The value 0 selects the low bits, while 1 selects the high bits. + v_add_f32 v0, 1.0, 0 // v0 = 0x3F800000 (1.0) + v_add_u32 v0, 1.0, 0 // v0 = 0x3F800000 - ======================================== ============================================================= - Syntax Description - ======================================== ============================================================= - op_sel:[{0..1}] Select operand bits for instructions with 1 source operand. - op_sel:[{0..1},{0..1}] Select operand bits for instructions with 2 source operands. - op_sel:[{0..1},{0..1},{0..1}] Select operand bits for instructions with 3 source operands. - ======================================== ============================================================= -.. _amdgpu_synid_op_sel_hi: +.. _amdgpu_synid_lit_conv: -op_sel_hi -~~~~~~~~~ +Literals +-------- -Selects the low [15:0] or high [31:16] operand bits as input to the operation -which results in the upper-half of the destination. -By default, high bits are used for all operands. +.. _amdgpu_synid_int_lit_conv: -The number of values specified with the op_sel_hi modifier must match the number of source -operands. First value controls src0, second value controls src1 and so on. -The value 0 selects the low bits, while 1 selects the high bits. +Integer Literals +~~~~~~~~~~~~~~~~ - ======================================== ============================================================= - Syntax Description - ======================================== ============================================================= - op_sel_hi:[{0..1}] Select operand bits for instructions with 1 source operand. - op_sel_hi:[{0..1},{0..1}] Select operand bits for instructions with 2 source operands. - op_sel_hi:[{0..1},{0..1},{0..1}] Select operand bits for instructions with 3 source operands. - ======================================== ============================================================= +Integer :ref:`literals` +are specified as 64-bit :ref:`integer numbers`. -.. _amdgpu_synid_neg_lo: +When used as operands they are converted to +:ref:`expected operand type` as described below. -neg_lo -~~~~~~ + ============== ============== =============== ==================================================================== + Expected type Condition Result Note + ============== ============== =============== ==================================================================== + i16, u16, b16 cond(num, 16) num.u16 Truncate to 16 bits. + i32, u32, b32 cond(num, 32) num.u32 Truncate to 32 bits. + i64 cond(num, 32) {-1, num.i32} Truncate to 32 bits and then sign-extend the result to 64 bits. + u64, b64 cond(num, 32) { 0, num.u32} Truncate to 32 bits and then zero-extend the result to 64 bits. + f16 cond(num, 16) num.u16 Use low 16 bits as an f16 value. + f32 cond(num, 32) num.u32 Use low 32 bits as an f32 value. + f64 cond(num, 32) {num.u32, 0} Use low 32 bits of the number as high 32 bits + of the result; low 32 bits of the result are zeroed. + ============== ============== =============== ==================================================================== -Specifies whether to change sign of operand values selected by -:ref:`op_sel`. These values are then used -as input to the operation which results in the upper-half of the destination. +The condition *cond(X,S)* indicates if a 64-bit number *X* +can be converted to a smaller size *S* by truncation of upper bits. +There are two cases when the conversion is possible: -The number of values specified with this modifier must match the number of source -operands. First value controls src0, second value controls src1 and so on. +* The truncated bits are all 0. +* The truncated bits are all 1 and the value after truncation has its MSB bit set. -The value 0 indicates that the corresponding operand value is used unmodified, -the value 1 indicates that negative value of the operand must be used. +Examples of valid literals: -By default, operand values are used unmodified. +.. parsed-literal:: -This modifier is valid for floating point operands only. + // GFX9 - ======================================== ================================================================== - Syntax Description - ======================================== ================================================================== - neg_lo:[{0..1}] Select affected operands for instructions with 1 source operand. - neg_lo:[{0..1},{0..1}] Select affected operands for instructions with 2 source operands. - neg_lo:[{0..1},{0..1},{0..1}] Select affected operands for instructions with 3 source operands. - ======================================== ================================================================== + v_add_u16 v0, 0xff00, v0 // value after conversion: 0xff00 + v_add_u16 v0, 0xffffffffffffff00, v0 // value after conversion: 0xff00 + v_add_u16 v0, -256, v0 // value after conversion: 0xff00 -.. _amdgpu_synid_neg_hi: + s_bfe_i64 s[0:1], 0xffefffff, s3 // value after conversion: 0xffffffffffefffff + s_bfe_u64 s[0:1], 0xffefffff, s3 // value after conversion: 0x00000000ffefffff + v_ceil_f64_e32 v[0:1], 0xffefffff // value after conversion: 0xffefffff00000000 (-1.7976922776554302e308) -neg_hi -~~~~~~ +Examples of invalid literals: -Specifies whether to change sign of operand values selected by -:ref:`op_sel_hi`. These values are then used -as input to the operation which results in the upper-half of the destination. +.. parsed-literal:: -The number of values specified with this modifier must match the number of source -operands. First value controls src0, second value controls src1 and so on. + // GFX9 -The value 0 indicates that the corresponding operand value is used unmodified, -the value 1 indicates that negative value of the operand must be used. + v_add_u16 v0, 0x1ff00, v0 // conversion is not possible as truncated bits are not all 0 or 1 + v_add_u16 v0, 0xffffffffffff00ff, v0 // conversion is not possible as truncated bits do not match MSB of the result -By default, operand values are used unmodified. +.. _amdgpu_synid_fp_lit_conv: -This modifier is valid for floating point operands only. +Floating-Point Literals +~~~~~~~~~~~~~~~~~~~~~~~ - ======================================== ================================================================== - Syntax Description - ======================================== ================================================================== - neg_hi:[{0..1}] Select affected operands for instructions with 1 source operand. - neg_hi:[{0..1},{0..1}] Select affected operands for instructions with 2 source operands. - neg_hi:[{0..1},{0..1},{0..1}] Select affected operands for instructions with 3 source operands. - ======================================== ================================================================== +Floating-point :ref:`literals` are specified as 64-bit +:ref:`floating-point numbers`. -clamp -~~~~~ +When used as operands they are converted to +:ref:`expected operand type` as described below. -See a description :ref:`here`. + ============== ============== ================= ================================================================= + Expected type Condition Result Note + ============== ============== ================= ================================================================= + i16, u16, b16 cond(num, 16) f16(num) Convert to f16 and use bits of the result as an integer value. + i32, u32, b32 cond(num, 32) f32(num) Convert to f32 and use bits of the result as an integer value. + i64, u64, b64 false \- Conversion disabled because of an unclear semantics. + f16 cond(num, 16) f16(num) Convert to f16. + f32 cond(num, 32) f32(num) Convert to f32. + f64 true {num.u32.hi, 0} Use high 32 bits of the number as high 32 bits of the result; + zero-fill low 32 bits of the result. -.. _amdgpu_synid_mad_mix: + Note that the result may differ from the original number. + ============== ============== ================= ================================================================= -VOP3P V_MAD_MIX Modifiers -------------------------- +The condition *cond(X,S)* indicates if an f64 number *X* can be converted +to a smaller *S*-bit floating-point type without overflow or underflow. +Precision lost is allowed. -These instructions use VOP3P format but have different modifiers. +Examples of valid literals: -GFX9 only. +.. parsed-literal:: -.. _amdgpu_synid_mad_mix_op_sel: + // GFX9 -mad_mix_op_sel -~~~~~~~~~~~~~~ + v_add_f16 v1, 65500.0, v2 + v_add_f32 v1, 65600.0, v2 -This operand has meaning only for 16-bit source operands as indicated by -:ref:`mad_mix_op_sel_hi`. -It specifies to select either the low [15:0] or high [31:16] operand bits -as input to the operation. + // value before conversion: 0x7fefffffffffffff (1.7976931348623157e308) + v_ceil_f64 v[0:1], 1.7976931348623157e308 // value after conversion: 0x7fefffff00000000 (1.7976922776554302e308) -The value 0 indicates the low bits, the value 1 indicates the high 16 bits. -By default, low bits are used for all operands. +Examples of invalid literals: - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - op_sel:[{0..1},{0..1},{0..1}] Select location of each 16-bit source operand. - ======================================== ================================================ +.. parsed-literal:: -.. _amdgpu_synid_mad_mix_op_sel_hi: + // GFX9 -mad_mix_op_sel_hi -~~~~~~~~~~~~~~~~~ + v_add_f16 v1, 65600.0, v2 // cannot be converted to f16 because of overflow -Selects the size of source operands: either 32 bits or 16 bits. -By default, 32 bits are used for all source operands. +.. _amdgpu_synid_exp_conv: -The value 0 indicates 32 bits, the value 1 indicates 16 bits. -The location of 16 bits in the operand may be specified by -:ref:`mad_mix_op_sel`. +Expressions +~~~~~~~~~~~ - ======================================== ================================================ - Syntax Description - ======================================== ================================================ - op_sel_hi:[{0..1},{0..1},{0..1}] Select size of each source operand. - ======================================== ================================================ +Expressions operate with and result in 64-bit integers. -abs -~~~ +When used as operands they are truncated to +:ref:`expected operand size`. +No data type conversions are performed. -See a description :ref:`here`. +Examples: -neg -~~~ +.. parsed-literal:: -See a description :ref:`here`. + // GFX9 -clamp -~~~~~ + x = 0.1 + v_sqrt_f32 v0, x // v0 = [low 32 bits of 0.1 (double)] + v_sqrt_f32 v0, (0.1 + 0) // the same as above + v_sqrt_f32 v0, 0.1 // v0 = [0.1 (double) converted to float] -See a description :ref:`here`. diff --git a/docs/AMDGPUUsage.rst b/docs/AMDGPUUsage.rst index 03685f9e352834d826da2f6196fbe0f0b7bc3f27..c60d60af5a82cf1204fb9854816f2762ff8c2ecc 100644 --- a/docs/AMDGPUUsage.rst +++ b/docs/AMDGPUUsage.rst @@ -679,17 +679,18 @@ aligned. In addition, minimal zero byte padding must be generated to ensure the ``desc`` field size is a multiple of 4 bytes. The ``sh_addralign`` field of the ``.note`` section must be at least 4 to indicate at least 8 byte alignment. -The AMDGPU backend code object uses the following ELF note records in the -``.note`` section. The *Description* column specifies the layout of the note -record's ``desc`` field. All fields are consecutive bytes. Note records with -variable size strings have a corresponding ``*_size`` field that specifies the -number of bytes, including the terminating null character, in the string. The -string(s) come immediately after the preceding fields. +.. _amdgpu-note-records-v2: + +Code Object V2 Note Records (-mattr=-code-object-v3) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The AMDGPU backend code object uses the following ELF note record in the +``.note`` section. Additional note records can be present. - .. table:: AMDGPU ELF Note Records - :name: amdgpu-elf-note-records-table + .. table:: AMDGPU Code Object V2 ELF Note Records + :name: amdgpu-elf-note-records-table-v2 ===== ============================== ====================================== Name Type Description @@ -699,8 +700,8 @@ Additional note records can be present. .. - .. table:: AMDGPU ELF Note Record Enumeration Values - :name: amdgpu-elf-note-record-enumeration-values-table + .. table:: AMDGPU Code Object V2 ELF Note Record Enumeration Values + :name: amdgpu-elf-note-record-enumeration-values-table-v2 ============================== ===== Name Value @@ -714,9 +715,47 @@ Additional note records can be present. Specifies extensible metadata associated with the code objects executed on HSA [HSA]_ compatible runtimes such as AMD's ROCm [AMD-ROCm]_. It is required when the target triple OS is ``amdhsa`` (see :ref:`amdgpu-target-triples`). See - :ref:`amdgpu-amdhsa-code-object-metadata` for the syntax of the code + :ref:`amdgpu-amdhsa-code-object-metadata-v2` for the syntax of the code object metadata string. +.. _amdgpu-note-records-v3: + +Code Object V3 Note Records (-mattr=+code-object-v3) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The AMDGPU backend code object uses the following ELF note record in the +``.note`` section. + +Additional note records can be present. + + .. table:: AMDGPU Code Object V3 ELF Note Records + :name: amdgpu-elf-note-records-table-v3 + + ======== ============================== ====================================== + Name Type Description + ======== ============================== ====================================== + "AMDGPU" ``NT_AMDGPU_METADATA`` Metadata in Message Pack [MsgPack]_ + binary format. + ======== ============================== ====================================== + +.. + + .. table:: AMDGPU Code Object V3 ELF Note Record Enumeration Values + :name: amdgpu-elf-note-record-enumeration-values-table-v3 + + ============================== ===== + Name Value + ============================== ===== + *reserved* 0-31 + ``NT_AMDGPU_METADATA`` 32 + ============================== ===== + +``NT_AMDGPU_METADATA`` + Specifies extensible metadata associated with an AMDGPU code + object. It is encoded as a map in the Message Pack [MsgPack]_ binary + data format. See :ref:`amdgpu-amdhsa-code-object-metadata-v3` for the + map keys defined for the ``amdhsa`` OS. + .. _amdgpu-symbols: Symbols @@ -1009,13 +1048,21 @@ Code Object Metadata The code object metadata specifies extensible metadata associated with the code objects executed on HSA [HSA]_ compatible runtimes such as AMD's ROCm -[AMD-ROCm]_. It is specified by the ``NT_AMD_AMDGPU_HSA_METADATA`` note record -(see :ref:`amdgpu-note-records`) and is required when the target triple OS is -``amdhsa`` (see :ref:`amdgpu-target-triples`). It must contain the minimum -information necessary to support the ROCM kernel queries. For example, the -segment sizes needed in a dispatch packet. In addition, a high level language -runtime may require other information to be included. For example, the AMD -OpenCL runtime records kernel argument information. +[AMD-ROCm]_. It is specified in a note record (see :ref:`amdgpu-note-records`) +and is required when the target triple OS is ``amdhsa`` (see +:ref:`amdgpu-target-triples`). It must contain the minimum information +necessary to support the ROCM kernel queries. For example, the segment sizes +needed in a dispatch packet. In addition, a high level language runtime may +require other information to be included. For example, the AMD OpenCL runtime +records kernel argument information. + +.. _amdgpu-amdhsa-code-object-metadata-v2: + +Code Object V2 Metadata (-mattr=-code-object-v3) +++++++++++++++++++++++++++++++++++++++++++++++++ + +Code object V2 metadata is specified by the ``NT_AMD_AMDGPU_METADATA`` note +record (see :ref:`amdgpu-note-records-v2`). The metadata is specified as a YAML formatted string (see [YAML]_ and :doc:`YamlIO`). @@ -1025,7 +1072,7 @@ The metadata is specified as a YAML formatted string (see [YAML]_ and contain null characters, otherwise it should be. The metadata is represented as a single YAML document comprised of the mapping -defined in table :ref:`amdgpu-amdhsa-code-object-metadata-mapping-table` and +defined in table :ref:`amdgpu-amdhsa-code-object-metadata-map-table-v2` and referenced tables. For boolean values, the string values of ``false`` and ``true`` are used for @@ -1034,8 +1081,8 @@ false and true respectively. Additional information can be added to the mappings. To avoid conflicts, any non-AMD key names should be prefixed by "*vendor-name*.". - .. table:: AMDHSA Code Object Metadata Mapping - :name: amdgpu-amdhsa-code-object-metadata-mapping-table + .. table:: AMDHSA Code Object V2 Metadata Map + :name: amdgpu-amdhsa-code-object-metadata-map-table-v2 ========== ============== ========= ======================================= String Key Value Type Required? Description @@ -1072,14 +1119,14 @@ non-AMD key names should be prefixed by "*vendor-name*.". printf function call. "Kernels" sequence of Required Sequence of the mappings for each mapping kernel in the code object. See - :ref:`amdgpu-amdhsa-code-object-kernel-metadata-mapping-table` + :ref:`amdgpu-amdhsa-code-object-kernel-metadata-map-table-v2` for the definition of the mapping. ========== ============== ========= ======================================= .. - .. table:: AMDHSA Code Object Kernel Metadata Mapping - :name: amdgpu-amdhsa-code-object-kernel-metadata-mapping-table + .. table:: AMDHSA Code Object V2 Kernel Metadata Map + :name: amdgpu-amdhsa-code-object-kernel-metadata-map-table-v2 ================= ============== ========= ================================ String Key Value Type Required? Description @@ -1101,22 +1148,22 @@ non-AMD key names should be prefixed by "*vendor-name*.". minor version. "Attrs" mapping Mapping of kernel attributes. See - :ref:`amdgpu-amdhsa-code-object-kernel-attribute-metadata-mapping-table` + :ref:`amdgpu-amdhsa-code-object-kernel-attribute-metadata-map-table-v2` for the mapping definition. "Args" sequence of Sequence of mappings of the mapping kernel arguments. See - :ref:`amdgpu-amdhsa-code-object-kernel-argument-metadata-mapping-table` + :ref:`amdgpu-amdhsa-code-object-kernel-argument-metadata-map-table-v2` for the definition of the mapping. "CodeProps" mapping Mapping of properties related to the kernel code. See - :ref:`amdgpu-amdhsa-code-object-kernel-code-properties-metadata-mapping-table` + :ref:`amdgpu-amdhsa-code-object-kernel-code-properties-metadata-map-table-v2` for the mapping definition. ================= ============== ========= ================================ .. - .. table:: AMDHSA Code Object Kernel Attribute Metadata Mapping - :name: amdgpu-amdhsa-code-object-kernel-attribute-metadata-mapping-table + .. table:: AMDHSA Code Object V2 Kernel Attribute Metadata Map + :name: amdgpu-amdhsa-code-object-kernel-attribute-metadata-map-table-v2 =================== ============== ========= ============================== String Key Value Type Required? Description @@ -1156,8 +1203,8 @@ non-AMD key names should be prefixed by "*vendor-name*.". .. - .. table:: AMDHSA Code Object Kernel Argument Metadata Mapping - :name: amdgpu-amdhsa-code-object-kernel-argument-metadata-mapping-table + .. table:: AMDHSA Code Object V2 Kernel Argument Metadata Map + :name: amdgpu-amdhsa-code-object-kernel-argument-metadata-map-table-v2 ================= ============== ========= ================================ String Key Value Type Required? Description @@ -1354,8 +1401,8 @@ non-AMD key names should be prefixed by "*vendor-name*.". .. - .. table:: AMDHSA Code Object Kernel Code Properties Metadata Mapping - :name: amdgpu-amdhsa-code-object-kernel-code-properties-metadata-mapping-table + .. table:: AMDHSA Code Object V2 Kernel Code Properties Metadata Map + :name: amdgpu-amdhsa-code-object-kernel-code-properties-metadata-map-table-v2 ============================ ============== ========= ===================== String Key Value Type Required? Description @@ -1433,6 +1480,414 @@ non-AMD key names should be prefixed by "*vendor-name*.". location. ============================ ============== ========= ===================== +.. _amdgpu-amdhsa-code-object-metadata-v3: + +Code Object V3 Metadata (-mattr=+code-object-v3) +++++++++++++++++++++++++++++++++++++++++++++++++ + +Code object V3 metadata is specified by the ``NT_AMDGPU_METADATA`` note record +(see :ref:`amdgpu-note-records-v3`). + +The metadata is represented as Message Pack formatted binary data (see +[MsgPack]_). The top level is a Message Pack map that includes the +keys defined in table +:ref:`amdgpu-amdhsa-code-object-metadata-map-table-v3` and referenced +tables. + +Additional information can be added to the maps. To avoid conflicts, +any key names should be prefixed by "*vendor-name*." where +``vendor-name`` can be the the name of the vendor and specific vendor +tool that generates the information. The prefix is abbreviated to +simply "." when it appears within a map that has been added by the +same *vendor-name*. + + .. table:: AMDHSA Code Object V3 Metadata Map + :name: amdgpu-amdhsa-code-object-metadata-map-table-v3 + + ================= ============== ========= ======================================= + String Key Value Type Required? Description + ================= ============== ========= ======================================= + "amdhsa.version" sequence of Required - The first integer is the major + 2 integers version. Currently 1. + - The second integer is the minor + version. Currently 0. + "amdhsa.printf" sequence of Each string is encoded information + strings about a printf function call. The + encoded information is organized as + fields separated by colon (':'): + + ``ID:N:S[0]:S[1]:...:S[N-1]:FormatString`` + + where: + + ``ID`` + A 32 bit integer as a unique id for + each printf function call + + ``N`` + A 32 bit integer equal to the number + of arguments of printf function call + minus 1 + + ``S[i]`` (where i = 0, 1, ... , N-1) + 32 bit integers for the size in bytes + of the i-th FormatString argument of + the printf function call + + FormatString + The format string passed to the + printf function call. + "amdhsa.kernels" sequence of Required Sequence of the maps for each + map kernel in the code object. See + :ref:`amdgpu-amdhsa-code-object-kernel-metadata-map-table-v3` + for the definition of the keys included + in that map. + ================= ============== ========= ======================================= + +.. + + .. table:: AMDHSA Code Object V3 Kernel Metadata Map + :name: amdgpu-amdhsa-code-object-kernel-metadata-map-table-v3 + + =================================== ============== ========= ================================ + String Key Value Type Required? Description + =================================== ============== ========= ================================ + ".name" string Required Source name of the kernel. + ".symbol" string Required Name of the kernel + descriptor ELF symbol. + ".language" string Source language of the kernel. + Values include: + + - "OpenCL C" + - "OpenCL C++" + - "HCC" + - "HIP" + - "OpenMP" + - "Assembler" + + ".language_version" sequence of - The first integer is the major + 2 integers version. + - The second integer is the + minor version. + ".args" sequence of Sequence of maps of the + map kernel arguments. See + :ref:`amdgpu-amdhsa-code-object-kernel-argument-metadata-map-table-v3` + for the definition of the keys + included in that map. + ".reqd_workgroup_size" sequence of If not 0, 0, 0 then all values + 3 integers must be >=1 and the dispatch + work-group size X, Y, Z must + correspond to the specified + values. Defaults to 0, 0, 0. + + Corresponds to the OpenCL + ``reqd_work_group_size`` + attribute. + ".workgroup_size_hint" sequence of The dispatch work-group size + 3 integers X, Y, Z is likely to be the + specified values. + + Corresponds to the OpenCL + ``work_group_size_hint`` + attribute. + ".vec_type_hint" string The name of a scalar or vector + type. + + Corresponds to the OpenCL + ``vec_type_hint`` attribute. + + ".device_enqueue_symbol" string The external symbol name + associated with a kernel. + OpenCL runtime allocates a + global buffer for the symbol + and saves the kernel's address + to it, which is used for + device side enqueueing. Only + available for device side + enqueued kernels. + ".kernarg_segment_size" integer Required The size in bytes of + the kernarg segment + that holds the values + of the arguments to + the kernel. + ".group_segment_fixed_size" integer Required The amount of group + segment memory + required by a + work-group in + bytes. This does not + include any + dynamically allocated + group segment memory + that may be added + when the kernel is + dispatched. + ".private_segment_fixed_size" integer Required The amount of fixed + private address space + memory required for a + work-item in + bytes. If the kernel + uses a dynamic call + stack then additional + space must be added + to this value for the + call stack. + ".kernarg_segment_align" integer Required The maximum byte + alignment of + arguments in the + kernarg segment. Must + be a power of 2. + ".wavefront_size" integer Required Wavefront size. Must + be a power of 2. + ".sgpr_count" integer Required Number of scalar + registers required by a + wavefront for + GFX6-GFX9. A register + is required if it is + used explicitly, or + if a higher numbered + register is used + explicitly. This + includes the special + SGPRs for VCC, Flat + Scratch (GFX7-GFX9) + and XNACK (for + GFX8-GFX9). It does + not include the 16 + SGPR added if a trap + handler is + enabled. It is not + rounded up to the + allocation + granularity. + ".vgpr_count" integer Required Number of vector + registers required by + each work-item for + GFX6-GFX9. A register + is required if it is + used explicitly, or + if a higher numbered + register is used + explicitly. + ".max_flat_workgroup_size" integer Required Maximum flat + work-group size + supported by the + kernel in work-items. + Must be >=1 and + consistent with + ReqdWorkGroupSize if + not 0, 0, 0. + ".sgpr_spill_count" integer Number of stores from + a scalar register to + a register allocator + created spill + location. + ".vgpr_spill_count" integer Number of stores from + a vector register to + a register allocator + created spill + location. + =================================== ============== ========= ================================ + +.. + + .. table:: AMDHSA Code Object V3 Kernel Argument Metadata Map + :name: amdgpu-amdhsa-code-object-kernel-argument-metadata-map-table-v3 + + ====================== ============== ========= ================================ + String Key Value Type Required? Description + ====================== ============== ========= ================================ + ".name" string Kernel argument name. + ".type_name" string Kernel argument type name. + ".size" integer Required Kernel argument size in bytes. + ".offset" integer Required Kernel argument offset in + bytes. The offset must be a + multiple of the alignment + required by the argument. + ".value_kind" string Required Kernel argument kind that + specifies how to set up the + corresponding argument. + Values include: + + "by_value" + The argument is copied + directly into the kernarg. + + "global_buffer" + A global address space pointer + to the buffer data is passed + in the kernarg. + + "dynamic_shared_pointer" + A group address space pointer + to dynamically allocated LDS + is passed in the kernarg. + + "sampler" + A global address space + pointer to a S# is passed in + the kernarg. + + "image" + A global address space + pointer to a T# is passed in + the kernarg. + + "pipe" + A global address space pointer + to an OpenCL pipe is passed in + the kernarg. + + "queue" + A global address space pointer + to an OpenCL device enqueue + queue is passed in the + kernarg. + + "hidden_global_offset_x" + The OpenCL grid dispatch + global offset for the X + dimension is passed in the + kernarg. + + "hidden_global_offset_y" + The OpenCL grid dispatch + global offset for the Y + dimension is passed in the + kernarg. + + "hidden_global_offset_z" + The OpenCL grid dispatch + global offset for the Z + dimension is passed in the + kernarg. + + "hidden_none" + An argument that is not used + by the kernel. Space needs to + be left for it, but it does + not need to be set up. + + "hidden_printf_buffer" + A global address space pointer + to the runtime printf buffer + is passed in kernarg. + + "hidden_default_queue" + A global address space pointer + to the OpenCL device enqueue + queue that should be used by + the kernel by default is + passed in the kernarg. + + "hidden_completion_action" + A global address space pointer + to help link enqueued kernels into + the ancestor tree for determining + when the parent kernel has finished. + + ".value_type" string Required Kernel argument value type. Only + present if ".value_kind" is + "by_value". For vector data + types, the value is for the + element type. Values include: + + - "struct" + - "i8" + - "u8" + - "i16" + - "u16" + - "f16" + - "i32" + - "u32" + - "f32" + - "i64" + - "u64" + - "f64" + + .. TODO + How can it be determined if a + vector type, and what size + vector? + ".pointee_align" integer Alignment in bytes of pointee + type for pointer type kernel + argument. Must be a power + of 2. Only present if + ".value_kind" is + "dynamic_shared_pointer". + ".address_space" string Kernel argument address space + qualifier. Only present if + ".value_kind" is "global_buffer" or + "dynamic_shared_pointer". Values + are: + + - "private" + - "global" + - "constant" + - "local" + - "generic" + - "region" + + .. TODO + Is "global_buffer" only "global" + or "constant"? Is + "dynamic_shared_pointer" always + "local"? Can HCC allow "generic"? + How can "private" or "region" + ever happen? + ".access" string Kernel argument access + qualifier. Only present if + ".value_kind" is "image" or + "pipe". Values + are: + + - "read_only" + - "write_only" + - "read_write" + + .. TODO + Does this apply to + "global_buffer"? + ".actual_access" string The actual memory accesses + performed by the kernel on the + kernel argument. Only present if + ".value_kind" is "global_buffer", + "image", or "pipe". This may be + more restrictive than indicated + by ".access" to reflect what the + kernel actual does. If not + present then the runtime must + assume what is implied by + ".access" and ".is_const" . Values + are: + + - "read_only" + - "write_only" + - "read_write" + + ".is_const" boolean Indicates if the kernel argument + is const qualified. Only present + if ".value_kind" is + "global_buffer". + + ".is_restrict" boolean Indicates if the kernel argument + is restrict qualified. Only + present if ".value_kind" is + "global_buffer". + + ".is_volatile" boolean Indicates if the kernel argument + is volatile qualified. Only + present if ".value_kind" is + "global_buffer". + + ".is_pipe" boolean Indicates if the kernel argument + is pipe qualified. Only present + if ".value_kind" is "pipe". + + .. TODO + Can "global_buffer" be pipe + qualified? + ====================== ============== ========= ================================ + .. Kernel Dispatch @@ -4103,21 +4558,26 @@ Instructions .. toctree:: :hidden: - AMDGPUAsmGFX7 - AMDGPUAsmGFX8 - AMDGPUAsmGFX9 + AMDGPU/AMDGPUAsmGFX7 + AMDGPU/AMDGPUAsmGFX8 + AMDGPU/AMDGPUAsmGFX9 + AMDGPUModifierSyntax AMDGPUOperandSyntax + AMDGPUInstructionSyntax + AMDGPUInstructionNotation -An instruction has the following syntax: +An instruction has the following :doc:`syntax`: - * , ,... ...* + ``<``\ *opcode*\ ``> <``\ *operand0*\ ``>, <``\ *operand1*\ ``>,... <``\ *modifier0*\ ``> <``\ *modifier1*\ ``>...`` -Note that operands are normally comma-separated while modifiers are space-separated. +:doc:`Operands` are normally comma-separated while +:doc:`modifiers` are space-separated. -The order of operands and modifiers is fixed. Most modifiers are optional and may be omitted. +The order of *operands* and *modifiers* is fixed. +Most *modifiers* are optional and may be omitted. -See detailed instruction syntax description for :doc:`GFX7`, -:doc:`GFX8` and :doc:`GFX9`. +See detailed instruction syntax description for :doc:`GFX7`, +:doc:`GFX8` and :doc:`GFX9`. Note that features under development are not included in this description. @@ -4128,22 +4588,12 @@ operands, refer to one of instruction set architecture manuals Operands ~~~~~~~~ -The following syntax for register operands is supported: - -* SGPR registers: s0, ... or s[0], ... -* VGPR registers: v0, ... or v[0], ... -* TTMP registers: ttmp0, ... or ttmp[0], ... -* Special registers: exec (exec_lo, exec_hi), vcc (vcc_lo, vcc_hi), flat_scratch (flat_scratch_lo, flat_scratch_hi) -* Special trap registers: tba (tba_lo, tba_hi), tma (tma_lo, tma_hi) -* Register pairs, quads, etc: s[2:3], v[10:11], ttmp[5:6], s[4:7], v[12:15], ttmp[4:7], s[8:15], ... -* Register lists: [s0, s1], [ttmp0, ttmp1, ttmp2, ttmp3] -* Register index expressions: v[2*2], s[1-1:2-1] -* 'off' indicates that an operand is not enabled +Detailed description of operands may be found :doc:`here`. Modifiers ~~~~~~~~~ -Detailed description of modifiers may be found :doc:`here`. +Detailed description of modifiers may be found :doc:`here`. Instruction Examples ~~~~~~~~~~~~~~~~~~~~ @@ -4373,7 +4823,7 @@ used. The default value for all keys is 0, with the following exceptions: - *wavefront_size* defaults to 6. - *kernarg_segment_alignment*, *group_segment_alignment*, and *private_segment_alignment* default to 4. Note that alignments are specified - as a power of two, so a value of **n** means an alignment of 2^ **n**. + as a power of 2, so a value of **n** means an alignment of 2^ **n**. The *.amd_kernel_code_t* directive must be placed immediately after the function label and before any instructions. @@ -4586,6 +5036,17 @@ terminated by an ``.end_amdhsa_kernel`` directive. :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx9-table`. ======================================================== ================ ============ =================== +.amdgpu_metadata +++++++++++++++++ + +Optional directive which declares the contents of the ``NT_AMDGPU_METADATA`` +note record (see :ref:`amdgpu-elf-note-records-table-v3`). + +The contents must be in the [YAML]_ markup format, with the same structure and +semantics described in :ref:`amdgpu-amdhsa-code-object-metadata-v3`. + +This directive is terminated by an ``.end_amdgpu_metadata`` directive. + Example HSA Source Code (-mattr=+code-object-v3) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -4618,6 +5079,24 @@ Here is an example of a minimal assembly source file, defining one HSA kernel: .amdhsa_next_free_sgpr .amdgcn.next_free_sgpr .end_amdhsa_kernel + .amdgpu_metadata + --- + amdhsa.version: + - 1 + - 0 + amdhsa.kernels: + - .name: hello_world + .symbol: hello_world.kd + .kernarg_segment_size: 48 + .group_segment_fixed_size: 0 + .private_segment_fixed_size: 0 + .kernarg_segment_align: 4 + .wavefront_size: 64 + .sgpr_count: 2 + .vgpr_count: 3 + .max_flat_workgroup_size: 256 + ... + .end_amdgpu_metadata Additional Documentation ======================== @@ -4636,6 +5115,7 @@ Additional Documentation .. [ELF] `Executable and Linkable Format (ELF) `__ .. [DWARF] `DWARF Debugging Information Format `__ .. [YAML] `YAML Ain't Markup Language (YAML™) Version 1.2 `__ +.. [MsgPack] `Message Pack `__ .. [OpenCL] `The OpenCL Specification Version 2.0 `__ .. [HRF] `Heterogeneous-race-free Memory Models `__ .. [CLANG-ATTR] `Attributes in Clang `__ diff --git a/docs/AdvancedBuilds.rst b/docs/AdvancedBuilds.rst index d2a2ef58b23e4c85a6bea3a2f16baa9a49e36afa..695dcfb62a1fd21fa8dc97e415f36c05ab155640 100644 --- a/docs/AdvancedBuilds.rst +++ b/docs/AdvancedBuilds.rst @@ -51,6 +51,15 @@ CMake option, each variable separated by a ";". As example: $ cmake -G Ninja -DCLANG_ENABLE_BOOTSTRAP=On -DCLANG_BOOTSTRAP_PASSTHROUGH="CMAKE_INSTALL_PREFIX;CMAKE_VERBOSE_MAKEFILE" $ ninja stage2 +CMake options starting by ``BOOTSTRAP_`` will be passed only to the stage2 build. +This gives the opportunity to use Clang specific build flags. +For example, the following CMake call will enabled '-fno-addrsig' only during +the stage2 build for C and C++. + +.. code-block:: console + + $ cmake [..] -DBOOTSTRAP_CMAKE_CXX_FLAGS='-fno-addrsig' -DBOOTSTRAP_CMAKE_C_FLAGS='-fno-addrsig' [..] + The clang build system refers to builds as stages. A stage1 build is a standard build using the compiler installed on the host, and a stage2 build is built using the stage1 compiler. This nomenclature holds up to more stages too. In diff --git a/docs/Atomics.rst b/docs/Atomics.rst index 4961348d0c97d7915f13c3afa51511a97e1283c3..450b5b36f63abeaf2aaa3fb8864d5aa5c9a7636d 100644 --- a/docs/Atomics.rst +++ b/docs/Atomics.rst @@ -461,8 +461,24 @@ atomic constructs. Here are some lowerings it can do: * atomic rmw -> loop with cmpxchg or load-linked/store-conditional by overriding ``expandAtomicRMWInIR()`` * expansion to __atomic_* libcalls for unsupported sizes. - -For an example of all of these, look at the ARM backend. +* part-word atomicrmw/cmpxchg -> target-specific intrinsic by overriding + ``shouldExpandAtomicRMWInIR``, ``emitMaskedAtomicRMWIntrinsic``, + ``shouldExpandAtomicCmpXchgInIR``, and ``emitMaskedAtomicCmpXchgIntrinsic``. + +For an example of these look at the ARM (first five lowerings) or RISC-V (last +lowering) backend. + +AtomicExpandPass supports two strategies for lowering atomicrmw/cmpxchg to +load-linked/store-conditional (LL/SC) loops. The first expands the LL/SC loop +in IR, calling target lowering hooks to emit intrinsics for the LL and SC +operations. However, many architectures have strict requirements for LL/SC +loops to ensure forward progress, such as restrictions on the number and type +of instructions in the loop. It isn't possible to enforce these restrictions +when the loop is expanded in LLVM IR, and so affected targets may prefer to +expand to LL/SC loops at a very late stage (i.e. after register allocation). +AtomicExpandPass can help support lowering of part-word atomicrmw or cmpxchg +using this strategy by producing IR for any shifting and masking that can be +performed outside of the LL/SC loop. Libcalls: __atomic_* ==================== diff --git a/docs/CodingStandards.rst b/docs/CodingStandards.rst index 92cad91a08e44b7824fe2da9a419bca4a721fb3b..88d8ff804d981f333e57c64dff88dab3ca69c813 100644 --- a/docs/CodingStandards.rst +++ b/docs/CodingStandards.rst @@ -305,6 +305,21 @@ useful to use C style (``/* */``) comments however: #. When writing a source file that is used by a tool that only accepts C style comments. +#. When documenting the significance of constants used as actual parameters in + a call. This is most helpful for ``bool`` parameters, or passing ``0`` or + ``nullptr``. Typically you add the formal parameter name, which ought to be + meaningful. For example, it's not clear what the parameter means in this call: + + .. code-block:: c++ + + Object.emitName(nullptr); + + An in-line C-style comment makes the intent obvious: + + .. code-block:: c++ + + Object.emitName(/*Prefix=*/nullptr); + Commenting out large blocks of code is discouraged, but if you really have to do this (for documentation purposes or as a suggestion for debug printing), use ``#if 0`` and ``#endif``. These nest properly and are better behaved in general diff --git a/docs/CommandGuide/FileCheck.rst b/docs/CommandGuide/FileCheck.rst index 6581b33ba1c716ccc6e0f53c39b5b2fb3745907e..721d2c2e782e8c2949691b6074e4dc97d575c2c7 100644 --- a/docs/CommandGuide/FileCheck.rst +++ b/docs/CommandGuide/FileCheck.rst @@ -80,9 +80,16 @@ and from the command line. -verify``. With this option FileCheck will verify that input does not contain warnings not covered by any ``CHECK:`` patterns. +.. option:: --dump-input + + Dump input to stderr, adding annotations representing currently enabled + diagnostics. Do this either 'always', on 'fail', or 'never'. Specify 'help' + to explain the dump format and quit. + .. option:: --dump-input-on-failure - When the check fails, dump all of the original input. + When the check fails, dump all of the original input. This option is + deprecated in favor of `--dump-input=fail`. .. option:: --enable-var-scope @@ -311,6 +318,29 @@ can be used: ; CHECK: ret i8 } +The "CHECK-COUNT:" directive +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you need to match multiple lines with the same pattern over and over again +you can repeat a plain ``CHECK:`` as many times as needed. If that looks too +boring you can instead use a counted check "``CHECK-COUNT-:``", where +```` is a positive decimal number. It will match the pattern exactly +```` times, no more and no less. If you specified a custom check prefix, +just use "``-COUNT-:``" for the same effect. +Here is a simple example: + +.. code-block:: text + + Loop at depth 1 + Loop at depth 1 + Loop at depth 1 + Loop at depth 1 + Loop at depth 2 + Loop at depth 3 + + ; CHECK-COUNT-6: Loop at depth {{[0-9]+}} + ; CHECK-NOT: Loop at depth {{[0-9]+}} + The "CHECK-DAG:" directive ~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/CommandGuide/llvm-cov.rst b/docs/CommandGuide/llvm-cov.rst index 6f1b6e46c48ae869a0ef943b32dff41fdc2b8acd..71924e997d934a6f0725852a7cb3328f903da6ff 100644 --- a/docs/CommandGuide/llvm-cov.rst +++ b/docs/CommandGuide/llvm-cov.rst @@ -374,9 +374,15 @@ SYNOPSIS DESCRIPTION ^^^^^^^^^^^ -The :program:`llvm-cov export` command exports regions, functions, expansions, -and summaries of the coverage of the binaries *BIN*,... using the profile data -*PROFILE* as JSON. It can optionally be filtered to only export the coverage +The :program:`llvm-cov export` command exports coverage data of the binaries +*BIN*,... using the profile data *PROFILE* in either JSON or lcov trace file +format. + +When exporting JSON, the regions, functions, expansions, and summaries of the +coverage data will be exported. When exporting an lcov trace file, the +line-based coverage and summaries will be exported. + +The exported data can optionally be filtered to only export the coverage for the files listed in *SOURCES*. For information on compiling programs for coverage and generating profile data, @@ -392,12 +398,18 @@ OPTIONS universal binary or to use an architecture that does not match a non-universal binary. +.. option:: -format= + + Use the specified output format. The supported formats are: "text" (JSON), + "lcov". + .. option:: -summary-only Export only summary information for each file in the coverage data. This mode will not export coverage information for smaller units such as individual - functions or regions. The result will be the same as produced by :program: - `llvm-cov report` command, but presented in JSON format rather than text. + functions or regions. The result will contain the same information as produced + by the :program:`llvm-cov report` command, but presented in JSON or lcov + format rather than text. .. option:: -ignore-filename-regex= diff --git a/docs/CommandGuide/llvm-mca.rst b/docs/CommandGuide/llvm-mca.rst index 100136e4d1734c9cd72cb49e16c58f9ccff22ef7..bc50794e0cbc613755610ac0918080a7cfbbee93 100644 --- a/docs/CommandGuide/llvm-mca.rst +++ b/docs/CommandGuide/llvm-mca.rst @@ -516,6 +516,10 @@ sections. 1, 102 (16.7%) 2, 399 (65.4%) + Total ROB Entries: 64 + Max Used ROB Entries: 35 ( 54.7% ) + Average Used ROB Entries per cy: 32 ( 50.0% ) + Register File statistics: Total number of mappings created: 900 diff --git a/docs/CompileCudaWithLLVM.rst b/docs/CompileCudaWithLLVM.rst index 6ad8652cfc1d527b0ca3b2c8cacdbcbc4aeb712f..95d6b0d9b82a19fef66520d88d92ed5dda32ade9 100644 --- a/docs/CompileCudaWithLLVM.rst +++ b/docs/CompileCudaWithLLVM.rst @@ -22,21 +22,21 @@ Compiling CUDA Code Prerequisites ------------- -CUDA is supported in llvm 3.9, but it's still in active development, so we -recommend you `compile clang/LLVM from HEAD -`_. - -Before you build CUDA code, you'll need to have installed the appropriate -driver for your nvidia GPU and the CUDA SDK. See `NVIDIA's CUDA installation -guide `_ -for details. Note that clang `does not support -`_ the CUDA toolkit as installed -by many Linux package managers; you probably need to install nvidia's package. - -You will need CUDA 7.0, 7.5, or 8.0 to compile with clang. - -CUDA compilation is supported on Linux, on MacOS as of 2016-11-18, and on -Windows as of 2017-01-05. +CUDA is supported since llvm 3.9. Current release of clang (7.0.0) supports CUDA +7.0 through 9.2. If you need support for CUDA 10, you will need to use clang +built from r342924 or newer. + +Before you build CUDA code, you'll need to have installed the appropriate driver +for your nvidia GPU and the CUDA SDK. See `NVIDIA's CUDA installation guide +`_ for +details. Note that clang `does not support +`_ the CUDA toolkit as installed by +many Linux package managers; you probably need to install CUDA in a single +directory from NVIDIA's package. + +CUDA compilation is supported on Linux. Compilation on MacOS and Windows may or +may not work and currently have no maintainers. Compilation with CUDA-9.x is +`currently broken on Windows `_. Invoking clang -------------- @@ -73,7 +73,9 @@ run your program. Pass e.g. ``-L/usr/local/cuda/lib64`` if compiling in 64-bit mode; otherwise, pass e.g. ``-L/usr/local/cuda/lib``. (In CUDA, the device code and host code always have the same pointer widths, so if you're compiling 64-bit code for - the host, you're also compiling 64-bit code for the device.) + the host, you're also compiling 64-bit code for the device.) Note that as of + v10.0 CUDA SDK `no longer supports compilation of 32-bit + applications `_. * ```` -- the `compute capability `_ of your GPU. For example, if you @@ -89,8 +91,7 @@ run your program. The `-L` and `-l` flags only need to be passed when linking. When compiling, you may also need to pass ``--cuda-path=/path/to/cuda`` if you didn't install -the CUDA SDK into ``/usr/local/cuda``, ``/usr/local/cuda-7.0``, or -``/usr/local/cuda-7.5``. +the CUDA SDK into ``/usr/local/cuda`` or ``/usr/local/cuda-X.Y``. Flags that control numerical code --------------------------------- @@ -548,9 +549,9 @@ The relevant tools are now just vanilla clang/LLVM. | Jingyue Wu, Artem Belevich, Eli Bendersky, Mark Heffernan, Chris Leary, Jacques Pienaar, Bjarke Roune, Rob Springer, Xuetian Weng, Robert Hundt | *Proceedings of the 2016 International Symposium on Code Generation and Optimization (CGO 2016)* | -| `Slides from the CGO talk `_ +| `Slides from the CGO talk `_ | -| `Tutorial given at CGO `_ +| `Tutorial given at CGO `_ Obtaining Help ============== diff --git a/docs/GarbageCollection.rst b/docs/GarbageCollection.rst index e4f5802f887949058b6c3cd476ad42c0a2603122..5e671bc10b6988f4a7f3a398184fa7dea8af5049 100644 --- a/docs/GarbageCollection.rst +++ b/docs/GarbageCollection.rst @@ -835,45 +835,23 @@ for collector plugins which implement reference counting or a shadow stack. .. _init-roots: -Initializing roots to null: ``InitRoots`` ------------------------------------------ - -.. code-block:: c++ - - MyGC::MyGC() { - InitRoots = true; - } - -When set, LLVM will automatically initialize each root to ``null`` upon entry to -the function. This prevents the GC's sweep phase from visiting uninitialized -pointers, which will almost certainly cause it to crash. This initialization -occurs before custom lowering, so the two may be used together. - -Since LLVM does not yet compute liveness information, there is no means of -distinguishing an uninitialized stack root from an initialized one. Therefore, -this feature should be used by all GC plugins. It is enabled by default. +Initializing roots to null +--------------------------- -Custom lowering of intrinsics: ``CustomRoots``, ``CustomReadBarriers``, and ``CustomWriteBarriers`` ---------------------------------------------------------------------------------------------------- +It is recommended that frontends initialize roots explicitly to avoid +potentially confusing the optimizer. This prevents the GC from visiting +uninitialized pointers, which will almost certainly cause it to crash. -For GCs which use barriers or unusual treatment of stack roots, these -flags allow the collector to perform arbitrary transformations of the -LLVM IR: +As a fallback, LLVM will automatically initialize each root to ``null`` +upon entry to the function. Support for this mode in code generation is +largely a legacy detail to keep old collector implementations working. -.. code-block:: c++ - - class MyGC : public GCStrategy { - public: - MyGC() { - CustomRoots = true; - CustomReadBarriers = true; - CustomWriteBarriers = true; - } - }; +Custom lowering of intrinsics +------------------------------ -If any of these flags are set, LLVM suppresses its default lowering for -the corresponding intrinsics. Instead, you must provide a custom Pass -which lowers the intrinsics as desired. If you have opted in to custom +For GCs which use barriers or unusual treatment of stack roots, the +implementor is responsibly for providing a custom pass to lower the +intrinsics with the desired semantics. If you have opted in to custom lowering of a particular intrinsic your pass **must** eliminate all instances of the corresponding intrinsic in functions which opt in to your GC. The best example of such a pass is the ShadowStackGC and it's @@ -884,62 +862,14 @@ without building a custom copy of LLVM. .. _safe-points: -Generating safe points: ``NeededSafePoints`` --------------------------------------------- - -LLVM can compute four kinds of safe points: - -.. code-block:: c++ - - namespace GC { - /// PointKind - The type of a collector-safe point. - /// - enum PointKind { - Loop, //< Instr is a loop (backwards branch). - Return, //< Instr is a return instruction. - PreCall, //< Instr is a call instruction. - PostCall //< Instr is the return address of a call. - }; - } - -A collector can request any combination of the four by setting the -``NeededSafePoints`` mask: - -.. code-block:: c++ - - MyGC::MyGC() { - NeededSafePoints = 1 << GC::Loop - | 1 << GC::Return - | 1 << GC::PreCall - | 1 << GC::PostCall; - } - -It can then use the following routines to access safe points. - -.. code-block:: c++ - - for (iterator I = begin(), E = end(); I != E; ++I) { - GCFunctionInfo *MD = *I; - size_t PointCount = MD->size(); - - for (GCFunctionInfo::iterator PI = MD->begin(), - PE = MD->end(); PI != PE; ++PI) { - GC::PointKind PointKind = PI->Kind; - unsigned PointNum = PI->Num; - } - } - -Almost every collector requires ``PostCall`` safe points, since these correspond -to the moments when the function is suspended during a call to a subroutine. - -Threaded programs generally require ``Loop`` safe points to guarantee that the -application will reach a safe point within a bounded amount of time, even if it -is executing a long-running loop which contains no function calls. +Generating safe points +----------------------- -Threaded collectors may also require ``Return`` and ``PreCall`` safe points to -implement "stop the world" techniques using self-modifying code, where it is -important that the program not exit the function without reaching a safe point -(because only the topmost function has been patched). +LLVM provides support for associating stackmaps with the return address of +a call. Any loop or return safepoints required by a given collector design +can be modeled via calls to runtime routines, or potentially patchable call +sequences. Using gcroot, all call instructions are inferred to be possible +safepoints and will thus have an associated stackmap. .. _assembly: diff --git a/docs/HowToBuildWithPGO.rst b/docs/HowToBuildWithPGO.rst index ba93bc64a294ab9b2cdd1763160620336bf6070a..77867dbc1d5dad0fb06790577d15eb6e3167f10c 100644 --- a/docs/HowToBuildWithPGO.rst +++ b/docs/HowToBuildWithPGO.rst @@ -125,12 +125,12 @@ In more detailed steps: It's recommended to build the ``all`` target with your instrumented Clang, since more coverage is often better. - b. You should now have a few ``*.profdata`` files in + b. You should now have a few ``*.profraw`` files in ``path/to/stage2/profiles/``. You need to merge these using ``llvm-profdata`` (even if you only have one! The profile merge transforms profraw into actual profile data, as well). This can be done with - ``/path/to/stage1/llvm-profdata -merge - -output=/path/to/output/profdata.prof path/to/stage2/profiles/*.profdata``. + ``/path/to/stage1/llvm-profdata merge + -output=/path/to/output/profdata.prof path/to/stage2/profiles/*.profraw``. 4. Now, build your final, PGO-optimized Clang. To do this, you'll want to pass the following additional arguments to CMake. diff --git a/docs/HowToCrossCompileBuiltinsOnArm.rst b/docs/HowToCrossCompileBuiltinsOnArm.rst index 4b4d563a5a968dfd4317d785dc1cdb3c4355b143..6ad93c773b25758c428dcd25abc15d6892e66c5b 100644 --- a/docs/HowToCrossCompileBuiltinsOnArm.rst +++ b/docs/HowToCrossCompileBuiltinsOnArm.rst @@ -19,33 +19,46 @@ may need to adapt the instructions here to fit your own local situation. Prerequisites ============= -In this use case we'll be using CMake on a Debian-based Linux system, +In this use case we'll be using cmake on a Debian-based Linux system, cross-compiling from an x86_64 host to a hard-float Armv7-A target. We'll be using as many of the LLVM tools as we can, but it is possible to use GNU equivalents. * ``A build of LLVM/clang for the llvm-tools and llvm-config`` + * ``A clang executable with support for the ARM target`` + * ``compiler-rt sources`` * ``The qemu-arm user mode emulator`` * ``An arm-linux-gnueabihf sysroot`` +In this example we will be using ninja. + See https://compiler-rt.llvm.org/ for more information about the dependencies on clang and LLVM. +See https://llvm.org/docs/GettingStarted.html for information about obtaining +the source for LLVM and compiler-rt. Note that the getting started guide +places compiler-rt in the projects subdirectory, but this is not essential and +if you are using the BaremetalARM.cmake cache for v6-M, v7-M and v7-EM then +compiler-rt must be placed in the runtimes directory. + ``qemu-arm`` should be available as a package for your Linux distribution. The most complicated of the prequisites to satisfy is the arm-linux-gnueabihf -sysroot. The :doc:`HowToCrossCompileLLVM` has information about how to use the -Linux distributions multiarch support to fulfill the dependencies for building -LLVM. Alternatively, as building and testing just the compiler-rt builtins -requires fewer dependencies than LLVM, it is possible to use the Linaro -arm-linux-gnueabihf gcc installation as our sysroot. +sysroot. In theory it is possible to use the Linux distributions multiarch +support to fulfill the dependencies for building but unfortunately due to +/usr/local/include being added some host includes are selected. The easiest way +to supply a sysroot is to download the arm-linux-gnueabihf toolchain. This can +be found at: +* https://developer.arm.com/open-source/gnu-toolchain/gnu-a/downloads for gcc 8 and above +* https://releases.linaro.org/components/toolchain/binaries/ for gcc 4.9 to 7.3 Building compiler-rt builtins for Arm ===================================== We will be doing a standalone build of compiler-rt using the following cmake options. -* ``path/to/llvm/projects/compiler-rt`` +* ``path/to/compiler-rt`` +* ``-G Ninja`` * ``-DCOMPILER_RT_BUILD_BUILTINS=ON`` * ``-DCOMPILER_RT_BUILD_SANITIZERS=OFF`` * ``-DCOMPILER_RT_BUILD_XRAY=OFF`` @@ -57,22 +70,38 @@ options. * ``-DCMAKE_RANLIB=/path/to/llvm-ranlib`` * ``-DCMAKE_EXE_LINKER_FLAGS="-fuse-ld=lld"`` * ``-DCMAKE_C_COMPILER_TARGET="arm-linux-gnueabihf"`` +* ``-DCMAKE_ASM_COMPILER_TARGET="arm-linux-gnueabihf"`` * ``-DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON`` * ``-DLLVM_CONFIG_PATH=/path/to/llvm-config`` * ``-DCMAKE_C_FLAGS="build-c-flags"`` +* ``-DCMAKE_ASM_FLAGS="build-c-flags"`` -The build-c-flags need to be sufficient to pass the C-make compiler check and -to compile compiler-rt. When using a GCC 7 Linaro arm-linux-gnueabihf -installation the following flags are needed: +The ``build-c-flags`` need to be sufficient to pass the C-make compiler check, +compile compiler-rt, and if you are running the tests, compile and link the +tests. When cross-compiling with clang we will need to pass sufficient +information to generate code for the Arm architecture we are targeting. We will +need to select the Arm target, select the Armv7-A architecture and choose +between using Arm or Thumb. +instructions. For example: * ``--target=arm-linux-gnueabihf`` -* ``--march=armv7a`` +* ``-march=armv7a`` +* ``-mthumb`` + +When using a GCC arm-linux-gnueabihf toolchain the following flags are +needed to pick up the includes and libraries: + * ``--gcc-toolchain=/path/to/dir/toolchain`` * ``--sysroot=/path/to/toolchain/arm-linux-gnueabihf/libc`` -Depending on how your sysroot is laid out, you may not need ``--gcc-toolchain``. -For example if you have added armhf as an architecture using your Linux -distributions multiarch support then you should be able to use ``--sysroot=/``. +In this example we will be adding all of the command line options to both +``CMAKE_C_FLAGS`` and ``CMAKE_ASM_FLAGS``. There are cmake flags to pass some of +these options individually which can be used to simplify the ``build-c-flags``: + +* ``-DCMAKE_C_COMPILER_TARGET="arm-linux-gnueabihf"`` +* ``-DCMAKE_ASM_COMPILER_TARGET="arm-linux-gnueabihf"`` +* ``-DCMAKE_C_COMPILER_EXTERNAL_TOOLCHAIN=/path/to/dir/toolchain`` +* ``-DCMAKE_SYSROOT=/path/to/dir/toolchain/arm-linux-gnueabihf/libc`` Once cmake has completed the builtins can be built with ``ninja builtins`` @@ -90,12 +119,72 @@ cmake that we wish to run the tests on ``qemu-arm``. The ``/path/to/armhf/sysroot`` should be the same as the one passed to ``--sysroot`` in the "build-c-flags". -The "test-c-flags" can be the same as the "build-c-flags", with the addition -of ``"-fuse-ld=lld`` if you wish to use lld to link the tests. +The "test-c-flags" need to include the target, architecture, gcc-toolchain, +sysroot and arm/thumb state. The additional cmake defines such as +``CMAKE_C_COMPILER_EXTERNAL_TOOLCHAIN`` do not apply when building the tests. If +you have put all of these in "build-c-flags" then these can be repeated. If you +wish to use lld to link the tests then add ``"-fuse-ld=lld``. Once cmake has completed the tests can be built and run using ``ninja check-builtins`` +Troubleshooting +=============== + +The cmake try compile stage fails +--------------------------------- +At an early stage cmake will attempt to compile and link a simple C program to +test if the toolchain is working. + +This stage can often fail at link time if the ``--sysroot`` and +``--gcc-toolchain`` options are not passed to the compiler. Check the +``CMAKE_C_FLAGS`` and ``CMAKE_C_COMPILER_TARGET`` flags. + +It can be useful to build a simple example outside of cmake with your toolchain +to make sure it is working. For example: ``clang --target=arm-linux-gnueabi -march=armv7a --gcc-toolchain=/path/to/gcc-toolchain --sysroot=/path/to/gcc-toolchain/arm-linux-gnueabihf/libc helloworld.c`` + +Clang uses the host header files +-------------------------------- +On debian based systems it is possible to install multiarch support for +arm-linux-gnueabi and arm-linux-gnueabihf. In many cases clang can successfully +use this multiarch support when -gcc-toolchain and --sysroot are not supplied. +Unfortunately clang adds ``/usr/local/include`` before +``/usr/include/arm-linux-gnueabihf`` leading to errors when compiling the hosts +header files. + +The multiarch support is not sufficient to build the builtins you will need to +use a separate arm-linux-gnueabihf toolchain. + +No target passed to clang +------------------------- +If clang is not given a target it will typically use the host target, this will +not understand the Arm assembly language files resulting in error messages such +as ``error: unknown directive .syntax unified``. + +You can check the clang invocation in the error message to see if there is no +``--target`` or if it is set incorrectly. The cause is usually +``CMAKE_ASM_FLAGS`` not containing ``--target`` or ``CMAKE_ASM_COMPILER_TARGET`` not being present. + +Arm architecture not given +-------------------------- +The ``--target=arm-linux-gnueabihf`` will default to arm architecture v4t which +cannot assemble the barrier instructions used in the synch_and_fetch source +files. + +The cause is usually a missing ``-march=armv7a`` from the ``CMAKE_ASM_FLAGS``. + +Compiler-rt builds but the tests fail to build +---------------------------------------------- +The flags used to build the tests are not the same as those used to build the +builtins. The c flags are provided by ``COMPILER_RT_TEST_COMPILE_CFLAGS`` and +the ``CMAKE_C_COMPILER_TARGET``, ``CMAKE_ASM_COMPILER_TARGET``, +``CMAKE_C_COMPILER_EXTERNAL_TOOLCHAIN`` and ``CMAKE_SYSROOT`` flags are not +applied. + +Make sure that ``COMPILER_RT_TEST_COMPILE_CFLAGS`` contains all the necessary +information. + + Modifications for other Targets =============================== @@ -112,6 +201,8 @@ may need extra c-flags such as ``-mfloat-abi=softfp`` for use of floating-point instructions, and ``-mfloat-abi=soft -mfpu=none`` for software floating-point emulation. +You will need to use an arm-linux-gnueabi GNU toolchain for soft-float. + AArch64 Target -------------- The instructions for Arm can be used for AArch64 by substituting AArch64 @@ -125,39 +216,24 @@ The CMAKE_C_FLAGS and COMPILER_RT_TEST_COMPILER_CFLAGS may also need: Armv6-m, Armv7-m and Armv7E-M targets ------------------------------------- -If you wish to build, but not test compiler-rt for Armv6-M, Armv7-M or Armv7E-M -then the easiest way is to use the BaremetalARM.cmake recipe in -clang/cmake/caches. - -You will need a bare metal sysroot such as that provided by the GNU ARM -Embedded toolchain. - -The libraries can be built with the cmake options: - -* ``-DBAREMETAL_ARMV6M_SYSROOT=/path/to/bare/metal/sysroot`` -* ``-DBAREMETAL_ARMV7M_SYSROOT=/path/to/bare/metal/sysroot`` -* ``-DBAREMETAL_ARMV7EM_SYSROOT=/path/to/bare/metal/sysroot`` -* ``-C /path/to/llvm/source/tools/clang/cmake/caches/BaremetalARM.cmake`` - -**Note** that for the recipe to work the compiler-rt source must be checked out -into the directory llvm/runtimes and not llvm/projects. - To build and test the libraries using a similar method to Armv7-A is possible but more difficult. The main problems are: * There isn't a ``qemu-arm`` user-mode emulator for bare-metal systems. The ``qemu-system-arm`` can be used but this is significantly more difficult to setup. -* The target to compile compiler-rt have the suffix -none-eabi. This uses the BareMetal driver in clang and by default won't find the libraries needed to pass the cmake compiler check. +* The targets to compile compiler-rt have the suffix -none-eabi. This uses the BareMetal driver in clang and by default won't find the libraries needed to pass the cmake compiler check. As the Armv6-M, Armv7-M and Armv7E-M builds of compiler-rt only use instructions that are supported on Armv7-A we can still get most of the value of running the tests using the same ``qemu-arm`` that we used for Armv7-A by building and running the test cases for Armv7-A but using the builtins compiled for -Armv6-M, Armv7-M or Armv7E-M. This will not catch instructions that are -supported on Armv7-A but not Armv6-M, Armv7-M and Armv7E-M. - -To get the cmake compile test to pass the libraries needed to successfully link -the test application will need to be manually added to ``CMAKE_CFLAGS``. -Alternatively if you are using version 3.6 or above of cmake you can use +Armv6-M, Armv7-M or Armv7E-M. This will test that the builtins can be linked +into a binary and execute the tests correctly but it will not catch if the +builtins use instructions that are supported on Armv7-A but not Armv6-M, +Armv7-M and Armv7E-M. + +To get the cmake compile test to pass you will need to pass the libraries +needed to successfully link the cmake test via ``CMAKE_CFLAGS``. It is +strongly recommended that you use version 3.6 or above of cmake so you can use ``CMAKE_TRY_COMPILE_TARGET=STATIC_LIBRARY`` to skip the link step. * ``-DCMAKE_TRY_COMPILE_TARGET_TYPE=STATIC_LIBRARY`` @@ -169,6 +245,7 @@ Alternatively if you are using version 3.6 or above of cmake you can use * ``-DCOMPILER_RT_BUILD_PROFILE=OFF`` * ``-DCMAKE_C_COMPILER=${host_install_dir}/bin/clang`` * ``-DCMAKE_C_COMPILER_TARGET="your *-none-eabi target"`` +* ``-DCMAKE_ASM_COMPILER_TARGET="your *-none-eabi target"`` * ``-DCMAKE_AR=/path/to/llvm-ar`` * ``-DCMAKE_NM=/path/to/llvm-nm`` * ``-DCMAKE_RANLIB=/path/to/llvm-ranlib`` @@ -176,7 +253,7 @@ Alternatively if you are using version 3.6 or above of cmake you can use * ``-DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON`` * ``-DLLVM_CONFIG_PATH=/path/to/llvm-config`` * ``-DCMAKE_C_FLAGS="build-c-flags"`` -* ``-DCMAKE_ASM_FLAGS="${arm_cflags}"`` +* ``-DCMAKE_ASM_FLAGS="build-c-flags"`` * ``-DCOMPILER_RT_EMULATOR="qemu-arm -L /path/to/armv7-A/sysroot"`` * ``-DCOMPILER_RT_INCLUDE_TESTS=ON`` * ``-DCOMPILER_RT_TEST_COMPILER="/path/to/clang"`` @@ -186,16 +263,28 @@ The Armv6-M builtins will use the soft-float ABI. When compiling the tests for Armv7-A we must include ``"-mthumb -mfloat-abi=soft -mfpu=none"`` in the test-c-flags. We must use an Armv7-A soft-float abi sysroot for ``qemu-arm``. -Unfortunately at time of writing the Armv7-M and Armv7E-M builds of -compiler-rt will always include assembler files including floating point -instructions. This means that building for a cpu without a floating point unit -requires something like removing the arm_Thumb1_VFPv2_SOURCES from the -arm_Thumb1_SOURCES in builtins/CMakeLists.txt. The float-abi of the compiler-rt -library must be matched by the float abi of the Armv7-A sysroot used by -qemu-arm. - Depending on the linker used for the test cases you may encounter BuildAttribute mismatches between the M-profile objects from compiler-rt and the A-profile -objects from the test. The lld linker does not check the BuildAttributes so it -can be used to link the tests by adding -fuse-ld=lld to the +objects from the test. The lld linker does not check the profile +BuildAttribute so it can be used to link the tests by adding -fuse-ld=lld to the ``COMPILER_RT_TEST_COMPILER_CFLAGS``. + +Alternative using a cmake cache +------------------------------- +If you wish to build, but not test compiler-rt for Armv6-M, Armv7-M or Armv7E-M +the easiest way is to use the BaremetalARM.cmake recipe in clang/cmake/caches. + +You will need a bare metal sysroot such as that provided by the GNU ARM +Embedded toolchain. + +The libraries can be built with the cmake options: + +* ``-DBAREMETAL_ARMV6M_SYSROOT=/path/to/bare/metal/toolchain/arm-none-eabi`` +* ``-DBAREMETAL_ARMV7M_SYSROOT=/path/to/bare/metal/toolchain/arm-none-eabi`` +* ``-DBAREMETAL_ARMV7EM_SYSROOT=/path/to/bare/metal/toolchain/arm-none-eabi`` +* ``-C /path/to/llvm/source/tools/clang/cmake/caches/BaremetalARM.cmake`` +* ``/path/to/llvm`` + +**Note** that for the recipe to work the compiler-rt source must be checked out +into the directory llvm/runtimes. You will also need clang and lld checked out. + diff --git a/docs/LangRef.rst b/docs/LangRef.rst index 06e092fb9fc5269b60fd6f5d69a315002d933be9..ea879a5db5b70f5ae78ef12b0ecf7b878833feea 100644 --- a/docs/LangRef.rst +++ b/docs/LangRef.rst @@ -1643,19 +1643,15 @@ example: ``speculative_load_hardening`` This attribute indicates that `Speculative Load Hardening `_ - should be enabled for the function body. This is a best-effort attempt to - mitigate all known speculative execution information leak vulnerabilities - that are based on the fundamental principles of modern processors' - speculative execution. These vulnerabilities are classified as "Spectre - variant #1" vulnerabilities typically. Notably, this does not attempt to - mitigate any vulnerabilities where the speculative execution and/or - prediction devices of specific processors can be *completely* undermined - (such as "Branch Target Injection", a.k.a, "Spectre variant #2"). Instead, - this is a target-independent request to harden against the completely - generic risk posed by speculative execution to incorrectly load secret data, - making it available to some micro-architectural side-channel for information - leak. For a processor without any speculative execution or predictors, this - is expected to be a no-op. + should be enabled for the function body. + + Speculative Load Hardening is a best-effort mitigation against + information leak attacks that make use of control flow + miss-speculation - specifically miss-speculation of whether a branch + is taken or not. Typically vulnerabilities enabling such attacks are + classified as "Spectre variant #1". Notably, this does not attempt to + mitigate against miss-speculation of branch target, classified as + "Spectre variant #2" vulnerabilities. When inlining, the attribute is sticky. Inlining a function that carries this attribute will cause the caller to gain the attribute. This is intended @@ -5080,6 +5076,8 @@ optimizations related to compare and branch instructions. The metadata is treated as a boolean value; if it exists, it signals that the branch or switch that it is attached to is completely unpredictable. +.. _llvm.loop: + '``llvm.loop``' ^^^^^^^^^^^^^^^ @@ -5113,6 +5111,26 @@ suggests an unroll factor to the loop unroller: !0 = !{!0, !1} !1 = !{!"llvm.loop.unroll.count", i32 4} +'``llvm.loop.disable_nonforced``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata disables all optional loop transformations unless +explicitly instructed using other transformation metdata such as +``llvm.loop.unroll.enable``. That is, no heuristic will try to determine +whether a transformation is profitable. The purpose is to avoid that the +loop is transformed to a different loop before an explicitly requested +(forced) transformation is applied. For instance, loop fusion can make +other transformations impossible. Mandatory loop canonicalizations such +as loop rotation are still applied. + +It is recommended to use this metadata in addition to any llvm.loop.* +transformation directive. Also, any loop should have at most one +directive applied to it (and a sequence of transformations built using +followup-attributes). Otherwise, which transformation will be applied +depends on implementation details such as the pass pipeline order. + +See :ref:`transformation-metadata` for details. + '``llvm.loop.vectorize``' and '``llvm.loop.interleave``' ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -5171,6 +5189,29 @@ vectorization of the loop. If ``llvm.loop.vectorize.width`` is set to 0 or if the loop does not have this metadata the width will be determined automatically. +'``llvm.loop.vectorize.followup_vectorized``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata defines which loop attributes the vectorized loop will +have. See :ref:`transformation-metadata` for details. + +'``llvm.loop.vectorize.followup_epilogue``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata defines which loop attributes the epilogue will have. The +epilogue is not vectorized and is executed when either the vectorized +loop is not known to preserve semantics (because e.g., it processes two +arrays that are found to alias by a runtime check) or for the last +iterations that do not fill a complete set of vector lanes. See +:ref:`Transformation Metadata ` for details. + +'``llvm.loop.vectorize.followup_all``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Attributes in the metadata will be added to both the vectorized and +epilogue loop. +See :ref:`Transformation Metadata ` for details. + '``llvm.loop.unroll``' ^^^^^^^^^^^^^^^^^^^^^^ @@ -5239,6 +5280,19 @@ For example: !0 = !{!"llvm.loop.unroll.full"} +'``llvm.loop.unroll.followup``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata defines which loop attributes the unrolled loop will have. +See :ref:`Transformation Metadata ` for details. + +'``llvm.loop.unroll.followup_remainder``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata defines which loop attributes the remainder loop after +partial/runtime unrolling will have. See +:ref:`Transformation Metadata ` for details. + '``llvm.loop.unroll_and_jam``' ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -5292,6 +5346,43 @@ string ``llvm.loop.unroll_and_jam.enable``. For example: !0 = !{!"llvm.loop.unroll_and_jam.enable"} +'``llvm.loop.unroll_and_jam.followup_outer``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata defines which loop attributes the outer unrolled loop will +have. See :ref:`Transformation Metadata ` for +details. + +'``llvm.loop.unroll_and_jam.followup_inner``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata defines which loop attributes the inner jammed loop will +have. See :ref:`Transformation Metadata ` for +details. + +'``llvm.loop.unroll_and_jam.followup_remainder_outer``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata defines which attributes the epilogue of the outer loop +will have. This loop is usually unrolled, meaning there is no such +loop. This attribute will be ignored in this case. See +:ref:`Transformation Metadata ` for details. + +'``llvm.loop.unroll_and_jam.followup_remainder_inner``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata defines which attributes the inner loop of the epilogue +will have. The outer epilogue will usually be unrolled, meaning there +can be multiple inner remainder loops. See +:ref:`Transformation Metadata ` for details. + +'``llvm.loop.unroll_and_jam.followup_all``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Attributes specified in the metadata is added to all +``llvm.loop.unroll_and_jam.*`` loops. See +:ref:`Transformation Metadata ` for details. + '``llvm.loop.licm_versioning.disable``' Metadata ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -5324,6 +5415,34 @@ enabled. A value of 0 disables distribution: This metadata should be used in conjunction with ``llvm.loop`` loop identification metadata. +'``llvm.loop.distribute.followup_coincident``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata defines which attributes extracted loops with no cyclic +dependencies will have (i.e. can be vectorized). See +:ref:`Transformation Metadata ` for details. + +'``llvm.loop.distribute.followup_sequential``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata defines which attributes the isolated loops with unsafe +memory dependencies will have. See +:ref:`Transformation Metadata ` for details. + +'``llvm.loop.distribute.followup_fallback``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If loop versioning is necessary, this metadata defined the attributes +the non-distributed fallback version will have. See +:ref:`Transformation Metadata ` for details. + +'``llvm.loop.distribute.followup_all``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Thes attributes in this metdata is added to all followup loops of the +loop distribution pass. See +:ref:`Transformation Metadata ` for details. + '``llvm.mem``' ^^^^^^^^^^^^^^^ @@ -6790,6 +6909,55 @@ Semantics: The '``unreachable``' instruction has no defined semantics. +.. _unaryops: + +Unary Operations +----------------- + +Unary operators require a single operand, execute an operation on +it, and produce a single value. The operand might represent multiple +data, as is the case with the :ref:`vector ` data type. The +result value has the same type as its operand. + +.. _i_fneg: + +'``fneg``' Instruction +^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + = fneg [fast-math flags]* ; yields ty:result + +Overview: +""""""""" + +The '``fneg``' instruction returns the negation of its operand. + +Arguments: +"""""""""" + +The argument to the '``fneg``' instruction must be a +:ref:`floating-point ` or :ref:`vector ` of +floating-point values. + +Semantics: +"""""""""" + +The value produced is a copy of the operand with its sign bit flipped. +This instruction can also take any number of :ref:`fast-math +flags `, which are optimization hints to enable otherwise +unsafe floating-point optimizations: + +Example: +"""""""" + +.. code-block:: text + + = fneg float %val ; yields float:result = -%var + .. _binaryops: Binary Operations @@ -6965,9 +7133,6 @@ Overview: The '``fsub``' instruction returns the difference of its two operands. -Note that the '``fsub``' instruction is used to represent the '``fneg``' -instruction present in most other intermediate representations. - Arguments: """""""""" @@ -12530,6 +12695,276 @@ Examples: %obit = extractvalue {i32, i1} %res, 1 br i1 %obit, label %overflow, label %normal +Saturation Arithmetic Intrinsics +--------------------------------- + +Saturation arithmetic is a version of arithmetic in which operations are +limited to a fixed range between a minimum and maximum value. If the result of +an operation is greater than the maximum value, the result is set (or +"clamped") to this maximum. If it is below the minimum, it is clamped to this +minimum. + + +'``llvm.sadd.sat.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax +""""""" + +This is an overloaded intrinsic. You can use ``llvm.sadd.sat`` +on any integer bit width or vectors of integers. + +:: + + declare i16 @llvm.sadd.sat.i16(i16 %a, i16 %b) + declare i32 @llvm.sadd.sat.i32(i32 %a, i32 %b) + declare i64 @llvm.sadd.sat.i64(i64 %a, i64 %b) + declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b) + +Overview +""""""""" + +The '``llvm.sadd.sat``' family of intrinsic functions perform signed +saturation addition on the 2 arguments. + +Arguments +"""""""""" + +The arguments (%a and %b) and the result may be of integer types of any bit +width, but they must have the same bit width. ``%a`` and ``%b`` are the two +values that will undergo signed addition. + +Semantics: +"""""""""" + +The maximum value this operation can clamp to is the largest signed value +representable by the bit width of the arguments. The minimum value is the +smallest signed value representable by this bit width. + + +Examples +""""""""" + +.. code-block:: llvm + + %res = call i4 @llvm.sadd.sat.i4(i4 1, i4 2) ; %res = 3 + %res = call i4 @llvm.sadd.sat.i4(i4 5, i4 6) ; %res = 7 + %res = call i4 @llvm.sadd.sat.i4(i4 -4, i4 2) ; %res = -2 + %res = call i4 @llvm.sadd.sat.i4(i4 -4, i4 -5) ; %res = -8 + + +'``llvm.uadd.sat.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax +""""""" + +This is an overloaded intrinsic. You can use ``llvm.uadd.sat`` +on any integer bit width or vectors of integers. + +:: + + declare i16 @llvm.uadd.sat.i16(i16 %a, i16 %b) + declare i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) + declare i64 @llvm.uadd.sat.i64(i64 %a, i64 %b) + declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b) + +Overview +""""""""" + +The '``llvm.uadd.sat``' family of intrinsic functions perform unsigned +saturation addition on the 2 arguments. + +Arguments +"""""""""" + +The arguments (%a and %b) and the result may be of integer types of any bit +width, but they must have the same bit width. ``%a`` and ``%b`` are the two +values that will undergo unsigned addition. + +Semantics: +"""""""""" + +The maximum value this operation can clamp to is the largest unsigned value +representable by the bit width of the arguments. Because this is an unsigned +operation, the result will never saturate towards zero. + + +Examples +""""""""" + +.. code-block:: llvm + + %res = call i4 @llvm.uadd.sat.i4(i4 1, i4 2) ; %res = 3 + %res = call i4 @llvm.uadd.sat.i4(i4 5, i4 6) ; %res = 11 + %res = call i4 @llvm.uadd.sat.i4(i4 8, i4 8) ; %res = 15 + + +'``llvm.ssub.sat.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax +""""""" + +This is an overloaded intrinsic. You can use ``llvm.ssub.sat`` +on any integer bit width or vectors of integers. + +:: + + declare i16 @llvm.ssub.sat.i16(i16 %a, i16 %b) + declare i32 @llvm.ssub.sat.i32(i32 %a, i32 %b) + declare i64 @llvm.ssub.sat.i64(i64 %a, i64 %b) + declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> %b) + +Overview +""""""""" + +The '``llvm.ssub.sat``' family of intrinsic functions perform signed +saturation subtraction on the 2 arguments. + +Arguments +"""""""""" + +The arguments (%a and %b) and the result may be of integer types of any bit +width, but they must have the same bit width. ``%a`` and ``%b`` are the two +values that will undergo signed subtraction. + +Semantics: +"""""""""" + +The maximum value this operation can clamp to is the largest signed value +representable by the bit width of the arguments. The minimum value is the +smallest signed value representable by this bit width. + + +Examples +""""""""" + +.. code-block:: llvm + + %res = call i4 @llvm.ssub.sat.i4(i4 2, i4 1) ; %res = 1 + %res = call i4 @llvm.ssub.sat.i4(i4 2, i4 6) ; %res = -4 + %res = call i4 @llvm.ssub.sat.i4(i4 -4, i4 5) ; %res = -8 + %res = call i4 @llvm.ssub.sat.i4(i4 4, i4 -5) ; %res = 7 + + +'``llvm.usub.sat.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax +""""""" + +This is an overloaded intrinsic. You can use ``llvm.usub.sat`` +on any integer bit width or vectors of integers. + +:: + + declare i16 @llvm.usub.sat.i16(i16 %a, i16 %b) + declare i32 @llvm.usub.sat.i32(i32 %a, i32 %b) + declare i64 @llvm.usub.sat.i64(i64 %a, i64 %b) + declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %a, <4 x i32> %b) + +Overview +""""""""" + +The '``llvm.usub.sat``' family of intrinsic functions perform unsigned +saturation subtraction on the 2 arguments. + +Arguments +"""""""""" + +The arguments (%a and %b) and the result may be of integer types of any bit +width, but they must have the same bit width. ``%a`` and ``%b`` are the two +values that will undergo unsigned subtraction. + +Semantics: +"""""""""" + +The minimum value this operation can clamp to is 0, which is the smallest +unsigned value representable by the bit width of the unsigned arguments. +Because this is an unsigned operation, the result will never saturate towards +the largest possible value representable by this bit width. + + +Examples +""""""""" + +.. code-block:: llvm + + %res = call i4 @llvm.usub.sat.i4(i4 2, i4 1) ; %res = 1 + %res = call i4 @llvm.usub.sat.i4(i4 2, i4 6) ; %res = 0 + + +Fixed Point Arithmetic Intrinsics +--------------------------------- + +A fixed point number represents a real data type for a number that has a fixed +number of digits after a radix point (equivalent to the decimal point '.'). +The number of digits after the radix point is referred as the ``scale``. These +are useful for representing fractional values to a specific precision. The +following intrinsics perform fixed point arithmetic operations on 2 operands +of the same scale, specified as the third argument. + + +'``llvm.smul.fix.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax +""""""" + +This is an overloaded intrinsic. You can use ``llvm.smul.fix`` +on any integer bit width or vectors of integers. + +:: + + declare i16 @llvm.smul.fix.i16(i16 %a, i16 %b, i32 %scale) + declare i32 @llvm.smul.fix.i32(i32 %a, i32 %b, i32 %scale) + declare i64 @llvm.smul.fix.i64(i64 %a, i64 %b, i32 %scale) + declare <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %scale) + +Overview +""""""""" + +The '``llvm.smul.fix``' family of intrinsic functions perform signed +fixed point multiplication on 2 arguments of the same scale. + +Arguments +"""""""""" + +The arguments (%a and %b) and the result may be of integer types of any bit +width, but they must have the same bit width. ``%a`` and ``%b`` are the two +values that will undergo signed fixed point multiplication. The argument +``%scale`` represents the scale of both operands, and must be a constant +integer. + +Semantics: +"""""""""" + +This operation performs fixed point multiplication on the 2 arguments of a +specified scale. The result will also be returned in the same scale specified +in the third argument. + +If the result value cannot be precisely represented in the given scale, the +value is rounded up or down to the closest representable value. The rounding +direction is unspecified. + +It is undefined behavior if the source value does not fit within the range of +the fixed point type. + + +Examples +""""""""" + +.. code-block:: llvm + + %res = call i4 @llvm.smul.fix.i4(i4 3, i4 2, i32 0) ; %res = 6 (2 x 3 = 6) + %res = call i4 @llvm.smul.fix.i4(i4 3, i4 2, i32 1) ; %res = 3 (1.5 x 1 = 1.5) + %res = call i4 @llvm.smul.fix.i4(i4 3, i4 -2, i32 1) ; %res = -3 (1.5 x -1 = -1.5) + + ; The result in the following could be rounded up to -2 or down to -2.5 + %res = call i4 @llvm.smul.fix.i4(i4 3, i4 -3, i32 1) ; %res = -5 (or -4) (1.5 x -1.5 = -2.25) + + Specialised Arithmetic Intrinsics --------------------------------- @@ -14868,7 +15303,7 @@ Syntax: :: - declare void @llvm.trap() noreturn nounwind + declare void @llvm.trap() cold noreturn nounwind Overview: """"""""" @@ -15362,6 +15797,145 @@ if"); and this allows for "check widening" type optimizations. ``@llvm.experimental.guard`` cannot be invoked. +'``llvm.experimental.widenable.condition``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i1 @llvm.experimental.widenable.condition() + +Overview: +""""""""" + +This intrinsic represents a "widenable condition" which is +boolean expressions with the following property: whether this +expression is `true` or `false`, the program is correct and +well-defined. + +Together with :ref:`deoptimization operand bundles `, +``@llvm.experimental.widenable.condition`` allows frontends to +express guards or checks on optimistic assumptions made during +compilation and represent them as branch instructions on special +conditions. + +While this may appear similar in semantics to `undef`, it is very +different in that an invocation produces a particular, singular +value. It is also intended to be lowered late, and remain available +for specific optimizations and transforms that can benefit from its +special properties. + +Arguments: +"""""""""" + +None. + +Semantics: +"""""""""" + +The intrinsic ``@llvm.experimental.widenable.condition()`` +returns either `true` or `false`. For each evaluation of a call +to this intrinsic, the program must be valid and correct both if +it returns `true` and if it returns `false`. This allows +transformation passes to replace evaluations of this intrinsic +with either value whenever one is beneficial. + +When used in a branch condition, it allows us to choose between +two alternative correct solutions for the same problem, like +in example below: + +.. code-block:: text + + %cond = call i1 @llvm.experimental.widenable.condition() + br i1 %cond, label %solution_1, label %solution_2 + + label %fast_path: + ; Apply memory-consuming but fast solution for a task. + + label %slow_path: + ; Cheap in memory but slow solution. + +Whether the result of intrinsic's call is `true` or `false`, +it should be correct to pick either solution. We can switch +between them by replacing the result of +``@llvm.experimental.widenable.condition`` with different +`i1` expressions. + +This is how it can be used to represent guards as widenable branches: + +.. code-block:: text + + block: + ; Unguarded instructions + call void @llvm.experimental.guard(i1 %cond, ) ["deopt"()] + ; Guarded instructions + +Can be expressed in an alternative equivalent form of explicit branch using +``@llvm.experimental.widenable.condition``: + +.. code-block:: text + + block: + ; Unguarded instructions + %widenable_condition = call i1 @llvm.experimental.widenable.condition() + %guard_condition = and i1 %cond, %widenable_condition + br i1 %guard_condition, label %guarded, label %deopt + + guarded: + ; Guarded instructions + + deopt: + call type @llvm.experimental.deoptimize() [ "deopt"() ] + +So the block `guarded` is only reachable when `%cond` is `true`, +and it should be valid to go to the block `deopt` whenever `%cond` +is `true` or `false`. + +``@llvm.experimental.widenable.condition`` will never throw, thus +it cannot be invoked. + +Guard widening: +""""""""""""""" + +When ``@llvm.experimental.widenable.condition()`` is used in +condition of a guard represented as explicit branch, it is +legal to widen the guard's condition with any additional +conditions. + +Guard widening looks like replacement of + +.. code-block:: text + + %widenable_cond = call i1 @llvm.experimental.widenable.condition() + %guard_cond = and i1 %cond, %widenable_cond + br i1 %guard_cond, label %guarded, label %deopt + +with + +.. code-block:: text + + %widenable_cond = call i1 @llvm.experimental.widenable.condition() + %new_cond = and i1 %any_other_cond, %widenable_cond + %new_guard_cond = and i1 %cond, %new_cond + br i1 %new_guard_cond, label %guarded, label %deopt + +for this branch. Here `%any_other_cond` is an arbitrarily chosen +well-defined `i1` value. By making guard widening, we may +impose stricter conditions on `guarded` block and bail to the +deopt when the new condition is not met. + +Lowering: +""""""""" + +Default lowering strategy is replacing the result of +call of ``@llvm.experimental.widenable.condition`` with +constant `true`. However it is always correct to replace +it with any other `i1` value. Any pass can +freely do it if it can benefit from non-default lowering. + + '``llvm.load.relative``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -15697,3 +16271,263 @@ lowered to a call to the symbol ``__llvm_memset_element_unordered_atomic_*``. Wh is replaced with an actual element size. The optimizer is allowed to inline the memory assignment when it's profitable to do so. + +Objective-C ARC Runtime Intrinsics +---------------------------------- + +LLVM provides intrinsics that lower to Objective-C ARC runtime entry points. +LLVM is aware of the semantics of these functions, and optimizes based on that +knowledge. You can read more about the details of Objective-C ARC `here +`_. + +'``llvm.objc.autorelease``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +:: + + declare i8* @llvm.objc.autorelease(i8*) + +Lowering: +""""""""" + +Lowers to a call to `objc_autorelease `_. + +'``llvm.objc.autoreleasePoolPop``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +:: + + declare void @llvm.objc.autoreleasePoolPop(i8*) + +Lowering: +""""""""" + +Lowers to a call to `objc_autoreleasePoolPop `_. + +'``llvm.objc.autoreleasePoolPush``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +:: + + declare i8* @llvm.objc.autoreleasePoolPush() + +Lowering: +""""""""" + +Lowers to a call to `objc_autoreleasePoolPush `_. + +'``llvm.objc.autoreleaseReturnValue``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +:: + + declare i8* @llvm.objc.autoreleaseReturnValue(i8*) + +Lowering: +""""""""" + +Lowers to a call to `objc_autoreleaseReturnValue `_. + +'``llvm.objc.copyWeak``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +:: + + declare void @llvm.objc.copyWeak(i8**, i8**) + +Lowering: +""""""""" + +Lowers to a call to `objc_copyWeak `_. + +'``llvm.objc.destroyWeak``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +:: + + declare void @llvm.objc.destroyWeak(i8**) + +Lowering: +""""""""" + +Lowers to a call to `objc_destroyWeak `_. + +'``llvm.objc.initWeak``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +:: + + declare i8* @llvm.objc.initWeak(i8**, i8*) + +Lowering: +""""""""" + +Lowers to a call to `objc_initWeak `_. + +'``llvm.objc.loadWeak``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +:: + + declare i8* @llvm.objc.loadWeak(i8**) + +Lowering: +""""""""" + +Lowers to a call to `objc_loadWeak `_. + +'``llvm.objc.loadWeakRetained``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +:: + + declare i8* @llvm.objc.loadWeakRetained(i8**) + +Lowering: +""""""""" + +Lowers to a call to `objc_loadWeakRetained `_. + +'``llvm.objc.moveWeak``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +:: + + declare void @llvm.objc.moveWeak(i8**, i8**) + +Lowering: +""""""""" + +Lowers to a call to `objc_moveWeak `_. + +'``llvm.objc.release``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +:: + + declare void @llvm.objc.release(i8*) + +Lowering: +""""""""" + +Lowers to a call to `objc_release `_. + +'``llvm.objc.retain``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +:: + + declare i8* @llvm.objc.retain(i8*) + +Lowering: +""""""""" + +Lowers to a call to `objc_retain `_. + +'``llvm.objc.retainAutorelease``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +:: + + declare i8* @llvm.objc.retainAutorelease(i8*) + +Lowering: +""""""""" + +Lowers to a call to `objc_retainAutorelease `_. + +'``llvm.objc.retainAutoreleaseReturnValue``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +:: + + declare i8* @llvm.objc.retainAutoreleaseReturnValue(i8*) + +Lowering: +""""""""" + +Lowers to a call to `objc_retainAutoreleaseReturnValue `_. + +'``llvm.objc.retainAutoreleasedReturnValue``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +:: + + declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*) + +Lowering: +""""""""" + +Lowers to a call to `objc_retainAutoreleasedReturnValue `_. + +'``llvm.objc.retainBlock``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +:: + + declare i8* @llvm.objc.retainBlock(i8*) + +Lowering: +""""""""" + +Lowers to a call to `objc_retainBlock `_. + +'``llvm.objc.storeStrong``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +:: + + declare void @llvm.objc.storeStrong(i8**, i8*) + +Lowering: +""""""""" + +Lowers to a call to `objc_storeStrong `_. + +'``llvm.objc.storeWeak``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +:: + + declare i8* @llvm.objc.storeWeak(i8**, i8*) + +Lowering: +""""""""" + +Lowers to a call to `objc_storeWeak `_. diff --git a/docs/MIRLangRef.rst b/docs/MIRLangRef.rst index 9a7e18c091c5ae9cd41ad8fa11d91e1cb9c3f735..ca6c0107fa66e6d1f89ea60de6514264c268ced4 100644 --- a/docs/MIRLangRef.rst +++ b/docs/MIRLangRef.rst @@ -60,6 +60,11 @@ runs just before the pass that we are trying to test: ``llc -stop-after=machine-cp bug-trigger.ll > test.mir`` +If the same pass is run multiple times, a run index can be included +after the name with a comma. + + ``llc -stop-after=dead-mi-elimination,1 bug-trigger.ll > test.mir`` + After generating the input MIR file, you'll have to add a run line that uses the ``-run-pass`` option to it. In order to test the post register allocation pseudo instruction expansion pass on X86-64, a run line like the one shown diff --git a/docs/Passes.rst b/docs/Passes.rst index 9ab214984d2d91a1dde3007d64b21976903c4e63..81bd8acfc2dad49d4299528ab8a29a4638da0084 100644 --- a/docs/Passes.rst +++ b/docs/Passes.rst @@ -355,6 +355,15 @@ between different iterations. ``ScalarEvolution`` has a more complete understanding of pointer arithmetic than ``BasicAliasAnalysis``' collection of ad-hoc analyses. +``-stack-safety``: Stack Safety Analysis +------------------------------------------------ + +The ``StackSafety`` analysis can be used to determine if stack allocated +variables can be considered safe from memory access bugs. + +This analysis' primary purpose is to be used by sanitizers to avoid unnecessary +instrumentation of safe variables. + ``-targetdata``: Target Data Layout ----------------------------------- @@ -1215,3 +1224,8 @@ Displays the post dominator tree using the GraphViz tool. Displays the post dominator tree using the GraphViz tool, but omitting function bodies. +``-transform-warning``: Report missed forced transformations +------------------------------------------------------------ + +Emits warnings about not yet applied forced transformations (e.g. from +``#pragma omp simd``). diff --git a/docs/Proposals/TestSuite.rst b/docs/Proposals/TestSuite.rst index 8c7531783d44bfe44a8d619d888701a7536e8e65..85f3642a2e922c63fb84ce49f93402bb3e130485 100644 --- a/docs/Proposals/TestSuite.rst +++ b/docs/Proposals/TestSuite.rst @@ -259,6 +259,10 @@ https://github.com/darktable-org/rawspeed Its test dataset is 756 MB in size, which is too large to be included into the test-suite repository. +C++ Performance Benchmarks +-------------------------- +https://gitlab.com/chriscox/CppPerformanceBenchmarks + Generic Algorithms ================== diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst index 0da765a302776ea1915b4352f4f3fe341f93b0d4..a3500ba3f743f502929ab726c5c7b36792af1aeb 100644 --- a/docs/ReleaseNotes.rst +++ b/docs/ReleaseNotes.rst @@ -40,7 +40,8 @@ Non-comprehensive list of changes in this release functionality, or simply have a lot to talk about), see the `NOTE` below for adding a new subsection. -* Note.. +* The **llvm-cov** tool can now export lcov trace files using the + `-format=lcov` option of the `export` command. .. NOTE If you would like to document a larger change, then you can add a diff --git a/docs/SourceLevelDebugging.rst b/docs/SourceLevelDebugging.rst index 4cccfb40bd4d1eb69be1f7cdf07a455d0b5bddfc..dab300ac7b76078843ecf5fbe22c6b0821a9cfe8 100644 --- a/docs/SourceLevelDebugging.rst +++ b/docs/SourceLevelDebugging.rst @@ -1661,4 +1661,4 @@ are correctly pointing to the ``[[C]]`` and ``[[D]]`` variables. for a ``DILocation`` to have a specific line number, and someone later adds an instruction before the one we check the test will fail. In the cases this can't be avoided (say, if a test wouldn't be precise enough), moving the - test to it's own file is preferred. + test to its own file is preferred. diff --git a/docs/StackMaps.rst b/docs/StackMaps.rst index 99c5e5fbe4de13bf3df4c4bc02af60f84ef360f0..2ceb408097a718dee820ca6574d5d3fcb28bcfb5 100644 --- a/docs/StackMaps.rst +++ b/docs/StackMaps.rst @@ -427,8 +427,11 @@ this section, it invokes the callback and passes the section name. The JIT can record the in-memory address of the section at this time and later parse it to recover the stack map data. -On Darwin, the stack map section name is "__llvm_stackmaps". The -segment name is "__LLVM_STACKMAPS". +For MachO (e.g. on Darwin), the stack map section name is +"__llvm_stackmaps". The segment name is "__LLVM_STACKMAPS". + +For ELF (e.g. on Linux), the stack map section name is +".llvm_stackmaps". The segment name is "__LLVM_STACKMAPS". Stack Map Usage =============== diff --git a/docs/StackSafetyAnalysis.rst b/docs/StackSafetyAnalysis.rst new file mode 100644 index 0000000000000000000000000000000000000000..8573b7a2bb97fc5178c97ac019e222a7595a36ce --- /dev/null +++ b/docs/StackSafetyAnalysis.rst @@ -0,0 +1,56 @@ +================================== +Stack Safety Analysis +================================== + + +Introduction +============ + +The Stack Safety Analysis determines if stack allocated variables can be +considered 'safe' from memory access bugs. + +The primary purpose of the analysis is to be used by sanitizers to avoid +unnecessary instrumentation of 'safe' variables. SafeStack is going to be the +first user. + +'safe' variables can be defined as variables that can not be used out-of-scope +(e.g. use-after-return) or accessed out of bounds. In the future it can be +extended to track other variable properties. E.g. we plan to extend +implementation with a check to make sure that variable is always initialized +before every read to optimize use-of-uninitialized-memory checks. + +How it works +============ + +The analysis is implemented in two stages: + +The intra-procedural, or 'local', stage performs a depth-first search inside +functions to collect all uses of each alloca, including loads/stores and uses as +arguments functions. After this stage we know which parts of the alloca are used +by functions itself but we don't know what happens after it is passed as +an argument to another function. + +The inter-procedural, or 'global', stage, resolves what happens to allocas after +they are passed as function arguments. This stage performs a depth-first search +on function calls inside a single module and propagates allocas usage through +functions calls. + +When used with ThinLTO, the global stage performs a whole program analysis over +the Module Summary Index. + +Testing +======= + +The analysis is covered with lit tests. + +We expect that users can tolerate false classification of variables as +'unsafe' when in-fact it's 'safe'. This may lead to inefficient code. However, we +can't accept false 'safe' classification which may cause sanitizers to miss actual +bugs in instrumented code. To avoid that we want additional validation tool. + +AddressSanitizer may help with this validation. We can instrument all variables +as usual but additionally store stack-safe information in the +``ASanStackVariableDescription``. Then if AddressSanitizer detects a bug on +a 'safe' variable we can produce an additional report to let the user know that +probably Stack Safety Analysis failed and we should check for a bug in the +compiler. diff --git a/docs/Statepoints.rst b/docs/Statepoints.rst index 6ed4e46b73bcb760a820b02c744dd640ca3f60c4..3832a43b287ef98eea1ac29aa5aace60517f8b77 100644 --- a/docs/Statepoints.rst +++ b/docs/Statepoints.rst @@ -26,8 +26,8 @@ historical interest at this point with one exception - its implementation of shadow stacks has been used successfully by a number of language frontends and is still supported. -Overview -======== +Overview & Core Concepts +======================== To collect dead objects, garbage collectors must be able to identify any references to objects contained within executing code, and, @@ -93,7 +93,10 @@ the collector must be able to: This document describes the mechanism by which an LLVM based compiler can provide this information to a language runtime/collector, and -ensure that all pointers can be read and updated if desired. +ensure that all pointers can be read and updated if desired. + +Abstract Machine Model +^^^^^^^^^^^^^^^^^^^^^^^ At a high level, LLVM has been extended to support compiling to an abstract machine which extends the actual target with a non-integral pointer type @@ -103,10 +106,16 @@ integer representation. This semantic quirk allows the runtime to pick a integer mapping for each point in the program allowing relocations of objects without visible effects. -Warning: Non-Integral Pointer Types are a newly added concept in LLVM IR. -It's possible that we've missed disabling some of the optimizations which -assume an integral value for pointers. If you find such a case, please -file a bug or share a patch. +This high level abstract machine model is used for most of the optimizer. As +a result, transform passes do not need to be extended to look through explicit +relocation sequence. Before starting code generation, we switch +representations to an explicit form. The exact location chosen for lowering +is an implementation detail. + +Note that most of the value of the abstract machine model comes for collectors +which need to model potentially relocatable objects. For a compiler which +supports only a non-relocating collector, you may wish to consider starting +with the fully explicit form. Warning: There is one currently known semantic hole in the definition of non-integral pointers which has not been addressed upstream. To work around @@ -116,10 +125,13 @@ not safe to speculate a load if doing causes a non-integral pointer value to be loaded as any other type or vice versa. In practice, this restriction is well isolated to isSafeToSpeculate in ValueTracking.cpp. -This high level abstract machine model is used for most of the LLVM optimizer. -Before starting code generation, we switch representations to an explicit form. -In theory, a frontend could directly generate this low level explicit form, but -doing so is likely to inhibit optimization. +Explicit Representation +^^^^^^^^^^^^^^^^^^^^^^^ + +A frontend could directly generate this low level explicit form, but +doing so may inhibit optimization. Instead, it is recommended that +compilers with relocating collectors target the abstract machine model just +described. The heart of the explicit approach is to construct (or rewrite) the IR in a manner where the possible updates performed by the garbage collector are @@ -242,6 +254,56 @@ following command. opt -rewrite-statepoints-for-gc test/Transforms/RewriteStatepointsForGC/basics.ll -S | llc -debug-only=stackmaps +Simplifications for Non-Relocating GCs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Some of the complexity in the previous example is unnecessary for a +non-relocating collector. While a non-relocating collector still needs the +information about which location contain live references, it doesn't need to +represent explicit relocations. As such, the previously described explicit +lowering can be simplified to remove all of the ``gc.relocate`` intrinsic +calls and leave uses in terms of the original reference value. + +Here's the explicit lowering for the previous example for a non-relocating +collector: + +.. code-block:: llvm + + define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) + gc "statepoint-example" { + call token (i64, i32, void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %obj) + ret i8 addrspace(1)* %obj + } + +Recording On Stack Regions +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In addition to the explicit relocation form previously described, the +statepoint infrastructure also allows the listing of allocas within the gc +pointer list. Allocas can be listed with or without additional explicit gc +pointer values and relocations. + +An alloca in the gc region of the statepoint operand list will cause the +address of the stack region to be listed in the stackmap for the statepoint. + +This mechanism can be used to describe explicit spill slots if desired. It +then becomes the generator's responsibility to ensure that values are +spill/filled to/from the alloca as needed on either side of the safepoint. +Note that there is no way to indicate a corresponding base pointer for such +an explicitly specified spill slot, so usage is restricted to values for +which the associated collector can derive the object base from the pointer +itself. + +This mechanism can be used to describe on stack objects containing +references provided that the collector can map from the location on the +stack to a heap map describing the internal layout of the references the +collector needs to process. + +WARNING: At the moment, this alternate form is not well exercised. It is +recommended to use this with caution and expect to have to fix a few bugs. +In particular, the RewriteStatepointsForGC utility pass does not do +anything for allocas today. + Base & Derived Pointers ^^^^^^^^^^^^^^^^^^^^^^^ @@ -590,8 +652,15 @@ Stack Map Format ================ Locations for each pointer value which may need read and/or updated by -the runtime or collector are provided via the :ref:`Stack Map format -` specified in the PatchPoint documentation. +the runtime or collector are provided in a separate section of the +generated object file as specified in the PatchPoint documentation. +This special section is encoded per the +:ref:`Stack Map format `. + +The general expectation is that a JIT compiler will parse and discard this +format; it is not particularly memory efficient. If you need an alternate +format (e.g. for an ahead of time compiler), see discussion under +:ref: `open work items ` below. Each statepoint generates the following Locations: @@ -831,35 +900,73 @@ Supported Architectures ======================= Support for statepoint generation requires some code for each backend. -Today, only X86_64 is supported. - -Problem Areas and Active Work -============================= - -#. Support for languages which allow unmanaged pointers to garbage collected - objects (i.e. pass a pointer to an object to a C routine) via pinning. - -#. Support for garbage collected objects allocated on the stack. Specifically, - allocas are always assumed to be in address space 0 and we need a - cast/promotion operator to let rewriting identify them. - -#. The current statepoint lowering is known to be somewhat poor. In the very - long term, we'd like to integrate statepoints with the register allocator; - in the near term this is unlikely to happen. We've found the quality of - lowering to be relatively unimportant as hot-statepoints are almost always - inliner bugs. - -#. Concerns have been raised that the statepoint representation results in a - large amount of IR being produced for some examples and that this - contributes to higher than expected memory usage and compile times. There's - no immediate plans to make changes due to this, but alternate models may be - explored in the future. - -#. Relocations along exceptional paths are currently broken in ToT. In - particular, there is current no way to represent a rethrow on a path which - also has relocations. See `this llvm-dev discussion - `_ for more - detail. +Today, only X86_64 is supported. + +.. _OpenWork: + +Limitations and Half Baked Ideas +================================ + +Mixing References and Raw Pointers +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Support for languages which allow unmanaged pointers to garbage collected +objects (i.e. pass a pointer to an object to a C routine) in the abstract +machine model. At the moment, the best idea on how to approach this +involves an intrinsic or opaque function which hides the connection between +the reference value and the raw pointer. The problem is that having a +ptrtoint or inttoptr cast (which is common for such use cases) breaks the +rules used for inferring base pointers for arbitrary references when +lowering out of the abstract model to the explicit physical model. Note +that a frontend which lowers directly to the physical model doesn't have +any problems here. + +Objects on the Stack +^^^^^^^^^^^^^^^^^^^^ + +As noted above, the explicit lowering supports objects allocated on the +stack provided the collector can find a heap map given the stack address. + +The missing pieces are a) integration with rewriting (RS4GC) from the +abstract machine model and b) support for optionally decomposing on stack +objects so as not to require heap maps for them. The later is required +for ease of integration with some collectors. + +Lowering Quality and Representation Overhead +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The current statepoint lowering is known to be somewhat poor. In the very +long term, we'd like to integrate statepoints with the register allocator; +in the near term this is unlikely to happen. We've found the quality of +lowering to be relatively unimportant as hot-statepoints are almost always +inliner bugs. + +Concerns have been raised that the statepoint representation results in a +large amount of IR being produced for some examples and that this +contributes to higher than expected memory usage and compile times. There's +no immediate plans to make changes due to this, but alternate models may be +explored in the future. + +Relocations Along Exceptional Edges +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Relocations along exceptional paths are currently broken in ToT. In +particular, there is current no way to represent a rethrow on a path which +also has relocations. See `this llvm-dev discussion +`_ for more +detail. + +Support for alternate stackmap formats +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For some use cases, it is +desirable to directly encode a final memory efficient stackmap format for +use by the runtime. This is particularly relevant for ahead of time +compilers which wish to directly link object files without the need for +post processing of each individual object file. While not implemented +today for statepoints, there is precedent for a GCStrategy to be able to +select a customer GCMetataPrinter for this purpose. Patches to enable +this functionality upstream are welcome. Bugs and Enhancements ===================== diff --git a/docs/TableGen/LangRef.rst b/docs/TableGen/LangRef.rst index 439d646034ada188c4baa085e0204c1d935bb8db..2efee12ec9d5b991814cde6bafba0be76eaa96c5 100644 --- a/docs/TableGen/LangRef.rst +++ b/docs/TableGen/LangRef.rst @@ -33,7 +33,7 @@ Lexical Analysis ================ TableGen supports BCPL (``// ...``) and nestable C-style (``/* ... */``) -comments. +comments. TableGen also provides simple `Preprocessing Support`_. The following is a listing of the basic punctuation tokens:: @@ -448,3 +448,50 @@ applied at the end of parsing the base classes of a record. BaseMultiClassList: `MultiClassID` ("," `MultiClassID`)* MultiClassID: `TokIdentifier` MultiClassObject: `Def` | `Defm` | `Let` | `Foreach` + +Preprocessing Support +===================== + +TableGen's embedded preprocessor is only intended for conditional compilation. +It supports the following directives: + +.. productionlist:: + LineBegin: ^ + LineEnd: "\n" | "\r" | EOF + WhiteSpace: " " | "\t" + CStyleComment: "/*" (.* - "*/") "*/" + BCPLComment: "//" (.* - `LineEnd`) `LineEnd` + WhiteSpaceOrCStyleComment: `WhiteSpace` | `CStyleComment` + WhiteSpaceOrAnyComment: `WhiteSpace` | `CStyleComment` | `BCPLComment` + MacroName: `ualpha` (`ualpha` | "0"..."9")* + PrepDefine: `LineBegin` (`WhiteSpaceOrCStyleComment`)* + : "#define" (`WhiteSpace`)+ `MacroName` + : (`WhiteSpaceOrAnyComment`)* `LineEnd` + PrepIfdef: `LineBegin` (`WhiteSpaceOrCStyleComment`)* + : "#ifdef" (`WhiteSpace`)+ `MacroName` + : (`WhiteSpaceOrAnyComment`)* `LineEnd` + PrepElse: `LineBegin` (`WhiteSpaceOrCStyleComment`)* + : "#else" (`WhiteSpaceOrAnyComment`)* `LineEnd` + PrepEndif: `LineBegin` (`WhiteSpaceOrCStyleComment`)* + : "#endif" (`WhiteSpaceOrAnyComment`)* `LineEnd` + PrepRegContentException: `PredIfdef` | `PredElse` | `PredEndif` | EOF + PrepRegion: .* - `PrepRegContentException` + :| `PrepIfDef` + : (`PrepRegion`)* + : [`PrepElse`] + : (`PrepRegion`)* + : `PrepEndif` + +:token:`PrepRegion` may occur anywhere in a TD file, as long as it matches +the grammar specification. + +:token:`PrepDefine` allows defining a :token:`MacroName` so that any following +:token:`PrepIfdef` - :token:`PrepElse` preprocessing region part and +:token:`PrepIfdef` - :token:`PrepEndif` preprocessing region +are enabled for TableGen tokens parsing. + +A preprocessing region, starting (i.e. having its :token:`PrepIfdef`) in a file, +must end (i.e. have its :token:`PrepEndif`) in the same file. + +A :token:`MacroName` may be defined externally by using ``{ -D }`` +option of TableGen. diff --git a/docs/TransformMetadata.rst b/docs/TransformMetadata.rst new file mode 100644 index 0000000000000000000000000000000000000000..68649424b717c0bdd5f72ee71d2b7329d5ea8213 --- /dev/null +++ b/docs/TransformMetadata.rst @@ -0,0 +1,441 @@ +.. _transformation-metadata: + +============================ +Code Transformation Metadata +============================ + +.. contents:: + :local: + +Overview +======== + +LLVM transformation passes can be controlled by attaching metadata to +the code to transform. By default, transformation passes use heuristics +to determine whether or not to perform transformations, and when doing +so, other details of how the transformations are applied (e.g., which +vectorization factor to select). +Unless the optimizer is otherwise directed, transformations are applied +conservatively. This conservatism generally allows the optimizer to +avoid unprofitable transformations, but in practice, this results in the +optimizer not applying transformations that would be highly profitable. + +Frontends can give additional hints to LLVM passes on which +transformations they should apply. This can be additional knowledge that +cannot be derived from the emitted IR, or directives passed from the +user/programmer. OpenMP pragmas are an example of the latter. + +If any such metadata is dropped from the program, the code's semantics +must not change. + +Metadata on Loops +================= + +Attributes can be attached to loops as described in :ref:`llvm.loop`. +Attributes can describe properties of the loop, disable transformations, +force specific transformations and set transformation options. + +Because metadata nodes are immutable (with the exception of +``MDNode::replaceOperandWith`` which is dangerous to use on uniqued +metadata), in order to add or remove a loop attributes, a new ``MDNode`` +must be created and assigned as the new ``llvm.loop`` metadata. Any +connection between the old ``MDNode`` and the loop is lost. The +``llvm.loop`` node is also used as LoopID (``Loop::getLoopID()``), i.e. +the loop effectively gets a new identifier. For instance, +``llvm.mem.parallel_loop_access`` references the LoopID. Therefore, if +the parallel access property is to be preserved after adding/removing +loop attributes, any ``llvm.mem.parallel_loop_access`` reference must be +updated to the new LoopID. + +Transformation Metadata Structure +================================= + +Some attributes describe code transformations (unrolling, vectorizing, +loop distribution, etc.). They can either be a hint to the optimizer +that a transformation might be beneficial, instruction to use a specific +option, , or convey a specific request from the user (such as +``#pragma clang loop`` or ``#pragma omp simd``). + +If a transformation is forced but cannot be carried-out for any reason, +an optimization-missed warning must be emitted. Semantic information +such as a transformation being safe (e.g. +``llvm.mem.parallel_loop_access``) can be unused by the optimizer +without generating a warning. + +Unless explicitly disabled, any optimization pass may heuristically +determine whether a transformation is beneficial and apply it. If +metadata for another transformation was specified, applying a different +transformation before it might be inadvertent due to being applied on a +different loop or the loop not existing anymore. To avoid having to +explicitly disable an unknown number of passes, the attribute +``llvm.loop.disable_nonforced`` disables all optional, high-level, +restructuring transformations. + +The following example avoids the loop being altered before being +vectorized, for instance being unrolled. + +.. code-block:: llvm + + br i1 %exitcond, label %for.exit, label %for.header, !llvm.loop !0 + ... + !0 = distinct !{!0, !1, !2} + !1 = !{!"llvm.loop.vectorize.enable", i1 true} + !2 = !{!"llvm.loop.disable_nonforced"} + +After a transformation is applied, follow-up attributes are set on the +transformed and/or new loop(s). This allows additional attributes +including followup-transformations to be specified. Specifying multiple +transformations in the same metadata node is possible for compatibility +reasons, but their execution order is undefined. For instance, when +``llvm.loop.vectorize.enable`` and ``llvm.loop.unroll.enable`` are +specified at the same time, unrolling may occur either before or after +vectorization. + +As an example, the following instructs a loop to be vectorized and only +then unrolled. + +.. code-block:: llvm + + !0 = distinct !{!0, !1, !2, !3} + !1 = !{!"llvm.loop.vectorize.enable", i1 true} + !2 = !{!"llvm.loop.disable_nonforced"} + !3 = !{!"llvm.loop.vectorize.followup_vectorized", !{"llvm.loop.unroll.enable"}} + +If, and only if, no followup is specified, the pass may add attributes itself. +For instance, the vectorizer adds a ``llvm.loop.isvectorized`` attribute and +all attributes from the original loop excluding its loop vectorizer +attributes. To avoid this, an empty followup attribute can be used, e.g. + +.. code-block:: llvm + + !3 = !{!"llvm.loop.vectorize.followup_vectorized"} + +The followup attributes of a transformation that cannot be applied will +never be added to a loop and are therefore effectively ignored. This means +that any followup-transformation in such attributes requires that its +prior transformations are applied before the followup-transformation. +The user should receive a warning about the first transformation in the +transformation chain that could not be applied if it a forced +transformation. All following transformations are skipped. + +Pass-Specific Transformation Metadata +===================================== + +Transformation options are specific to each transformation. In the +following, we present the model for each LLVM loop optimization pass and +the metadata to influence them. + +Loop Vectorization and Interleaving +----------------------------------- + +Loop vectorization and interleaving is interpreted as a single +transformation. It is interpreted as forced if +``!{"llvm.loop.vectorize.enable", i1 true}`` is set. + +Assuming the pre-vectorization loop is + +.. code-block:: c + + for (int i = 0; i < n; i+=1) // original loop + Stmt(i); + +then the code after vectorization will be approximately (assuming an +SIMD width of 4): + +.. code-block:: c + + int i = 0; + if (rtc) { + for (; i + 3 < n; i+=4) // vectorized/interleaved loop + Stmt(i:i+3); + } + for (; i < n; i+=1) // epilogue loop + Stmt(i); + +where ``rtc`` is a generated runtime check. + +``llvm.loop.vectorize.followup_vectorized`` will set the attributes for +the vectorized loop. If not specified, ``llvm.loop.isvectorized`` is +combined with the original loop's attributes to avoid it being +vectorized multiple times. + +``llvm.loop.vectorize.followup_epilogue`` will set the attributes for +the remainder loop. If not specified, it will have the original loop's +attributes combined with ``llvm.loop.isvectorized`` and +``llvm.loop.unroll.runtime.disable`` (unless the original loop already +has unroll metadata). + +The attributes specified by ``llvm.loop.vectorize.followup_all`` are +added to both loops. + +When using a follow-up attribute, it replaces any automatically deduced +attributes for the generated loop in question. Therefore it is +recommended to add ``llvm.loop.isvectorized`` to +``llvm.loop.vectorize.followup_all`` which avoids that the loop +vectorizer tries to optimize the loops again. + +Loop Unrolling +-------------- + +Unrolling is interpreted as forced any ``!{!"llvm.loop.unroll.enable"}`` +metadata or option (``llvm.loop.unroll.count``, ``llvm.loop.unroll.full``) +is present. Unrolling can be full unrolling, partial unrolling of a loop +with constant trip count or runtime unrolling of a loop with a trip +count unknown at compile-time. + +If the loop has been unrolled fully, there is no followup-loop. For +partial/runtime unrolling, the original loop of + +.. code-block:: c + + for (int i = 0; i < n; i+=1) // original loop + Stmt(i); + +is transformed into (using an unroll factor of 4): + +.. code-block:: c + + int i = 0; + for (; i + 3 < n; i+=4) // unrolled loop + Stmt(i); + Stmt(i+1); + Stmt(i+2); + Stmt(i+3); + } + for (; i < n; i+=1) // remainder loop + Stmt(i); + +``llvm.loop.unroll.followup_unrolled`` will set the loop attributes of +the unrolled loop. If not specified, the attributes of the original loop +without the ``llvm.loop.unroll.*`` attributes are copied and +``llvm.loop.unroll.disable`` added to it. + +``llvm.loop.unroll.followup_remainder`` defines the attributes of the +remainder loop. If not specified the remainder loop will have no +attributes. The remainder loop might not be present due to being fully +unrolled in which case this attribute has no effect. + +Attributes defined in ``llvm.loop.unroll.followup_all`` are added to the +unrolled and remainder loops. + +To avoid that the partially unrolled loop is unrolled again, it is +recommended to add ``llvm.loop.unroll.disable`` to +``llvm.loop.unroll.followup_all``. If no follow-up attribute specified +for a generated loop, it is added automatically. + +Unroll-And-Jam +-------------- + +Unroll-and-jam uses the following transformation model (here with an +unroll factor if 2). Currently, it does not support a fallback version +when the transformation is unsafe. + +.. code-block:: c + + for (int i = 0; i < n; i+=1) { // original outer loop + Fore(i); + for (int j = 0; j < m; j+=1) // original inner loop + SubLoop(i, j); + Aft(i); + } + +.. code-block:: c + + int i = 0; + for (; i + 1 < n; i+=2) { // unrolled outer loop + Fore(i); + Fore(i+1); + for (int j = 0; j < m; j+=1) { // unrolled inner loop + SubLoop(i, j); + SubLoop(i+1, j); + } + Aft(i); + Aft(i+1); + } + for (; i < n; i+=1) { // remainder outer loop + Fore(i); + for (int j = 0; j < m; j+=1) // remainder inner loop + SubLoop(i, j); + Aft(i); + } + +``llvm.loop.unroll_and_jam.followup_outer`` will set the loop attributes +of the unrolled outer loop. If not specified, the attributes of the +original outer loop without the ``llvm.loop.unroll.*`` attributes are +copied and ``llvm.loop.unroll.disable`` added to it. + +``llvm.loop.unroll_and_jam.followup_inner`` will set the loop attributes +of the unrolled inner loop. If not specified, the attributes of the +original inner loop are used unchanged. + +``llvm.loop.unroll_and_jam.followup_remainder_outer`` sets the loop +attributes of the outer remainder loop. If not specified it will not +have any attributes. The remainder loop might not be present due to +being fully unrolled. + +``llvm.loop.unroll_and_jam.followup_remainder_inner`` sets the loop +attributes of the inner remainder loop. If not specified it will have +the attributes of the original inner loop. It the outer remainder loop +is unrolled, the inner remainder loop might be present multiple times. + +Attributes defined in ``llvm.loop.unroll_and_jam.followup_all`` are +added to all of the aforementioned output loops. + +To avoid that the unrolled loop is unrolled again, it is +recommended to add ``llvm.loop.unroll.disable`` to +``llvm.loop.unroll_and_jam.followup_all``. It suppresses unroll-and-jam +as well as an additional inner loop unrolling. If no follow-up +attribute specified for a generated loop, it is added automatically. + +Loop Distribution +----------------- + +The LoopDistribution pass tries to separate vectorizable parts of a loop +from the non-vectorizable part (which otherwise would make the entire +loop non-vectorizable). Conceptually, it transforms a loop such as + +.. code-block:: c + + for (int i = 1; i < n; i+=1) { // original loop + A[i] = i; + B[i] = 2 + B[i]; + C[i] = 3 + C[i - 1]; + } + +into the following code: + +.. code-block:: c + + if (rtc) { + for (int i = 1; i < n; i+=1) // coincident loop + A[i] = i; + for (int i = 1; i < n; i+=1) // coincident loop + B[i] = 2 + B[i]; + for (int i = 1; i < n; i+=1) // sequential loop + C[i] = 3 + C[i - 1]; + } else { + for (int i = 1; i < n; i+=1) { // fallback loop + A[i] = i; + B[i] = 2 + B[i]; + C[i] = 3 + C[i - 1]; + } + } + +where ``rtc`` is a generated runtime check. + +``llvm.loop.distribute.followup_coincident`` sets the loop attributes of +all loops without loop-carried dependencies (i.e. vectorizable loops). +There might be more than one such loops. If not defined, the loops will +inherit the original loop's attributes. + +``llvm.loop.distribute.followup_sequential`` sets the loop attributes of the +loop with potentially unsafe dependencies. There should be at most one +such loop. If not defined, the loop will inherit the original loop's +attributes. + +``llvm.loop.distribute.followup_fallback`` defines the loop attributes +for the fallback loop, which is a copy of the original loop for when +loop versioning is required. If undefined, the fallback loop inherits +all attributes from the original loop. + +Attributes defined in ``llvm.loop.distribute.followup_all`` are added to +all of the aforementioned output loops. + +It is recommended to add ``llvm.loop.disable_nonforced`` to +``llvm.loop.distribute.followup_fallback``. This avoids that the +fallback version (which is likely never executed) is further optimzed +which would increase the code size. + +Versioning LICM +--------------- + +The pass hoists code out of loops that are only loop-invariant when +dynamic conditions apply. For instance, it transforms the loop + +.. code-block:: c + + for (int i = 0; i < n; i+=1) // original loop + A[i] = B[0]; + +into: + +.. code-block:: c + + if (rtc) { + auto b = B[0]; + for (int i = 0; i < n; i+=1) // versioned loop + A[i] = b; + } else { + for (int i = 0; i < n; i+=1) // unversioned loop + A[i] = B[0]; + } + +The runtime condition (``rtc``) checks that the array ``A`` and the +element `B[0]` do not alias. + +Currently, this transformation does not support followup-attributes. + +Loop Interchange +---------------- + +Currently, the ``LoopInterchange`` pass does not use any metadata. + +Ambiguous Transformation Order +============================== + +If there multiple transformations defined, the order in which they are +executed depends on the order in LLVM's pass pipeline, which is subject +to change. The default optimization pipeline (anything higher than +``-O0``) has the following order. + +When using the legacy pass manager: + + - LoopInterchange (if enabled) + - SimpleLoopUnroll/LoopFullUnroll (only performs full unrolling) + - VersioningLICM (if enabled) + - LoopDistribute + - LoopVectorizer + - LoopUnrollAndJam (if enabled) + - LoopUnroll (partial and runtime unrolling) + +When using the legacy pass manager with LTO: + + - LoopInterchange (if enabled) + - SimpleLoopUnroll/LoopFullUnroll (only performs full unrolling) + - LoopVectorizer + - LoopUnroll (partial and runtime unrolling) + +When using the new pass manager: + + - SimpleLoopUnroll/LoopFullUnroll (only performs full unrolling) + - LoopDistribute + - LoopVectorizer + - LoopUnrollAndJam (if enabled) + - LoopUnroll (partial and runtime unrolling) + +Leftover Transformations +======================== + +Forced transformations that have not been applied after the last +transformation pass should be reported to the user. The transformation +passes themselves cannot be responsible for this reporting because they +might not be in the pipeline, there might be multiple passes able to +apply a transformation (e.g. ``LoopInterchange`` and Polly) or a +transformation attribute may be 'hidden' inside another passes' followup +attribute. + +The pass ``-transform-warning`` (``WarnMissedTransformationsPass``) +emits such warnings. It should be placed after the last transformation +pass. + +The current pass pipeline has a fixed order in which transformations +passes are executed. A transformation can be in the followup of a pass +that is executed later and thus leftover. For instance, a loop nest +cannot be distributed and then interchanged with the current pass +pipeline. The loop distribution will execute, but there is no loop +interchange pass following such that any loop interchange metadata will +be ignored. The ``-transform-warning`` should emit a warning in this +case. + +Future versions of LLVM may fix this by executing transformations using +a dynamic ordering. diff --git a/docs/index.rst b/docs/index.rst index df70de095bd94761c7d8c01ce6d18dfe609765b4..698e6764bfb3ab9f67022dd8b249265bd9a65407 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -292,6 +292,7 @@ For API clients and LLVM developers. Statepoints MergeFunctions TypeMetadata + TransformMetadata FaultMaps MIRLangRef Coroutines @@ -302,6 +303,7 @@ For API clients and LLVM developers. PDB/index CFIVerify SpeculativeLoadHardening + StackSafetyAnalysis :doc:`WritingAnLLVMPass` Information on how to write LLVM transformations and analyses. @@ -438,6 +440,10 @@ For API clients and LLVM developers. :doc:`SpeculativeLoadHardening` A description of the Speculative Load Hardening mitigation for Spectre v1. +:doc:`StackSafetyAnalysis` + This document describes the design of the stack safety analysis of local + variables. + Development Process Documentation ================================= diff --git a/docs/llvm-objdump.1 b/docs/llvm-objdump.1 new file mode 100644 index 0000000000000000000000000000000000000000..72d1b93f6bb37cb7038dbc356ffbc5e18765d44e --- /dev/null +++ b/docs/llvm-objdump.1 @@ -0,0 +1,197 @@ +.\" This file is distributed under the University of Illinois Open Source +.\" License. See LICENSE.TXT for details. +.\" +.Dd December 19, 2018 +.Dt LLVM-OBJDUMP 1 +.Os +.Sh NAME +.Nm llvm-objdump +.Nd LLVM object file dumper +.Sh SYNOPSIS +.Nm llvm-objdump +.Op Ar options +.Ar objfile ... +.Sh DESCRIPTION +.Nm +prints the contents of object files and final linked images named on the +command line. +If no file name is specified, +.Nm +will attempt to read from +.Pa a.out . +If +.Pa - +is used as a file name, +.Nm +will process a file on its standard input stream. +.Nm +accepts many of the same command line arguments as GNU objdump. +.Sh OPTIONS +.Ss General Options +.Bl -tag -width indent +.It Fl -aarch64-neon-syntax Ns = Ns Ar value +Choose style of NEON code to emit from AArch64 backend. +.Ar value +may be one of: +.Bl -tag -width indent +.It generic +Generic NEON assembly +.It apple +Apple-style NEON assembly +.El +.It Fl -arch Ns = Ns Ar value +Choose architecture(s) from a Mach-O file to dump +.It Fl -arch-name Ns = Ns ar arch +Target arch to disassemble for. +See +.Fl -version +for available targets. +.It Fl -bind +Display mach-o binding info. +.It Fl -color +Use colored syntax highlighting. +Default autodetect. +.It Fl -disassemble +Display assembler mnemonics for machine instructions. +.It Fl -disassemble-all +Display assembler mnemonics for the machine instruction in all sections. +.It Fl -dsym Ns = Ns Ar file +Use +.Ar file +for debug info. +.It Fl -dwarf Ns = Ns Ar sections +Dump of dwarf debug sections. +.Bl -tag -width indent +.It frames +.Dv .debug_frame +.El +.It Fl -exports-trie +Display mach-o exported symbols. +.It Fl -fault-map-section +Display contents of faultmap section. +.It Fl -filter-print-funcs Ns = Ns Ar functions +Only print IR for functions whose name match +.Ar functions +for all print-[before|after][-all] options. +.It Fl -full-leading-addr +Print full leading address. +.It Fl g +Print line information from debug info if available. +.It Fl h , -headers , -section-headers +Display summaries of the headers for each section. +.It Fl -help +Display available options. +Use +.Fl -help-hidden +for more. +.It Fl -lazy-bind +Display mach-o lazy binding info. +.It Fl -line-numbers +Display source line numbers with disassembly. +Implies disassemble object. +.It Fl -macho +Use MachO specific object file parser. +.It Fl -mattr Ns = Ns Ar attribute ... +Target specific attributes. +.It Fl -mcpu Ns = Ns Ar CPU +Target a specific cpu type. +Use +.Fl mcpu Ns = Ns help +for details. +.It Fl -no-leading-addr +Print no leading address. +.It Fl -no-leading-headers +Print no leading headers. +.It Fl -no-show-raw-insn +When disassembling instructions, do not print the instruction bytes. +.It Fl -print-imm-hex +Use hex format for immediate values. +.It Fl -private-header +Display only the first format specific file header. +.It Fl -private-headers +Display format specific file headers. +.It Fl r +Display the relocation entries in the file. +.It Fl -raw-clang-ast +Dump the raw binary contents of the clang AST section. +.It Fl -rebase +Display mach-o rebasing info. +.It Fl -reverse-iterate +Reverse iterate. +.It Fl s +Display the content of each section. +.It Fl -section Ns = Ns Ar section +Operate on the specified sections only. +With +.Fl -macho +dump segment,section. +.It Fl -source +Display source inline with disassembly. +Implies disassmble object. +.It Fl -start-address Ns = Ns Ar address +Disassemble beginning at +.Ar address . +.It Fl -stop-address Ns = Ns Ar address +Stop disassembly at +.Ar address . +.It Fl t +Display the symbol table. +.It Fl -triple Ns = Ns Ar triple +Target triple to disassemble for. +See +.Fl -version +for available targets. +.It Fl -unwind-info +Display unwind information. +.It Fl -version +Display the version of this program. +.It Fl -weak-bind +Display mach-o weak binding info. +.It Fl -x86-asm-syntax Ns = Ns Ar syntax +Choose style of code to emit from X86 backend. +.Bl -tag -width indent +.It att +Emit AT&T-style assembly. +.It intel +Emit Intel-style assembly. +.El +.El +.Ss Mach-O Options +There are a number of options specific to the Mach-O format. +These are used in combination with the +.Fl -macho +option. +.Bl -tag -width indent +.It Fl -archive-headers +Print archive headers for Mach-O archives. +.It Fl -archive-member-offsets +Print the offset to each archive member for Mach-O archives. +Requires +.Fl -macho +and +.Fl -archive-headers . +.It Fl -data-in-code +Print the data in code table for Mach-O objects. +.It Fl -dis-symname Ns = Ns Ar symbol +Disassemble just +.Ar symbol 's +instructions. +.It Fl -dylib-id +Print the shared library's id for the dylib Mach-O file. +.It Fl -dylibs-used +Print the shared libraries used for linked Mach-O files. +.It Fl -indirect-symbols +Print indirect symbol table for Mach-O objects. +.It Fl -info-plist +Print the info plist section as strings for Mach-O objects. +.It Fl -link-opt-hints +Print the linker optimization hints for Mach-O objects. +.It Fl -no-symbolic-operands +do not symbolic operands when disassembling. +.It Fl -non-verbose +Print the info for Mach-O objects in non-verbose or numeric form. +.It Fl -objc-meta-data +Print the Objective-C runtime meta data for Mach-O files. +.It Fl -universal-headers +Print Mach-O universal headers. +.El diff --git a/docs/tutorial/BuildingAJIT2.rst b/docs/tutorial/BuildingAJIT2.rst index 15c9c3586bc37c8f0c0e8e659b0add5b10b9816a..7d25ccaba0aa1ff24f2df272f8f854d864935ab5 100644 --- a/docs/tutorial/BuildingAJIT2.rst +++ b/docs/tutorial/BuildingAJIT2.rst @@ -12,10 +12,11 @@ we welcome any feedback. Chapter 2 Introduction ====================== -**Warning: This text is currently out of date due to ORC API updates.** +**Warning: This tutorial is currently being updated to account for ORC API +changes. Only Chapters 1 and 2 are up-to-date.** -**The example code has been updated and can be used. The text will be updated -once the API churn dies down.** +**Example code from Chapters 3 to 5 will compile and run, but has not been +updated** Welcome to Chapter 2 of the "Building an ORC-based JIT in LLVM" tutorial. In `Chapter 1 `_ of this series we examined a basic JIT @@ -42,67 +43,49 @@ added to it. In this Chapter we will make optimization a phase of our JIT instead. For now this will provide us a motivation to learn more about ORC layers, but in the long term making optimization part of our JIT will yield an important benefit: When we begin lazily compiling code (i.e. deferring -compilation of each function until the first time it's run), having +compilation of each function until the first time it's run) having optimization managed by our JIT will allow us to optimize lazily too, rather than having to do all our optimization up-front. To add optimization support to our JIT we will take the KaleidoscopeJIT from Chapter 1 and compose an ORC *IRTransformLayer* on top. We will look at how the IRTransformLayer works in more detail below, but the interface is simple: the -constructor for this layer takes a reference to the layer below (as all layers -do) plus an *IR optimization function* that it will apply to each Module that -is added via addModule: +constructor for this layer takes a reference to the execution session and the +layer below (as all layers do) plus an *IR optimization function* that it will +apply to each Module that is added via addModule: .. code-block:: c++ class KaleidoscopeJIT { private: - std::unique_ptr TM; - const DataLayout DL; - RTDyldObjectLinkingLayer<> ObjectLayer; - IRCompileLayer CompileLayer; + ExecutionSession ES; + RTDyldObjectLinkingLayer ObjectLayer; + IRCompileLayer CompileLayer; + IRTransformLayer TransformLayer; - using OptimizeFunction = - std::function(std::shared_ptr)>; - - IRTransformLayer OptimizeLayer; + DataLayout DL; + MangleAndInterner Mangle; + ThreadSafeContext Ctx; public: - using ModuleHandle = decltype(OptimizeLayer)::ModuleHandleT; - - KaleidoscopeJIT() - : TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()), - ObjectLayer([]() { return std::make_shared(); }), - CompileLayer(ObjectLayer, SimpleCompiler(*TM)), - OptimizeLayer(CompileLayer, - [this](std::unique_ptr M) { - return optimizeModule(std::move(M)); - }) { - llvm::sys::DynamicLibrary::LoadLibraryPermanently(nullptr); + + KaleidoscopeJIT(JITTargetMachineBuilder JTMB, DataLayout DL) + : ObjectLayer(ES, + []() { return llvm::make_unique(); }), + CompileLayer(ES, ObjectLayer, ConcurrentIRCompiler(std::move(JTMB))), + TransformLayer(ES, CompileLayer, optimizeModule), + DL(std::move(DL)), Mangle(ES, this->DL), + Ctx(llvm::make_unique()) { + ES.getMainJITDylib().setGenerator( + cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(DL))); } Our extended KaleidoscopeJIT class starts out the same as it did in Chapter 1, -but after the CompileLayer we introduce a typedef for our optimization function. -In this case we use a std::function (a handy wrapper for "function-like" things) -from a single unique_ptr input to a std::unique_ptr output. With -our optimization function typedef in place we can declare our OptimizeLayer, -which sits on top of our CompileLayer. - -To initialize our OptimizeLayer we pass it a reference to the CompileLayer -below (standard practice for layers), and we initialize the OptimizeFunction -using a lambda that calls out to an "optimizeModule" function that we will -define below. - -.. code-block:: c++ - - // ... - auto Resolver = createLambdaResolver( - [&](const std::string &Name) { - if (auto Sym = OptimizeLayer.findSymbol(Name, false)) - return Sym; - return JITSymbol(nullptr); - }, - // ... +but after the CompileLayer we introduce a new member, TransformLayer, which sits +on top of our CompileLayer. We initialize our OptimizeLayer with a reference to +the ExecutionSession and output layer (standard practice for layers), along with +a *transform function*. For our transform function we supply our classes +optimizeModule static method. .. code-block:: c++ @@ -111,26 +94,13 @@ define below. std::move(Resolver))); // ... -.. code-block:: c++ - - // ... - return OptimizeLayer.findSymbol(MangledNameStream.str(), true); - // ... - -.. code-block:: c++ - - // ... - cantFail(OptimizeLayer.removeModule(H)); - // ... - -Next we need to replace references to 'CompileLayer' with references to -OptimizeLayer in our key methods: addModule, findSymbol, and removeModule. In -addModule we need to be careful to replace both references: the findSymbol call -inside our resolver, and the call through to addModule. +Next we need to update our addModule method to replace the call to +``CompileLayer::add`` with a call to ``OptimizeLayer::add`` instead. .. code-block:: c++ - std::shared_ptr optimizeModule(std::shared_ptr M) { + static Expected + optimizeModule(ThreadSafeModule M, const MaterializationResponsibility &R) { // Create a function pass manager. auto FPM = llvm::make_unique(M.get()); @@ -150,12 +120,18 @@ inside our resolver, and the call through to addModule. } At the bottom of our JIT we add a private method to do the actual optimization: -*optimizeModule*. This function sets up a FunctionPassManager, adds some passes -to it, runs it over every function in the module, and then returns the mutated -module. The specific optimizations are the same ones used in -`Chapter 4 `_ of the "Implementing a language with LLVM" -tutorial series. Readers may visit that chapter for a more in-depth -discussion of these, and of IR optimization in general. +*optimizeModule*. This function takes the module to be transformed as input (as +a ThreadSafeModule) along with a reference to a reference to a new class: +``MaterializationResponsibility``. The MaterializationResponsibility argument +can be used to query JIT state for the module being transformed, such as the set +of definitions in the module that JIT'd code is actively trying to call/access. +For now we will ignore this argument and use a standard optimization +pipeline. To do this we set up a FunctionPassManager, add some passes to it, run +it over every function in the module, and then return the mutated module. The +specific optimizations are the same ones used in `Chapter 4 `_ +of the "Implementing a language with LLVM" tutorial series. Readers may visit +that chapter for a more in-depth discussion of these, and of IR optimization in +general. And that's it in terms of changes to KaleidoscopeJIT: When a module is added via addModule the OptimizeLayer will call our optimizeModule function before passing @@ -163,148 +139,122 @@ the transformed module on to the CompileLayer below. Of course, we could have called optimizeModule directly in our addModule function and not gone to the bother of using the IRTransformLayer, but doing so gives us another opportunity to see how layers compose. It also provides a neat entry point to the *layer* -concept itself, because IRTransformLayer turns out to be one of the simplest -implementations of the layer concept that can be devised: +concept itself, because IRTransformLayer is one of the simplest layers that +can be implemented. .. code-block:: c++ - template - class IRTransformLayer { + // From IRTransformLayer.h: + class IRTransformLayer : public IRLayer { public: - using ModuleHandleT = typename BaseLayerT::ModuleHandleT; + using TransformFunction = std::function( + ThreadSafeModule, const MaterializationResponsibility &R)>; - IRTransformLayer(BaseLayerT &BaseLayer, - TransformFtor Transform = TransformFtor()) - : BaseLayer(BaseLayer), Transform(std::move(Transform)) {} + IRTransformLayer(ExecutionSession &ES, IRLayer &BaseLayer, + TransformFunction Transform = identityTransform); - Expected - addModule(std::shared_ptr M, - std::shared_ptr Resolver) { - return BaseLayer.addModule(Transform(std::move(M)), std::move(Resolver)); + void setTransform(TransformFunction Transform) { + this->Transform = std::move(Transform); } - void removeModule(ModuleHandleT H) { BaseLayer.removeModule(H); } - - JITSymbol findSymbol(const std::string &Name, bool ExportedSymbolsOnly) { - return BaseLayer.findSymbol(Name, ExportedSymbolsOnly); + static ThreadSafeModule + identityTransform(ThreadSafeModule TSM, + const MaterializationResponsibility &R) { + return TSM; } - JITSymbol findSymbolIn(ModuleHandleT H, const std::string &Name, - bool ExportedSymbolsOnly) { - return BaseLayer.findSymbolIn(H, Name, ExportedSymbolsOnly); - } + void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override; - void emitAndFinalize(ModuleHandleT H) { - BaseLayer.emitAndFinalize(H); - } + private: + IRLayer &BaseLayer; + TransformFunction Transform; + }; - TransformFtor& getTransform() { return Transform; } + // From IRTransfomrLayer.cpp: - const TransformFtor& getTransform() const { return Transform; } + IRTransformLayer::IRTransformLayer(ExecutionSession &ES, + IRLayer &BaseLayer, + TransformFunction Transform) + : IRLayer(ES), BaseLayer(BaseLayer), Transform(std::move(Transform)) {} - private: - BaseLayerT &BaseLayer; - TransformFtor Transform; - }; + void IRTransformLayer::emit(MaterializationResponsibility R, + ThreadSafeModule TSM) { + assert(TSM.getModule() && "Module must not be null"); + + if (auto TransformedTSM = Transform(std::move(TSM), R)) + BaseLayer.emit(std::move(R), std::move(*TransformedTSM)); + else { + R.failMaterialization(); + getExecutionSession().reportError(TransformedTSM.takeError()); + } + } This is the whole definition of IRTransformLayer, from -``llvm/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h``, stripped of its -comments. It is a template class with two template arguments: ``BaesLayerT`` and -``TransformFtor`` that provide the type of the base layer and the type of the -"transform functor" (in our case a std::function) respectively. This class is -concerned with two very simple jobs: (1) Running every IR Module that is added -with addModule through the transform functor, and (2) conforming to the ORC -layer interface. The interface consists of one typedef and five methods: - -+------------------+-----------------------------------------------------------+ -| Interface | Description | -+==================+===========================================================+ -| | Provides a handle that can be used to identify a module | -| ModuleHandleT | set when calling findSymbolIn, removeModule, or | -| | emitAndFinalize. | -+------------------+-----------------------------------------------------------+ -| | Takes a given set of Modules and makes them "available | -| | for execution". This means that symbols in those modules | -| | should be searchable via findSymbol and findSymbolIn, and | -| | the address of the symbols should be read/writable (for | -| | data symbols), or executable (for function symbols) after | -| | JITSymbol::getAddress() is called. Note: This means that | -| addModule | addModule doesn't have to compile (or do any other | -| | work) up-front. It *can*, like IRCompileLayer, act | -| | eagerly, but it can also simply record the module and | -| | take no further action until somebody calls | -| | JITSymbol::getAddress(). In IRTransformLayer's case | -| | addModule eagerly applies the transform functor to | -| | each module in the set, then passes the resulting set | -| | of mutated modules down to the layer below. | -+------------------+-----------------------------------------------------------+ -| | Removes a set of modules from the JIT. Code or data | -| removeModule | defined in these modules will no longer be available, and | -| | the memory holding the JIT'd definitions will be freed. | -+------------------+-----------------------------------------------------------+ -| | Searches for the named symbol in all modules that have | -| | previously been added via addModule (and not yet | -| findSymbol | removed by a call to removeModule). In | -| | IRTransformLayer we just pass the query on to the layer | -| | below. In our REPL this is our default way to search for | -| | function definitions. | -+------------------+-----------------------------------------------------------+ -| | Searches for the named symbol in the module set indicated | -| | by the given ModuleHandleT. This is just an optimized | -| | search, better for lookup-speed when you know exactly | -| | a symbol definition should be found. In IRTransformLayer | -| findSymbolIn | we just pass this query on to the layer below. In our | -| | REPL we use this method to search for functions | -| | representing top-level expressions, since we know exactly | -| | where we'll find them: in the top-level expression module | -| | we just added. | -+------------------+-----------------------------------------------------------+ -| | Forces all of the actions required to make the code and | -| | data in a module set (represented by a ModuleHandleT) | -| | accessible. Behaves as if some symbol in the set had been | -| | searched for and JITSymbol::getSymbolAddress called. This | -| emitAndFinalize | is rarely needed, but can be useful when dealing with | -| | layers that usually behave lazily if the user wants to | -| | trigger early compilation (for example, to use idle CPU | -| | time to eagerly compile code in the background). | -+------------------+-----------------------------------------------------------+ - -This interface attempts to capture the natural operations of a JIT (with some -wrinkles like emitAndFinalize for performance), similar to the basic JIT API -operations we identified in Chapter 1. Conforming to the layer concept allows -classes to compose neatly by implementing their behaviors in terms of the these -same operations, carried out on the layer below. For example, an eager layer -(like IRTransformLayer) can implement addModule by running each module in the -set through its transform up-front and immediately passing the result to the -layer below. A lazy layer, by contrast, could implement addModule by -squirreling away the modules doing no other up-front work, but applying the -transform (and calling addModule on the layer below) when the client calls -findSymbol instead. The JIT'd program behavior will be the same either way, but -these choices will have different performance characteristics: Doing work -eagerly means the JIT takes longer up-front, but proceeds smoothly once this is -done. Deferring work allows the JIT to get up-and-running quickly, but will -force the JIT to pause and wait whenever some code or data is needed that hasn't -already been processed. - -Our current REPL is eager: Each function definition is optimized and compiled as -soon as it's typed in. If we were to make the transform layer lazy (but not -change things otherwise) we could defer optimization until the first time we -reference a function in a top-level expression (see if you can figure out why, -then check out the answer below [1]_). In the next chapter, however we'll -introduce fully lazy compilation, in which function's aren't compiled until -they're first called at run-time. At this point the trade-offs get much more +``llvm/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h`` and +``llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp``. This class is concerned +with two very simple jobs: (1) Running every IR Module that is emitted via this +layer through the transform function object, and (2) implementing the ORC +``IRLayer`` interface (which itself conforms to the general ORC Layer concept, +more on that below). Most of the class is straightforward: a typedef for the +transform function, a constructor to initialize the members, a setter for the +transform function value, and a default no-op transform. The most important +method is ``emit`` as this is half of our IRLayer interface. The emit method +applies our transform to each module that it is called on and, if the transform +succeeds, passes the transformed module to the base layer. If the transform +fails, our emit function calls +``MaterializationResponsibility::failMaterialization`` (this JIT clients who +may be waiting on other threads know that the code they were waiting for has +failed to compile) and logs the error with the execution session before bailing +out. + +The other half of the IRLayer interface we inherit unmodified from the IRLayer +class: + +.. code-block:: c++ + + Error IRLayer::add(JITDylib &JD, ThreadSafeModule TSM, VModuleKey K) { + return JD.define(llvm::make_unique( + *this, std::move(K), std::move(TSM))); + } + +This code, from ``llvm/lib/ExecutionEngine/Orc/Layer.cpp``, adds a +ThreadSafeModule to a given JITDylib by wrapping it up in a +``MaterializationUnit`` (in this case a ``BasicIRLayerMaterializationUnit``). +Most layers that derived from IRLayer can rely on this default implementation +of the ``add`` method. + +These two operations, ``add`` and ``emit``, together constitute the layer +concept: A layer is a way to wrap a portion of a compiler pipeline (in this case +the "opt" phase of an LLVM compiler) whose API is is opaque to ORC in an +interface that allows ORC to invoke it when needed. The add method takes an +module in some input program representation (in this case an LLVM IR module) and +stores it in the target JITDylib, arranging for it to be passed back to the +Layer's emit method when any symbol defined by that module is requested. Layers +can compose neatly by calling the 'emit' method of a base layer to complete +their work. For example, in this tutorial our IRTransformLayer calls through to +our IRCompileLayer to compile the transformed IR, and our IRCompileLayer in turn +calls our ObjectLayer to link the object file produced by our compiler. + + +So far we have learned how to optimize and compile our LLVM IR, but we have not +focused on when compilation happens. Our current REPL is eager: Each function +definition is optimized and compiled as soon as it is referenced by any other +code, regardless of whether it is ever called at runtime. In the next chapter we +will introduce fully lazy compilation, in which functions are not compiled until +they are first called at run-time. At this point the trade-offs get much more interesting: the lazier we are, the quicker we can start executing the first -function, but the more often we'll have to pause to compile newly encountered -functions. If we only code-gen lazily, but optimize eagerly, we'll have a slow -startup (which everything is optimized) but relatively short pauses as each -function just passes through code-gen. If we both optimize and code-gen lazily -we can start executing the first function more quickly, but we'll have longer -pauses as each function has to be both optimized and code-gen'd when it's first -executed. Things become even more interesting if we consider interproceedural -optimizations like inlining, which must be performed eagerly. These are -complex trade-offs, and there is no one-size-fits all solution to them, but by -providing composable layers we leave the decisions to the person implementing -the JIT, and make it easy for them to experiment with different configurations. +function, but the more often we will have to pause to compile newly encountered +functions. If we only code-gen lazily, but optimize eagerly, we will have a +longer startup time (as everything is optimized) but relatively short pauses as +each function just passes through code-gen. If we both optimize and code-gen +lazily we can start executing the first function more quickly, but we will have +longer pauses as each function has to be both optimized and code-gen'd when it +is first executed. Things become even more interesting if we consider +interproceedural optimizations like inlining, which must be performed eagerly. +These are complex trade-offs, and there is no one-size-fits all solution to +them, but by providing composable layers we leave the decisions to the person +implementing the JIT, and make it easy for them to experiment with different +configurations. `Next: Adding Per-function Lazy Compilation `_ @@ -325,10 +275,3 @@ Here is the code: .. literalinclude:: ../../examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h :language: c++ - -.. [1] When we add our top-level expression to the JIT, any calls to functions - that we defined earlier will appear to the RTDyldObjectLinkingLayer as - external symbols. The RTDyldObjectLinkingLayer will call the SymbolResolver - that we defined in addModule, which in turn calls findSymbol on the - OptimizeLayer, at which point even a lazy transform layer will have to - do its work. diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp b/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp index 5a66b367c27368a8d383cb9ad8685e4b4cdccee8..8cbd5d6f2f7737fb2f1b109e973b31e6f7292129 100644 --- a/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp +++ b/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp @@ -679,9 +679,8 @@ static std::unique_ptr ParseDefinition() { static std::unique_ptr ParseTopLevelExpr(unsigned ExprCount) { if (auto E = ParseExpression()) { // Make an anonymous proto. - auto Proto = llvm::make_unique(("__anon_expr" + - Twine(ExprCount)).str(), - std::vector()); + auto Proto = llvm::make_unique + (("__anon_expr" + Twine(ExprCount)).str(), std::vector()); return llvm::make_unique(std::move(Proto), std::move(E)); } return nullptr; @@ -1157,7 +1156,7 @@ static void HandleTopLevelExpression() { // Get the anonymous expression's JITSymbol. auto Sym = - ExitOnErr(TheJIT->lookup(("__anon_expr" + Twine(ExprCount)).str())); + ExitOnErr(TheJIT->lookup(("__anon_expr" + Twine(ExprCount)).str())); auto *FP = (double (*)())(intptr_t)Sym.getAddress(); assert(FP && "Failed to codegen function"); diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h index 7c803b138c0666e9406bccad50f6aeb653c07ef1..4dc08470dffabf70188014569572c051b59623e5 100644 --- a/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h +++ b/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h @@ -14,29 +14,23 @@ #ifndef LLVM_EXECUTIONENGINE_ORC_KALEIDOSCOPEJIT_H #define LLVM_EXECUTIONENGINE_ORC_KALEIDOSCOPEJIT_H -#include "llvm/ADT/STLExtras.h" -#include "llvm/ExecutionEngine/ExecutionEngine.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ExecutionEngine/JITSymbol.h" #include "llvm/ExecutionEngine/Orc/CompileUtils.h" +#include "llvm/ExecutionEngine/Orc/Core.h" +#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" #include "llvm/ExecutionEngine/Orc/IRTransformLayer.h" -#include "llvm/ExecutionEngine/Orc/LambdaResolver.h" +#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h" #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h" -#include "llvm/ExecutionEngine/RTDyldMemoryManager.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/Mangler.h" -#include "llvm/Support/DynamicLibrary.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/InstCombine/InstCombine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" -#include #include -#include -#include namespace llvm { namespace orc { @@ -44,69 +38,57 @@ namespace orc { class KaleidoscopeJIT { private: ExecutionSession ES; - std::shared_ptr Resolver; - std::unique_ptr TM; - const DataLayout DL; - LegacyRTDyldObjectLinkingLayer ObjectLayer; - LegacyIRCompileLayer CompileLayer; + RTDyldObjectLinkingLayer ObjectLayer; + IRCompileLayer CompileLayer; + IRTransformLayer OptimizeLayer; - using OptimizeFunction = - std::function(std::unique_ptr)>; - - LegacyIRTransformLayer OptimizeLayer; + DataLayout DL; + MangleAndInterner Mangle; + ThreadSafeContext Ctx; public: - KaleidoscopeJIT() - : Resolver(createLegacyLookupResolver( - ES, - [this](const std::string &Name) -> JITSymbol { - if (auto Sym = OptimizeLayer.findSymbol(Name, false)) - return Sym; - else if (auto Err = Sym.takeError()) - return std::move(Err); - if (auto SymAddr = - RTDyldMemoryManager::getSymbolAddressInProcess(Name)) - return JITSymbol(SymAddr, JITSymbolFlags::Exported); - return nullptr; - }, - [](Error Err) { cantFail(std::move(Err), "lookupFlags failed"); })), - TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()), - ObjectLayer(ES, - [this](VModuleKey) { - return LegacyRTDyldObjectLinkingLayer::Resources{ - std::make_shared(), Resolver}; - }), - CompileLayer(ObjectLayer, SimpleCompiler(*TM)), - OptimizeLayer(CompileLayer, [this](std::unique_ptr M) { - return optimizeModule(std::move(M)); - }) { - llvm::sys::DynamicLibrary::LoadLibraryPermanently(nullptr); + KaleidoscopeJIT(JITTargetMachineBuilder JTMB, DataLayout DL) + : ObjectLayer(ES, + []() { return llvm::make_unique(); }), + CompileLayer(ES, ObjectLayer, ConcurrentIRCompiler(std::move(JTMB))), + OptimizeLayer(ES, CompileLayer, optimizeModule), + DL(std::move(DL)), Mangle(ES, this->DL), + Ctx(llvm::make_unique()) { + ES.getMainJITDylib().setGenerator( + cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(DL))); } - TargetMachine &getTargetMachine() { return *TM; } + const DataLayout &getDataLayout() const { return DL; } + + LLVMContext &getContext() { return *Ctx.getContext(); } + + static Expected> Create() { + auto JTMB = JITTargetMachineBuilder::detectHost(); + + if (!JTMB) + return JTMB.takeError(); + + auto DL = JTMB->getDefaultDataLayoutForTarget(); + if (!DL) + return DL.takeError(); - VModuleKey addModule(std::unique_ptr M) { - // Add the module to the JIT with a new VModuleKey. - auto K = ES.allocateVModule(); - cantFail(OptimizeLayer.addModule(K, std::move(M))); - return K; + return llvm::make_unique(std::move(*JTMB), std::move(*DL)); } - JITSymbol findSymbol(const std::string Name) { - std::string MangledName; - raw_string_ostream MangledNameStream(MangledName); - Mangler::getNameWithPrefix(MangledNameStream, Name, DL); - return OptimizeLayer.findSymbol(MangledNameStream.str(), true); + Error addModule(std::unique_ptr M) { + return OptimizeLayer.add(ES.getMainJITDylib(), + ThreadSafeModule(std::move(M), Ctx)); } - void removeModule(VModuleKey K) { - cantFail(OptimizeLayer.removeModule(K)); + Expected lookup(StringRef Name) { + return ES.lookup({&ES.getMainJITDylib()}, Mangle(Name.str())); } private: - std::unique_ptr optimizeModule(std::unique_ptr M) { + static Expected + optimizeModule(ThreadSafeModule TSM, const MaterializationResponsibility &R) { // Create a function pass manager. - auto FPM = llvm::make_unique(M.get()); + auto FPM = llvm::make_unique(TSM.getModule()); // Add some optimizations. FPM->add(createInstructionCombiningPass()); @@ -117,10 +99,10 @@ private: // Run the optimizations over all functions in the module being added to // the JIT. - for (auto &F : *M) + for (auto &F : *TSM.getModule()) FPM->run(F); - return M; + return TSM; } }; diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter2/toy.cpp b/examples/Kaleidoscope/BuildingAJIT/Chapter2/toy.cpp index 2471344c6d65fdd3aaba7211e50a94dc892f94fa..e7c3624aa04d575ced97dfaa0233eba6c6d6301b 100644 --- a/examples/Kaleidoscope/BuildingAJIT/Chapter2/toy.cpp +++ b/examples/Kaleidoscope/BuildingAJIT/Chapter2/toy.cpp @@ -676,11 +676,11 @@ static std::unique_ptr ParseDefinition() { } /// toplevelexpr ::= expression -static std::unique_ptr ParseTopLevelExpr() { +static std::unique_ptr ParseTopLevelExpr(unsigned ExprCount) { if (auto E = ParseExpression()) { // Make an anonymous proto. - auto Proto = llvm::make_unique("__anon_expr", - std::vector()); + auto Proto = llvm::make_unique( + ("__anon_expr" + Twine(ExprCount)).str(), std::vector()); return llvm::make_unique(std::move(Proto), std::move(E)); } return nullptr; @@ -696,12 +696,13 @@ static std::unique_ptr ParseExtern() { // Code Generation //===----------------------------------------------------------------------===// -static LLVMContext TheContext; -static IRBuilder<> Builder(TheContext); +static std::unique_ptr TheJIT; +static LLVMContext *TheContext; +static std::unique_ptr> Builder; static std::unique_ptr TheModule; static std::map NamedValues; -static std::unique_ptr TheJIT; static std::map> FunctionProtos; +static ExitOnError ExitOnErr; Value *LogErrorV(const char *Str) { LogError(Str); @@ -729,11 +730,11 @@ static AllocaInst *CreateEntryBlockAlloca(Function *TheFunction, const std::string &VarName) { IRBuilder<> TmpB(&TheFunction->getEntryBlock(), TheFunction->getEntryBlock().begin()); - return TmpB.CreateAlloca(Type::getDoubleTy(TheContext), nullptr, VarName); + return TmpB.CreateAlloca(Type::getDoubleTy(*TheContext), nullptr, VarName); } Value *NumberExprAST::codegen() { - return ConstantFP::get(TheContext, APFloat(Val)); + return ConstantFP::get(*TheContext, APFloat(Val)); } Value *VariableExprAST::codegen() { @@ -743,7 +744,7 @@ Value *VariableExprAST::codegen() { return LogErrorV("Unknown variable name"); // Load the value. - return Builder.CreateLoad(V, Name.c_str()); + return Builder->CreateLoad(V, Name.c_str()); } Value *UnaryExprAST::codegen() { @@ -755,7 +756,7 @@ Value *UnaryExprAST::codegen() { if (!F) return LogErrorV("Unknown unary operator"); - return Builder.CreateCall(F, OperandV, "unop"); + return Builder->CreateCall(F, OperandV, "unop"); } Value *BinaryExprAST::codegen() { @@ -778,7 +779,7 @@ Value *BinaryExprAST::codegen() { if (!Variable) return LogErrorV("Unknown variable name"); - Builder.CreateStore(Val, Variable); + Builder->CreateStore(Val, Variable); return Val; } @@ -789,15 +790,15 @@ Value *BinaryExprAST::codegen() { switch (Op) { case '+': - return Builder.CreateFAdd(L, R, "addtmp"); + return Builder->CreateFAdd(L, R, "addtmp"); case '-': - return Builder.CreateFSub(L, R, "subtmp"); + return Builder->CreateFSub(L, R, "subtmp"); case '*': - return Builder.CreateFMul(L, R, "multmp"); + return Builder->CreateFMul(L, R, "multmp"); case '<': - L = Builder.CreateFCmpULT(L, R, "cmptmp"); + L = Builder->CreateFCmpULT(L, R, "cmptmp"); // Convert bool 0/1 to double 0.0 or 1.0 - return Builder.CreateUIToFP(L, Type::getDoubleTy(TheContext), "booltmp"); + return Builder->CreateUIToFP(L, Type::getDoubleTy(*TheContext), "booltmp"); default: break; } @@ -808,7 +809,7 @@ Value *BinaryExprAST::codegen() { assert(F && "binary operator not found!"); Value *Ops[] = {L, R}; - return Builder.CreateCall(F, Ops, "binop"); + return Builder->CreateCall(F, Ops, "binop"); } Value *CallExprAST::codegen() { @@ -828,7 +829,7 @@ Value *CallExprAST::codegen() { return nullptr; } - return Builder.CreateCall(CalleeF, ArgsV, "calltmp"); + return Builder->CreateCall(CalleeF, ArgsV, "calltmp"); } Value *IfExprAST::codegen() { @@ -837,46 +838,46 @@ Value *IfExprAST::codegen() { return nullptr; // Convert condition to a bool by comparing equal to 0.0. - CondV = Builder.CreateFCmpONE( - CondV, ConstantFP::get(TheContext, APFloat(0.0)), "ifcond"); + CondV = Builder->CreateFCmpONE( + CondV, ConstantFP::get(*TheContext, APFloat(0.0)), "ifcond"); - Function *TheFunction = Builder.GetInsertBlock()->getParent(); + Function *TheFunction = Builder->GetInsertBlock()->getParent(); // Create blocks for the then and else cases. Insert the 'then' block at the // end of the function. - BasicBlock *ThenBB = BasicBlock::Create(TheContext, "then", TheFunction); - BasicBlock *ElseBB = BasicBlock::Create(TheContext, "else"); - BasicBlock *MergeBB = BasicBlock::Create(TheContext, "ifcont"); + BasicBlock *ThenBB = BasicBlock::Create(*TheContext, "then", TheFunction); + BasicBlock *ElseBB = BasicBlock::Create(*TheContext, "else"); + BasicBlock *MergeBB = BasicBlock::Create(*TheContext, "ifcont"); - Builder.CreateCondBr(CondV, ThenBB, ElseBB); + Builder->CreateCondBr(CondV, ThenBB, ElseBB); // Emit then value. - Builder.SetInsertPoint(ThenBB); + Builder->SetInsertPoint(ThenBB); Value *ThenV = Then->codegen(); if (!ThenV) return nullptr; - Builder.CreateBr(MergeBB); + Builder->CreateBr(MergeBB); // Codegen of 'Then' can change the current block, update ThenBB for the PHI. - ThenBB = Builder.GetInsertBlock(); + ThenBB = Builder->GetInsertBlock(); // Emit else block. TheFunction->getBasicBlockList().push_back(ElseBB); - Builder.SetInsertPoint(ElseBB); + Builder->SetInsertPoint(ElseBB); Value *ElseV = Else->codegen(); if (!ElseV) return nullptr; - Builder.CreateBr(MergeBB); + Builder->CreateBr(MergeBB); // Codegen of 'Else' can change the current block, update ElseBB for the PHI. - ElseBB = Builder.GetInsertBlock(); + ElseBB = Builder->GetInsertBlock(); // Emit merge block. TheFunction->getBasicBlockList().push_back(MergeBB); - Builder.SetInsertPoint(MergeBB); - PHINode *PN = Builder.CreatePHI(Type::getDoubleTy(TheContext), 2, "iftmp"); + Builder->SetInsertPoint(MergeBB); + PHINode *PN = Builder->CreatePHI(Type::getDoubleTy(*TheContext), 2, "iftmp"); PN->addIncoming(ThenV, ThenBB); PN->addIncoming(ElseV, ElseBB); @@ -903,7 +904,7 @@ Value *IfExprAST::codegen() { // br endcond, loop, endloop // outloop: Value *ForExprAST::codegen() { - Function *TheFunction = Builder.GetInsertBlock()->getParent(); + Function *TheFunction = Builder->GetInsertBlock()->getParent(); // Create an alloca for the variable in the entry block. AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName); @@ -914,17 +915,17 @@ Value *ForExprAST::codegen() { return nullptr; // Store the value into the alloca. - Builder.CreateStore(StartVal, Alloca); + Builder->CreateStore(StartVal, Alloca); // Make the new basic block for the loop header, inserting after current // block. - BasicBlock *LoopBB = BasicBlock::Create(TheContext, "loop", TheFunction); + BasicBlock *LoopBB = BasicBlock::Create(*TheContext, "loop", TheFunction); // Insert an explicit fall through from the current block to the LoopBB. - Builder.CreateBr(LoopBB); + Builder->CreateBr(LoopBB); // Start insertion in LoopBB. - Builder.SetInsertPoint(LoopBB); + Builder->SetInsertPoint(LoopBB); // Within the loop, the variable is defined equal to the PHI node. If it // shadows an existing variable, we have to restore it, so save it now. @@ -945,7 +946,7 @@ Value *ForExprAST::codegen() { return nullptr; } else { // If not specified, use 1.0. - StepVal = ConstantFP::get(TheContext, APFloat(1.0)); + StepVal = ConstantFP::get(*TheContext, APFloat(1.0)); } // Compute the end condition. @@ -955,23 +956,23 @@ Value *ForExprAST::codegen() { // Reload, increment, and restore the alloca. This handles the case where // the body of the loop mutates the variable. - Value *CurVar = Builder.CreateLoad(Alloca, VarName.c_str()); - Value *NextVar = Builder.CreateFAdd(CurVar, StepVal, "nextvar"); - Builder.CreateStore(NextVar, Alloca); + Value *CurVar = Builder->CreateLoad(Alloca, VarName.c_str()); + Value *NextVar = Builder->CreateFAdd(CurVar, StepVal, "nextvar"); + Builder->CreateStore(NextVar, Alloca); // Convert condition to a bool by comparing equal to 0.0. - EndCond = Builder.CreateFCmpONE( - EndCond, ConstantFP::get(TheContext, APFloat(0.0)), "loopcond"); + EndCond = Builder->CreateFCmpONE( + EndCond, ConstantFP::get(*TheContext, APFloat(0.0)), "loopcond"); // Create the "after loop" block and insert it. BasicBlock *AfterBB = - BasicBlock::Create(TheContext, "afterloop", TheFunction); + BasicBlock::Create(*TheContext, "afterloop", TheFunction); // Insert the conditional branch into the end of LoopEndBB. - Builder.CreateCondBr(EndCond, LoopBB, AfterBB); + Builder->CreateCondBr(EndCond, LoopBB, AfterBB); // Any new code will be inserted in AfterBB. - Builder.SetInsertPoint(AfterBB); + Builder->SetInsertPoint(AfterBB); // Restore the unshadowed variable. if (OldVal) @@ -980,13 +981,13 @@ Value *ForExprAST::codegen() { NamedValues.erase(VarName); // for expr always returns 0.0. - return Constant::getNullValue(Type::getDoubleTy(TheContext)); + return Constant::getNullValue(Type::getDoubleTy(*TheContext)); } Value *VarExprAST::codegen() { std::vector OldBindings; - Function *TheFunction = Builder.GetInsertBlock()->getParent(); + Function *TheFunction = Builder->GetInsertBlock()->getParent(); // Register all variables and emit their initializer. for (unsigned i = 0, e = VarNames.size(); i != e; ++i) { @@ -1004,11 +1005,11 @@ Value *VarExprAST::codegen() { if (!InitVal) return nullptr; } else { // If not specified, use 0.0. - InitVal = ConstantFP::get(TheContext, APFloat(0.0)); + InitVal = ConstantFP::get(*TheContext, APFloat(0.0)); } AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName); - Builder.CreateStore(InitVal, Alloca); + Builder->CreateStore(InitVal, Alloca); // Remember the old variable binding so that we can restore the binding when // we unrecurse. @@ -1033,9 +1034,9 @@ Value *VarExprAST::codegen() { Function *PrototypeAST::codegen() { // Make the function type: double(double,double) etc. - std::vector Doubles(Args.size(), Type::getDoubleTy(TheContext)); + std::vector Doubles(Args.size(), Type::getDoubleTy(*TheContext)); FunctionType *FT = - FunctionType::get(Type::getDoubleTy(TheContext), Doubles, false); + FunctionType::get(Type::getDoubleTy(*TheContext), Doubles, false); Function *F = Function::Create(FT, Function::ExternalLinkage, Name, TheModule.get()); @@ -1062,8 +1063,8 @@ Function *FunctionAST::codegen() { BinopPrecedence[P.getOperatorName()] = P.getBinaryPrecedence(); // Create a new basic block to start insertion into. - BasicBlock *BB = BasicBlock::Create(TheContext, "entry", TheFunction); - Builder.SetInsertPoint(BB); + BasicBlock *BB = BasicBlock::Create(*TheContext, "entry", TheFunction); + Builder->SetInsertPoint(BB); // Record the function arguments in the NamedValues map. NamedValues.clear(); @@ -1072,7 +1073,7 @@ Function *FunctionAST::codegen() { AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, Arg.getName()); // Store the initial value into the alloca. - Builder.CreateStore(&Arg, Alloca); + Builder->CreateStore(&Arg, Alloca); // Add arguments to variable symbol table. NamedValues[Arg.getName()] = Alloca; @@ -1080,7 +1081,7 @@ Function *FunctionAST::codegen() { if (Value *RetVal = Body->codegen()) { // Finish off the function. - Builder.CreateRet(RetVal); + Builder->CreateRet(RetVal); // Validate the generated code, checking for consistency. verifyFunction(*TheFunction); @@ -1102,8 +1103,11 @@ Function *FunctionAST::codegen() { static void InitializeModule() { // Open a new module. - TheModule = llvm::make_unique("my cool jit", TheContext); - TheModule->setDataLayout(TheJIT->getTargetMachine().createDataLayout()); + TheModule = llvm::make_unique("my cool jit", *TheContext); + TheModule->setDataLayout(TheJIT->getDataLayout()); + + // Create a new builder for the module. + Builder = llvm::make_unique>(*TheContext); } static void HandleDefinition() { @@ -1112,7 +1116,7 @@ static void HandleDefinition() { fprintf(stderr, "Read function definition:"); FnIR->print(errs()); fprintf(stderr, "\n"); - TheJIT->addModule(std::move(TheModule)); + ExitOnErr(TheJIT->addModule(std::move(TheModule))); InitializeModule(); } } else { @@ -1136,25 +1140,27 @@ static void HandleExtern() { } static void HandleTopLevelExpression() { + static unsigned ExprCount = 0; + + // Update ExprCount. This number will be added to anonymous expressions to + // prevent them from clashing. + ++ExprCount; + // Evaluate a top-level expression into an anonymous function. - if (auto FnAST = ParseTopLevelExpr()) { + if (auto FnAST = ParseTopLevelExpr(ExprCount)) { if (FnAST->codegen()) { // JIT the module containing the anonymous expression, keeping a handle so // we can free it later. - auto H = TheJIT->addModule(std::move(TheModule)); + ExitOnErr(TheJIT->addModule(std::move(TheModule))); InitializeModule(); - // Search the JIT for the __anon_expr symbol. - auto ExprSymbol = TheJIT->findSymbol("__anon_expr"); - assert(ExprSymbol && "Function not found"); + // Get the anonymous expression's JITSymbol. + auto Sym = + ExitOnErr(TheJIT->lookup(("__anon_expr" + Twine(ExprCount)).str())); - // Get the symbol's address and cast it to the right type (takes no - // arguments, returns a double) so we can call it as a native function. - double (*FP)() = (double (*)())(intptr_t)cantFail(ExprSymbol.getAddress()); + auto *FP = (double (*)())(intptr_t)Sym.getAddress(); + assert(FP && "Failed to codegen function"); fprintf(stderr, "Evaluated to %f\n", FP()); - - // Delete the anonymous expression module from the JIT. - TheJIT->removeModule(H); } } else { // Skip token for error recovery. @@ -1222,7 +1228,8 @@ int main() { fprintf(stderr, "ready> "); getNextToken(); - TheJIT = llvm::make_unique(); + TheJIT = ExitOnErr(KaleidoscopeJIT::Create()); + TheContext = &TheJIT->getContext(); InitializeModule(); diff --git a/examples/Kaleidoscope/Chapter9/toy.cpp b/examples/Kaleidoscope/Chapter9/toy.cpp index 7a12b5a80cc848ca970b827fede7223875b8226c..19bac1a8bc7021c89241b983ead61ea467c7ac9d 100644 --- a/examples/Kaleidoscope/Chapter9/toy.cpp +++ b/examples/Kaleidoscope/Chapter9/toy.cpp @@ -1245,9 +1245,8 @@ Function *FunctionAST::codegen() { unsigned ScopeLine = LineNo; DISubprogram *SP = DBuilder->createFunction( FContext, P.getName(), StringRef(), Unit, LineNo, - CreateFunctionType(TheFunction->arg_size(), Unit), - false /* internal linkage */, true /* definition */, ScopeLine, - DINode::FlagPrototyped, false); + CreateFunctionType(TheFunction->arg_size(), Unit), ScopeLine, + DINode::FlagPrototyped, DISubprogram::SPFlagDefinition); TheFunction->setSubprogram(SP); // Push the current scope. diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h index c093c0906ce3b32ddf9f19bb7126dcaaba2050ff..0784275ace8cb393283c4b3f515662787aa5575d 100644 --- a/include/llvm-c/Core.h +++ b/include/llvm-c/Core.h @@ -54,6 +54,8 @@ extern "C" { * @{ */ +/// External users depend on the following values being stable. It is not safe +/// to reorder them. typedef enum { /* Terminator Instructions */ LLVMRet = 1, @@ -64,6 +66,9 @@ typedef enum { /* removed 6 due to API changes */ LLVMUnreachable = 7, + /* Standard Unary Operators */ + LLVMFNeg = 66, + /* Standard Binary Operators */ LLVMAdd = 8, LLVMFAdd = 9, diff --git a/include/llvm/ADT/APFloat.h b/include/llvm/ADT/APFloat.h index 52ed183c78aed1fa2f12eaac1b355d25d13b52aa..c6fa5ad674f6898e79ab7aaf604ef6ddf3d8f416 100644 --- a/include/llvm/ADT/APFloat.h +++ b/include/llvm/ADT/APFloat.h @@ -870,13 +870,13 @@ public: /// Factory for NaN values. /// /// \param Negative - True iff the NaN generated should be negative. - /// \param type - The unspecified fill bits for creating the NaN, 0 by + /// \param payload - The unspecified fill bits for creating the NaN, 0 by /// default. The value is truncated as necessary. static APFloat getNaN(const fltSemantics &Sem, bool Negative = false, - unsigned type = 0) { - if (type) { - APInt fill(64, type); - return getQNaN(Sem, Negative, &fill); + uint64_t payload = 0) { + if (payload) { + APInt intPayload(64, payload); + return getQNaN(Sem, Negative, &intPayload); } else { return getQNaN(Sem, Negative, nullptr); } diff --git a/include/llvm/ADT/APInt.h b/include/llvm/ADT/APInt.h index 1befe7a0273a86399396d68474bc7626a05c6264..6e106ff8bf5dca7786acc1069e6b2739b18067e9 100644 --- a/include/llvm/ADT/APInt.h +++ b/include/llvm/ADT/APInt.h @@ -1105,6 +1105,12 @@ public: APInt sshl_ov(const APInt &Amt, bool &Overflow) const; APInt ushl_ov(const APInt &Amt, bool &Overflow) const; + // Operations that saturate + APInt sadd_sat(const APInt &RHS) const; + APInt uadd_sat(const APInt &RHS) const; + APInt ssub_sat(const APInt &RHS) const; + APInt usub_sat(const APInt &RHS) const; + /// Array-indexing support. /// /// \returns the bit value at bitPosition @@ -1395,7 +1401,7 @@ public: /// /// Set the given bit to 1 whose position is given as "bitPosition". void setBit(unsigned BitPosition) { - assert(BitPosition <= BitWidth && "BitPosition out of range"); + assert(BitPosition < BitWidth && "BitPosition out of range"); WordType Mask = maskBit(BitPosition); if (isSingleWord()) U.VAL |= Mask; @@ -1454,7 +1460,7 @@ public: /// /// Set the given bit to 0 whose position is given as "bitPosition". void clearBit(unsigned BitPosition) { - assert(BitPosition <= BitWidth && "BitPosition out of range"); + assert(BitPosition < BitWidth && "BitPosition out of range"); WordType Mask = ~maskBit(BitPosition); if (isSingleWord()) U.VAL &= Mask; diff --git a/include/llvm/ADT/Optional.h b/include/llvm/ADT/Optional.h index 353e5d0ec9df20c7ddaffd821f10ffefa45b08c7..76937d632ae13a6f5325deb1c4473578a024d242 100644 --- a/include/llvm/ADT/Optional.h +++ b/include/llvm/ADT/Optional.h @@ -29,7 +29,7 @@ namespace llvm { namespace optional_detail { /// Storage for any type. -template struct OptionalStorage { +template ::value> struct OptionalStorage { AlignedCharArrayUnion storage; bool hasVal = false; @@ -108,28 +108,10 @@ template struct OptionalStorage { } }; -#if !defined(__GNUC__) || defined(__clang__) // GCC up to GCC7 miscompiles this. -/// Storage for trivially copyable types only. -template struct OptionalStorage { - AlignedCharArrayUnion storage; - bool hasVal = false; - - OptionalStorage() = default; - - OptionalStorage(const T &y) : hasVal(true) { new (storage.buffer) T(y); } - OptionalStorage &operator=(const T &y) { - *reinterpret_cast(storage.buffer) = y; - hasVal = true; - return *this; - } - - void reset() { hasVal = false; } -}; -#endif } // namespace optional_detail template class Optional { - optional_detail::OptionalStorage::value> Storage; + optional_detail::OptionalStorage Storage; public: using value_type = T; diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h index 4a93ee55e76dcae13dd5b1c1aea7f27efc8fd0fe..8685f0e4aada7f1372afb56d54aed2e0da56cc41 100644 --- a/include/llvm/ADT/STLExtras.h +++ b/include/llvm/ADT/STLExtras.h @@ -71,6 +71,10 @@ template struct conjunction : std::conditional, B1>::type {}; +template struct make_const_ptr { + using type = + typename std::add_pointer::type>::type; +}; //===----------------------------------------------------------------------===// // Extra additions to //===----------------------------------------------------------------------===// @@ -508,9 +512,11 @@ make_early_inc_range(RangeT &&Range) { EarlyIncIteratorT(std::end(std::forward(Range)))); } -// forward declarations required by zip_shortest/zip_first +// forward declarations required by zip_shortest/zip_first/zip_longest template bool all_of(R &&range, UnaryPredicate P); +template +bool any_of(R &&range, UnaryPredicate P); template struct index_sequence; @@ -661,6 +667,132 @@ detail::zippy zip_first(T &&t, U &&u, std::forward(t), std::forward(u), std::forward(args)...); } +namespace detail { +template +static Iter next_or_end(const Iter &I, const Iter &End) { + if (I == End) + return End; + return std::next(I); +} + +template +static auto deref_or_none(const Iter &I, const Iter &End) + -> llvm::Optional::type>::type> { + if (I == End) + return None; + return *I; +} + +template struct ZipLongestItemType { + using type = + llvm::Optional())>::type>::type>; +}; + +template struct ZipLongestTupleType { + using type = std::tuple::type...>; +}; + +template +class zip_longest_iterator + : public iterator_facade_base< + zip_longest_iterator, + typename std::common_type< + std::forward_iterator_tag, + typename std::iterator_traits::iterator_category...>::type, + typename ZipLongestTupleType::type, + typename std::iterator_traits>::type>::difference_type, + typename ZipLongestTupleType::type *, + typename ZipLongestTupleType::type> { +public: + using value_type = typename ZipLongestTupleType::type; + +private: + std::tuple iterators; + std::tuple end_iterators; + + template + bool test(const zip_longest_iterator &other, + index_sequence) const { + return llvm::any_of( + std::initializer_list{std::get(this->iterators) != + std::get(other.iterators)...}, + identity{}); + } + + template value_type deref(index_sequence) const { + return value_type( + deref_or_none(std::get(iterators), std::get(end_iterators))...); + } + + template + decltype(iterators) tup_inc(index_sequence) const { + return std::tuple( + next_or_end(std::get(iterators), std::get(end_iterators))...); + } + +public: + zip_longest_iterator(std::pair... ts) + : iterators(std::forward(ts.first)...), + end_iterators(std::forward(ts.second)...) {} + + value_type operator*() { return deref(index_sequence_for{}); } + + value_type operator*() const { return deref(index_sequence_for{}); } + + zip_longest_iterator &operator++() { + iterators = tup_inc(index_sequence_for{}); + return *this; + } + + bool operator==(const zip_longest_iterator &other) const { + return !test(other, index_sequence_for{}); + } +}; + +template class zip_longest_range { +public: + using iterator = + zip_longest_iterator()))...>; + using iterator_category = typename iterator::iterator_category; + using value_type = typename iterator::value_type; + using difference_type = typename iterator::difference_type; + using pointer = typename iterator::pointer; + using reference = typename iterator::reference; + +private: + std::tuple ts; + + template iterator begin_impl(index_sequence) const { + return iterator(std::make_pair(adl_begin(std::get(ts)), + adl_end(std::get(ts)))...); + } + + template iterator end_impl(index_sequence) const { + return iterator(std::make_pair(adl_end(std::get(ts)), + adl_end(std::get(ts)))...); + } + +public: + zip_longest_range(Args &&... ts_) : ts(std::forward(ts_)...) {} + + iterator begin() const { return begin_impl(index_sequence_for{}); } + iterator end() const { return end_impl(index_sequence_for{}); } +}; +} // namespace detail + +/// Iterate over two or more iterators at the same time. Iteration continues +/// until all iterators reach the end. The llvm::Optional only contains a value +/// if the iterator has not reached the end. +template +detail::zip_longest_range zip_longest(T &&t, U &&u, + Args &&... args) { + return detail::zip_longest_range( + std::forward(t), std::forward(u), std::forward(args)...); +} + /// Iterator wrapper that concatenates sequences together. /// /// This can concatenate different iterators, even with different types, into @@ -1405,6 +1537,40 @@ auto apply_tuple(F &&f, Tuple &&t) -> decltype(detail::apply_tuple_impl( Indices{}); } +/// Return true if the sequence [Begin, End) has exactly N items. Runs in O(N) +/// time. Not meant for use with random-access iterators. +template +bool hasNItems( + IterTy &&Begin, IterTy &&End, unsigned N, + typename std::enable_if< + !std::is_same< + typename std::iterator_traits::type>::iterator_category, + std::random_access_iterator_tag>::value, + void>::type * = nullptr) { + for (; N; --N, ++Begin) + if (Begin == End) + return false; // Too few. + return Begin == End; +} + +/// Return true if the sequence [Begin, End) has N or more items. Runs in O(N) +/// time. Not meant for use with random-access iterators. +template +bool hasNItemsOrMore( + IterTy &&Begin, IterTy &&End, unsigned N, + typename std::enable_if< + !std::is_same< + typename std::iterator_traits::type>::iterator_category, + std::random_access_iterator_tag>::value, + void>::type * = nullptr) { + for (; N; --N, ++Begin) + if (Begin == End) + return false; // Too few. + return true; +} + } // end namespace llvm #endif // LLVM_ADT_STLEXTRAS_H diff --git a/include/llvm/ADT/SmallBitVector.h b/include/llvm/ADT/SmallBitVector.h index f86bebd14cc5bc0d9f952552a6d175a08601d224..0a73dbd6067182a66495d57233a62e700685e181 100644 --- a/include/llvm/ADT/SmallBitVector.h +++ b/include/llvm/ADT/SmallBitVector.h @@ -92,10 +92,6 @@ public: }; private: - bool isSmall() const { - return X & uintptr_t(1); - } - BitVector *getPointer() const { assert(!isSmall()); return reinterpret_cast(X); @@ -186,6 +182,8 @@ public: return make_range(set_bits_begin(), set_bits_end()); } + bool isSmall() const { return X & uintptr_t(1); } + /// Tests whether there are no bits in this bitvector. bool empty() const { return isSmall() ? getSmallSize() == 0 : getPointer()->empty(); @@ -242,7 +240,7 @@ public: uintptr_t Bits = getSmallBits(); if (Bits == 0) return -1; - return NumBaseBits - countLeadingZeros(Bits); + return NumBaseBits - countLeadingZeros(Bits) - 1; } return getPointer()->find_last(); } @@ -265,7 +263,9 @@ public: return -1; uintptr_t Bits = getSmallBits(); - return NumBaseBits - countLeadingOnes(Bits); + // Set unused bits. + Bits |= ~uintptr_t(0) << getSmallSize(); + return NumBaseBits - countLeadingOnes(Bits) - 1; } return getPointer()->find_last_unset(); } @@ -487,10 +487,17 @@ public: bool operator==(const SmallBitVector &RHS) const { if (size() != RHS.size()) return false; - if (isSmall()) + if (isSmall() && RHS.isSmall()) return getSmallBits() == RHS.getSmallBits(); - else + else if (!isSmall() && !RHS.isSmall()) return *getPointer() == *RHS.getPointer(); + else { + for (size_t i = 0, e = size(); i != e; ++i) { + if ((*this)[i] != RHS[i]) + return false; + } + return true; + } } bool operator!=(const SmallBitVector &RHS) const { @@ -498,16 +505,19 @@ public: } // Intersection, union, disjoint union. + // FIXME BitVector::operator&= does not resize the LHS but this does SmallBitVector &operator&=(const SmallBitVector &RHS) { resize(std::max(size(), RHS.size())); - if (isSmall()) + if (isSmall() && RHS.isSmall()) setSmallBits(getSmallBits() & RHS.getSmallBits()); - else if (!RHS.isSmall()) + else if (!isSmall() && !RHS.isSmall()) getPointer()->operator&=(*RHS.getPointer()); else { - SmallBitVector Copy = RHS; - Copy.resize(size()); - getPointer()->operator&=(*Copy.getPointer()); + size_t i, e; + for (i = 0, e = std::min(size(), RHS.size()); i != e; ++i) + (*this)[i] = test(i) && RHS.test(i); + for (e = size(); i != e; ++i) + reset(i); } return *this; } @@ -547,28 +557,26 @@ public: SmallBitVector &operator|=(const SmallBitVector &RHS) { resize(std::max(size(), RHS.size())); - if (isSmall()) + if (isSmall() && RHS.isSmall()) setSmallBits(getSmallBits() | RHS.getSmallBits()); - else if (!RHS.isSmall()) + else if (!isSmall() && !RHS.isSmall()) getPointer()->operator|=(*RHS.getPointer()); else { - SmallBitVector Copy = RHS; - Copy.resize(size()); - getPointer()->operator|=(*Copy.getPointer()); + for (size_t i = 0, e = RHS.size(); i != e; ++i) + (*this)[i] = test(i) || RHS.test(i); } return *this; } SmallBitVector &operator^=(const SmallBitVector &RHS) { resize(std::max(size(), RHS.size())); - if (isSmall()) + if (isSmall() && RHS.isSmall()) setSmallBits(getSmallBits() ^ RHS.getSmallBits()); - else if (!RHS.isSmall()) + else if (!isSmall() && !RHS.isSmall()) getPointer()->operator^=(*RHS.getPointer()); else { - SmallBitVector Copy = RHS; - Copy.resize(size()); - getPointer()->operator^=(*Copy.getPointer()); + for (size_t i = 0, e = RHS.size(); i != e; ++i) + (*this)[i] = test(i) != RHS.test(i); } return *this; } diff --git a/include/llvm/ADT/SmallVector.h b/include/llvm/ADT/SmallVector.h index e4ddd12afdf4c96906aad4ecbf9c11823b344c9e..0636abbb1fbfdc8d2158b0438b747e7cb8585bd9 100644 --- a/include/llvm/ADT/SmallVector.h +++ b/include/llvm/ADT/SmallVector.h @@ -182,7 +182,7 @@ public: /// SmallVectorTemplateBase - This is where we put method /// implementations that are designed to work with non-POD-like T's. -template +template ::value> class SmallVectorTemplateBase : public SmallVectorTemplateCommon { protected: SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon(Size) {} @@ -320,8 +320,8 @@ public: /// This class consists of common code factored out of the SmallVector class to /// reduce code duplication based on the SmallVector 'N' template parameter. template -class SmallVectorImpl : public SmallVectorTemplateBase::value> { - using SuperClass = SmallVectorTemplateBase::value>; +class SmallVectorImpl : public SmallVectorTemplateBase { + using SuperClass = SmallVectorTemplateBase; public: using iterator = typename SuperClass::iterator; diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h index fe78f78aaab2d70ff44527c982f758f2c38e762a..04b64e2fa60b00303ef7639a1a43dcd9eeeb0fc3 100644 --- a/include/llvm/ADT/Triple.h +++ b/include/llvm/ADT/Triple.h @@ -186,7 +186,8 @@ public: Contiki, AMDPAL, // AMD PAL Runtime HermitCore, // HermitCore Unikernel/Multikernel - LastOSType = HermitCore + Hurd, // GNU/Hurd + LastOSType = Hurd }; enum EnvironmentType { UnknownEnvironment, @@ -582,9 +583,15 @@ public: return getOS() == Triple::KFreeBSD; } + /// Tests whether the OS is Hurd. + bool isOSHurd() const { + return getOS() == Triple::Hurd; + } + /// Tests whether the OS uses glibc. bool isOSGlibc() const { - return (getOS() == Triple::Linux || getOS() == Triple::KFreeBSD) && + return (getOS() == Triple::Linux || getOS() == Triple::KFreeBSD || + getOS() == Triple::Hurd) && !isAndroid(); } diff --git a/include/llvm/ADT/iterator.h b/include/llvm/ADT/iterator.h index 7f7ed69a005447c19177a6a599c5f577743a4f75..40e490cf7864ea69abb23662c35d3e9946531b93 100644 --- a/include/llvm/ADT/iterator.h +++ b/include/llvm/ADT/iterator.h @@ -309,8 +309,10 @@ make_pointee_range(RangeT &&Range) { template ())> class pointer_iterator - : public iterator_adaptor_base, - WrappedIteratorT, T> { + : public iterator_adaptor_base< + pointer_iterator, WrappedIteratorT, + typename std::iterator_traits::iterator_category, + T> { mutable T Ptr; public: diff --git a/include/llvm/Analysis/CGSCCPassManager.h b/include/llvm/Analysis/CGSCCPassManager.h index f150064a910a05e035ebd9848b026993c7e81c76..61b99f6c3e6b8cb5115f492dd93029d35e8734fe 100644 --- a/include/llvm/Analysis/CGSCCPassManager.h +++ b/include/llvm/Analysis/CGSCCPassManager.h @@ -441,7 +441,10 @@ public: PreservedAnalyses PassPA = Pass.run(*C, CGAM, CG, UR); - PI.runAfterPass(Pass, *C); + if (UR.InvalidatedSCCs.count(C)) + PI.runAfterPassInvalidated(Pass); + else + PI.runAfterPass(Pass, *C); // Update the SCC and RefSCC if necessary. C = UR.UpdatedC ? UR.UpdatedC : C; @@ -762,7 +765,10 @@ public: PreservedAnalyses PassPA = Pass.run(*C, AM, CG, UR); - PI.runAfterPass(Pass, *C); + if (UR.InvalidatedSCCs.count(C)) + PI.runAfterPassInvalidated(Pass); + else + PI.runAfterPass(Pass, *C); // If the SCC structure has changed, bail immediately and let the outer // CGSCC layer handle any iteration to reflect the refined structure. diff --git a/include/llvm/Analysis/CaptureTracking.h b/include/llvm/Analysis/CaptureTracking.h index 7a869a51233a035bbb8d51f4d327d2e7d652a957..aaaaff9ae252c6ac9a93a3274c4cfc8ac6647f77 100644 --- a/include/llvm/Analysis/CaptureTracking.h +++ b/include/llvm/Analysis/CaptureTracking.h @@ -22,6 +22,14 @@ namespace llvm { class DominatorTree; class OrderedBasicBlock; + /// The default value for MaxUsesToExplore argument. It's relatively small to + /// keep the cost of analysis reasonable for clients like BasicAliasAnalysis, + /// where the results can't be cached. + /// TODO: we should probably introduce a caching CaptureTracking analysis and + /// use it where possible. The caching version can use much higher limit or + /// don't have this cap at all. + unsigned constexpr DefaultMaxUsesToExplore = 20; + /// PointerMayBeCaptured - Return true if this pointer value may be captured /// by the enclosing function (which is required to exist). This routine can /// be expensive, so consider caching the results. The boolean ReturnCaptures @@ -29,9 +37,12 @@ namespace llvm { /// counts as capturing it or not. The boolean StoreCaptures specified /// whether storing the value (or part of it) into memory anywhere /// automatically counts as capturing it or not. + /// MaxUsesToExplore specifies how many uses should the analysis explore for + /// one value before giving up due too "too many uses". bool PointerMayBeCaptured(const Value *V, bool ReturnCaptures, - bool StoreCaptures); + bool StoreCaptures, + unsigned MaxUsesToExplore = DefaultMaxUsesToExplore); /// PointerMayBeCapturedBefore - Return true if this pointer value may be /// captured by the enclosing function (which is required to exist). If a @@ -44,10 +55,13 @@ namespace llvm { /// or not. Captures by the provided instruction are considered if the /// final parameter is true. An ordered basic block in \p OBB could be used /// to speed up capture-tracker queries. + /// MaxUsesToExplore specifies how many uses should the analysis explore for + /// one value before giving up due too "too many uses". bool PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures, bool StoreCaptures, const Instruction *I, const DominatorTree *DT, bool IncludeI = false, - OrderedBasicBlock *OBB = nullptr); + OrderedBasicBlock *OBB = nullptr, + unsigned MaxUsesToExplore = DefaultMaxUsesToExplore); /// This callback is used in conjunction with PointerMayBeCaptured. In /// addition to the interface here, you'll need to provide your own getters @@ -75,7 +89,10 @@ namespace llvm { /// PointerMayBeCaptured - Visit the value and the values derived from it and /// find values which appear to be capturing the pointer value. This feeds /// results into and is controlled by the CaptureTracker object. - void PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker); + /// MaxUsesToExplore specifies how many uses should the analysis explore for + /// one value before giving up due too "too many uses". + void PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker, + unsigned MaxUsesToExplore = DefaultMaxUsesToExplore); } // end namespace llvm #endif diff --git a/include/llvm/Analysis/CmpInstAnalysis.h b/include/llvm/Analysis/CmpInstAnalysis.h index 3cc69d9fea29f33dd02d9c07dd6bd4f183b1d0f1..0e9c6a96b0f41095234f5877c6f4a6b6bf4b83e8 100644 --- a/include/llvm/Analysis/CmpInstAnalysis.h +++ b/include/llvm/Analysis/CmpInstAnalysis.h @@ -46,19 +46,18 @@ namespace llvm { /// unsigned getICmpCode(const ICmpInst *ICI, bool InvertPred = false); - /// This is the complement of getICmpCode, which turns an opcode and two - /// operands into either a constant true or false, or the predicate for a new - /// ICmp instruction. The sign is passed in to determine which kind of - /// predicate to use in the new icmp instruction. + /// This is the complement of getICmpCode. It turns a predicate code into + /// either a constant true or false or the predicate for a new ICmp. + /// The sign is passed in to determine which kind of predicate to use in the + /// new ICmp instruction. /// Non-NULL return value will be a true or false constant. - /// NULL return means a new ICmp is needed. The predicate for which is output - /// in NewICmpPred. - Value *getICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS, - CmpInst::Predicate &NewICmpPred); + /// NULL return means a new ICmp is needed. The predicate is output in Pred. + Constant *getPredForICmpCode(unsigned Code, bool Sign, Type *OpTy, + CmpInst::Predicate &Pred); /// Return true if both predicates match sign or if at least one of them is an /// equality comparison (which is signless). - bool PredicatesFoldable(CmpInst::Predicate p1, CmpInst::Predicate p2); + bool predicatesFoldable(CmpInst::Predicate P1, CmpInst::Predicate P2); /// Decompose an icmp into the form ((X & Mask) pred 0) if possible. The /// returned predicate is either == or !=. Returns false if decomposition diff --git a/include/llvm/Analysis/DemandedBits.h b/include/llvm/Analysis/DemandedBits.h index d4384609762d4a7356800dcaadc4b40fe2882219..f751d2edcfffb24e21237eb40d33304a2243d9f7 100644 --- a/include/llvm/Analysis/DemandedBits.h +++ b/include/llvm/Analysis/DemandedBits.h @@ -44,6 +44,14 @@ public: F(F), AC(AC), DT(DT) {} /// Return the bits demanded from instruction I. + /// + /// For vector instructions individual vector elements are not distinguished: + /// A bit is demanded if it is demanded for any of the vector elements. The + /// size of the return value corresponds to the type size in bits of the + /// scalar type. + /// + /// Instructions that do not have integer or vector of integer type are + /// accepted, but will always produce a mask with all bits set. APInt getDemandedBits(Instruction *I); /// Return true if, during analysis, I could not be reached. diff --git a/include/llvm/Analysis/DivergenceAnalysis.h b/include/llvm/Analysis/DivergenceAnalysis.h index 9fadf52288bc87eb38a043a8b71a3abaebd9835d..d834862db09573261cfd49609ed94b6b7ddf5d59 100644 --- a/include/llvm/Analysis/DivergenceAnalysis.h +++ b/include/llvm/Analysis/DivergenceAnalysis.h @@ -173,6 +173,33 @@ private: std::vector Worklist; }; +/// \brief Divergence analysis frontend for GPU kernels. +class GPUDivergenceAnalysis { + SyncDependenceAnalysis SDA; + DivergenceAnalysis DA; + +public: + /// Runs the divergence analysis on @F, a GPU kernel + GPUDivergenceAnalysis(Function &F, const DominatorTree &DT, + const PostDominatorTree &PDT, const LoopInfo &LI, + const TargetTransformInfo &TTI); + + /// Whether any divergence was detected. + bool hasDivergence() const { return DA.hasDetectedDivergence(); } + + /// The GPU kernel this analysis result is for + const Function &getFunction() const { return DA.getFunction(); } + + /// Whether \p V is divergent. + bool isDivergent(const Value &V) const; + + /// Whether \p V is uniform/non-divergent + bool isUniform(const Value &V) const { return !isDivergent(V); } + + /// Print all divergent values in the kernel. + void print(raw_ostream &OS, const Module *) const; +}; + } // namespace llvm #endif // LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H diff --git a/include/llvm/Analysis/IVDescriptors.h b/include/llvm/Analysis/IVDescriptors.h index d1d7e5ef0227e40a62fbb6c18b9e23189f385595..64b4ae23cc59de8e7d0790995f02105c0091cb45 100644 --- a/include/llvm/Analysis/IVDescriptors.h +++ b/include/llvm/Analysis/IVDescriptors.h @@ -140,7 +140,8 @@ public: /// Returns true if instruction I has multiple uses in Insts static bool hasMultipleUsesOf(Instruction *I, - SmallPtrSetImpl &Insts); + SmallPtrSetImpl &Insts, + unsigned MaxNumUses); /// Returns true if all uses of the instruction I is within the Set. static bool areAllUsesIn(Instruction *I, SmallPtrSetImpl &Set); @@ -150,6 +151,10 @@ public: /// or max(X, Y). static InstDesc isMinMaxSelectCmpPattern(Instruction *I, InstDesc &Prev); + /// Returns a struct describing if the instruction is a + /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern. + static InstDesc isConditionalRdxPattern(RecurrenceKind Kind, Instruction *I); + /// Returns identity corresponding to the RecurrenceKind. static Constant *getRecurrenceIdentity(RecurrenceKind K, Type *Tp); diff --git a/include/llvm/Analysis/InstructionPrecedenceTracking.h b/include/llvm/Analysis/InstructionPrecedenceTracking.h index 5178a33d2e84a324462002d66ccb1a40c860606e..b754557ecc41d36780319f743ebbbdf183f8d809 100644 --- a/include/llvm/Analysis/InstructionPrecedenceTracking.h +++ b/include/llvm/Analysis/InstructionPrecedenceTracking.h @@ -114,6 +114,31 @@ public: virtual bool isSpecialInstruction(const Instruction *Insn) const; }; +class MemoryWriteTracking : public InstructionPrecedenceTracking { +public: + MemoryWriteTracking(DominatorTree *DT) : InstructionPrecedenceTracking(DT) {} + + /// Returns the topmost instruction that may write memory from the given + /// basic block. Returns nullptr if there is no such instructions in the block. + const Instruction *getFirstMemoryWrite(const BasicBlock *BB) { + return getFirstSpecialInstruction(BB); + } + + /// Returns true if at least one instruction from the given basic block may + /// write memory. + bool mayWriteToMemory(const BasicBlock *BB) { + return hasSpecialInstructions(BB); + } + + /// Returns true if the first memory writing instruction of Insn's block + /// exists and dominates Insn. + bool isDominatedByMemoryWriteFromSameBlock(const Instruction *Insn) { + return isPreceededBySpecialInstruction(Insn); + } + + virtual bool isSpecialInstruction(const Instruction *Insn) const; +}; + } // llvm #endif // LLVM_ANALYSIS_INSTRUCTIONPRECEDENCETRACKING_H diff --git a/include/llvm/Analysis/IteratedDominanceFrontier.h b/include/llvm/Analysis/IteratedDominanceFrontier.h index fdebb1bc3963beb4ee6281a4cba5a58d481b753e..3083db75b81c1c1eb15a6f3b357d6e29dbb16c12 100644 --- a/include/llvm/Analysis/IteratedDominanceFrontier.h +++ b/include/llvm/Analysis/IteratedDominanceFrontier.h @@ -6,7 +6,7 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// +/// \file /// Compute iterated dominance frontiers using a linear time algorithm. /// /// The algorithm used here is based on: diff --git a/include/llvm/Analysis/LegacyDivergenceAnalysis.h b/include/llvm/Analysis/LegacyDivergenceAnalysis.h index 173ca28a7f4873329feea6458a1982de9b1c61b7..fc426ad7fb645f0088eb415b625e07bd521776a0 100644 --- a/include/llvm/Analysis/LegacyDivergenceAnalysis.h +++ b/include/llvm/Analysis/LegacyDivergenceAnalysis.h @@ -19,9 +19,11 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/IR/Function.h" #include "llvm/Pass.h" +#include "llvm/Analysis/DivergenceAnalysis.h" namespace llvm { class Value; +class GPUDivergenceAnalysis; class LegacyDivergenceAnalysis : public FunctionPass { public: static char ID; @@ -41,7 +43,7 @@ public: // // Even if this function returns false, V may still be divergent when used // in a different basic block. - bool isDivergent(const Value *V) const { return DivergentValues.count(V); } + bool isDivergent(const Value *V) const; // Returns true if V is uniform/non-divergent. // @@ -53,6 +55,12 @@ public: void removeValue(const Value *V) { DivergentValues.erase(V); } private: + // Whether analysis should be performed by GPUDivergenceAnalysis. + bool shouldUseGPUDivergenceAnalysis(const Function &F) const; + + // (optional) handle to new DivergenceAnalysis + std::unique_ptr gpuDA; + // Stores all divergent values. DenseSet DivergentValues; }; diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h index c59c86c499404536c640efa718ac01f21fdb9400..b5a964de9eb0b61b6cf723c0941e57d74de169f6 100644 --- a/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/include/llvm/Analysis/LoopAccessAnalysis.h @@ -97,6 +97,17 @@ public: /// Set of potential dependent memory accesses. typedef EquivalenceClasses DepCandidates; + /// Type to keep track of the status of the dependence check. The order of + /// the elements is important and has to be from most permissive to least + /// permissive. + enum class VectorizationSafetyStatus { + // Can vectorize safely without RT checks. All dependences are known to be + // safe. + Safe, + // Cannot vectorize due to unsafe or unknown dependencies. + Unsafe, + }; + /// Dependece between memory access instructions. struct Dependence { /// The type of the dependence. @@ -146,7 +157,7 @@ public: Instruction *getDestination(const LoopAccessInfo &LAI) const; /// Dependence types that don't prevent vectorization. - static bool isSafeForVectorization(DepType Type); + static VectorizationSafetyStatus isSafeForVectorization(DepType Type); /// Lexically forward dependence. bool isForward() const; @@ -164,8 +175,8 @@ public: MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L) : PSE(PSE), InnermostLoop(L), AccessIdx(0), MaxSafeRegisterWidth(-1U), - ShouldRetryWithRuntimeCheck(false), SafeForVectorization(true), - RecordDependences(true) {} + ShouldRetryWithRuntimeCheck(false), + Status(VectorizationSafetyStatus::Safe), RecordDependences(true) {} /// Register the location (instructions are given increasing numbers) /// of a write access. @@ -193,7 +204,9 @@ public: /// No memory dependence was encountered that would inhibit /// vectorization. - bool isSafeForVectorization() const { return SafeForVectorization; } + bool isSafeForVectorization() const { + return Status == VectorizationSafetyStatus::Safe; + } /// The maximum number of bytes of a vector register we can vectorize /// the accesses safely with. @@ -269,9 +282,9 @@ private: /// vectorize this loop with runtime checks. bool ShouldRetryWithRuntimeCheck; - /// No memory dependence was encountered that would inhibit - /// vectorization. - bool SafeForVectorization; + /// Result of the dependence checks, indicating whether the checked + /// dependences are safe for vectorization or not. + VectorizationSafetyStatus Status; //// True if Dependences reflects the dependences in the //// loop. If false we exceeded MaxDependences and @@ -304,6 +317,10 @@ private: /// \return false if we shouldn't vectorize at all or avoid larger /// vectorization factors by limiting MaxSafeDepDistBytes. bool couldPreventStoreLoadForward(uint64_t Distance, uint64_t TypeByteSize); + + /// Updates the current safety status with \p S. We can go from Safe to + /// to Unsafe. + void mergeInStatus(VectorizationSafetyStatus S); }; /// Holds information about the memory runtime legality checks to verify @@ -564,10 +581,10 @@ public: /// Print the information about the memory accesses in the loop. void print(raw_ostream &OS, unsigned Depth = 0) const; - /// If the loop has multiple stores to an invariant address, then - /// return true, else return false. - bool hasMultipleStoresToLoopInvariantAddress() const { - return HasMultipleStoresToLoopInvariantAddress; + /// If the loop has memory dependence involving an invariant address, i.e. two + /// stores or a store and a load, then return true, else return false. + bool hasDependenceInvolvingLoopInvariantAddress() const { + return HasDependenceInvolvingLoopInvariantAddress; } /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts @@ -620,8 +637,8 @@ private: /// Cache the result of analyzeLoop. bool CanVecMem; - /// Indicator that there are multiple stores to a uniform address. - bool HasMultipleStoresToLoopInvariantAddress; + /// Indicator that there are non vectorizable stores to a uniform address. + bool HasDependenceInvolvingLoopInvariantAddress; /// The diagnostics report generated for the analysis. E.g. why we /// couldn't analyze the loop. diff --git a/include/llvm/Analysis/MustExecute.h b/include/llvm/Analysis/MustExecute.h index 05c28d139889af4b3a1e3a88a11c580b55343640..b973447ca8221fb3d6d86f6b42701995062a8414 100644 --- a/include/llvm/Analysis/MustExecute.h +++ b/include/llvm/Analysis/MustExecute.h @@ -126,6 +126,8 @@ class ICFLoopSafetyInfo: public LoopSafetyInfo { // may throw. // Contains information about implicit control flow in this loop's blocks. mutable ImplicitControlFlowTracking ICF; + // Contains information about instruction that may possibly write memory. + mutable MemoryWriteTracking MW; public: virtual bool blockMayThrow(const BasicBlock *BB) const; @@ -138,6 +140,16 @@ public: const DominatorTree *DT, const Loop *CurLoop) const; + /// Returns true if we could not execute a memory-modifying instruction before + /// we enter \p BB under assumption that \p CurLoop is entered. + bool doesNotWriteMemoryBefore(const BasicBlock *BB, const Loop *CurLoop) + const; + + /// Returns true if we could not execute a memory-modifying instruction before + /// we execute \p I under assumption that \p CurLoop is entered. + bool doesNotWriteMemoryBefore(const Instruction &I, const Loop *CurLoop) + const; + /// Inform the safety info that we are planning to insert a new instruction /// into the basic block \p BB. It will make all cache updates to keep it /// correct after this insertion. @@ -148,7 +160,7 @@ public: /// this removal. void removeInstruction(const Instruction *Inst); - ICFLoopSafetyInfo(DominatorTree *DT) : LoopSafetyInfo(), ICF(DT) {}; + ICFLoopSafetyInfo(DominatorTree *DT) : LoopSafetyInfo(), ICF(DT), MW(DT) {}; virtual ~ICFLoopSafetyInfo() {}; }; diff --git a/include/llvm/Analysis/ObjCARCAnalysisUtils.h b/include/llvm/Analysis/ObjCARCAnalysisUtils.h index 07beb0bb60a32dfb1322c7aad090c41bcb8731ce..1f497fab35da8b8fe4bb9ded547d2dbcc93bc7f1 100644 --- a/include/llvm/Analysis/ObjCARCAnalysisUtils.h +++ b/include/llvm/Analysis/ObjCARCAnalysisUtils.h @@ -51,25 +51,25 @@ extern bool EnableARCOpts; /// on. inline bool ModuleHasARC(const Module &M) { return - M.getNamedValue("objc_retain") || - M.getNamedValue("objc_release") || - M.getNamedValue("objc_autorelease") || - M.getNamedValue("objc_retainAutoreleasedReturnValue") || - M.getNamedValue("objc_unsafeClaimAutoreleasedReturnValue") || - M.getNamedValue("objc_retainBlock") || - M.getNamedValue("objc_autoreleaseReturnValue") || - M.getNamedValue("objc_autoreleasePoolPush") || - M.getNamedValue("objc_loadWeakRetained") || - M.getNamedValue("objc_loadWeak") || - M.getNamedValue("objc_destroyWeak") || - M.getNamedValue("objc_storeWeak") || - M.getNamedValue("objc_initWeak") || - M.getNamedValue("objc_moveWeak") || - M.getNamedValue("objc_copyWeak") || - M.getNamedValue("objc_retainedObject") || - M.getNamedValue("objc_unretainedObject") || - M.getNamedValue("objc_unretainedPointer") || - M.getNamedValue("clang.arc.use"); + M.getNamedValue("llvm.objc.retain") || + M.getNamedValue("llvm.objc.release") || + M.getNamedValue("llvm.objc.autorelease") || + M.getNamedValue("llvm.objc.retainAutoreleasedReturnValue") || + M.getNamedValue("llvm.objc.unsafeClaimAutoreleasedReturnValue") || + M.getNamedValue("llvm.objc.retainBlock") || + M.getNamedValue("llvm.objc.autoreleaseReturnValue") || + M.getNamedValue("llvm.objc.autoreleasePoolPush") || + M.getNamedValue("llvm.objc.loadWeakRetained") || + M.getNamedValue("llvm.objc.loadWeak") || + M.getNamedValue("llvm.objc.destroyWeak") || + M.getNamedValue("llvm.objc.storeWeak") || + M.getNamedValue("llvm.objc.initWeak") || + M.getNamedValue("llvm.objc.moveWeak") || + M.getNamedValue("llvm.objc.copyWeak") || + M.getNamedValue("llvm.objc.retainedObject") || + M.getNamedValue("llvm.objc.unretainedObject") || + M.getNamedValue("llvm.objc.unretainedPointer") || + M.getNamedValue("llvm.objc.clang.arc.use"); } /// This is a wrapper around getUnderlyingObject which also knows how to diff --git a/include/llvm/Analysis/ObjCARCInstKind.h b/include/llvm/Analysis/ObjCARCInstKind.h index 0b92d8b4835601aaf798fbc4075d0174eb84a785..018ea1f851bebaf35c5ef36f6d61e00ce1a225aa 100644 --- a/include/llvm/Analysis/ObjCARCInstKind.h +++ b/include/llvm/Analysis/ObjCARCInstKind.h @@ -11,6 +11,7 @@ #define LLVM_ANALYSIS_OBJCARCINSTKIND_H #include "llvm/IR/Function.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Instructions.h" namespace llvm { @@ -48,7 +49,7 @@ enum class ARCInstKind { CopyWeak, ///< objc_copyWeak (derived) DestroyWeak, ///< objc_destroyWeak (derived) StoreStrong, ///< objc_storeStrong (derived) - IntrinsicUser, ///< clang.arc.use + IntrinsicUser, ///< llvm.objc.clang.arc.use CallOrUser, ///< could call objc_release and/or "use" pointers Call, ///< could call objc_release User, ///< could "use" a pointer diff --git a/include/llvm/Analysis/ProfileSummaryInfo.h b/include/llvm/Analysis/ProfileSummaryInfo.h index 58b67e74ba5130074615e02d903167d4cc59613b..3aef4be72d71f49e0d7d0b07abcd874dcfed2132 100644 --- a/include/llvm/Analysis/ProfileSummaryInfo.h +++ b/include/llvm/Analysis/ProfileSummaryInfo.h @@ -98,14 +98,14 @@ public: bool isFunctionEntryCold(const Function *F); /// Returns true if \p F contains only cold code. bool isFunctionColdInCallGraph(const Function *F, BlockFrequencyInfo &BFI); - /// Returns true if \p F is a hot function. + /// Returns true if count \p C is considered hot. bool isHotCount(uint64_t C); /// Returns true if count \p C is considered cold. bool isColdCount(uint64_t C); - /// Returns true if BasicBlock \p B is considered hot. - bool isHotBB(const BasicBlock *B, BlockFrequencyInfo *BFI); - /// Returns true if BasicBlock \p B is considered cold. - bool isColdBB(const BasicBlock *B, BlockFrequencyInfo *BFI); + /// Returns true if BasicBlock \p BB is considered hot. + bool isHotBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI); + /// Returns true if BasicBlock \p BB is considered cold. + bool isColdBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI); /// Returns true if CallSite \p CS is considered hot. bool isHotCallSite(const CallSite &CS, BlockFrequencyInfo *BFI); /// Returns true if Callsite \p CS is considered cold. @@ -134,9 +134,8 @@ public: static char ID; ProfileSummaryInfoWrapperPass(); - ProfileSummaryInfo *getPSI() { - return &*PSI; - } + ProfileSummaryInfo &getPSI() { return *PSI; } + const ProfileSummaryInfo &getPSI() const { return *PSI; } bool doInitialization(Module &M) override; bool doFinalization(Module &M) override; diff --git a/include/llvm/Analysis/StackSafetyAnalysis.h b/include/llvm/Analysis/StackSafetyAnalysis.h new file mode 100644 index 0000000000000000000000000000000000000000..8a151650a34c07e6922466aeab7154ac9054bed1 --- /dev/null +++ b/include/llvm/Analysis/StackSafetyAnalysis.h @@ -0,0 +1,120 @@ +//===- StackSafetyAnalysis.h - Stack memory safety analysis -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Stack Safety Analysis detects allocas and arguments with safe access. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_STACKSAFETYANALYSIS_H +#define LLVM_ANALYSIS_STACKSAFETYANALYSIS_H + +#include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" + +namespace llvm { + +/// Interface to access stack safety analysis results for single function. +class StackSafetyInfo { +public: + struct FunctionInfo; + +private: + std::unique_ptr Info; + +public: + StackSafetyInfo(); + StackSafetyInfo(FunctionInfo &&Info); + StackSafetyInfo(StackSafetyInfo &&); + StackSafetyInfo &operator=(StackSafetyInfo &&); + ~StackSafetyInfo(); + + // TODO: Add useful for client methods. + void print(raw_ostream &O) const; +}; + +/// StackSafetyInfo wrapper for the new pass manager. +class StackSafetyAnalysis : public AnalysisInfoMixin { + friend AnalysisInfoMixin; + static AnalysisKey Key; + +public: + using Result = StackSafetyInfo; + StackSafetyInfo run(Function &F, FunctionAnalysisManager &AM); +}; + +/// Printer pass for the \c StackSafetyAnalysis results. +class StackSafetyPrinterPass : public PassInfoMixin { + raw_ostream &OS; + +public: + explicit StackSafetyPrinterPass(raw_ostream &OS) : OS(OS) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +/// StackSafetyInfo wrapper for the legacy pass manager +class StackSafetyInfoWrapperPass : public FunctionPass { + StackSafetyInfo SSI; + +public: + static char ID; + StackSafetyInfoWrapperPass(); + + const StackSafetyInfo &getResult() const { return SSI; } + + void print(raw_ostream &O, const Module *M) const override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + + bool runOnFunction(Function &F) override; +}; + +using StackSafetyGlobalInfo = std::map; + +/// This pass performs the global (interprocedural) stack safety analysis (new +/// pass manager). +class StackSafetyGlobalAnalysis + : public AnalysisInfoMixin { + friend AnalysisInfoMixin; + static AnalysisKey Key; + +public: + using Result = StackSafetyGlobalInfo; + Result run(Module &M, ModuleAnalysisManager &AM); +}; + +/// Printer pass for the \c StackSafetyGlobalAnalysis results. +class StackSafetyGlobalPrinterPass + : public PassInfoMixin { + raw_ostream &OS; + +public: + explicit StackSafetyGlobalPrinterPass(raw_ostream &OS) : OS(OS) {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + +/// This pass performs the global (interprocedural) stack safety analysis +/// (legacy pass manager). +class StackSafetyGlobalInfoWrapperPass : public ModulePass { + StackSafetyGlobalInfo SSI; + +public: + static char ID; + + StackSafetyGlobalInfoWrapperPass(); + + const StackSafetyGlobalInfo &getResult() const { return SSI; } + + void print(raw_ostream &O, const Module *M) const override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + + bool runOnModule(Module &M) override; +}; + +} // end namespace llvm + +#endif // LLVM_ANALYSIS_STACKSAFETYANALYSIS_H diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index eb0e0270157fd05f3eaf4b36f3aeb6e86b3fcd9d..6ddea8bbdce09ffe142ab7cecea912aa3d499b90 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -744,9 +744,9 @@ public: /// and the number of execution units in the CPU. unsigned getMaxInterleaveFactor(unsigned VF) const; - /// Collect properties of V used in cost analyzis, e.g. OP_PowerOf2. - OperandValueKind getOperandInfo(Value *V, - OperandValueProperties &OpProps) const; + /// Collect properties of V used in cost analysis, e.g. OP_PowerOf2. + static OperandValueKind getOperandInfo(Value *V, + OperandValueProperties &OpProps); /// This is an approximation of reciprocal throughput of a math/logic op. /// A higher cost indicates less expected throughput. diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h index a92ba8e374f09282558cf7b59ca95f94ff2aaae4..7c2e9a65ac27dcdf653e2818b6455186604c76b6 100644 --- a/include/llvm/Analysis/ValueTracking.h +++ b/include/llvm/Analysis/ValueTracking.h @@ -610,6 +610,12 @@ class Value; Optional isImpliedCondition(const Value *LHS, const Value *RHS, const DataLayout &DL, bool LHSIsTrue = true, unsigned Depth = 0); + + /// Return the boolean condition value in the context of the given instruction + /// if it is known based on dominating conditions. + Optional isImpliedByDomCondition(const Value *Cond, + const Instruction *ContextI, + const DataLayout &DL); } // end namespace llvm #endif // LLVM_ANALYSIS_VALUETRACKING_H diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h index 797260f439a04078787fad427f874067d3e96980..5d44a52d4c53c64e1ab57bb09b10b2b8200b9cdc 100644 --- a/include/llvm/Analysis/VectorUtils.h +++ b/include/llvm/Analysis/VectorUtils.h @@ -24,7 +24,7 @@ namespace llvm { template class ArrayRef; class DemandedBits; class GetElementPtrInst; -class InterleaveGroup; +template class InterleaveGroup; class Loop; class ScalarEvolution; class TargetTransformInfo; @@ -138,7 +138,7 @@ Instruction *propagateMetadata(Instruction *I, ArrayRef VL); /// create[*]Mask() utilities which create a shuffle mask (mask that /// consists of indices). Constant *createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF, - const InterleaveGroup &Group); + const InterleaveGroup &Group); /// Create a mask with replicated elements. /// @@ -233,9 +233,12 @@ Value *concatenateVectors(IRBuilder<> &Builder, ArrayRef Vecs); /// /// Note: the interleaved load group could have gaps (missing members), but /// the interleaved store group doesn't allow gaps. -class InterleaveGroup { +template class InterleaveGroup { public: - InterleaveGroup(Instruction *Instr, int Stride, unsigned Align) + InterleaveGroup(unsigned Factor, bool Reverse, unsigned Align) + : Factor(Factor), Reverse(Reverse), Align(Align), InsertPos(nullptr) {} + + InterleaveGroup(InstTy *Instr, int Stride, unsigned Align) : Align(Align), InsertPos(Instr) { assert(Align && "The alignment should be non-zero"); @@ -256,7 +259,7 @@ public: /// negative if it is the new leader. /// /// \returns false if the instruction doesn't belong to the group. - bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) { + bool insertMember(InstTy *Instr, int Index, unsigned NewAlign) { assert(NewAlign && "The new member's alignment should be non-zero"); int Key = Index + SmallestKey; @@ -288,7 +291,7 @@ public: /// Get the member with the given index \p Index /// /// \returns nullptr if contains no such member. - Instruction *getMember(unsigned Index) const { + InstTy *getMember(unsigned Index) const { int Key = SmallestKey + Index; auto Member = Members.find(Key); if (Member == Members.end()) @@ -299,16 +302,17 @@ public: /// Get the index for the given member. Unlike the key in the member /// map, the index starts from 0. - unsigned getIndex(Instruction *Instr) const { - for (auto I : Members) + unsigned getIndex(const InstTy *Instr) const { + for (auto I : Members) { if (I.second == Instr) return I.first - SmallestKey; + } llvm_unreachable("InterleaveGroup contains no such member"); } - Instruction *getInsertPos() const { return InsertPos; } - void setInsertPos(Instruction *Inst) { InsertPos = Inst; } + InstTy *getInsertPos() const { return InsertPos; } + void setInsertPos(InstTy *Inst) { InsertPos = Inst; } /// Add metadata (e.g. alias info) from the instructions in this group to \p /// NewInst. @@ -316,12 +320,7 @@ public: /// FIXME: this function currently does not add noalias metadata a'la /// addNewMedata. To do that we need to compute the intersection of the /// noalias info from all members. - void addMetadata(Instruction *NewInst) const { - SmallVector VL; - std::transform(Members.begin(), Members.end(), std::back_inserter(VL), - [](std::pair p) { return p.second; }); - propagateMetadata(NewInst, VL); - } + void addMetadata(InstTy *NewInst) const; /// Returns true if this Group requires a scalar iteration to handle gaps. bool requiresScalarEpilogue() const { @@ -344,7 +343,7 @@ private: unsigned Factor; // Interleave Factor. bool Reverse; unsigned Align; - DenseMap Members; + DenseMap Members; int SmallestKey = 0; int LargestKey = 0; @@ -359,7 +358,7 @@ private: // store i32 %even // %odd = add i32 // Def of %odd // store i32 %odd // Insert Position - Instruction *InsertPos; + InstTy *InsertPos; }; /// Drive the analysis of interleaved memory accesses in the loop. @@ -390,7 +389,7 @@ public: /// formation for predicated accesses, we may be able to relax this limitation /// in the future once we handle more complicated blocks. void reset() { - SmallPtrSet DelSet; + SmallPtrSet *, 4> DelSet; // Avoid releasing a pointer twice. for (auto &I : InterleaveGroupMap) DelSet.insert(I.second); @@ -409,11 +408,16 @@ public: /// Get the interleave group that \p Instr belongs to. /// /// \returns nullptr if doesn't have such group. - InterleaveGroup *getInterleaveGroup(Instruction *Instr) const { - auto Group = InterleaveGroupMap.find(Instr); - if (Group == InterleaveGroupMap.end()) - return nullptr; - return Group->second; + InterleaveGroup * + getInterleaveGroup(const Instruction *Instr) const { + if (InterleaveGroupMap.count(Instr)) + return InterleaveGroupMap.find(Instr)->second; + return nullptr; + } + + iterator_range *>> + getInterleaveGroups() { + return make_range(InterleaveGroups.begin(), InterleaveGroups.end()); } /// Returns true if an interleaved group that may access memory @@ -443,7 +447,9 @@ private: bool RequiresScalarEpilogue = false; /// Holds the relationships between the members and the interleave group. - DenseMap InterleaveGroupMap; + DenseMap *> InterleaveGroupMap; + + SmallPtrSet *, 4> InterleaveGroups; /// Holds dependences among the memory accesses in the loop. It maps a source /// access to a set of dependent sink accesses. @@ -476,19 +482,23 @@ private: /// stride \p Stride and alignment \p Align. /// /// \returns the newly created interleave group. - InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride, - unsigned Align) { - assert(!isInterleaved(Instr) && "Already in an interleaved access group"); - InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align); + InterleaveGroup * + createInterleaveGroup(Instruction *Instr, int Stride, unsigned Align) { + assert(!InterleaveGroupMap.count(Instr) && + "Already in an interleaved access group"); + InterleaveGroupMap[Instr] = + new InterleaveGroup(Instr, Stride, Align); + InterleaveGroups.insert(InterleaveGroupMap[Instr]); return InterleaveGroupMap[Instr]; } /// Release the group and remove all the relationships. - void releaseGroup(InterleaveGroup *Group) { + void releaseGroup(InterleaveGroup *Group) { for (unsigned i = 0; i < Group->getFactor(); i++) if (Instruction *Member = Group->getMember(i)) InterleaveGroupMap.erase(Member); + InterleaveGroups.erase(Group); delete Group; } diff --git a/include/llvm/BinaryFormat/AMDGPUMetadataVerifier.h b/include/llvm/BinaryFormat/AMDGPUMetadataVerifier.h new file mode 100644 index 0000000000000000000000000000000000000000..de44f41720edb3369350ac1d95fc1a4700971e5e --- /dev/null +++ b/include/llvm/BinaryFormat/AMDGPUMetadataVerifier.h @@ -0,0 +1,70 @@ +//===- AMDGPUMetadataVerifier.h - MsgPack Types -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This is a verifier for AMDGPU HSA metadata, which can verify both +/// well-typed metadata and untyped metadata. When verifying in the non-strict +/// mode, untyped metadata is coerced into the correct type if possible. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_BINARYFORMAT_AMDGPUMETADATAVERIFIER_H +#define LLVM_BINARYFORMAT_AMDGPUMETADATAVERIFIER_H + +#include "llvm/BinaryFormat/MsgPackTypes.h" + +namespace llvm { +namespace AMDGPU { +namespace HSAMD { +namespace V3 { + +/// Verifier for AMDGPU HSA metadata. +/// +/// Operates in two modes: +/// +/// In strict mode, metadata must already be well-typed. +/// +/// In non-strict mode, metadata is coerced into expected types when possible. +class MetadataVerifier { + bool Strict; + + bool verifyScalar(msgpack::Node &Node, msgpack::ScalarNode::ScalarKind SKind, + function_ref verifyValue = {}); + bool verifyInteger(msgpack::Node &Node); + bool verifyArray(msgpack::Node &Node, + function_ref verifyNode, + Optional Size = None); + bool verifyEntry(msgpack::MapNode &MapNode, StringRef Key, bool Required, + function_ref verifyNode); + bool + verifyScalarEntry(msgpack::MapNode &MapNode, StringRef Key, bool Required, + msgpack::ScalarNode::ScalarKind SKind, + function_ref verifyValue = {}); + bool verifyIntegerEntry(msgpack::MapNode &MapNode, StringRef Key, + bool Required); + bool verifyKernelArgs(msgpack::Node &Node); + bool verifyKernel(msgpack::Node &Node); + +public: + /// Construct a MetadataVerifier, specifying whether it will operate in \p + /// Strict mode. + MetadataVerifier(bool Strict) : Strict(Strict) {} + + /// Verify given HSA metadata. + /// + /// \returns True when successful, false when metadata is invalid. + bool verify(msgpack::Node &HSAMetadataRoot); +}; + +} // end namespace V3 +} // end namespace HSAMD +} // end namespace AMDGPU +} // end namespace llvm + +#endif // LLVM_BINARYFORMAT_AMDGPUMETADATAVERIFIER_H diff --git a/include/llvm/BinaryFormat/Dwarf.def b/include/llvm/BinaryFormat/Dwarf.def index 512cc64926db5a6e0692895a6af1660c0e9d6a8f..cb9f7f5145d1fbc48ea71945e5a083dc1f18b01c 100644 --- a/include/llvm/BinaryFormat/Dwarf.def +++ b/include/llvm/BinaryFormat/Dwarf.def @@ -18,7 +18,8 @@ defined HANDLE_DW_VIRTUALITY || defined HANDLE_DW_DEFAULTED || \ defined HANDLE_DW_CC || defined HANDLE_DW_LNS || defined HANDLE_DW_LNE || \ defined HANDLE_DW_LNCT || defined HANDLE_DW_MACRO || \ - defined HANDLE_DW_RLE || defined HANDLE_DW_CFA || \ + defined HANDLE_DW_RLE || \ + (defined HANDLE_DW_CFA && defined HANDLE_DW_CFA_PRED) || \ defined HANDLE_DW_APPLE_PROPERTY || defined HANDLE_DW_UT || \ defined HANDLE_DWARF_SECTION || defined HANDLE_DW_IDX || \ defined HANDLE_DW_END) @@ -85,6 +86,10 @@ #define HANDLE_DW_CFA(ID, NAME) #endif +#ifndef HANDLE_DW_CFA_PRED +#define HANDLE_DW_CFA_PRED(ID, NAME, PRED) +#endif + #ifndef HANDLE_DW_APPLE_PROPERTY #define HANDLE_DW_APPLE_PROPERTY(ID, NAME) #endif @@ -831,9 +836,10 @@ HANDLE_DW_CFA(0x14, val_offset) HANDLE_DW_CFA(0x15, val_offset_sf) HANDLE_DW_CFA(0x16, val_expression) // Vendor extensions: -HANDLE_DW_CFA(0x1d, MIPS_advance_loc8) -HANDLE_DW_CFA(0x2d, GNU_window_save) -HANDLE_DW_CFA(0x2e, GNU_args_size) +HANDLE_DW_CFA_PRED(0x1d, MIPS_advance_loc8, SELECT_MIPS64) +HANDLE_DW_CFA_PRED(0x2d, GNU_window_save, SELECT_SPARC) +HANDLE_DW_CFA_PRED(0x2d, AARCH64_negate_ra_state, SELECT_AARCH64) +HANDLE_DW_CFA_PRED(0x2e, GNU_args_size, SELECT_X86) // Apple Objective-C Property Attributes. // Keep this list in sync with clang's DeclSpec.h ObjCPropertyAttributeKind! @@ -916,6 +922,7 @@ HANDLE_DW_IDX(0x05, type_hash) #undef HANDLE_DW_MACRO #undef HANDLE_DW_RLE #undef HANDLE_DW_CFA +#undef HANDLE_DW_CFA_PRED #undef HANDLE_DW_APPLE_PROPERTY #undef HANDLE_DW_UT #undef HANDLE_DWARF_SECTION diff --git a/include/llvm/BinaryFormat/Dwarf.h b/include/llvm/BinaryFormat/Dwarf.h index 330e31cffae678d3c7dea13a4729e6b9b3ae7edf..c25b6cd0966b13ef37982833fb3591b81c18ae7a 100644 --- a/include/llvm/BinaryFormat/Dwarf.h +++ b/include/llvm/BinaryFormat/Dwarf.h @@ -26,6 +26,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/FormatVariadicDetails.h" +#include "llvm/ADT/Triple.h" namespace llvm { class StringRef; @@ -272,6 +273,7 @@ enum RangeListEntries { /// Call frame instruction encodings. enum CallFrameInfo { #define HANDLE_DW_CFA(ID, NAME) DW_CFA_##NAME = ID, +#define HANDLE_DW_CFA_PRED(ID, NAME, ARCH) DW_CFA_##NAME = ID, #include "llvm/BinaryFormat/Dwarf.def" DW_CFA_extended = 0x00, @@ -430,7 +432,7 @@ StringRef LNStandardString(unsigned Standard); StringRef LNExtendedString(unsigned Encoding); StringRef MacinfoString(unsigned Encoding); StringRef RangeListEncodingString(unsigned Encoding); -StringRef CallFrameString(unsigned Encoding); +StringRef CallFrameString(unsigned Encoding, Triple::ArchType Arch); StringRef ApplePropertyString(unsigned); StringRef UnitTypeString(unsigned); StringRef AtomTypeString(unsigned Atom); diff --git a/include/llvm/BinaryFormat/ELF.h b/include/llvm/BinaryFormat/ELF.h index ebbf830a60e9db9318dcc2d8a580da094250e74e..ce35d127d43390e764795c22c98bed1ac073e464 100644 --- a/include/llvm/BinaryFormat/ELF.h +++ b/include/llvm/BinaryFormat/ELF.h @@ -582,6 +582,7 @@ enum { EF_HEXAGON_MACH_V60 = 0x00000060, // Hexagon V60 EF_HEXAGON_MACH_V62 = 0x00000062, // Hexagon V62 EF_HEXAGON_MACH_V65 = 0x00000065, // Hexagon V65 + EF_HEXAGON_MACH_V66 = 0x00000066, // Hexagon V66 // Highest ISA version flags EF_HEXAGON_ISA_MACH = 0x00000000, // Same as specified in bits[11:0] @@ -594,6 +595,7 @@ enum { EF_HEXAGON_ISA_V60 = 0x00000060, // Hexagon V60 ISA EF_HEXAGON_ISA_V62 = 0x00000062, // Hexagon V62 ISA EF_HEXAGON_ISA_V65 = 0x00000065, // Hexagon V65 ISA + EF_HEXAGON_ISA_V66 = 0x00000066, // Hexagon V66 ISA }; // Hexagon-specific section indexes for common small data @@ -1359,7 +1361,7 @@ enum { GNU_PROPERTY_X86_FEATURE_1_SHSTK = 1 << 1 }; -// AMDGPU specific notes. +// AMD specific notes. (Code Object V2) enum { // Note types with values between 0 and 9 (inclusive) are reserved. NT_AMD_AMDGPU_HSA_METADATA = 10, @@ -1367,6 +1369,12 @@ enum { NT_AMD_AMDGPU_PAL_METADATA = 12 }; +// AMDGPU specific notes. (Code Object V3) +enum { + // Note types with values between 0 and 31 (inclusive) are reserved. + NT_AMDGPU_METADATA = 32 +}; + enum { GNU_ABI_TAG_LINUX = 0, GNU_ABI_TAG_HURD = 1, @@ -1377,6 +1385,8 @@ enum { GNU_ABI_TAG_NACL = 6, }; +constexpr const char *ELF_NOTE_GNU = "GNU"; + // Android packed relocation group flags. enum { RELOCATION_GROUPED_BY_INFO_FLAG = 1, diff --git a/include/llvm/BinaryFormat/MsgPackTypes.h b/include/llvm/BinaryFormat/MsgPackTypes.h new file mode 100644 index 0000000000000000000000000000000000000000..f96cd4c338fd9e55edccbe0d784deefdcc0755ff --- /dev/null +++ b/include/llvm/BinaryFormat/MsgPackTypes.h @@ -0,0 +1,372 @@ +//===- MsgPackTypes.h - MsgPack Types ---------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This is a data structure for representing MessagePack "documents", with +/// methods to go to and from MessagePack. The types also specialize YAMLIO +/// traits in order to go to and from YAML. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Optional.h" +#include "llvm/BinaryFormat/MsgPackReader.h" +#include "llvm/BinaryFormat/MsgPackWriter.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/YAMLTraits.h" +#include + +#ifndef LLVM_BINARYFORMAT_MSGPACKTYPES_H +#define LLVM_BINARYFORMAT_MSGPACKTYPES_H + +namespace llvm { +namespace msgpack { + +class Node; + +/// Short-hand for a Node pointer. +using NodePtr = std::shared_ptr; + +/// Short-hand for an Optional Node pointer. +using OptNodePtr = Optional; + +/// Abstract base-class which can be any MessagePack type. +class Node { +public: + enum NodeKind { + NK_Scalar, + NK_Array, + NK_Map, + }; + +private: + virtual void anchor() = 0; + const NodeKind Kind; + + static Expected readArray(Reader &MPReader, size_t Length); + static Expected readMap(Reader &MPReader, size_t Length); + +public: + NodeKind getKind() const { return Kind; } + + /// Construct a Node. Used by derived classes to track kind information. + Node(NodeKind Kind) : Kind(Kind) {} + + virtual ~Node() = default; + + /// Read from a MessagePack reader \p MPReader, returning an error if one is + /// encountered, or None if \p MPReader is at the end of stream, or some Node + /// pointer if some type is read. + static Expected read(Reader &MPReader); + + /// Write to a MessagePack writer \p MPWriter. + virtual void write(Writer &MPWriter) = 0; +}; + +/// A MessagePack scalar. +class ScalarNode : public Node { +public: + enum ScalarKind { + SK_Int, + SK_UInt, + SK_Nil, + SK_Boolean, + SK_Float, + SK_String, + SK_Binary, + }; + +private: + void anchor() override; + + void destroy(); + + ScalarKind SKind; + + union { + int64_t IntValue; + uint64_t UIntValue; + bool BoolValue; + double FloatValue; + std::string StringValue; + }; + +public: + /// Construct an Int ScalarNode. + ScalarNode(int64_t IntValue); + /// Construct an Int ScalarNode. + ScalarNode(int32_t IntValue); + /// Construct an UInt ScalarNode. + ScalarNode(uint64_t UIntValue); + /// Construct an UInt ScalarNode. + ScalarNode(uint32_t UIntValue); + /// Construct a Nil ScalarNode. + ScalarNode(); + /// Construct a Boolean ScalarNode. + ScalarNode(bool BoolValue); + /// Construct a Float ScalarNode. + ScalarNode(double FloatValue); + /// Construct a String ScalarNode. + ScalarNode(StringRef StringValue); + /// Construct a String ScalarNode. + ScalarNode(const char *StringValue); + /// Construct a String ScalarNode. + ScalarNode(std::string &&StringValue); + /// Construct a Binary ScalarNode. + ScalarNode(MemoryBufferRef BinaryValue); + + ~ScalarNode(); + + ScalarNode &operator=(const ScalarNode &RHS) = delete; + /// A ScalarNode can only be move assigned. + ScalarNode &operator=(ScalarNode &&RHS); + + /// Change the kind of this ScalarNode, zero initializing it to the new type. + void setScalarKind(ScalarKind SKind) { + switch (SKind) { + case SK_Int: + *this = int64_t(0); + break; + case SK_UInt: + *this = uint64_t(0); + break; + case SK_Boolean: + *this = false; + break; + case SK_Float: + *this = 0.0; + break; + case SK_String: + *this = StringRef(); + break; + case SK_Binary: + *this = MemoryBufferRef("", ""); + break; + case SK_Nil: + *this = ScalarNode(); + break; + } + } + + /// Get the current kind of ScalarNode. + ScalarKind getScalarKind() { return SKind; } + + /// Get the value of an Int scalar. + /// + /// \warning Assumes getScalarKind() == SK_Int + int64_t getInt() { + assert(SKind == SK_Int); + return IntValue; + } + + /// Get the value of a UInt scalar. + /// + /// \warning Assumes getScalarKind() == SK_UInt + uint64_t getUInt() { + assert(SKind == SK_UInt); + return UIntValue; + } + + /// Get the value of an Boolean scalar. + /// + /// \warning Assumes getScalarKind() == SK_Boolean + bool getBool() { + assert(SKind == SK_Boolean); + return BoolValue; + } + + /// Get the value of an Float scalar. + /// + /// \warning Assumes getScalarKind() == SK_Float + double getFloat() { + assert(SKind == SK_Float); + return FloatValue; + } + + /// Get the value of a String scalar. + /// + /// \warning Assumes getScalarKind() == SK_String + StringRef getString() { + assert(SKind == SK_String); + return StringValue; + } + + /// Get the value of a Binary scalar. + /// + /// \warning Assumes getScalarKind() == SK_Binary + StringRef getBinary() { + assert(SKind == SK_Binary); + return StringValue; + } + + static bool classof(const Node *N) { return N->getKind() == NK_Scalar; } + + void write(Writer &MPWriter) override; + + /// Parse a YAML scalar of the current ScalarKind from \p ScalarStr. + /// + /// \returns An empty string on success, otherwise an error message. + StringRef inputYAML(StringRef ScalarStr); + + /// Output a YAML scalar of the current ScalarKind into \p OS. + void outputYAML(raw_ostream &OS) const; + + /// Determine which YAML quoting type the current value would need when + /// output. + yaml::QuotingType mustQuoteYAML(StringRef ScalarStr) const; + + /// Get the YAML tag for the current ScalarKind. + StringRef getYAMLTag() const; + + /// Flag which affects how the type handles YAML tags when reading and + /// writing. + /// + /// When false, tags are used when reading and writing. When reading, the tag + /// is used to decide the ScalarKind before parsing. When writing, the tag is + /// output along with the value. + /// + /// When true, tags are ignored when reading and writing. When reading, the + /// ScalarKind is always assumed to be String. When writing, the tag is not + /// output. + bool IgnoreTag = false; + + static const char *IntTag; + static const char *NilTag; + static const char *BooleanTag; + static const char *FloatTag; + static const char *StringTag; + static const char *BinaryTag; +}; + +class ArrayNode : public Node, public std::vector { + void anchor() override; + +public: + ArrayNode() : Node(NK_Array) {} + static bool classof(const Node *N) { return N->getKind() == NK_Array; } + + void write(Writer &MPWriter) override { + MPWriter.writeArraySize(this->size()); + for (auto &N : *this) + N->write(MPWriter); + } +}; + +class MapNode : public Node, public StringMap { + void anchor() override; + +public: + MapNode() : Node(NK_Map) {} + static bool classof(const Node *N) { return N->getKind() == NK_Map; } + + void write(Writer &MPWriter) override { + MPWriter.writeMapSize(this->size()); + for (auto &N : *this) { + MPWriter.write(N.first()); + N.second->write(MPWriter); + } + } +}; + +} // end namespace msgpack + +namespace yaml { + +template <> struct PolymorphicTraits { + static NodeKind getKind(const msgpack::NodePtr &N) { + if (isa(*N)) + return NodeKind::Scalar; + if (isa(*N)) + return NodeKind::Map; + if (isa(*N)) + return NodeKind::Sequence; + llvm_unreachable("NodeKind not supported"); + } + static msgpack::ScalarNode &getAsScalar(msgpack::NodePtr &N) { + if (!N || !isa(*N)) + N.reset(new msgpack::ScalarNode()); + return *cast(N.get()); + } + static msgpack::MapNode &getAsMap(msgpack::NodePtr &N) { + if (!N || !isa(*N)) + N.reset(new msgpack::MapNode()); + return *cast(N.get()); + } + static msgpack::ArrayNode &getAsSequence(msgpack::NodePtr &N) { + if (!N || !isa(*N)) + N.reset(new msgpack::ArrayNode()); + return *cast(N.get()); + } +}; + +template <> struct TaggedScalarTraits { + static void output(const msgpack::ScalarNode &S, void *Ctxt, + raw_ostream &ScalarOS, raw_ostream &TagOS) { + if (!S.IgnoreTag) + TagOS << S.getYAMLTag(); + S.outputYAML(ScalarOS); + } + + static StringRef input(StringRef ScalarStr, StringRef Tag, void *Ctxt, + msgpack::ScalarNode &S) { + if (Tag == msgpack::ScalarNode::IntTag) { + S.setScalarKind(msgpack::ScalarNode::SK_UInt); + if (S.inputYAML(ScalarStr) == StringRef()) + return StringRef(); + S.setScalarKind(msgpack::ScalarNode::SK_Int); + return S.inputYAML(ScalarStr); + } + + if (S.IgnoreTag || Tag == msgpack::ScalarNode::StringTag || + Tag == "tag:yaml.org,2002:str") + S.setScalarKind(msgpack::ScalarNode::SK_String); + else if (Tag == msgpack::ScalarNode::NilTag) + S.setScalarKind(msgpack::ScalarNode::SK_Nil); + else if (Tag == msgpack::ScalarNode::BooleanTag) + S.setScalarKind(msgpack::ScalarNode::SK_Boolean); + else if (Tag == msgpack::ScalarNode::FloatTag) + S.setScalarKind(msgpack::ScalarNode::SK_Float); + else if (Tag == msgpack::ScalarNode::StringTag) + S.setScalarKind(msgpack::ScalarNode::SK_String); + else if (Tag == msgpack::ScalarNode::BinaryTag) + S.setScalarKind(msgpack::ScalarNode::SK_Binary); + else + return "Unsupported messagepack tag"; + + return S.inputYAML(ScalarStr); + } + + static QuotingType mustQuote(const msgpack::ScalarNode &S, StringRef Str) { + return S.mustQuoteYAML(Str); + } +}; + +template <> struct CustomMappingTraits { + static void inputOne(IO &IO, StringRef Key, msgpack::MapNode &M) { + IO.mapRequired(Key.str().c_str(), M[Key]); + } + static void output(IO &IO, msgpack::MapNode &M) { + for (auto &N : M) + IO.mapRequired(N.getKey().str().c_str(), N.getValue()); + } +}; + +template <> struct SequenceTraits { + static size_t size(IO &IO, msgpack::ArrayNode &A) { return A.size(); } + static msgpack::NodePtr &element(IO &IO, msgpack::ArrayNode &A, + size_t Index) { + if (Index >= A.size()) + A.resize(Index + 1); + return A[Index]; + } +}; + +} // end namespace yaml +} // end namespace llvm + +#endif // LLVM_BINARYFORMAT_MSGPACKTYPES_H diff --git a/include/llvm/BinaryFormat/Wasm.h b/include/llvm/BinaryFormat/Wasm.h index 3d25c9d15e4e7ff2fe73c716fd0425efa443bc06..ca752374e157087a22688c45d5e350bb99c52920 100644 --- a/include/llvm/BinaryFormat/Wasm.h +++ b/include/llvm/BinaryFormat/Wasm.h @@ -35,6 +35,14 @@ struct WasmObjectHeader { uint32_t Version; }; +struct WasmDylinkInfo { + uint32_t MemorySize; // Memory size in bytes + uint32_t MemoryAlignment; // P2 alignment of memory + uint32_t TableSize; // Table size in elements + uint32_t TableAlignment; // P2 alignment of table + std::vector Needed; // Shared library depenedencies +}; + struct WasmExport { StringRef Name; uint8_t Kind; @@ -75,6 +83,18 @@ struct WasmGlobal { StringRef SymbolName; // from the "linking" section }; +struct WasmEventType { + // Kind of event. Currently only WASM_EVENT_ATTRIBUTE_EXCEPTION is possible. + uint32_t Attribute; + uint32_t SigIndex; +}; + +struct WasmEvent { + uint32_t Index; + WasmEventType Type; + StringRef SymbolName; // from the "linking" section +}; + struct WasmImport { StringRef Module; StringRef Field; @@ -84,6 +104,7 @@ struct WasmImport { WasmGlobalType Global; WasmTable Table; WasmLimits Memory; + WasmEventType Event; }; }; @@ -167,18 +188,20 @@ struct WasmLinkingData { }; enum : unsigned { - WASM_SEC_CUSTOM = 0, // Custom / User-defined section - WASM_SEC_TYPE = 1, // Function signature declarations - WASM_SEC_IMPORT = 2, // Import declarations - WASM_SEC_FUNCTION = 3, // Function declarations - WASM_SEC_TABLE = 4, // Indirect function table and other tables - WASM_SEC_MEMORY = 5, // Memory attributes - WASM_SEC_GLOBAL = 6, // Global declarations - WASM_SEC_EXPORT = 7, // Exports - WASM_SEC_START = 8, // Start function declaration - WASM_SEC_ELEM = 9, // Elements section - WASM_SEC_CODE = 10, // Function bodies (code) - WASM_SEC_DATA = 11 // Data segments + WASM_SEC_CUSTOM = 0, // Custom / User-defined section + WASM_SEC_TYPE = 1, // Function signature declarations + WASM_SEC_IMPORT = 2, // Import declarations + WASM_SEC_FUNCTION = 3, // Function declarations + WASM_SEC_TABLE = 4, // Indirect function table and other tables + WASM_SEC_MEMORY = 5, // Memory attributes + WASM_SEC_GLOBAL = 6, // Global declarations + WASM_SEC_EXPORT = 7, // Exports + WASM_SEC_START = 8, // Start function declaration + WASM_SEC_ELEM = 9, // Elements section + WASM_SEC_CODE = 10, // Function bodies (code) + WASM_SEC_DATA = 11, // Data segments + WASM_SEC_DATACOUNT = 12, // Data segment count + WASM_SEC_EVENT = 13 // Event declarations }; // Type immediate encodings used in various contexts. @@ -200,6 +223,7 @@ enum : unsigned { WASM_EXTERNAL_TABLE = 0x1, WASM_EXTERNAL_MEMORY = 0x2, WASM_EXTERNAL_GLOBAL = 0x3, + WASM_EXTERNAL_EVENT = 0x4, }; // Opcodes used in initializer expressions. @@ -243,6 +267,12 @@ enum WasmSymbolType : unsigned { WASM_SYMBOL_TYPE_DATA = 0x1, WASM_SYMBOL_TYPE_GLOBAL = 0x2, WASM_SYMBOL_TYPE_SECTION = 0x3, + WASM_SYMBOL_TYPE_EVENT = 0x4, +}; + +// Kinds of event attributes. +enum WasmEventAttribute : unsigned { + WASM_EVENT_ATTRIBUTE_EXCEPTION = 0x0, }; const unsigned WASM_SYMBOL_BINDING_MASK = 0x3; diff --git a/include/llvm/BinaryFormat/WasmRelocs.def b/include/llvm/BinaryFormat/WasmRelocs.def index 8ffd51e483f38bdf9cba9e289d7a0a16601f7987..b3a08e70c1d5d55b8333f4663c1bd50cc5655733 100644 --- a/include/llvm/BinaryFormat/WasmRelocs.def +++ b/include/llvm/BinaryFormat/WasmRelocs.def @@ -1,4 +1,3 @@ - #ifndef WASM_RELOC #error "WASM_RELOC must be defined" #endif @@ -13,3 +12,4 @@ WASM_RELOC(R_WEBASSEMBLY_TYPE_INDEX_LEB, 6) WASM_RELOC(R_WEBASSEMBLY_GLOBAL_INDEX_LEB, 7) WASM_RELOC(R_WEBASSEMBLY_FUNCTION_OFFSET_I32, 8) WASM_RELOC(R_WEBASSEMBLY_SECTION_OFFSET_I32, 9) +WASM_RELOC(R_WEBASSEMBLY_EVENT_INDEX_LEB, 10) diff --git a/include/llvm/Bitcode/LLVMBitCodes.h b/include/llvm/Bitcode/LLVMBitCodes.h index 5467ecc26262a7f67dc08458e8ea714d62f21d49..f0d11e9c16894e849e6d83f21374e5eb8e13cdab 100644 --- a/include/llvm/Bitcode/LLVMBitCodes.h +++ b/include/llvm/Bitcode/LLVMBitCodes.h @@ -342,6 +342,7 @@ enum ConstantsCodes { CST_CODE_INLINEASM = 23, // INLINEASM: [sideeffect|alignstack| // asmdialect,asmstr,conststr] CST_CODE_CE_GEP_WITH_INRANGE_INDEX = 24, // [opty, flags, n x operands] + CST_CODE_CE_UNOP = 25, // CE_UNOP: [opcode, opval] }; /// CastOpcodes - These are values used in the bitcode files to encode which @@ -364,6 +365,14 @@ enum CastOpcodes { CAST_ADDRSPACECAST = 12 }; +/// UnaryOpcodes - These are values used in the bitcode files to encode which +/// unop a CST_CODE_CE_UNOP or a XXX refers to. The values of these enums +/// have no fixed relation to the LLVM IR enum values. Changing these will +/// break compatibility with old files. +enum UnaryOpcodes { + UNOP_NEG = 0 +}; + /// BinaryOpcodes - These are values used in the bitcode files to encode which /// binop a CST_CODE_CE_BINOP or a XXX refers to. The values of these enums /// have no fixed relation to the LLVM IR enum values. Changing these will @@ -524,6 +533,7 @@ enum FunctionCodes { // 53 is unused. // 54 is unused. FUNC_CODE_OPERAND_BUNDLE = 55, // OPERAND_BUNDLE: [tag#, value...] + FUNC_CODE_INST_UNOP = 56, // UNOP: [opcode, ty, opval] }; enum UseListCodes { diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h index 14bc08167825e3d7332d59f8457326b22f15446d..413901d218f961e1b70577f25925b3d77f52a1e2 100644 --- a/include/llvm/CodeGen/AsmPrinter.h +++ b/include/llvm/CodeGen/AsmPrinter.h @@ -71,6 +71,7 @@ class MCTargetOptions; class MDNode; class Module; class raw_ostream; +class StackMaps; class TargetLoweringObjectFile; class TargetMachine; @@ -137,6 +138,9 @@ private: static char ID; +protected: + /// Protected struct HandlerInfo and Handlers permit target extended + /// AsmPrinter adds their own handlers. struct HandlerInfo { AsmPrinterHandler *Handler; const char *TimerName; @@ -365,6 +369,9 @@ public: /// emit the proxies we previously omitted in EmitGlobalVariable. void emitGlobalGOTEquivs(); + /// Emit the stack maps. + void emitStackMaps(StackMaps &SM); + //===------------------------------------------------------------------===// // Overridable Hooks //===------------------------------------------------------------------===// @@ -542,7 +549,7 @@ public: /// /// \p Value - The value to emit. /// \p Size - The size of the integer (in bytes) to emit. - virtual void EmitDebugThreadLocal(const MCExpr *Value, unsigned Size) const; + virtual void EmitDebugValue(const MCExpr *Value, unsigned Size) const; //===------------------------------------------------------------------===// // Dwarf Lowering Routines @@ -652,6 +659,8 @@ private: void EmitLLVMUsedList(const ConstantArray *InitList); /// Emit llvm.ident metadata in an '.ident' directive. void EmitModuleIdents(Module &M); + /// Emit bytes for llvm.commandline metadata. + void EmitModuleCommandLines(Module &M); void EmitXXStructorList(const DataLayout &DL, const Constant *List, bool isCtor); diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h b/include/llvm/CodeGen/AsmPrinterHandler.h similarity index 92% rename from lib/CodeGen/AsmPrinter/AsmPrinterHandler.h rename to include/llvm/CodeGen/AsmPrinterHandler.h index f5ac95a20b10db2df8e6b84f0d2eecf47ca46a44..a8b13200dd4e33eba1301e9ec5e1ffb296ee59ad 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h +++ b/include/llvm/CodeGen/AsmPrinterHandler.h @@ -1,4 +1,4 @@ -//===-- lib/CodeGen/AsmPrinter/AsmPrinterHandler.h -------------*- C++ -*--===// +//===-- llvm/CodeGen/AsmPrinterHandler.h -----------------------*- C++ -*--===// // // The LLVM Compiler Infrastructure // @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_ASMPRINTERHANDLER_H -#define LLVM_LIB_CODEGEN_ASMPRINTER_ASMPRINTERHANDLER_H +#ifndef LLVM_CODEGEN_ASMPRINTERHANDLER_H +#define LLVM_CODEGEN_ASMPRINTERHANDLER_H #include "llvm/Support/DataTypes.h" diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h index 224a41bc2b7aba965904741c4567ffa942f01ea6..f105d887c397f3386eb1dfe9acd6019c17faafc4 100644 --- a/include/llvm/CodeGen/BasicTTIImpl.h +++ b/include/llvm/CodeGen/BasicTTIImpl.h @@ -118,6 +118,50 @@ private: return Cost; } + /// Estimate a cost of subvector extraction as a sequence of extract and + /// insert operations. + unsigned getExtractSubvectorOverhead(Type *Ty, int Index, Type *SubTy) { + assert(Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() && + "Can only extract subvectors from vectors"); + int NumSubElts = SubTy->getVectorNumElements(); + assert((Index + NumSubElts) <= (int)Ty->getVectorNumElements() && + "SK_ExtractSubvector index out of range"); + + unsigned Cost = 0; + // Subvector extraction cost is equal to the cost of extracting element from + // the source type plus the cost of inserting them into the result vector + // type. + for (int i = 0; i != NumSubElts; ++i) { + Cost += static_cast(this)->getVectorInstrCost( + Instruction::ExtractElement, Ty, i + Index); + Cost += static_cast(this)->getVectorInstrCost( + Instruction::InsertElement, SubTy, i); + } + return Cost; + } + + /// Estimate a cost of subvector insertion as a sequence of extract and + /// insert operations. + unsigned getInsertSubvectorOverhead(Type *Ty, int Index, Type *SubTy) { + assert(Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() && + "Can only insert subvectors into vectors"); + int NumSubElts = SubTy->getVectorNumElements(); + assert((Index + NumSubElts) <= (int)Ty->getVectorNumElements() && + "SK_InsertSubvector index out of range"); + + unsigned Cost = 0; + // Subvector insertion cost is equal to the cost of extracting element from + // the source type plus the cost of inserting them into the result vector + // type. + for (int i = 0; i != NumSubElts; ++i) { + Cost += static_cast(this)->getVectorInstrCost( + Instruction::ExtractElement, SubTy, i); + Cost += static_cast(this)->getVectorInstrCost( + Instruction::InsertElement, Ty, i + Index); + } + return Cost; + } + /// Local query method delegates up to T which *must* implement this! const TargetSubtargetInfo *getST() const { return static_cast(this)->getST(); @@ -579,9 +623,12 @@ public: case TTI::SK_PermuteSingleSrc: case TTI::SK_PermuteTwoSrc: return getPermuteShuffleOverhead(Tp); - default: - return 1; + case TTI::SK_ExtractSubvector: + return getExtractSubvectorOverhead(Tp, Index, SubTp); + case TTI::SK_InsertSubvector: + return getInsertSubvectorOverhead(Tp, Index, SubTp); } + llvm_unreachable("Unknown TTI::ShuffleKind"); } unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, @@ -961,6 +1008,7 @@ public: unsigned VF = 1) { unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1); assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type"); + auto *ConcreteTTI = static_cast(this); switch (IID) { default: { @@ -986,29 +1034,24 @@ public: ScalarizationCost += getOperandsScalarizationOverhead(Args, VF); } - return static_cast(this)-> - getIntrinsicInstrCost(IID, RetTy, Types, FMF, ScalarizationCost); + return ConcreteTTI->getIntrinsicInstrCost(IID, RetTy, Types, FMF, + ScalarizationCost); } case Intrinsic::masked_scatter: { assert(VF == 1 && "Can't vectorize types here."); Value *Mask = Args[3]; bool VarMask = !isa(Mask); unsigned Alignment = cast(Args[2])->getZExtValue(); - return - static_cast(this)->getGatherScatterOpCost(Instruction::Store, - Args[0]->getType(), - Args[1], VarMask, - Alignment); + return ConcreteTTI->getGatherScatterOpCost( + Instruction::Store, Args[0]->getType(), Args[1], VarMask, Alignment); } case Intrinsic::masked_gather: { assert(VF == 1 && "Can't vectorize types here."); Value *Mask = Args[2]; bool VarMask = !isa(Mask); unsigned Alignment = cast(Args[1])->getZExtValue(); - return - static_cast(this)->getGatherScatterOpCost(Instruction::Load, - RetTy, Args[0], VarMask, - Alignment); + return ConcreteTTI->getGatherScatterOpCost(Instruction::Load, RetTy, + Args[0], VarMask, Alignment); } case Intrinsic::experimental_vector_reduce_add: case Intrinsic::experimental_vector_reduce_mul: @@ -1024,6 +1067,45 @@ public: case Intrinsic::experimental_vector_reduce_umax: case Intrinsic::experimental_vector_reduce_umin: return getIntrinsicInstrCost(IID, RetTy, Args[0]->getType(), FMF); + case Intrinsic::fshl: + case Intrinsic::fshr: { + Value *X = Args[0]; + Value *Y = Args[1]; + Value *Z = Args[2]; + TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW; + TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX); + TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY); + TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ); + TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue; + OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2 + : TTI::OP_None; + // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) + // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) + unsigned Cost = 0; + Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Or, RetTy); + Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Sub, RetTy); + Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Shl, RetTy, + OpKindX, OpKindZ, OpPropsX); + Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::LShr, RetTy, + OpKindY, OpKindZ, OpPropsY); + // Non-constant shift amounts requires a modulo. + if (OpKindZ != TTI::OK_UniformConstantValue && + OpKindZ != TTI::OK_NonUniformConstantValue) + Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::URem, RetTy, + OpKindZ, OpKindBW, OpPropsZ, + OpPropsBW); + // For non-rotates (X != Y) we must add shift-by-zero handling costs. + if (X != Y) { + Type *CondTy = Type::getInt1Ty(RetTy->getContext()); + if (RetVF > 1) + CondTy = VectorType::get(CondTy, RetVF); + Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, + CondTy, nullptr); + Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::Select, RetTy, + CondTy, nullptr); + } + return Cost; + } } } @@ -1349,20 +1431,31 @@ public: ShuffleCost += (IsPairwise + 1) * ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy); - ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, Ty); + ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, SubTy); Ty = SubTy; ++LongVectorCount; } + + NumReduxLevels -= LongVectorCount; + // The minimal length of the vector is limited by the real length of vector // operations performed on the current platform. That's why several final // reduction operations are performed on the vectors with the same // architecture-dependent length. - ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) * + + // Non pairwise reductions need one shuffle per reduction level. Pairwise + // reductions need two shuffles on every level, but the last one. On that + // level one of the shuffles is <0, u, u, ...> which is identity. + unsigned NumShuffles = NumReduxLevels; + if (IsPairwise && NumReduxLevels >= 1) + NumShuffles += NumReduxLevels - 1; + ShuffleCost += NumShuffles * ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, Ty); - ArithCost += (NumReduxLevels - LongVectorCount) * + ArithCost += NumReduxLevels * ConcreteTTI->getArithmeticInstrCost(Opcode, Ty); - return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true); + return ShuffleCost + ArithCost + + ConcreteTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); } /// Try to calculate op costs for min/max reduction operations. @@ -1393,37 +1486,45 @@ public: while (NumVecElts > MVTLen) { NumVecElts /= 2; Type *SubTy = VectorType::get(ScalarTy, NumVecElts); + CondTy = VectorType::get(ScalarCondTy, NumVecElts); + // Assume the pairwise shuffles add a cost. ShuffleCost += (IsPairwise + 1) * ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy); MinMaxCost += - ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) + - ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy, + ConcreteTTI->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, nullptr) + + ConcreteTTI->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy, nullptr); Ty = SubTy; - CondTy = VectorType::get(ScalarCondTy, NumVecElts); ++LongVectorCount; } + + NumReduxLevels -= LongVectorCount; + // The minimal length of the vector is limited by the real length of vector // operations performed on the current platform. That's why several final // reduction opertions are perfomed on the vectors with the same // architecture-dependent length. - ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) * + + // Non pairwise reductions need one shuffle per reduction level. Pairwise + // reductions need two shuffles on every level, but the last one. On that + // level one of the shuffles is <0, u, u, ...> which is identity. + unsigned NumShuffles = NumReduxLevels; + if (IsPairwise && NumReduxLevels >= 1) + NumShuffles += NumReduxLevels - 1; + ShuffleCost += NumShuffles * ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, Ty); MinMaxCost += - (NumReduxLevels - LongVectorCount) * + NumReduxLevels * (ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) + ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy, nullptr)); - // Need 3 extractelement instructions for scalarization + an additional - // scalar select instruction. + // The last min/max should be in vector registers and we counted it above. + // So just need a single extractelement. return ShuffleCost + MinMaxCost + - 3 * getScalarizationOverhead(Ty, /*Insert=*/false, - /*Extract=*/true) + - ConcreteTTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, - ScalarCondTy, nullptr); + ConcreteTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); } unsigned getVectorSplitCost() { return 1; } diff --git a/include/llvm/CodeGen/GCs.h b/include/llvm/CodeGen/BuiltinGCs.h similarity index 56% rename from include/llvm/CodeGen/GCs.h rename to include/llvm/CodeGen/BuiltinGCs.h index 5207f801c84ea652f858ed3e70f3a4824346eb5d..1767922fb5ac0c7d77c7c5f3a8c4445cb5b99bb6 100644 --- a/include/llvm/CodeGen/GCs.h +++ b/include/llvm/CodeGen/BuiltinGCs.h @@ -1,4 +1,4 @@ -//===-- GCs.h - Garbage collector linkage hacks ---------------------------===// +//===-- BuiltinGCs.h - Garbage collector linkage hacks --------------------===// // // The LLVM Compiler Infrastructure // @@ -7,7 +7,8 @@ // //===----------------------------------------------------------------------===// // -// This file contains hack functions to force linking in the GC components. +// This file contains hack functions to force linking in the builtin GC +// components. // //===----------------------------------------------------------------------===// @@ -15,32 +16,18 @@ #define LLVM_CODEGEN_GCS_H namespace llvm { -class GCStrategy; -class GCMetadataPrinter; /// FIXME: Collector instances are not useful on their own. These no longer /// serve any purpose except to link in the plugins. -/// Creates a CoreCLR-compatible garbage collector. -void linkCoreCLRGC(); - -/// Creates an ocaml-compatible garbage collector. -void linkOcamlGC(); +/// Ensure the definition of the builtin GCs gets linked in +void linkAllBuiltinGCs(); /// Creates an ocaml-compatible metadata printer. void linkOcamlGCPrinter(); -/// Creates an erlang-compatible garbage collector. -void linkErlangGC(); - /// Creates an erlang-compatible metadata printer. void linkErlangGCPrinter(); - -/// Creates a shadow stack garbage collector. This collector requires no code -/// generator support. -void linkShadowStackGC(); - -void linkStatepointExampleGC(); } #endif diff --git a/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.h b/include/llvm/CodeGen/DbgEntityHistoryCalculator.h similarity index 92% rename from lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.h rename to include/llvm/CodeGen/DbgEntityHistoryCalculator.h index 660d2ff551bea061433ecf6af7290ca04765ea7b..befc28f084e7f5d5e8ab382095f2382d354324a0 100644 --- a/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.h +++ b/include/llvm/CodeGen/DbgEntityHistoryCalculator.h @@ -1,4 +1,4 @@ -//===- llvm/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.h -----*- C++ -*-===// +//===- llvm/CodeGen/DbgEntityHistoryCalculator.h ----------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DBGVALUEHISTORYCALCULATOR_H -#define LLVM_LIB_CODEGEN_ASMPRINTER_DBGVALUEHISTORYCALCULATOR_H +#ifndef LLVM_CODEGEN_DBGVALUEHISTORYCALCULATOR_H +#define LLVM_CODEGEN_DBGVALUEHISTORYCALCULATOR_H #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" @@ -84,4 +84,4 @@ void calculateDbgEntityHistory(const MachineFunction *MF, } // end namespace llvm -#endif // LLVM_LIB_CODEGEN_ASMPRINTER_DBGVALUEHISTORYCALCULATOR_H +#endif // LLVM_CODEGEN_DBGVALUEHISTORYCALCULATOR_H diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.h b/include/llvm/CodeGen/DebugHandlerBase.h similarity index 94% rename from lib/CodeGen/AsmPrinter/DebugHandlerBase.h rename to include/llvm/CodeGen/DebugHandlerBase.h index cdf8dc72b0753180cf82741cdda098ece7e3efa0..4f0d14d317f2a11d87fab58e01b18f6119312a4b 100644 --- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.h +++ b/include/llvm/CodeGen/DebugHandlerBase.h @@ -1,4 +1,4 @@ -//===-- llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h --------*- C++ -*--===// +//===-- llvm/CodeGen/DebugHandlerBase.h -----------------------*- C++ -*--===// // // The LLVM Compiler Infrastructure // @@ -12,12 +12,12 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGHANDLERBASE_H -#define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGHANDLERBASE_H +#ifndef LLVM_CODEGEN_DEBUGHANDLERBASE_H +#define LLVM_CODEGEN_DEBUGHANDLERBASE_H -#include "AsmPrinterHandler.h" -#include "DbgEntityHistoryCalculator.h" #include "llvm/ADT/Optional.h" +#include "llvm/CodeGen/AsmPrinterHandler.h" +#include "llvm/CodeGen/DbgEntityHistoryCalculator.h" #include "llvm/CodeGen/LexicalScopes.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/IR/DebugInfoMetadata.h" diff --git a/include/llvm/CodeGen/FunctionLoweringInfo.h b/include/llvm/CodeGen/FunctionLoweringInfo.h index 5fe4f89f34b22c2a991fd96bece59b513ac80981..7c658515de0952ff84f72cd67f50a0ed610f7cf4 100644 --- a/include/llvm/CodeGen/FunctionLoweringInfo.h +++ b/include/llvm/CodeGen/FunctionLoweringInfo.h @@ -246,6 +246,7 @@ public: return 0; unsigned &R = ValueMap[V]; assert(R == 0 && "Already initialized this value register!"); + assert(VirtReg2Value.empty()); return R = CreateRegs(V->getType()); } diff --git a/include/llvm/CodeGen/GCMetadata.h b/include/llvm/CodeGen/GCMetadata.h index ad2599fc120e54e064e5bfe21063a934459c98ca..7fb27202c1226c1d88362e4f7b4136c21616bd4c 100644 --- a/include/llvm/CodeGen/GCMetadata.h +++ b/include/llvm/CodeGen/GCMetadata.h @@ -55,12 +55,11 @@ class MCSymbol; /// GCPoint - Metadata for a collector-safe point in machine code. /// struct GCPoint { - GC::PointKind Kind; ///< The kind of the safe point. MCSymbol *Label; ///< A label. DebugLoc Loc; - GCPoint(GC::PointKind K, MCSymbol *L, DebugLoc DL) - : Kind(K), Label(L), Loc(std::move(DL)) {} + GCPoint(MCSymbol *L, DebugLoc DL) + : Label(L), Loc(std::move(DL)) {} }; /// GCRoot - Metadata for a pointer to an object managed by the garbage @@ -124,8 +123,8 @@ public: /// addSafePoint - Notes the existence of a safe point. Num is the ID of the /// label just prior to the safe point (if the code generator is using /// MachineModuleInfo). - void addSafePoint(GC::PointKind Kind, MCSymbol *Label, const DebugLoc &DL) { - SafePoints.emplace_back(Kind, Label, DL); + void addSafePoint(MCSymbol *Label, const DebugLoc &DL) { + SafePoints.emplace_back(Label, DL); } /// getFrameSize/setFrameSize - Records the function's frame size. diff --git a/include/llvm/CodeGen/GCMetadataPrinter.h b/include/llvm/CodeGen/GCMetadataPrinter.h index 1cc69a7b71af5fd01c535d37922a57e2bcb0bb80..5f1efb2ce02c2d81dc10531a6912678668927386 100644 --- a/include/llvm/CodeGen/GCMetadataPrinter.h +++ b/include/llvm/CodeGen/GCMetadataPrinter.h @@ -29,6 +29,7 @@ class GCMetadataPrinter; class GCModuleInfo; class GCStrategy; class Module; +class StackMaps; /// GCMetadataPrinterRegistry - The GC assembly printer registry uses all the /// defaults from Registry. @@ -60,6 +61,11 @@ public: /// Called after the assembly for the module is generated by /// the AsmPrinter (but before target specific hooks) virtual void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) {} + + /// Called when the stack maps are generated. Return true if + /// stack maps with a custom format are generated. Otherwise + /// returns false and the default format will be used. + virtual bool emitStackMaps(StackMaps &SM, AsmPrinter &AP) { return false; } }; } // end namespace llvm diff --git a/include/llvm/CodeGen/GCStrategy.h b/include/llvm/CodeGen/GCStrategy.h index f835bacfb5481110ea31a9f5d634f8662b7bbc30..5a60cd7cb8236dd54e1d1bf1368370748f2ad3cf 100644 --- a/include/llvm/CodeGen/GCStrategy.h +++ b/include/llvm/CodeGen/GCStrategy.h @@ -59,19 +59,6 @@ namespace llvm { class Type; -namespace GC { - -/// PointKind - Used to indicate whether the address of the call instruction -/// or the address after the call instruction is listed in the stackmap. For -/// most runtimes, PostCall safepoints are appropriate. -/// -enum PointKind { - PreCall, ///< Instr is a call instruction. - PostCall ///< Instr is the return address of a call. -}; - -} // end namespace GC - /// GCStrategy describes a garbage collector algorithm's code generation /// requirements, and provides overridable hooks for those needs which cannot /// be abstractly described. GCStrategy objects must be looked up through @@ -88,11 +75,7 @@ protected: /// if set, none of the other options can be /// anything but their default values. - unsigned NeededSafePoints = 0; ///< Bitmask of required safe points. - bool CustomReadBarriers = false; ///< Default is to insert loads. - bool CustomWriteBarriers = false; ///< Default is to insert stores. - bool CustomRoots = false; ///< Default is to pass through to backend. - bool InitRoots= true; ///< If set, roots are nulled during lowering. + bool NeededSafePoints = false; ///< if set, calls are inferred to be safepoints bool UsesMetadata = false; ///< If set, backend must emit metadata tables. public: @@ -103,16 +86,6 @@ public: /// name string specified on functions which use this strategy. const std::string &getName() const { return Name; } - /// By default, write barriers are replaced with simple store - /// instructions. If true, you must provide a custom pass to lower - /// calls to \@llvm.gcwrite. - bool customWriteBarrier() const { return CustomWriteBarriers; } - - /// By default, read barriers are replaced with simple load - /// instructions. If true, you must provide a custom pass to lower - /// calls to \@llvm.gcread. - bool customReadBarrier() const { return CustomReadBarriers; } - /// Returns true if this strategy is expecting the use of gc.statepoints, /// and false otherwise. bool useStatepoints() const { return UseStatepoints; } @@ -135,25 +108,8 @@ public: */ ///@{ - /// True if safe points of any kind are required. By default, none are - /// recorded. - bool needsSafePoints() const { return NeededSafePoints != 0; } - - /// True if the given kind of safe point is required. By default, none are - /// recorded. - bool needsSafePoint(GC::PointKind Kind) const { - return (NeededSafePoints & 1 << Kind) != 0; - } - - /// By default, roots are left for the code generator so it can generate a - /// stack map. If true, you must provide a custom pass to lower - /// calls to \@llvm.gcroot. - bool customRoots() const { return CustomRoots; } - - /// If set, gcroot intrinsics should initialize their allocas to null - /// before the first use. This is necessary for most GCs and is enabled by - /// default. - bool initializeRoots() const { return InitRoots; } + /// True if safe points need to be inferred on call sites + bool needsSafePoints() const { return NeededSafePoints; } /// If set, appropriate metadata tables must be emitted by the back-end /// (assembler, JIT, or otherwise). For statepoint, this method is diff --git a/include/llvm/CodeGen/GlobalISel/Combiner.h b/include/llvm/CodeGen/GlobalISel/Combiner.h index fa49e8c20d28bfac9b0f90caec7f7cc4ec9bbfdb..36a33deb4a642a6338fdee468191f1eca7f0737f 100644 --- a/include/llvm/CodeGen/GlobalISel/Combiner.h +++ b/include/llvm/CodeGen/GlobalISel/Combiner.h @@ -24,16 +24,6 @@ class CombinerInfo; class TargetPassConfig; class MachineFunction; -class CombinerChangeObserver { -public: - virtual ~CombinerChangeObserver() {} - - /// An instruction was erased. - virtual void erasedInstr(MachineInstr &MI) = 0; - /// An instruction was created and inseerted into the function. - virtual void createdInstr(MachineInstr &MI) = 0; -}; - class Combiner { public: Combiner(CombinerInfo &CombinerInfo, const TargetPassConfig *TPC); diff --git a/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index ee67f90a39e508c4c7495aa24d599307ed7f0f98..6e9ac01c1ee291a1dd6cff05c0b12f903d425f69 100644 --- a/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -1,4 +1,4 @@ -//== llvm/CodeGen/GlobalISel/CombinerHelper.h -------------- -*- C++ -*-==// +//===-- llvm/CodeGen/GlobalISel/CombinerHelper.h --------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -20,21 +20,27 @@ namespace llvm { -class CombinerChangeObserver; +class GISelChangeObserver; class MachineIRBuilder; class MachineRegisterInfo; class MachineInstr; +class MachineOperand; class CombinerHelper { MachineIRBuilder &Builder; MachineRegisterInfo &MRI; - CombinerChangeObserver &Observer; - - void eraseInstr(MachineInstr &MI); - void scheduleForVisit(MachineInstr &MI); + GISelChangeObserver &Observer; public: - CombinerHelper(CombinerChangeObserver &Observer, MachineIRBuilder &B); + CombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B); + + /// MachineRegisterInfo::replaceRegWith() and inform the observer of the changes + void replaceRegWith(MachineRegisterInfo &MRI, unsigned FromReg, unsigned ToReg) const; + + /// Replace a single register operand with a new register and inform the + /// observer of the changes. + void replaceRegOpWith(MachineRegisterInfo &MRI, MachineOperand &FromRegOp, + unsigned ToReg) const; /// If \p MI is COPY, try to combine it. /// Returns true if MI changed. diff --git a/include/llvm/CodeGen/GlobalISel/CombinerInfo.h b/include/llvm/CodeGen/GlobalISel/CombinerInfo.h index 98c5b981462697343692c761d8f4e486bd5ff40c..d21aa3f725d92e77ad472fd91c9a1370749d1d2c 100644 --- a/include/llvm/CodeGen/GlobalISel/CombinerInfo.h +++ b/include/llvm/CodeGen/GlobalISel/CombinerInfo.h @@ -17,7 +17,7 @@ #include namespace llvm { -class CombinerChangeObserver; +class GISelChangeObserver; class LegalizerInfo; class MachineInstr; class MachineIRBuilder; @@ -43,7 +43,18 @@ public: /// illegal ops that are created. bool LegalizeIllegalOps; // TODO: Make use of this. const LegalizerInfo *LInfo; - virtual bool combine(CombinerChangeObserver &Observer, MachineInstr &MI, + + /// Attempt to combine instructions using MI as the root. + /// + /// Use Observer to report the creation, modification, and erasure of + /// instructions. GISelChangeObserver will automatically report certain + /// kinds of operations. These operations are: + /// * Instructions that are newly inserted into the MachineFunction + /// * Instructions that are erased from the MachineFunction. + /// + /// However, it is important to report instruction modification and this is + /// not automatic. + virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const = 0; }; } // namespace llvm diff --git a/include/llvm/CodeGen/GlobalISel/ConstantFoldingMIRBuilder.h b/include/llvm/CodeGen/GlobalISel/ConstantFoldingMIRBuilder.h index 8d61f9a68279829f6d07b651639e71abe4186398..118c65ed39632edb9170c73d67bf7988348d9086 100644 --- a/include/llvm/CodeGen/GlobalISel/ConstantFoldingMIRBuilder.h +++ b/include/llvm/CodeGen/GlobalISel/ConstantFoldingMIRBuilder.h @@ -68,38 +68,18 @@ static Optional ConstantFoldBinOp(unsigned Opcode, const unsigned Op1, /// An MIRBuilder which does trivial constant folding of binary ops. /// Calls to buildInstr will also try to constant fold binary ops. -class ConstantFoldingMIRBuilder - : public FoldableInstructionsBuilder { +class ConstantFoldingMIRBuilder : public MachineIRBuilder { public: // Pull in base class constructors. - using FoldableInstructionsBuilder< - ConstantFoldingMIRBuilder>::FoldableInstructionsBuilder; - // Unhide buildInstr - using FoldableInstructionsBuilder::buildInstr; + using MachineIRBuilder::MachineIRBuilder; - // Implement buildBinaryOp required by FoldableInstructionsBuilder which - // tries to constant fold. - MachineInstrBuilder buildBinaryOp(unsigned Opcode, unsigned Dst, - unsigned Src0, unsigned Src1) { - validateBinaryOp(Dst, Src0, Src1); - auto MaybeCst = ConstantFoldBinOp(Opcode, Src0, Src1, getMF().getRegInfo()); - if (MaybeCst) - return buildConstant(Dst, MaybeCst->getSExtValue()); - return buildInstr(Opcode).addDef(Dst).addUse(Src0).addUse(Src1); - } - - template - MachineInstrBuilder buildInstr(unsigned Opc, DstTy &&Ty, UseArg1Ty &&Arg1, - UseArg2Ty &&Arg2) { - unsigned Dst = getDestFromArg(Ty); - return buildInstr(Opc, Dst, getRegFromArg(std::forward(Arg1)), - getRegFromArg(std::forward(Arg2))); - } + virtual ~ConstantFoldingMIRBuilder() = default; // Try to provide an overload for buildInstr for binary ops in order to // constant fold. - MachineInstrBuilder buildInstr(unsigned Opc, unsigned Dst, unsigned Src0, - unsigned Src1) { + MachineInstrBuilder buildInstr(unsigned Opc, ArrayRef DstOps, + ArrayRef SrcOps, + Optional Flags = None) override { switch (Opc) { default: break; @@ -116,19 +96,18 @@ public: case TargetOpcode::G_SDIV: case TargetOpcode::G_UREM: case TargetOpcode::G_SREM: { - return buildBinaryOp(Opc, Dst, Src0, Src1); + assert(DstOps.size() == 1 && "Invalid dst ops"); + assert(SrcOps.size() == 2 && "Invalid src ops"); + const DstOp &Dst = DstOps[0]; + const SrcOp &Src0 = SrcOps[0]; + const SrcOp &Src1 = SrcOps[1]; + if (auto MaybeCst = + ConstantFoldBinOp(Opc, Src0.getReg(), Src1.getReg(), *getMRI())) + return buildConstant(Dst, MaybeCst->getSExtValue()); + break; } } - return buildInstr(Opc).addDef(Dst).addUse(Src0).addUse(Src1); - } - - // Fallback implementation of buildInstr. - template - MachineInstrBuilder buildInstr(unsigned Opc, DstTy &&Ty, - UseArgsTy &&... Args) { - auto MIB = buildInstr(Opc).addDef(getDestFromArg(Ty)); - addUsesFromArgs(MIB, std::forward(Args)...); - return MIB; + return MachineIRBuilder::buildInstr(Opc, DstOps, SrcOps); } }; } // namespace llvm diff --git a/include/llvm/CodeGen/GlobalISel/GISelChangeObserver.h b/include/llvm/CodeGen/GlobalISel/GISelChangeObserver.h new file mode 100644 index 0000000000000000000000000000000000000000..d21c73309a5202f9ad3b4be9afa95e039bc4940c --- /dev/null +++ b/include/llvm/CodeGen/GlobalISel/GISelChangeObserver.h @@ -0,0 +1,55 @@ +//===----- llvm/CodeGen/GlobalISel/GISelChangeObserver.h ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// This contains common code to allow clients to notify changes to machine +/// instr. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_CODEGEN_GLOBALISEL_GISELCHANGEOBSERVER_H +#define LLVM_CODEGEN_GLOBALISEL_GISELCHANGEOBSERVER_H + +#include "llvm/ADT/SmallPtrSet.h" + +namespace llvm { +class MachineInstr; +class MachineRegisterInfo; + +/// Abstract class that contains various methods for clients to notify about +/// changes. This should be the preferred way for APIs to notify changes. +/// Typically calling erasingInstr/createdInstr multiple times should not affect +/// the result. The observer would likely need to check if it was already +/// notified earlier (consider using GISelWorkList). +class GISelChangeObserver { + SmallPtrSet ChangingAllUsesOfReg; + +public: + virtual ~GISelChangeObserver() {} + + /// An instruction is about to be erased. + virtual void erasingInstr(const MachineInstr &MI) = 0; + /// An instruction was created and inserted into the function. + virtual void createdInstr(const MachineInstr &MI) = 0; + /// This instruction is about to be mutated in some way. + virtual void changingInstr(const MachineInstr &MI) = 0; + /// This instruction was mutated in some way. + virtual void changedInstr(const MachineInstr &MI) = 0; + + /// All the instructions using the given register are being changed. + /// For convenience, finishedChangingAllUsesOfReg() will report the completion + /// of the changes. The use list may change between this call and + /// finishedChangingAllUsesOfReg(). + void changingAllUsesOfReg(const MachineRegisterInfo &MRI, unsigned Reg); + /// All instructions reported as changing by changingAllUsesOfReg() have + /// finished being changed. + void finishedChangingAllUsesOfReg(); + +}; + +} // namespace llvm +#endif diff --git a/include/llvm/CodeGen/GlobalISel/GISelWorkList.h b/include/llvm/CodeGen/GlobalISel/GISelWorkList.h index 167905dc9aa1a49efc9c23fe25e11902e3783111..b32c5af537ccd6578a7e3083ce90f5900ad674c6 100644 --- a/include/llvm/CodeGen/GlobalISel/GISelWorkList.h +++ b/include/llvm/CodeGen/GlobalISel/GISelWorkList.h @@ -12,38 +12,68 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/Support/Debug.h" namespace llvm { -class MachineInstr; +class MachineFunction; -// Worklist which mostly works similar to InstCombineWorkList, but on MachineInstrs. -// The main difference with something like a SetVector is that erasing an element doesn't -// move all elements over one place - instead just nulls out the element of the vector. -// FIXME: Does it make sense to factor out common code with the instcombinerWorkList? +// Worklist which mostly works similar to InstCombineWorkList, but on +// MachineInstrs. The main difference with something like a SetVector is that +// erasing an element doesn't move all elements over one place - instead just +// nulls out the element of the vector. +// +// This worklist operates on instructions within a particular function. This is +// important for acquiring the rights to modify/replace instructions a +// GISelChangeObserver reports as the observer doesn't have the right to make +// changes to the instructions it sees so we use our access to the +// MachineFunction to establish that it's ok to add a given instruction to the +// worklist. +// +// FIXME: Does it make sense to factor out common code with the +// instcombinerWorkList? template class GISelWorkList { - SmallVector Worklist; - DenseMap WorklistMap; + MachineFunction *MF; + SmallVector Worklist; + DenseMap WorklistMap; public: - GISelWorkList() = default; + GISelWorkList(MachineFunction *MF) : MF(MF) {} bool empty() const { return WorklistMap.empty(); } unsigned size() const { return WorklistMap.size(); } - /// Add - Add the specified instruction to the worklist if it isn't already - /// in it. + /// Add the specified instruction to the worklist if it isn't already in it. void insert(MachineInstr *I) { - if (WorklistMap.try_emplace(I, Worklist.size()).second) { - Worklist.push_back(I); + // It would be safe to add this instruction to the worklist regardless but + // for consistency with the const version, check that the instruction we're + // adding would have been accepted if we were given a const pointer instead. + insert(const_cast(I)); + } + + void insert(const MachineInstr *I) { + // Confirm we'd be able to find the non-const pointer we want to schedule if + // we wanted to. We have the right to schedule work that may modify any + // instruction in MF. + assert(I->getParent() && "Expected parent BB"); + assert(I->getParent()->getParent() && "Expected parent function"); + assert((!MF || I->getParent()->getParent() == MF) && + "Expected parent function to be current function or not given"); + + // But don't actually do the search since we can derive it from the const + // pointer. + MachineInstr *NonConstI = const_cast(I); + if (WorklistMap.try_emplace(NonConstI, Worklist.size()).second) { + Worklist.push_back(NonConstI); } } - /// Remove - remove I from the worklist if it exists. - void remove(MachineInstr *I) { + /// Remove I from the worklist if it exists. + void remove(const MachineInstr *I) { auto It = WorklistMap.find(I); if (It == WorklistMap.end()) return; // Not in worklist. diff --git a/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/include/llvm/CodeGen/GlobalISel/IRTranslator.h index 2498ee9332104a0afc622a3ac56a999f442a2cd8..04629f6b320464f99bcd6be503e56dbc50e6ef59 100644 --- a/include/llvm/CodeGen/GlobalISel/IRTranslator.h +++ b/include/llvm/CodeGen/GlobalISel/IRTranslator.h @@ -300,6 +300,8 @@ private: bool translateFSub(const User &U, MachineIRBuilder &MIRBuilder); + bool translateFNeg(const User &U, MachineIRBuilder &MIRBuilder); + bool translateAdd(const User &U, MachineIRBuilder &MIRBuilder) { return translateBinaryOp(TargetOpcode::G_ADD, U, MIRBuilder); } diff --git a/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h index e1132ac59c829af869ec17919fcdc8940838e7ce..20bec76501796c68b9848bb2bfaddea727ad0d2b 100644 --- a/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h +++ b/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h @@ -1,4 +1,4 @@ -//===-- llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h --===========// +//===-- llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h -----*- C++ -*-// // // The LLVM Compiler Infrastructure // @@ -51,6 +51,18 @@ public: markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts); return true; } + + // aext([asz]ext x) -> [asz]ext x + unsigned ExtSrc; + MachineInstr *ExtMI; + if (mi_match(SrcReg, MRI, + m_all_of(m_MInstr(ExtMI), m_any_of(m_GAnyExt(m_Reg(ExtSrc)), + m_GSExt(m_Reg(ExtSrc)), + m_GZExt(m_Reg(ExtSrc)))))) { + Builder.buildInstr(ExtMI->getOpcode(), {DstReg}, {ExtSrc}); + markInstAndDefDead(MI, *ExtMI, DeadInsts); + return true; + } return tryFoldImplicitDef(MI, DeadInsts); } @@ -106,9 +118,9 @@ public: unsigned ShAmt = DstTy.getSizeInBits() - SrcTy.getSizeInBits(); auto MIBShAmt = Builder.buildConstant(DstTy, ShAmt); auto MIBShl = Builder.buildInstr( - TargetOpcode::G_SHL, DstTy, - Builder.buildAnyExtOrTrunc(DstTy, TruncSrc), MIBShAmt); - Builder.buildInstr(TargetOpcode::G_ASHR, DstReg, MIBShl, MIBShAmt); + TargetOpcode::G_SHL, {DstTy}, + {Builder.buildAnyExtOrTrunc(DstTy, TruncSrc), MIBShAmt}); + Builder.buildInstr(TargetOpcode::G_ASHR, {DstReg}, {MIBShl, MIBShAmt}); markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts); return true; } @@ -134,7 +146,7 @@ public: if (isInstUnsupported({TargetOpcode::G_IMPLICIT_DEF, {DstTy}})) return false; LLVM_DEBUG(dbgs() << ".. Combine G_ANYEXT(G_IMPLICIT_DEF): " << MI;); - Builder.buildInstr(TargetOpcode::G_IMPLICIT_DEF, DstReg); + Builder.buildInstr(TargetOpcode::G_IMPLICIT_DEF, {DstReg}, {}); } else { // G_[SZ]EXT (G_IMPLICIT_DEF) -> G_CONSTANT 0 because the top // bits will be 0 for G_ZEXT and 0/1 for the G_SEXT. @@ -157,8 +169,20 @@ public: return false; unsigned NumDefs = MI.getNumOperands() - 1; - MachineInstr *MergeI = getOpcodeDef(TargetOpcode::G_MERGE_VALUES, - MI.getOperand(NumDefs).getReg(), MRI); + + unsigned MergingOpcode; + LLT OpTy = MRI.getType(MI.getOperand(NumDefs).getReg()); + LLT DestTy = MRI.getType(MI.getOperand(0).getReg()); + if (OpTy.isVector() && DestTy.isVector()) + MergingOpcode = TargetOpcode::G_CONCAT_VECTORS; + else if (OpTy.isVector() && !DestTy.isVector()) + MergingOpcode = TargetOpcode::G_BUILD_VECTOR; + else + MergingOpcode = TargetOpcode::G_MERGE_VALUES; + + MachineInstr *MergeI = + getOpcodeDef(MergingOpcode, MI.getOperand(NumDefs).getReg(), MRI); + if (!MergeI) return false; diff --git a/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h index 4d804d0d0d25ab6172318ce73040dc8641083a58..4d3e4a3763865c86f1d64b485a8d3e7d72474726 100644 --- a/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -32,6 +32,7 @@ namespace llvm { class LegalizerInfo; class Legalizer; class MachineRegisterInfo; +class GISelChangeObserver; class LegalizerHelper { public: @@ -48,8 +49,9 @@ public: UnableToLegalize, }; - LegalizerHelper(MachineFunction &MF); - LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI); + LegalizerHelper(MachineFunction &MF, GISelChangeObserver &Observer); + LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI, + GISelChangeObserver &Observer); /// Replace \p MI by a sequence of legal instructions that can implement the /// same operation. Note that this means \p MI may be deleted, so any iterator @@ -117,6 +119,8 @@ private: MachineRegisterInfo &MRI; const LegalizerInfo &LI; + /// To keep track of changes made by the LegalizerHelper. + GISelChangeObserver &Observer; }; /// Helper function that creates the given libcall. diff --git a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h index 755805de1b08e35bc4121cabb713053cfee44ab7..13776dd3e87d18dd76a2ab8eb5ee07b37d6d9834 100644 --- a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h +++ b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h @@ -39,6 +39,7 @@ class MachineInstr; class MachineIRBuilder; class MachineRegisterInfo; class MCInstrInfo; +class GISelChangeObserver; namespace LegalizeActions { enum LegalizeAction : std::uint8_t { @@ -651,6 +652,20 @@ public: return minScalar(TypeIdx, MinTy).maxScalar(TypeIdx, MaxTy); } + /// Widen the scalar to match the size of another. + LegalizeRuleSet &minScalarSameAs(unsigned TypeIdx, unsigned LargeTypeIdx) { + typeIdx(TypeIdx); + return widenScalarIf( + [=](const LegalityQuery &Query) { + return Query.Types[LargeTypeIdx].getScalarSizeInBits() > + Query.Types[TypeIdx].getSizeInBits(); + }, + [=](const LegalityQuery &Query) { + return std::make_pair(TypeIdx, + Query.Types[LargeTypeIdx].getElementType()); + }); + } + /// Add more elements to the vector to reach the next power of two. /// No effect if the type is not a vector or the element count is a power of /// two. @@ -949,9 +964,9 @@ public: bool isLegal(const MachineInstr &MI, const MachineRegisterInfo &MRI) const; - virtual bool legalizeCustom(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const; + virtual bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const; private: /// Determine what action should be taken to legalize the given generic diff --git a/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h index a837cf05691ad4900a79e20e6bd7fa527602a390..1745b9702d8f9da587e082f177a76ff140e6ecd9 100644 --- a/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ b/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -30,6 +30,7 @@ namespace llvm { class MachineFunction; class MachineInstr; class TargetInstrInfo; +class GISelChangeObserver; /// Class which stores all the state required in a MachineIRBuilder. /// Since MachineIRBuilders will only store state in this object, it allows @@ -50,57 +51,158 @@ struct MachineIRBuilderState { MachineBasicBlock::iterator II; /// @} - std::function InsertedInstr; + GISelChangeObserver *Observer; }; -/// Helper class to build MachineInstr. -/// It keeps internally the insertion point and debug location for all -/// the new instructions we want to create. -/// This information can be modify via the related setters. -class MachineIRBuilderBase { - - MachineIRBuilderState State; - void validateTruncExt(unsigned Dst, unsigned Src, bool IsExtend); +class DstOp { + union { + LLT LLTTy; + unsigned Reg; + const TargetRegisterClass *RC; + }; -protected: - unsigned getDestFromArg(unsigned Reg) { return Reg; } - unsigned getDestFromArg(LLT Ty) { - return getMF().getRegInfo().createGenericVirtualRegister(Ty); +public: + enum class DstType { Ty_LLT, Ty_Reg, Ty_RC }; + DstOp(unsigned R) : Reg(R), Ty(DstType::Ty_Reg) {} + DstOp(const LLT &T) : LLTTy(T), Ty(DstType::Ty_LLT) {} + DstOp(const TargetRegisterClass *TRC) : RC(TRC), Ty(DstType::Ty_RC) {} + + void addDefToMIB(MachineRegisterInfo &MRI, MachineInstrBuilder &MIB) const { + switch (Ty) { + case DstType::Ty_Reg: + MIB.addDef(Reg); + break; + case DstType::Ty_LLT: + MIB.addDef(MRI.createGenericVirtualRegister(LLTTy)); + break; + case DstType::Ty_RC: + MIB.addDef(MRI.createVirtualRegister(RC)); + break; + } } - unsigned getDestFromArg(const TargetRegisterClass *RC) { - return getMF().getRegInfo().createVirtualRegister(RC); + + DstType getType() const { return Ty; } + + LLT getLLTTy(const MachineRegisterInfo &MRI) const { + switch (Ty) { + case DstType::Ty_RC: + return LLT{}; + case DstType::Ty_LLT: + return LLTTy; + case DstType::Ty_Reg: + return MRI.getType(Reg); + } + llvm_unreachable("Unrecognised DstOp::DstType enum"); } - void addUseFromArg(MachineInstrBuilder &MIB, unsigned Reg) { - MIB.addUse(Reg); + DstType getDstOpKind() const { return Ty; } + +private: + DstType Ty; +}; + +class SrcOp { + union { + MachineInstrBuilder SrcMIB; + unsigned Reg; + CmpInst::Predicate Pred; + }; + +public: + enum class SrcType { Ty_Reg, Ty_MIB, Ty_Predicate }; + SrcOp(unsigned R) : Reg(R), Ty(SrcType::Ty_Reg) {} + SrcOp(const MachineInstrBuilder &MIB) : SrcMIB(MIB), Ty(SrcType::Ty_MIB) {} + SrcOp(const CmpInst::Predicate P) : Pred(P), Ty(SrcType::Ty_Predicate) {} + + void addSrcToMIB(MachineInstrBuilder &MIB) const { + switch (Ty) { + case SrcType::Ty_Predicate: + MIB.addPredicate(Pred); + break; + case SrcType::Ty_Reg: + MIB.addUse(Reg); + break; + case SrcType::Ty_MIB: + MIB.addUse(SrcMIB->getOperand(0).getReg()); + break; + } } - void addUseFromArg(MachineInstrBuilder &MIB, const MachineInstrBuilder &UseMIB) { - MIB.addUse(UseMIB->getOperand(0).getReg()); + LLT getLLTTy(const MachineRegisterInfo &MRI) const { + switch (Ty) { + case SrcType::Ty_Predicate: + llvm_unreachable("Not a register operand"); + case SrcType::Ty_Reg: + return MRI.getType(Reg); + case SrcType::Ty_MIB: + return MRI.getType(SrcMIB->getOperand(0).getReg()); + } + llvm_unreachable("Unrecognised SrcOp::SrcType enum"); } - void addUsesFromArgs(MachineInstrBuilder &MIB) { } - template - void addUsesFromArgs(MachineInstrBuilder &MIB, UseArgTy &&Arg1, UseArgsTy &&... Args) { - addUseFromArg(MIB, Arg1); - addUsesFromArgs(MIB, std::forward(Args)...); + unsigned getReg() const { + switch (Ty) { + case SrcType::Ty_Predicate: + llvm_unreachable("Not a register operand"); + case SrcType::Ty_Reg: + return Reg; + case SrcType::Ty_MIB: + return SrcMIB->getOperand(0).getReg(); + } + llvm_unreachable("Unrecognised SrcOp::SrcType enum"); } - unsigned getRegFromArg(unsigned Reg) { return Reg; } - unsigned getRegFromArg(const MachineInstrBuilder &MIB) { - return MIB->getOperand(0).getReg(); + + CmpInst::Predicate getPredicate() const { + switch (Ty) { + case SrcType::Ty_Predicate: + return Pred; + default: + llvm_unreachable("Not a register operand"); + } } - void validateBinaryOp(unsigned Res, unsigned Op0, unsigned Op1); + SrcType getSrcOpKind() const { return Ty; } + +private: + SrcType Ty; +}; + +class FlagsOp { + Optional Flags; + +public: + explicit FlagsOp(unsigned F) : Flags(F) {} + FlagsOp() : Flags(None) {} + Optional getFlags() const { return Flags; } +}; +/// Helper class to build MachineInstr. +/// It keeps internally the insertion point and debug location for all +/// the new instructions we want to create. +/// This information can be modify via the related setters. +class MachineIRBuilder { + + MachineIRBuilderState State; + +protected: + void validateTruncExt(const LLT &Dst, const LLT &Src, bool IsExtend); + + void validateBinaryOp(const LLT &Res, const LLT &Op0, const LLT &Op1); + + void validateSelectOp(const LLT &ResTy, const LLT &TstTy, const LLT &Op0Ty, + const LLT &Op1Ty); + void recordInsertion(MachineInstr *MI) const; public: /// Some constructors for easy use. - MachineIRBuilderBase() = default; - MachineIRBuilderBase(MachineFunction &MF) { setMF(MF); } - MachineIRBuilderBase(MachineInstr &MI) : MachineIRBuilderBase(*MI.getMF()) { + MachineIRBuilder() = default; + MachineIRBuilder(MachineFunction &MF) { setMF(MF); } + MachineIRBuilder(MachineInstr &MI) : MachineIRBuilder(*MI.getMF()) { setInstr(MI); } - MachineIRBuilderBase(const MachineIRBuilderState &BState) : State(BState) {} + virtual ~MachineIRBuilder() = default; + + MachineIRBuilder(const MachineIRBuilderState &BState) : State(BState) {} const TargetInstrInfo &getTII() { assert(State.TII && "TargetInstrInfo is not set"); @@ -151,12 +253,8 @@ public: void setInstr(MachineInstr &MI); /// @} - /// \name Control where instructions we create are recorded (typically for - /// visiting again later during legalization). - /// @{ - void recordInsertion(MachineInstr *InsertedInstr) const; - void recordInsertions(std::function InsertedInstr); - void stopRecordingInsertions(); + void setChangeObserver(GISelChangeObserver &Observer); + void stopObservingChanges(); /// @} /// Set the debug location to \p DL for all the next build instructions. @@ -300,9 +398,9 @@ public: /// registers with the same scalar type (typically s1) /// /// \return The newly created instruction. - MachineInstrBuilder buildUAdde(unsigned Res, unsigned CarryOut, unsigned Op0, - unsigned Op1, unsigned CarryIn); - + MachineInstrBuilder buildUAdde(const DstOp &Res, const DstOp &CarryOut, + const SrcOp &Op0, const SrcOp &Op1, + const SrcOp &CarryIn); /// Build and insert \p Res = G_ANYEXT \p Op0 /// @@ -318,11 +416,7 @@ public: /// /// \return The newly created instruction. - MachineInstrBuilder buildAnyExt(unsigned Res, unsigned Op); - template - MachineInstrBuilder buildAnyExt(DstType &&Res, ArgType &&Arg) { - return buildAnyExt(getDestFromArg(Res), getRegFromArg(Arg)); - } + MachineInstrBuilder buildAnyExt(const DstOp &Res, const SrcOp &Op); /// Build and insert \p Res = G_SEXT \p Op /// @@ -336,11 +430,7 @@ public: /// \pre \p Op must be smaller than \p Res /// /// \return The newly created instruction. - template - MachineInstrBuilder buildSExt(DstType &&Res, ArgType &&Arg) { - return buildSExt(getDestFromArg(Res), getRegFromArg(Arg)); - } - MachineInstrBuilder buildSExt(unsigned Res, unsigned Op); + MachineInstrBuilder buildSExt(const DstOp &Res, const SrcOp &Op); /// Build and insert \p Res = G_ZEXT \p Op /// @@ -354,11 +444,7 @@ public: /// \pre \p Op must be smaller than \p Res /// /// \return The newly created instruction. - template - MachineInstrBuilder buildZExt(DstType &&Res, ArgType &&Arg) { - return buildZExt(getDestFromArg(Res), getRegFromArg(Arg)); - } - MachineInstrBuilder buildZExt(unsigned Res, unsigned Op); + MachineInstrBuilder buildZExt(const DstOp &Res, const SrcOp &Op); /// Build and insert \p Res = G_SEXT \p Op, \p Res = G_TRUNC \p Op, or /// \p Res = COPY \p Op depending on the differing sizes of \p Res and \p Op. @@ -368,11 +454,7 @@ public: /// \pre \p Op must be a generic virtual register with scalar or vector type. /// /// \return The newly created instruction. - template - MachineInstrBuilder buildSExtOrTrunc(DstTy &&Dst, UseArgTy &&Use) { - return buildSExtOrTrunc(getDestFromArg(Dst), getRegFromArg(Use)); - } - MachineInstrBuilder buildSExtOrTrunc(unsigned Res, unsigned Op); + MachineInstrBuilder buildSExtOrTrunc(const DstOp &Res, const SrcOp &Op); /// Build and insert \p Res = G_ZEXT \p Op, \p Res = G_TRUNC \p Op, or /// \p Res = COPY \p Op depending on the differing sizes of \p Res and \p Op. @@ -382,11 +464,7 @@ public: /// \pre \p Op must be a generic virtual register with scalar or vector type. /// /// \return The newly created instruction. - template - MachineInstrBuilder buildZExtOrTrunc(DstTy &&Dst, UseArgTy &&Use) { - return buildZExtOrTrunc(getDestFromArg(Dst), getRegFromArg(Use)); - } - MachineInstrBuilder buildZExtOrTrunc(unsigned Res, unsigned Op); + MachineInstrBuilder buildZExtOrTrunc(const DstOp &Res, const SrcOp &Op); // Build and insert \p Res = G_ANYEXT \p Op, \p Res = G_TRUNC \p Op, or /// \p Res = COPY \p Op depending on the differing sizes of \p Res and \p Op. @@ -396,11 +474,7 @@ public: /// \pre \p Op must be a generic virtual register with scalar or vector type. /// /// \return The newly created instruction. - template - MachineInstrBuilder buildAnyExtOrTrunc(DstTy &&Dst, UseArgTy &&Use) { - return buildAnyExtOrTrunc(getDestFromArg(Dst), getRegFromArg(Use)); - } - MachineInstrBuilder buildAnyExtOrTrunc(unsigned Res, unsigned Op); + MachineInstrBuilder buildAnyExtOrTrunc(const DstOp &Res, const SrcOp &Op); /// Build and insert \p Res = \p ExtOpc, \p Res = G_TRUNC \p /// Op, or \p Res = COPY \p Op depending on the differing sizes of \p Res and @@ -411,15 +485,11 @@ public: /// \pre \p Op must be a generic virtual register with scalar or vector type. /// /// \return The newly created instruction. - MachineInstrBuilder buildExtOrTrunc(unsigned ExtOpc, unsigned Res, - unsigned Op); + MachineInstrBuilder buildExtOrTrunc(unsigned ExtOpc, const DstOp &Res, + const SrcOp &Op); /// Build and insert an appropriate cast between two registers of equal size. - template - MachineInstrBuilder buildCast(DstType &&Res, ArgType &&Arg) { - return buildCast(getDestFromArg(Res), getRegFromArg(Arg)); - } - MachineInstrBuilder buildCast(unsigned Dst, unsigned Src); + MachineInstrBuilder buildCast(const DstOp &Dst, const SrcOp &Src); /// Build and insert G_BR \p Dest /// @@ -464,7 +534,7 @@ public: /// type. /// /// \return The newly created instruction. - MachineInstrBuilder buildConstant(unsigned Res, const ConstantInt &Val); + MachineInstrBuilder buildConstant(const DstOp &Res, const ConstantInt &Val); /// Build and insert \p Res = G_CONSTANT \p Val /// @@ -474,12 +544,8 @@ public: /// \pre \p Res must be a generic virtual register with scalar type. /// /// \return The newly created instruction. - MachineInstrBuilder buildConstant(unsigned Res, int64_t Val); + MachineInstrBuilder buildConstant(const DstOp &Res, int64_t Val); - template - MachineInstrBuilder buildConstant(DstType &&Res, int64_t Val) { - return buildConstant(getDestFromArg(Res), Val); - } /// Build and insert \p Res = G_FCONSTANT \p Val /// /// G_FCONSTANT is a floating-point constant with the specified size and @@ -489,17 +555,9 @@ public: /// \pre \p Res must be a generic virtual register with scalar type. /// /// \return The newly created instruction. - template - MachineInstrBuilder buildFConstant(DstType &&Res, const ConstantFP &Val) { - return buildFConstant(getDestFromArg(Res), Val); - } - MachineInstrBuilder buildFConstant(unsigned Res, const ConstantFP &Val); + MachineInstrBuilder buildFConstant(const DstOp &Res, const ConstantFP &Val); - template - MachineInstrBuilder buildFConstant(DstType &&Res, double Val) { - return buildFConstant(getDestFromArg(Res), Val); - } - MachineInstrBuilder buildFConstant(unsigned Res, double Val); + MachineInstrBuilder buildFConstant(const DstOp &Res, double Val); /// Build and insert \p Res = COPY Op /// @@ -508,11 +566,7 @@ public: /// \pre setBasicBlock or setMI must have been called. /// /// \return a MachineInstrBuilder for the newly created instruction. - MachineInstrBuilder buildCopy(unsigned Res, unsigned Op); - template - MachineInstrBuilder buildCopy(DstType &&Res, SrcType &&Src) { - return buildCopy(getDestFromArg(Res), getRegFromArg(Src)); - } + MachineInstrBuilder buildCopy(const DstOp &Res, const SrcOp &Op); /// Build and insert `Res = G_LOAD Addr, MMO`. /// @@ -559,10 +613,7 @@ public: MachineInstrBuilder buildExtract(unsigned Res, unsigned Src, uint64_t Index); /// Build and insert \p Res = IMPLICIT_DEF. - template MachineInstrBuilder buildUndef(DstType &&Res) { - return buildUndef(getDestFromArg(Res)); - } - MachineInstrBuilder buildUndef(unsigned Res); + MachineInstrBuilder buildUndef(const DstOp &Res); /// Build and insert instructions to put \p Ops together at the specified p /// Indices to form a larger register. @@ -591,7 +642,7 @@ public: /// \pre The type of all \p Ops registers must be identical. /// /// \return a MachineInstrBuilder for the newly created instruction. - MachineInstrBuilder buildMerge(unsigned Res, ArrayRef Ops); + MachineInstrBuilder buildMerge(const DstOp &Res, ArrayRef Ops); /// Build and insert \p Res0, ... = G_UNMERGE_VALUES \p Op /// @@ -603,7 +654,50 @@ public: /// \pre The type of all \p Res registers must be identical. /// /// \return a MachineInstrBuilder for the newly created instruction. - MachineInstrBuilder buildUnmerge(ArrayRef Res, unsigned Op); + MachineInstrBuilder buildUnmerge(ArrayRef Res, const SrcOp &Op); + MachineInstrBuilder buildUnmerge(ArrayRef Res, const SrcOp &Op); + + /// Build and insert \p Res = G_BUILD_VECTOR \p Op0, ... + /// + /// G_BUILD_VECTOR creates a vector value from multiple scalar registers. + /// \pre setBasicBlock or setMI must have been called. + /// \pre The entire register \p Res (and no more) must be covered by the + /// input scalar registers. + /// \pre The type of all \p Ops registers must be identical. + /// + /// \return a MachineInstrBuilder for the newly created instruction. + MachineInstrBuilder buildBuildVector(const DstOp &Res, + ArrayRef Ops); + + /// Build and insert \p Res = G_BUILD_VECTOR_TRUNC \p Op0, ... + /// + /// G_BUILD_VECTOR_TRUNC creates a vector value from multiple scalar registers + /// which have types larger than the destination vector element type, and + /// truncates the values to fit. + /// + /// If the operands given are already the same size as the vector elt type, + /// then this method will instead create a G_BUILD_VECTOR instruction. + /// + /// \pre setBasicBlock or setMI must have been called. + /// \pre The type of all \p Ops registers must be identical. + /// + /// \return a MachineInstrBuilder for the newly created instruction. + MachineInstrBuilder buildBuildVectorTrunc(const DstOp &Res, + ArrayRef Ops); + + /// Build and insert \p Res = G_CONCAT_VECTORS \p Op0, ... + /// + /// G_CONCAT_VECTORS creates a vector from the concatenation of 2 or more + /// vectors. + /// + /// \pre setBasicBlock or setMI must have been called. + /// \pre The entire register \p Res (and no more) must be covered by the input + /// registers. + /// \pre The type of all source operands must be identical. + /// + /// \return a MachineInstrBuilder for the newly created instruction. + MachineInstrBuilder buildConcatVectors(const DstOp &Res, + ArrayRef Ops); MachineInstrBuilder buildInsert(unsigned Res, unsigned Src, unsigned Op, unsigned Index); @@ -631,11 +725,7 @@ public: /// \pre \p Res must be smaller than \p Op /// /// \return The newly created instruction. - template - MachineInstrBuilder buildFPTrunc(DstType &&Res, SrcType &&Src) { - return buildFPTrunc(getDestFromArg(Res), getRegFromArg(Src)); - } - MachineInstrBuilder buildFPTrunc(unsigned Res, unsigned Op); + MachineInstrBuilder buildFPTrunc(const DstOp &Res, const SrcOp &Op); /// Build and insert \p Res = G_TRUNC \p Op /// @@ -648,11 +738,7 @@ public: /// \pre \p Res must be smaller than \p Op /// /// \return The newly created instruction. - MachineInstrBuilder buildTrunc(unsigned Res, unsigned Op); - template - MachineInstrBuilder buildTrunc(DstType &&Res, SrcType &&Src) { - return buildTrunc(getDestFromArg(Res), getRegFromArg(Src)); - } + MachineInstrBuilder buildTrunc(const DstOp &Res, const SrcOp &Op); /// Build and insert a \p Res = G_ICMP \p Pred, \p Op0, \p Op1 /// @@ -666,13 +752,8 @@ public: /// \pre \p Pred must be an integer predicate. /// /// \return a MachineInstrBuilder for the newly created instruction. - MachineInstrBuilder buildICmp(CmpInst::Predicate Pred, - unsigned Res, unsigned Op0, unsigned Op1); - template - MachineInstrBuilder buildICmp(CmpInst::Predicate Pred, DstTy &&Dst, - UseArgsTy &&... UseArgs) { - return buildICmp(Pred, getDestFromArg(Dst), getRegFromArg(UseArgs)...); - } + MachineInstrBuilder buildICmp(CmpInst::Predicate Pred, const DstOp &Res, + const SrcOp &Op0, const SrcOp &Op1); /// Build and insert a \p Res = G_FCMP \p Pred\p Op0, \p Op1 /// @@ -686,8 +767,8 @@ public: /// \pre \p Pred must be a floating-point predicate. /// /// \return a MachineInstrBuilder for the newly created instruction. - MachineInstrBuilder buildFCmp(CmpInst::Predicate Pred, - unsigned Res, unsigned Op0, unsigned Op1); + MachineInstrBuilder buildFCmp(CmpInst::Predicate Pred, const DstOp &Res, + const SrcOp &Op0, const SrcOp &Op1); /// Build and insert a \p Res = G_SELECT \p Tst, \p Op0, \p Op1 /// @@ -699,12 +780,8 @@ public: /// elements as the other parameters. /// /// \return a MachineInstrBuilder for the newly created instruction. - MachineInstrBuilder buildSelect(unsigned Res, unsigned Tst, - unsigned Op0, unsigned Op1); - template - MachineInstrBuilder buildSelect(DstTy &&Dst, UseArgsTy &&... UseArgs) { - return buildSelect(getDestFromArg(Dst), getRegFromArg(UseArgs)...); - } + MachineInstrBuilder buildSelect(const DstOp &Res, const SrcOp &Tst, + const SrcOp &Op0, const SrcOp &Op1); /// Build and insert \p Res = G_INSERT_VECTOR_ELT \p Val, /// \p Elt, \p Idx @@ -716,8 +793,10 @@ public: /// with scalar type. /// /// \return The newly created instruction. - MachineInstrBuilder buildInsertVectorElement(unsigned Res, unsigned Val, - unsigned Elt, unsigned Idx); + MachineInstrBuilder buildInsertVectorElement(const DstOp &Res, + const SrcOp &Val, + const SrcOp &Elt, + const SrcOp &Idx); /// Build and insert \p Res = G_EXTRACT_VECTOR_ELT \p Val, \p Idx /// @@ -727,8 +806,9 @@ public: /// \pre \p Idx must be a generic virtual register with scalar type. /// /// \return The newly created instruction. - MachineInstrBuilder buildExtractVectorElement(unsigned Res, unsigned Val, - unsigned Idx); + MachineInstrBuilder buildExtractVectorElement(const DstOp &Res, + const SrcOp &Val, + const SrcOp &Idx); /// Build and insert `OldValRes, SuccessRes = /// G_ATOMIC_CMPXCHG_WITH_SUCCESS Addr, CmpVal, NewVal, MMO`. @@ -965,19 +1045,7 @@ public: /// /// \return The newly created instruction. MachineInstrBuilder buildBlockAddress(unsigned Res, const BlockAddress *BA); -}; - -/// A CRTP class that contains methods for building instructions that can -/// be constant folded. MachineIRBuilders that want to inherit from this will -/// need to implement buildBinaryOp (for constant folding binary ops). -/// Alternatively, they can implement buildInstr(Opc, Dst, Uses...) to perform -/// additional folding for Opc. -template -class FoldableInstructionsBuilder : public MachineIRBuilderBase { - Base &base() { return static_cast(*this); } -public: - using MachineIRBuilderBase::MachineIRBuilderBase; /// Build and insert \p Res = G_ADD \p Op0, \p Op1 /// /// G_ADD sets \p Res to the sum of integer parameters \p Op0 and \p Op1, @@ -989,13 +1057,10 @@ public: /// /// \return a MachineInstrBuilder for the newly created instruction. - MachineInstrBuilder buildAdd(unsigned Dst, unsigned Src0, unsigned Src1) { - return base().buildBinaryOp(TargetOpcode::G_ADD, Dst, Src0, Src1); - } - template - MachineInstrBuilder buildAdd(DstTy &&Ty, UseArgsTy &&... UseArgs) { - unsigned Res = base().getDestFromArg(Ty); - return base().buildAdd(Res, (base().getRegFromArg(UseArgs))...); + MachineInstrBuilder buildAdd(const DstOp &Dst, const SrcOp &Src0, + const SrcOp &Src1, + Optional Flags = None) { + return buildInstr(TargetOpcode::G_ADD, {Dst}, {Src0, Src1}, Flags); } /// Build and insert \p Res = G_SUB \p Op0, \p Op1 @@ -1009,13 +1074,10 @@ public: /// /// \return a MachineInstrBuilder for the newly created instruction. - MachineInstrBuilder buildSub(unsigned Dst, unsigned Src0, unsigned Src1) { - return base().buildBinaryOp(TargetOpcode::G_SUB, Dst, Src0, Src1); - } - template - MachineInstrBuilder buildSub(DstTy &&Ty, UseArgsTy &&... UseArgs) { - unsigned Res = base().getDestFromArg(Ty); - return base().buildSub(Res, (base().getRegFromArg(UseArgs))...); + MachineInstrBuilder buildSub(const DstOp &Dst, const SrcOp &Src0, + const SrcOp &Src1, + Optional Flags = None) { + return buildInstr(TargetOpcode::G_SUB, {Dst}, {Src0, Src1}, Flags); } /// Build and insert \p Res = G_MUL \p Op0, \p Op1 @@ -1028,13 +1090,10 @@ public: /// with the same (scalar or vector) type). /// /// \return a MachineInstrBuilder for the newly created instruction. - MachineInstrBuilder buildMul(unsigned Dst, unsigned Src0, unsigned Src1) { - return base().buildBinaryOp(TargetOpcode::G_MUL, Dst, Src0, Src1); - } - template - MachineInstrBuilder buildMul(DstTy &&Ty, UseArgsTy &&... UseArgs) { - unsigned Res = base().getDestFromArg(Ty); - return base().buildMul(Res, (base().getRegFromArg(UseArgs))...); + MachineInstrBuilder buildMul(const DstOp &Dst, const SrcOp &Src0, + const SrcOp &Src1, + Optional Flags = None) { + return buildInstr(TargetOpcode::G_MUL, {Dst}, {Src0, Src1}, Flags); } /// Build and insert \p Res = G_AND \p Op0, \p Op1 @@ -1048,13 +1107,9 @@ public: /// /// \return a MachineInstrBuilder for the newly created instruction. - MachineInstrBuilder buildAnd(unsigned Dst, unsigned Src0, unsigned Src1) { - return base().buildBinaryOp(TargetOpcode::G_AND, Dst, Src0, Src1); - } - template - MachineInstrBuilder buildAnd(DstTy &&Ty, UseArgsTy &&... UseArgs) { - unsigned Res = base().getDestFromArg(Ty); - return base().buildAnd(Res, (base().getRegFromArg(UseArgs))...); + MachineInstrBuilder buildAnd(const DstOp &Dst, const SrcOp &Src0, + const SrcOp &Src1) { + return buildInstr(TargetOpcode::G_AND, {Dst}, {Src0, Src1}); } /// Build and insert \p Res = G_OR \p Op0, \p Op1 @@ -1067,39 +1122,14 @@ public: /// with the same (scalar or vector) type). /// /// \return a MachineInstrBuilder for the newly created instruction. - MachineInstrBuilder buildOr(unsigned Dst, unsigned Src0, unsigned Src1) { - return base().buildBinaryOp(TargetOpcode::G_OR, Dst, Src0, Src1); + MachineInstrBuilder buildOr(const DstOp &Dst, const SrcOp &Src0, + const SrcOp &Src1) { + return buildInstr(TargetOpcode::G_OR, {Dst}, {Src0, Src1}); } - template - MachineInstrBuilder buildOr(DstTy &&Ty, UseArgsTy &&... UseArgs) { - unsigned Res = base().getDestFromArg(Ty); - return base().buildOr(Res, (base().getRegFromArg(UseArgs))...); - } -}; -class MachineIRBuilder : public FoldableInstructionsBuilder { -public: - using FoldableInstructionsBuilder< - MachineIRBuilder>::FoldableInstructionsBuilder; - MachineInstrBuilder buildBinaryOp(unsigned Opcode, unsigned Dst, - unsigned Src0, unsigned Src1) { - validateBinaryOp(Dst, Src0, Src1); - return buildInstr(Opcode).addDef(Dst).addUse(Src0).addUse(Src1); - } - using FoldableInstructionsBuilder::buildInstr; - /// DAG like Generic method for building arbitrary instructions as above. - /// \Opc opcode for the instruction. - /// \Ty Either LLT/TargetRegisterClass/unsigned types for Dst - /// \Args Variadic list of uses of types(unsigned/MachineInstrBuilder) - /// Uses of type MachineInstrBuilder will perform - /// getOperand(0).getReg() to convert to register. - template - MachineInstrBuilder buildInstr(unsigned Opc, DstTy &&Ty, - UseArgsTy &&... Args) { - auto MIB = buildInstr(Opc).addDef(getDestFromArg(Ty)); - addUsesFromArgs(MIB, std::forward(Args)...); - return MIB; - } + virtual MachineInstrBuilder buildInstr(unsigned Opc, ArrayRef DstOps, + ArrayRef SrcOps, + Optional Flags = None); }; } // End namespace llvm. diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h index ac620e4b69c501353ebf0986ede0751aa775d135..9c918ae1104f9dbcec2e9acd0efe198a0755abe1 100644 --- a/include/llvm/CodeGen/ISDOpcodes.h +++ b/include/llvm/CodeGen/ISDOpcodes.h @@ -272,6 +272,13 @@ namespace ISD { /// resulting value is this minimum value. SSUBSAT, USUBSAT, + /// RESULT = SMULFIX(LHS, RHS, SCALE) - Perform fixed point multiplication on + /// 2 integers with the same width and scale. SCALE represents the scale of + /// both operands as fixed point numbers. This SCALE parameter must be a + /// constant integer. A scale of zero is effectively performing + /// multiplication on 2 integers. + SMULFIX, + /// Simple binary floating point operators. FADD, FSUB, FMUL, FDIV, FREM, @@ -394,9 +401,13 @@ namespace ISD { /// When the 1st operand is a vector, the shift amount must be in the same /// type. (TLI.getShiftAmountTy() will return the same type when the input /// type is a vector.) - /// For rotates, the shift amount is treated as an unsigned amount modulo - /// the element size of the first operand. - SHL, SRA, SRL, ROTL, ROTR, + /// For rotates and funnel shifts, the shift amount is treated as an unsigned + /// amount modulo the element size of the first operand. + /// + /// Funnel 'double' shifts take 3 operands, 2 inputs and the shift amount. + /// fshl(X,Y,Z): (X << (Z % BW)) | (Y >> (BW - (Z % BW))) + /// fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW)) + SHL, SRA, SRL, ROTL, ROTR, FSHL, FSHR, /// Byte Swap and Counting operators. BSWAP, CTTZ, CTLZ, CTPOP, BITREVERSE, @@ -478,31 +489,33 @@ namespace ISD { /// in-register any-extension of the low lanes of an integer vector. The /// result type must have fewer elements than the operand type, and those /// elements must be larger integer types such that the total size of the - /// operand type and the result type match. Each of the low operand - /// elements is any-extended into the corresponding, wider result - /// elements with the high bits becoming undef. + /// operand type is less than or equal to the size of the result type. Each + /// of the low operand elements is any-extended into the corresponding, + /// wider result elements with the high bits becoming undef. + /// NOTE: The type legalizer prefers to make the operand and result size + /// the same to allow expansion to shuffle vector during op legalization. ANY_EXTEND_VECTOR_INREG, /// SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an /// in-register sign-extension of the low lanes of an integer vector. The /// result type must have fewer elements than the operand type, and those /// elements must be larger integer types such that the total size of the - /// operand type and the result type match. Each of the low operand - /// elements is sign-extended into the corresponding, wider result - /// elements. - // FIXME: The SIGN_EXTEND_INREG node isn't specifically limited to - // scalars, but it also doesn't handle vectors well. Either it should be - // restricted to scalars or this node (and its handling) should be merged - // into it. + /// operand type is less than or equal to the size of the result type. Each + /// of the low operand elements is sign-extended into the corresponding, + /// wider result elements. + /// NOTE: The type legalizer prefers to make the operand and result size + /// the same to allow expansion to shuffle vector during op legalization. SIGN_EXTEND_VECTOR_INREG, /// ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an /// in-register zero-extension of the low lanes of an integer vector. The /// result type must have fewer elements than the operand type, and those /// elements must be larger integer types such that the total size of the - /// operand type and the result type match. Each of the low operand - /// elements is zero-extended into the corresponding, wider result - /// elements. + /// operand type is less than or equal to the size of the result type. Each + /// of the low operand elements is zero-extended into the corresponding, + /// wider result elements. + /// NOTE: The type legalizer prefers to make the operand and result size + /// the same to allow expansion to shuffle vector during op legalization. ZERO_EXTEND_VECTOR_INREG, /// FP_TO_[US]INT - Convert a floating point value to a signed or unsigned diff --git a/include/llvm/CodeGen/LinkAllAsmWriterComponents.h b/include/llvm/CodeGen/LinkAllAsmWriterComponents.h index c3046da90b8db83e2e95a0d5adf10880583caf14..38fcb37b1e69e97c13cb56704ac373b50eb7cb5b 100644 --- a/include/llvm/CodeGen/LinkAllAsmWriterComponents.h +++ b/include/llvm/CodeGen/LinkAllAsmWriterComponents.h @@ -15,7 +15,7 @@ #ifndef LLVM_CODEGEN_LINKALLASMWRITERCOMPONENTS_H #define LLVM_CODEGEN_LINKALLASMWRITERCOMPONENTS_H -#include "llvm/CodeGen/GCs.h" +#include "llvm/CodeGen/BuiltinGCs.h" #include namespace { diff --git a/include/llvm/CodeGen/LinkAllCodegenComponents.h b/include/llvm/CodeGen/LinkAllCodegenComponents.h index fee131e4a3c6b73211796a3ac98e7e1edf9b0e66..18c13ca8f59877f6e8f1b2747e43d601d214c852 100644 --- a/include/llvm/CodeGen/LinkAllCodegenComponents.h +++ b/include/llvm/CodeGen/LinkAllCodegenComponents.h @@ -15,7 +15,7 @@ #ifndef LLVM_CODEGEN_LINKALLCODEGENCOMPONENTS_H #define LLVM_CODEGEN_LINKALLCODEGENCOMPONENTS_H -#include "llvm/CodeGen/GCs.h" +#include "llvm/CodeGen/BuiltinGCs.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/SchedulerRegistry.h" #include "llvm/Target/TargetMachine.h" @@ -36,11 +36,7 @@ namespace { (void) llvm::createGreedyRegisterAllocator(); (void) llvm::createDefaultPBQPRegisterAllocator(); - llvm::linkCoreCLRGC(); - llvm::linkOcamlGC(); - llvm::linkErlangGC(); - llvm::linkShadowStackGC(); - llvm::linkStatepointExampleGC(); + llvm::linkAllBuiltinGCs(); (void) llvm::createBURRListDAGScheduler(nullptr, llvm::CodeGenOpt::Default); diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h index 02b2f1be648a726d929b4595ebe5673c95c7175b..c2706a21a1778f1f1ec8bbf1ecaafd126ab99a2c 100644 --- a/include/llvm/CodeGen/MachineFrameInfo.h +++ b/include/llvm/CodeGen/MachineFrameInfo.h @@ -28,9 +28,14 @@ class AllocaInst; /// The CalleeSavedInfo class tracks the information need to locate where a /// callee saved register is in the current frame. +/// Callee saved reg can also be saved to a different register rather than +/// on the stack by setting DstReg instead of FrameIdx. class CalleeSavedInfo { unsigned Reg; - int FrameIdx; + union { + int FrameIdx; + unsigned DstReg; + }; /// Flag indicating whether the register is actually restored in the epilog. /// In most cases, if a register is saved, it is also restored. There are /// some situations, though, when this is not the case. For example, the @@ -44,17 +49,29 @@ class CalleeSavedInfo { /// by implicit uses on the return instructions, however, the required /// changes in the ARM backend would be quite extensive. bool Restored; + /// Flag indicating whether the register is spilled to stack or another + /// register. + bool SpilledToReg; public: explicit CalleeSavedInfo(unsigned R, int FI = 0) - : Reg(R), FrameIdx(FI), Restored(true) {} + : Reg(R), FrameIdx(FI), Restored(true), SpilledToReg(false) {} // Accessors. unsigned getReg() const { return Reg; } int getFrameIdx() const { return FrameIdx; } - void setFrameIdx(int FI) { FrameIdx = FI; } + unsigned getDstReg() const { return DstReg; } + void setFrameIdx(int FI) { + FrameIdx = FI; + SpilledToReg = false; + } + void setDstReg(unsigned SpillReg) { + DstReg = SpillReg; + SpilledToReg = true; + } bool isRestored() const { return Restored; } void setRestored(bool R) { Restored = R; } + bool isSpilledToReg() const { return SpilledToReg; } }; /// The MachineFrameInfo class represents an abstract stack frame until @@ -271,9 +288,9 @@ private: unsigned CVBytesOfCalleeSavedRegisters = 0; /// The prolog/epilog code inserter fills in this vector with each - /// callee saved register saved in the frame. Beyond its use by the prolog/ - /// epilog code inserter, this data used for debug info and exception - /// handling. + /// callee saved register saved in either the frame or a different + /// register. Beyond its use by the prolog/ epilog code inserter, + /// this data is used for debug info and exception handling. std::vector CSInfo; /// Has CSInfo been set yet? diff --git a/include/llvm/CodeGen/MachineOutliner.h b/include/llvm/CodeGen/MachineOutliner.h index eaa741353abb0c4cb27298ac95bfe5bdeedd8dc5..bfd1e994053a75a0c4a8ccce324abbe752431a3b 100644 --- a/include/llvm/CodeGen/MachineOutliner.h +++ b/include/llvm/CodeGen/MachineOutliner.h @@ -61,9 +61,6 @@ public: /// \p OutlinedFunctions. unsigned FunctionIdx; - /// Set to false if the candidate overlapped with another candidate. - bool InCandidateList = true; - /// Identifier denoting the instructions to emit to call an outlined function /// from this point. Defined by the target. unsigned CallConstructionID; @@ -82,6 +79,12 @@ public: /// been used across the sequence. LiveRegUnits UsedInSequence; + /// Target-specific flags for this Candidate's MBB. + unsigned Flags = 0x0; + + /// True if initLRU has been called on this Candidate. + bool LRUWasSet = false; + /// Return the number of instructions in this Candidate. unsigned getLength() const { return Len; } @@ -99,9 +102,7 @@ public: } /// Returns the call overhead of this candidate if it is in the list. - unsigned getCallOverhead() const { - return InCandidateList ? CallOverhead : 0; - } + unsigned getCallOverhead() const { return CallOverhead; } MachineBasicBlock::iterator &front() { return FirstInst; } MachineBasicBlock::iterator &back() { return LastInst; } @@ -120,9 +121,9 @@ public: Candidate(unsigned StartIdx, unsigned Len, MachineBasicBlock::iterator &FirstInst, MachineBasicBlock::iterator &LastInst, MachineBasicBlock *MBB, - unsigned FunctionIdx) + unsigned FunctionIdx, unsigned Flags) : StartIdx(StartIdx), Len(Len), FirstInst(FirstInst), LastInst(LastInst), - MBB(MBB), FunctionIdx(FunctionIdx) {} + MBB(MBB), FunctionIdx(FunctionIdx), Flags(Flags) {} Candidate() {} /// Used to ensure that \p Candidates are outlined in an order that @@ -138,6 +139,10 @@ public: void initLRU(const TargetRegisterInfo &TRI) { assert(MBB->getParent()->getRegInfo().tracksLiveness() && "Candidate's Machine Function must track liveness"); + // Only initialize once. + if (LRUWasSet) + return; + LRUWasSet = true; LRU.init(TRI); LRU.addLiveOuts(*MBB); @@ -158,21 +163,13 @@ public: /// class of candidate. struct OutlinedFunction { -private: - /// The number of candidates for this \p OutlinedFunction. - unsigned OccurrenceCount = 0; - public: - std::vector> Candidates; + std::vector Candidates; /// The actual outlined function created. /// This is initialized after we go through and create the actual function. MachineFunction *MF = nullptr; - /// The sequence of integers corresponding to the instructions in this - /// function. - std::vector Sequence; - /// Represents the size of a sequence in bytes. (Some instructions vary /// widely in size, so just counting the instructions isn't very useful.) unsigned SequenceSize; @@ -184,49 +181,41 @@ public: unsigned FrameConstructionID; /// Return the number of candidates for this \p OutlinedFunction. - unsigned getOccurrenceCount() { return OccurrenceCount; } - - /// Decrement the occurrence count of this OutlinedFunction and return the - /// new count. - unsigned decrement() { - assert(OccurrenceCount > 0 && "Can't decrement an empty function!"); - OccurrenceCount--; - return getOccurrenceCount(); - } + unsigned getOccurrenceCount() const { return Candidates.size(); } /// Return the number of bytes it would take to outline this /// function. - unsigned getOutliningCost() { + unsigned getOutliningCost() const { unsigned CallOverhead = 0; - for (std::shared_ptr &C : Candidates) - CallOverhead += C->getCallOverhead(); + for (const Candidate &C : Candidates) + CallOverhead += C.getCallOverhead(); return CallOverhead + SequenceSize + FrameOverhead; } /// Return the size in bytes of the unoutlined sequences. - unsigned getNotOutlinedCost() { return OccurrenceCount * SequenceSize; } + unsigned getNotOutlinedCost() const { + return getOccurrenceCount() * SequenceSize; + } /// Return the number of instructions that would be saved by outlining /// this function. - unsigned getBenefit() { + unsigned getBenefit() const { unsigned NotOutlinedCost = getNotOutlinedCost(); unsigned OutlinedCost = getOutliningCost(); return (NotOutlinedCost < OutlinedCost) ? 0 : NotOutlinedCost - OutlinedCost; } - OutlinedFunction(std::vector &Cands, - unsigned SequenceSize, unsigned FrameOverhead, - unsigned FrameConstructionID) - : SequenceSize(SequenceSize), FrameOverhead(FrameOverhead), - FrameConstructionID(FrameConstructionID) { - OccurrenceCount = Cands.size(); - for (Candidate &C : Cands) - Candidates.push_back(std::make_shared(C)); - - unsigned B = getBenefit(); - for (std::shared_ptr &C : Candidates) - C->Benefit = B; + /// Return the number of instructions in this sequence. + unsigned getNumInstrs() const { return Candidates[0].getLength(); } + + OutlinedFunction(std::vector &Candidates, unsigned SequenceSize, + unsigned FrameOverhead, unsigned FrameConstructionID) + : Candidates(Candidates), SequenceSize(SequenceSize), + FrameOverhead(FrameOverhead), FrameConstructionID(FrameConstructionID) { + const unsigned B = getBenefit(); + for (Candidate &C : Candidates) + C.Benefit = B; } OutlinedFunction() {} diff --git a/include/llvm/CodeGen/MachinePassRegistry.h b/include/llvm/CodeGen/MachinePassRegistry.h index 3aba0bba7d1a3c8dfbfbca05052bde12aef97da8..a031c92d914f408193d430aed0480ab042e26c36 100644 --- a/include/llvm/CodeGen/MachinePassRegistry.h +++ b/include/llvm/CodeGen/MachinePassRegistry.h @@ -24,22 +24,20 @@ namespace llvm { -using MachinePassCtor = void *(*)(); - //===----------------------------------------------------------------------===// /// /// MachinePassRegistryListener - Listener to adds and removals of nodes in /// registration list. /// //===----------------------------------------------------------------------===// -class MachinePassRegistryListener { - virtual void anchor(); +template class MachinePassRegistryListener { + virtual void anchor() {} public: MachinePassRegistryListener() = default; virtual ~MachinePassRegistryListener() = default; - virtual void NotifyAdd(StringRef N, MachinePassCtor C, StringRef D) = 0; + virtual void NotifyAdd(StringRef N, PassCtorTy C, StringRef D) = 0; virtual void NotifyRemove(StringRef N) = 0; }; @@ -48,15 +46,15 @@ public: /// MachinePassRegistryNode - Machine pass node stored in registration list. /// //===----------------------------------------------------------------------===// -class MachinePassRegistryNode { +template class MachinePassRegistryNode { private: MachinePassRegistryNode *Next = nullptr; // Next function pass in list. StringRef Name; // Name of function pass. StringRef Description; // Description string. - MachinePassCtor Ctor; // Function pass creator. + PassCtorTy Ctor; // Pass creator. public: - MachinePassRegistryNode(const char *N, const char *D, MachinePassCtor C) + MachinePassRegistryNode(const char *N, const char *D, PassCtorTy C) : Name(N), Description(D), Ctor(C) {} // Accessors @@ -64,7 +62,7 @@ public: MachinePassRegistryNode **getNextAddress() { return &Next; } StringRef getName() const { return Name; } StringRef getDescription() const { return Description; } - MachinePassCtor getCtor() const { return Ctor; } + PassCtorTy getCtor() const { return Ctor; } void setNext(MachinePassRegistryNode *N) { Next = N; } }; @@ -73,11 +71,12 @@ public: /// MachinePassRegistry - Track the registration of machine passes. /// //===----------------------------------------------------------------------===// -class MachinePassRegistry { +template class MachinePassRegistry { private: - MachinePassRegistryNode *List; // List of registry nodes. - MachinePassCtor Default; // Default function pass creator. - MachinePassRegistryListener *Listener; // Listener for list adds are removes. + MachinePassRegistryNode *List; // List of registry nodes. + PassCtorTy Default; // Default function pass creator. + MachinePassRegistryListener + *Listener; // Listener for list adds are removes. public: // NO CONSTRUCTOR - we don't want static constructor ordering to mess @@ -85,19 +84,47 @@ public: // Accessors. // - MachinePassRegistryNode *getList() { return List; } - MachinePassCtor getDefault() { return Default; } - void setDefault(MachinePassCtor C) { Default = C; } - void setDefault(StringRef Name); - void setListener(MachinePassRegistryListener *L) { Listener = L; } + MachinePassRegistryNode *getList() { return List; } + PassCtorTy getDefault() { return Default; } + void setDefault(PassCtorTy C) { Default = C; } + /// setDefault - Set the default constructor by name. + void setDefault(StringRef Name) { + PassCtorTy Ctor = nullptr; + for (MachinePassRegistryNode *R = getList(); R; + R = R->getNext()) { + if (R->getName() == Name) { + Ctor = R->getCtor(); + break; + } + } + assert(Ctor && "Unregistered pass name"); + setDefault(Ctor); + } + void setListener(MachinePassRegistryListener *L) { Listener = L; } /// Add - Adds a function pass to the registration list. /// - void Add(MachinePassRegistryNode *Node); + void Add(MachinePassRegistryNode *Node) { + Node->setNext(List); + List = Node; + if (Listener) + Listener->NotifyAdd(Node->getName(), Node->getCtor(), + Node->getDescription()); + } /// Remove - Removes a function pass from the registration list. /// - void Remove(MachinePassRegistryNode *Node); + void Remove(MachinePassRegistryNode *Node) { + for (MachinePassRegistryNode **I = &List; *I; + I = (*I)->getNextAddress()) { + if (*I == Node) { + if (Listener) + Listener->NotifyRemove(Node->getName()); + *I = (*I)->getNext(); + break; + } + } + } }; //===----------------------------------------------------------------------===// @@ -105,9 +132,11 @@ public: /// RegisterPassParser class - Handle the addition of new machine passes. /// //===----------------------------------------------------------------------===// -template -class RegisterPassParser : public MachinePassRegistryListener, - public cl::parser { +template +class RegisterPassParser + : public MachinePassRegistryListener< + typename RegistryClass::FunctionPassCtor>, + public cl::parser { public: RegisterPassParser(cl::Option &O) : cl::parser(O) {} @@ -129,8 +158,9 @@ public: } // Implement the MachinePassRegistryListener callbacks. - void NotifyAdd(StringRef N, MachinePassCtor C, StringRef D) override { - this->addLiteralOption(N, (typename RegistryClass::FunctionPassCtor)C, D); + void NotifyAdd(StringRef N, typename RegistryClass::FunctionPassCtor C, + StringRef D) override { + this->addLiteralOption(N, C, D); } void NotifyRemove(StringRef N) override { this->removeLiteralOption(N); diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h index 6003b1b6b82c334ccc81fbc4c282e98ea79985bd..4bc31ae7c61aca6eecb0a7d2349f2c21852d0dbc 100644 --- a/include/llvm/CodeGen/MachineScheduler.h +++ b/include/llvm/CodeGen/MachineScheduler.h @@ -132,17 +132,19 @@ struct MachineSchedContext { /// MachineSchedRegistry provides a selection of available machine instruction /// schedulers. -class MachineSchedRegistry : public MachinePassRegistryNode { +class MachineSchedRegistry + : public MachinePassRegistryNode< + ScheduleDAGInstrs *(*)(MachineSchedContext *)> { public: using ScheduleDAGCtor = ScheduleDAGInstrs *(*)(MachineSchedContext *); // RegisterPassParser requires a (misnamed) FunctionPassCtor type. using FunctionPassCtor = ScheduleDAGCtor; - static MachinePassRegistry Registry; + static MachinePassRegistry Registry; MachineSchedRegistry(const char *N, const char *D, ScheduleDAGCtor C) - : MachinePassRegistryNode(N, D, (MachinePassCtor)C) { + : MachinePassRegistryNode(N, D, C) { Registry.Add(this); } @@ -158,7 +160,7 @@ public: return (MachineSchedRegistry *)Registry.getList(); } - static void setListener(MachinePassRegistryListener *L) { + static void setListener(MachinePassRegistryListener *L) { Registry.setListener(L); } }; @@ -792,7 +794,7 @@ public: /// Represent the type of SchedCandidate found within a single queue. /// pickNodeBidirectional depends on these listed by decreasing priority. enum CandReason : uint8_t { - NoCand, Only1, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak, + NoCand, Only1, PhysReg, RegExcess, RegCritical, Stall, Cluster, Weak, RegMax, ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce, TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder}; @@ -926,7 +928,7 @@ bool tryPressure(const PressureChange &TryP, const TargetRegisterInfo *TRI, const MachineFunction &MF); unsigned getWeakLeft(const SUnit *SU, bool isTop); -int biasPhysRegCopy(const SUnit *SU, bool isTop); +int biasPhysReg(const SUnit *SU, bool isTop); /// GenericScheduler shrinks the unscheduled zone using heuristics to balance /// the schedule. @@ -1004,7 +1006,7 @@ protected: const RegPressureTracker &RPTracker, SchedCandidate &Candidate); - void reschedulePhysRegCopies(SUnit *SU, bool isTop); + void reschedulePhysReg(SUnit *SU, bool isTop); }; /// PostGenericScheduler - Interface to the scheduling algorithm used by diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h index cb12b14f4435ef152edadf7789d435f36fe44f70..acf1ebb5bc8396a203e0dd501a5473412ecdde4c 100644 --- a/include/llvm/CodeGen/Passes.h +++ b/include/llvm/CodeGen/Passes.h @@ -379,14 +379,20 @@ namespace llvm { /// FunctionPass *createInterleavedAccessPass(); + /// InterleavedLoadCombines Pass - This pass identifies interleaved loads and + /// combines them into wide loads detectable by InterleavedAccessPass + /// + FunctionPass *createInterleavedLoadCombinePass(); + /// LowerEmuTLS - This pass generates __emutls_[vt].xyz variables for all /// TLS variables for the emulated TLS model. /// ModulePass *createLowerEmuTLSPass(); - /// This pass lowers the \@llvm.load.relative intrinsic to instructions. - /// This is unsafe to do earlier because a pass may combine the constant - /// initializer into the load, which may result in an overflowing evaluation. + /// This pass lowers the \@llvm.load.relative and \@llvm.objc.* intrinsics to + /// instructions. This is unsafe to do earlier because a pass may combine the + /// constant initializer into the load, which may result in an overflowing + /// evaluation. ModulePass *createPreISelIntrinsicLoweringPass(); /// GlobalMerge - This pass merges internal (by default) globals into structs diff --git a/include/llvm/CodeGen/PreISelIntrinsicLowering.h b/include/llvm/CodeGen/PreISelIntrinsicLowering.h index 7a007eb8bceac52c9be510d757bc272ae8c59998..b7f83e515b7e95acc0c510609bd1d1fbe6ca0b40 100644 --- a/include/llvm/CodeGen/PreISelIntrinsicLowering.h +++ b/include/llvm/CodeGen/PreISelIntrinsicLowering.h @@ -7,7 +7,8 @@ // //===----------------------------------------------------------------------===// // -// This pass implements IR lowering for the llvm.load.relative intrinsic. +// This pass implements IR lowering for the llvm.load.relative and llvm.objc.* +// intrinsics. // //===----------------------------------------------------------------------===// #ifndef LLVM_CODEGEN_PREISELINTRINSICLOWERING_H diff --git a/include/llvm/CodeGen/RegAllocRegistry.h b/include/llvm/CodeGen/RegAllocRegistry.h index 481747dc163eea6a997e9b6e82b62580fc429c33..b518fbb9c9da66f677b581d8291cac198f4a05b0 100644 --- a/include/llvm/CodeGen/RegAllocRegistry.h +++ b/include/llvm/CodeGen/RegAllocRegistry.h @@ -26,14 +26,14 @@ class FunctionPass; /// RegisterRegAlloc class - Track the registration of register allocators. /// //===----------------------------------------------------------------------===// -class RegisterRegAlloc : public MachinePassRegistryNode { +class RegisterRegAlloc : public MachinePassRegistryNode { public: using FunctionPassCtor = FunctionPass *(*)(); - static MachinePassRegistry Registry; + static MachinePassRegistry Registry; RegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) - : MachinePassRegistryNode(N, D, (MachinePassCtor)C) { + : MachinePassRegistryNode(N, D, C) { Registry.Add(this); } @@ -48,15 +48,11 @@ public: return (RegisterRegAlloc *)Registry.getList(); } - static FunctionPassCtor getDefault() { - return (FunctionPassCtor)Registry.getDefault(); - } + static FunctionPassCtor getDefault() { return Registry.getDefault(); } - static void setDefault(FunctionPassCtor C) { - Registry.setDefault((MachinePassCtor)C); - } + static void setDefault(FunctionPassCtor C) { Registry.setDefault(C); } - static void setListener(MachinePassRegistryListener *L) { + static void setListener(MachinePassRegistryListener *L) { Registry.setListener(L); } }; diff --git a/include/llvm/CodeGen/SchedulerRegistry.h b/include/llvm/CodeGen/SchedulerRegistry.h index badf927d0e95618b7f5c091649f01f24e0893045..fbe559f25556c86f09999b82601622c309228382 100644 --- a/include/llvm/CodeGen/SchedulerRegistry.h +++ b/include/llvm/CodeGen/SchedulerRegistry.h @@ -29,16 +29,19 @@ namespace llvm { class ScheduleDAGSDNodes; class SelectionDAGISel; -class RegisterScheduler : public MachinePassRegistryNode { +class RegisterScheduler + : public MachinePassRegistryNode< + ScheduleDAGSDNodes *(*)(SelectionDAGISel *, CodeGenOpt::Level)> { public: using FunctionPassCtor = ScheduleDAGSDNodes *(*)(SelectionDAGISel*, CodeGenOpt::Level); - static MachinePassRegistry Registry; + static MachinePassRegistry Registry; RegisterScheduler(const char *N, const char *D, FunctionPassCtor C) - : MachinePassRegistryNode(N, D, (MachinePassCtor)C) - { Registry.Add(this); } + : MachinePassRegistryNode(N, D, C) { + Registry.Add(this); + } ~RegisterScheduler() { Registry.Remove(this); } @@ -51,7 +54,7 @@ public: return (RegisterScheduler *)Registry.getList(); } - static void setListener(MachinePassRegistryListener *L) { + static void setListener(MachinePassRegistryListener *L) { Registry.setListener(L); } }; diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h index 3b144b92e2a6eb93499429e32741c2f93ec2d911..8093fdab5491eb10787fb091b0962022c09abdcc 100644 --- a/include/llvm/CodeGen/SelectionDAG.h +++ b/include/llvm/CodeGen/SelectionDAG.h @@ -929,41 +929,45 @@ public: Type *SizeTy, unsigned ElemSz, bool isTailCall, MachinePointerInfo DstPtrInfo); - /// Helper function to make it easier to build SetCC's if you just - /// have an ISD::CondCode instead of an SDValue. - /// + /// Helper function to make it easier to build SetCC's if you just have an + /// ISD::CondCode instead of an SDValue. SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond) { assert(LHS.getValueType().isVector() == RHS.getValueType().isVector() && - "Cannot compare scalars to vectors"); + "Cannot compare scalars to vectors"); assert(LHS.getValueType().isVector() == VT.isVector() && - "Cannot compare scalars to vectors"); + "Cannot compare scalars to vectors"); assert(Cond != ISD::SETCC_INVALID && - "Cannot create a setCC of an invalid node."); + "Cannot create a setCC of an invalid node."); return getNode(ISD::SETCC, DL, VT, LHS, RHS, getCondCode(Cond)); } - /// Helper function to make it easier to build Select's if you just - /// have operands and don't want to check for vector. + /// Helper function to make it easier to build Select's if you just have + /// operands and don't want to check for vector. SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS) { assert(LHS.getValueType() == RHS.getValueType() && "Cannot use select on differing types"); assert(VT.isVector() == LHS.getValueType().isVector() && "Cannot mix vectors and scalars"); - return getNode(Cond.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, - Cond, LHS, RHS); + auto Opcode = Cond.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT; + return getNode(Opcode, DL, VT, Cond, LHS, RHS); } - /// Helper function to make it easier to build SelectCC's if you - /// just have an ISD::CondCode instead of an SDValue. - /// + /// Helper function to make it easier to build SelectCC's if you just have an + /// ISD::CondCode instead of an SDValue. SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond) { - return getNode(ISD::SELECT_CC, DL, True.getValueType(), - LHS, RHS, True, False, getCondCode(Cond)); + return getNode(ISD::SELECT_CC, DL, True.getValueType(), LHS, RHS, True, + False, getCondCode(Cond)); } + /// Try to simplify a select/vselect into 1 of its operands or a constant. + SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal); + + /// Try to simplify a shift into 1 of its operands or a constant. + SDValue simplifyShift(SDValue X, SDValue Y); + /// VAArg produces a result and token chain, and takes a pointer /// and a source value as input. SDValue getVAArg(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, @@ -1511,6 +1515,18 @@ public: /// allow an 'add' to be transformed into an 'or'. bool haveNoCommonBitsSet(SDValue A, SDValue B) const; + /// Test whether \p V has a splatted value for all the demanded elements. + /// + /// On success \p UndefElts will indicate the elements that have UNDEF + /// values instead of the splat value, this is only guaranteed to be correct + /// for \p DemandedElts. + /// + /// NOTE: The function will return true for a demanded splat of UNDEF values. + bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts); + + /// Test whether \p V has a splatted value. + bool isSplatValue(SDValue V, bool AllowUndefs = false); + /// Match a binop + shuffle pyramid that represents a horizontal reduction /// over the elements of a vector starting from the EXTRACT_VECTOR_ELT node /p /// Extract. The reduction must use one of the opcodes listed in /p diff --git a/include/llvm/CodeGen/SelectionDAGISel.h b/include/llvm/CodeGen/SelectionDAGISel.h index 86df0af7303fe4f536cb86db045da26dc339504f..6758c55c696afa4b8030b177d3b0d82acb1ce849 100644 --- a/include/llvm/CodeGen/SelectionDAGISel.h +++ b/include/llvm/CodeGen/SelectionDAGISel.h @@ -132,6 +132,7 @@ public: OPC_CheckChild2Same, OPC_CheckChild3Same, OPC_CheckPatternPredicate, OPC_CheckPredicate, + OPC_CheckPredicateWithOperands, OPC_CheckOpcode, OPC_SwitchOpcode, OPC_CheckType, @@ -267,6 +268,17 @@ public: llvm_unreachable("Tblgen should generate the implementation of this!"); } + /// CheckNodePredicateWithOperands - This function is generated by tblgen in + /// the target. + /// It runs node predicate number PredNo and returns true if it succeeds or + /// false if it fails. The number is a private implementation detail to the + /// code tblgen produces. + virtual bool CheckNodePredicateWithOperands( + SDNode *N, unsigned PredNo, + const SmallVectorImpl &Operands) const { + llvm_unreachable("Tblgen should generate the implementation of this!"); + } + virtual bool CheckComplexPattern(SDNode *Root, SDNode *Parent, SDValue N, unsigned PatternNo, SmallVectorImpl > &Result) { diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h index d125e888a5742d145f10ea43ec8046e917dcc078..2931717b72deb1dafe45c442a5e904a57ea84fc8 100644 --- a/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/include/llvm/CodeGen/SelectionDAGNodes.h @@ -1613,6 +1613,21 @@ ConstantSDNode *isConstOrConstSplat(SDValue N, bool AllowUndefs = false); /// Returns the SDNode if it is a constant splat BuildVector or constant float. ConstantFPSDNode *isConstOrConstSplatFP(SDValue N, bool AllowUndefs = false); +/// Return true if the value is a constant 0 integer or a splatted vector of +/// a constant 0 integer (with no undefs). +/// Build vector implicit truncation is not an issue for null values. +bool isNullOrNullSplat(SDValue V); + +/// Return true if the value is a constant 1 integer or a splatted vector of a +/// constant 1 integer (with no undefs). +/// Does not permit build vector implicit truncation. +bool isOneOrOneSplat(SDValue V); + +/// Return true if the value is a constant -1 integer or a splatted vector of a +/// constant -1 integer (with no undefs). +/// Does not permit build vector implicit truncation. +bool isAllOnesOrAllOnesSplat(SDValue V); + class GlobalAddressSDNode : public SDNode { friend class SelectionDAG; diff --git a/include/llvm/CodeGen/StackMaps.h b/include/llvm/CodeGen/StackMaps.h index e584a4136e4fb39fe9e02a51c7f2afb59fea1bc7..8be9ae378557eafe01c4321e0b0d6c4dd0e98146 100644 --- a/include/llvm/CodeGen/StackMaps.h +++ b/include/llvm/CodeGen/StackMaps.h @@ -236,25 +236,6 @@ public: FnInfos.clear(); } - /// Generate a stackmap record for a stackmap instruction. - /// - /// MI must be a raw STACKMAP, not a PATCHPOINT. - void recordStackMap(const MachineInstr &MI); - - /// Generate a stackmap record for a patchpoint instruction. - void recordPatchPoint(const MachineInstr &MI); - - /// Generate a stackmap record for a statepoint instruction. - void recordStatepoint(const MachineInstr &MI); - - /// If there is any stack map data, create a stack map section and serialize - /// the map info into it. This clears the stack map data structures - /// afterwards. - void serializeToStackMapSection(); - -private: - static const char *WSMP; - using LocationVec = SmallVector; using LiveOutVec = SmallVector; using ConstantPool = MapVector; @@ -283,6 +264,31 @@ private: using FnInfoMap = MapVector; using CallsiteInfoList = std::vector; + /// Generate a stackmap record for a stackmap instruction. + /// + /// MI must be a raw STACKMAP, not a PATCHPOINT. + void recordStackMap(const MachineInstr &MI); + + /// Generate a stackmap record for a patchpoint instruction. + void recordPatchPoint(const MachineInstr &MI); + + /// Generate a stackmap record for a statepoint instruction. + void recordStatepoint(const MachineInstr &MI); + + /// If there is any stack map data, create a stack map section and serialize + /// the map info into it. This clears the stack map data structures + /// afterwards. + void serializeToStackMapSection(); + + /// Get call site info. + CallsiteInfoList &getCSInfos() { return CSInfos; } + + /// Get function info. + FnInfoMap &getFnInfos() { return FnInfos; } + +private: + static const char *WSMP; + AsmPrinter &AP; CallsiteInfoList CSInfos; ConstantPool ConstPool; diff --git a/include/llvm/CodeGen/TargetInstrInfo.h b/include/llvm/CodeGen/TargetInstrInfo.h index 1fa3de421eacffa5b6eda502c1a192dea92302dc..961b90e9bc12d2e10f1537c53a38c786256f439c 100644 --- a/include/llvm/CodeGen/TargetInstrInfo.h +++ b/include/llvm/CodeGen/TargetInstrInfo.h @@ -1136,11 +1136,11 @@ public: return false; } - /// Get the base register and byte offset of an instruction that reads/writes + /// Get the base operand and byte offset of an instruction that reads/writes /// memory. - virtual bool getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg, - int64_t &Offset, - const TargetRegisterInfo *TRI) const { + virtual bool getMemOperandWithOffset(MachineInstr &MI, + MachineOperand *&BaseOp, int64_t &Offset, + const TargetRegisterInfo *TRI) const { return false; } @@ -1164,8 +1164,8 @@ public: /// or /// DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); /// to TargetPassConfig::createMachineScheduler() to have an effect. - virtual bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1, - MachineInstr &SecondLdSt, unsigned BaseReg2, + virtual bool shouldClusterMemOps(MachineOperand &BaseOp1, + MachineOperand &BaseOp2, unsigned NumLoads) const { llvm_unreachable("target did not implement shouldClusterMemOps()"); } @@ -1635,10 +1635,11 @@ public: "Target didn't implement TargetInstrInfo::getOutliningType!"); } - /// Returns target-defined flags defining properties of the MBB for - /// the outliner. - virtual unsigned getMachineOutlinerMBBFlags(MachineBasicBlock &MBB) const { - return 0x0; + /// Optional target hook that returns true if \p MBB is safe to outline from, + /// and returns any target-specific information in \p Flags. + virtual bool isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, + unsigned &Flags) const { + return true; } /// Insert a custom frame for outlined functions. diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h index 38e575b1360fb1e6d38eec123d348ef058d8468b..d4e6da80a4c8f8cc23ec2a7e888d3e7a0daadbc6 100644 --- a/include/llvm/CodeGen/TargetLowering.h +++ b/include/llvm/CodeGen/TargetLowering.h @@ -269,6 +269,14 @@ public: return true; } + /// Return true if it is profitable to convert a select of FP constants into + /// a constant pool load whose address depends on the select condition. The + /// parameter may be used to differentiate a select with FP compare from + /// integer compare. + virtual bool reduceSelectOfFPConstantLoads(bool IsFPSetCC) const { + return true; + } + /// Return true if multiple condition registers are available. bool hasMultipleConditionRegisters() const { return HasMultipleConditionRegisters; @@ -797,6 +805,38 @@ public: return OpActions[(unsigned)VT.getSimpleVT().SimpleTy][Op]; } + /// Custom method defined by each target to indicate if an operation which + /// may require a scale is supported natively by the target. + /// If not, the operation is illegal. + virtual bool isSupportedFixedPointOperation(unsigned Op, EVT VT, + unsigned Scale) const { + return false; + } + + /// Some fixed point operations may be natively supported by the target but + /// only for specific scales. This method allows for checking + /// if the width is supported by the target for a given operation that may + /// depend on scale. + LegalizeAction getFixedPointOperationAction(unsigned Op, EVT VT, + unsigned Scale) const { + auto Action = getOperationAction(Op, VT); + if (Action != Legal) + return Action; + + // This operation is supported in this type but may only work on specific + // scales. + bool Supported; + switch (Op) { + default: + llvm_unreachable("Unexpected fixed point operation."); + case ISD::SMULFIX: + Supported = isSupportedFixedPointOperation(Op, VT, Scale); + break; + } + + return Supported ? Action : Expand; + } + LegalizeAction getStrictFPOperationAction(unsigned Op, EVT VT) const { unsigned EqOpc; switch (Op) { @@ -1213,13 +1253,15 @@ public: /// reduce runtime. virtual bool ShouldShrinkFPConstant(EVT) const { return true; } - // Return true if it is profitable to reduce the given load node to a smaller - // type. - // - // e.g. (i16 (trunc (i32 (load x))) -> i16 load x should be performed - virtual bool shouldReduceLoadWidth(SDNode *Load, - ISD::LoadExtType ExtTy, + /// Return true if it is profitable to reduce a load to a smaller type. + /// Example: (i16 (trunc (i32 (load x))) -> i16 load x + virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { + // By default, assume that it is cheaper to extract a subvector from a wide + // vector load rather than creating multiple narrow vector loads. + if (NewVT.isVector() && !Load->hasOneUse()) + return false; + return true; } @@ -1736,6 +1778,16 @@ public: return false; } + /// Return true if it is more correct/profitable to use strict FP_TO_INT + /// conversion operations - canonicalizing the FP source value instead of + /// converting all cases and then selecting based on value. + /// This may be true if the target throws exceptions for out of bounds + /// conversions or has fast FP CMOV. + virtual bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT, + bool IsSigned) const { + return false; + } + //===--------------------------------------------------------------------===// // TargetLowering Configuration Methods - These methods should be invoked by // the derived class constructor to configure this object for the target. @@ -2210,6 +2262,12 @@ public: return false; } + /// Return true if sign-extension from FromTy to ToTy is cheaper than + /// zero-extension. + virtual bool isSExtCheaperThanZExt(EVT FromTy, EVT ToTy) const { + return false; + } + /// Return true if the target supplies and combines to a paired load /// two loaded values of type LoadedType next to each other in memory. /// RequiredAlignment gives the minimal alignment constraints that must be met @@ -2841,22 +2899,28 @@ public: bool SimplifyDemandedBits(SDNode *User, unsigned OpIdx, const APInt &Demanded, DAGCombinerInfo &DCI, TargetLoweringOpt &TLO) const; - /// Look at Op. At this point, we know that only the DemandedMask bits of the + /// Look at Op. At this point, we know that only the DemandedBits bits of the /// result of Op are ever used downstream. If we can use this information to /// simplify Op, create a new simplified DAG node and return true, returning /// the original and new nodes in Old and New. Otherwise, analyze the /// expression and return a mask of KnownOne and KnownZero bits for the /// expression (used to simplify the caller). The KnownZero/One bits may only - /// be accurate for those bits in the DemandedMask. + /// be accurate for those bits in the Demanded masks. /// \p AssumeSingleUse When this parameter is true, this function will /// attempt to simplify \p Op even if there are multiple uses. /// Callers are responsible for correctly updating the DAG based on the /// results of this function, because simply replacing replacing TLO.Old /// with TLO.New will be incorrect when this parameter is true and TLO.Old /// has multiple uses. - bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask, - KnownBits &Known, - TargetLoweringOpt &TLO, + bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, + const APInt &DemandedElts, KnownBits &Known, + TargetLoweringOpt &TLO, unsigned Depth = 0, + bool AssumeSingleUse = false) const; + + /// Helper wrapper around SimplifyDemandedBits, demanding all elements. + /// Adds Op back to the worklist upon success. + bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, + KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth = 0, bool AssumeSingleUse = false) const; @@ -2927,13 +2991,14 @@ public: SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth = 0) const; - /// Attempt to simplify any target nodes based on the demanded bits, + /// Attempt to simplify any target nodes based on the demanded bits/elts, /// returning true on success. Otherwise, analyze the /// expression and return a mask of KnownOne and KnownZero bits for the /// expression (used to simplify the caller). The KnownZero/One bits may only - /// be accurate for those bits in the DemandedMask. + /// be accurate for those bits in the Demanded masks. virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, + const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth = 0) const; @@ -3663,6 +3728,18 @@ public: SDValue LL = SDValue(), SDValue LH = SDValue(), SDValue RL = SDValue(), SDValue RH = SDValue()) const; + /// Expand funnel shift. + /// \param N Node to expand + /// \param Result output after conversion + /// \returns True, if the expansion was successful, false otherwise + bool expandFunnelShift(SDNode *N, SDValue &Result, SelectionDAG &DAG) const; + + /// Expand rotations. + /// \param N Node to expand + /// \param Result output after conversion + /// \returns True, if the expansion was successful, false otherwise + bool expandROT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const; + /// Expand float(f32) to SINT(i64) conversion /// \param N Node to expand /// \param Result output after conversion @@ -3743,10 +3820,15 @@ public: SDValue Index) const; /// Method for building the DAG expansion of ISD::[US][ADD|SUB]SAT. This - /// method accepts integers or vectors of integers as its arguments. + /// method accepts integers as its arguments. SDValue getExpandedSaturationAdditionSubtraction(SDNode *Node, SelectionDAG &DAG) const; + /// Method for building the DAG expansion of ISD::SMULFIX. This method accepts + /// integers as its arguments. + SDValue getExpandedFixedPointMultiplication(SDNode *Node, + SelectionDAG &DAG) const; + //===--------------------------------------------------------------------===// // Instruction Emitting Hooks // diff --git a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h index f5c7fc824ab47864ec11bbbaebfede5745be5b8f..052d1f8bc6863c4f65c82b90776cd6fc22d651f8 100644 --- a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h +++ b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h @@ -90,6 +90,8 @@ public: const MCExpr *lowerRelativeReference(const GlobalValue *LHS, const GlobalValue *RHS, const TargetMachine &TM) const override; + + MCSection *getSectionForCommandLines() const override; }; class TargetLoweringObjectFileMachO : public TargetLoweringObjectFile { diff --git a/include/llvm/CodeGen/TargetPassConfig.h b/include/llvm/CodeGen/TargetPassConfig.h index 7fda8751d40ae9a296b0ca177c37f29f5ac854db..3288711a335d2a8ef31fe4ea307a8ba52565e62d 100644 --- a/include/llvm/CodeGen/TargetPassConfig.h +++ b/include/llvm/CodeGen/TargetPassConfig.h @@ -90,6 +90,19 @@ private: AnalysisID StartAfter = nullptr; AnalysisID StopBefore = nullptr; AnalysisID StopAfter = nullptr; + + unsigned StartBeforeInstanceNum = 0; + unsigned StartBeforeCount = 0; + + unsigned StartAfterInstanceNum = 0; + unsigned StartAfterCount = 0; + + unsigned StopBeforeInstanceNum = 0; + unsigned StopBeforeCount = 0; + + unsigned StopAfterInstanceNum = 0; + unsigned StopAfterCount = 0; + bool Started = true; bool Stopped = false; bool AddingMachinePasses = false; diff --git a/include/llvm/CodeGen/WasmEHFuncInfo.h b/include/llvm/CodeGen/WasmEHFuncInfo.h index aa979349edfd01a493f81148f487267676d7b192..219fff988f6e0f974bfc7ad4e497d713ffc28ded 100644 --- a/include/llvm/CodeGen/WasmEHFuncInfo.h +++ b/include/llvm/CodeGen/WasmEHFuncInfo.h @@ -21,6 +21,8 @@ namespace llvm { +enum EventTag { CPP_EXCEPTION = 0, C_LONGJMP = 1 }; + using BBOrMBB = PointerUnion; struct WasmEHFuncInfo { diff --git a/include/llvm/Config/config.h.cmake b/include/llvm/Config/config.h.cmake index aece421b77d28b032cc1b1f6d5bd9d6c9d189b22..03bbd74d6d325e1b42b095801eb9e2e3e7521c68 100644 --- a/include/llvm/Config/config.h.cmake +++ b/include/llvm/Config/config.h.cmake @@ -208,6 +208,12 @@ /* Define to 1 if you have the header file. */ #cmakedefine HAVE_SYS_TIME_H ${HAVE_SYS_TIME_H} +/* Define to 1 if stat struct has st_mtimespec member .*/ +#cmakedefine HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC ${HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC} + +/* Define to 1 if stat struct has st_mtim member. */ +#cmakedefine HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC ${HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC} + /* Define to 1 if you have the header file. */ #cmakedefine HAVE_SYS_TYPES_H ${HAVE_SYS_TYPES_H} diff --git a/include/llvm/DebugInfo/CodeView/CVRecord.h b/include/llvm/DebugInfo/CodeView/CVRecord.h index 9dbeb438f4ae24d83a441cedf33240bc2213a7af..11ca9ff108de731553c6f581435b1b5a6cdde63a 100644 --- a/include/llvm/DebugInfo/CodeView/CVRecord.h +++ b/include/llvm/DebugInfo/CodeView/CVRecord.h @@ -45,13 +45,8 @@ public: return RecordData.drop_front(sizeof(RecordPrefix)); } - Optional hash() const { return Hash; } - - void setHash(uint32_t Value) { Hash = Value; } - Kind Type; ArrayRef RecordData; - Optional Hash; }; template struct RemappedRecord { diff --git a/include/llvm/DebugInfo/CodeView/CodeView.h b/include/llvm/DebugInfo/CodeView/CodeView.h index 4b96bc14ac1739a819bb2080df6c5cb479547823..8e0d9f608e93758161885e541fc5c9cc2f8123c4 100644 --- a/include/llvm/DebugInfo/CodeView/CodeView.h +++ b/include/llvm/DebugInfo/CodeView/CodeView.h @@ -358,7 +358,9 @@ enum class PointerOptions : uint32_t { Const = 0x00000400, Unaligned = 0x00000800, Restrict = 0x00001000, - WinRTSmartPointer = 0x00080000 + WinRTSmartPointer = 0x00080000, + LValueRefThisPointer = 0x00100000, + RValueRefThisPointer = 0x00200000 }; CV_DEFINE_ENUM_CLASS_FLAGS_OPERATORS(PointerOptions) diff --git a/include/llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h b/include/llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h index f74120a53608650c216fd3d4e41b80994a85ae9f..847d93f0e98585a8a4a8eb543b023400fc78bfd1 100644 --- a/include/llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h +++ b/include/llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h @@ -13,6 +13,7 @@ #include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/DebugSubsection.h" #include "llvm/Support/BinaryStreamReader.h" +#include "llvm/Support/Endian.h" #include "llvm/Support/Error.h" namespace llvm { @@ -31,10 +32,10 @@ public: FixedStreamArray::Iterator begin() const { return Frames.begin(); } FixedStreamArray::Iterator end() const { return Frames.end(); } - const uint32_t *getRelocPtr() const { return RelocPtr; } + const support::ulittle32_t *getRelocPtr() const { return RelocPtr; } private: - const uint32_t *RelocPtr = nullptr; + const support::ulittle32_t *RelocPtr = nullptr; FixedStreamArray Frames; }; diff --git a/include/llvm/DebugInfo/CodeView/RecordSerialization.h b/include/llvm/DebugInfo/CodeView/RecordSerialization.h index 58449c2c7565b9f8905d880d26d282afdc0502ed..36237e1a4d9e426a8bb1f995340b2d6df0db0b98 100644 --- a/include/llvm/DebugInfo/CodeView/RecordSerialization.h +++ b/include/llvm/DebugInfo/CodeView/RecordSerialization.h @@ -180,26 +180,6 @@ template serialize_numeric_impl serialize_numeric(T &Item) { return serialize_numeric_impl(Item); } -// This field is only present in the byte record if the condition is true. The -// condition is evaluated lazily, so it can depend on items that were -// deserialized -// earlier. -#define CV_CONDITIONAL_FIELD(I, C) \ - serialize_conditional(I, [&]() { return !!(C); }) - -// This is an array of N items, where N is evaluated lazily, so it can refer -// to a field deserialized earlier. -#define CV_ARRAY_FIELD_N(I, N) serialize_array(I, [&]() { return N; }) - -// This is an array that exhausts the remainder of the input buffer. -#define CV_ARRAY_FIELD_TAIL(I) serialize_array_tail(I) - -// This is an array that consumes null terminated strings until a double null -// is encountered. -#define CV_STRING_ARRAY_NULL_TERM(I) serialize_null_term_string_array(I) - -#define CV_NUMERIC_FIELD(I) serialize_numeric(I) - template Error consume(BinaryStreamReader &Reader, const serialize_conditional_impl &Item) { @@ -242,9 +222,6 @@ Error consume(BinaryStreamReader &Reader, T &&X, U &&Y, Args &&... Rest) { return consume(Reader, Y, std::forward(Rest)...); } -#define CV_DESERIALIZE(...) \ - if (auto EC = consume(__VA_ARGS__)) \ - return std::move(EC); } } diff --git a/include/llvm/DebugInfo/CodeView/SymbolRecord.h b/include/llvm/DebugInfo/CodeView/SymbolRecord.h index c63fb9871909ec40c9772861fea3524f73254e61..b58825c4a7887a90b54a13edd15ce7d3e87e8998 100644 --- a/include/llvm/DebugInfo/CodeView/SymbolRecord.h +++ b/include/llvm/DebugInfo/CodeView/SymbolRecord.h @@ -400,6 +400,7 @@ public: uint16_t Module; StringRef Name; + uint16_t modi() const { return Module - 1; } uint32_t RecordOffset; }; diff --git a/include/llvm/DebugInfo/CodeView/SymbolRecordHelpers.h b/include/llvm/DebugInfo/CodeView/SymbolRecordHelpers.h new file mode 100644 index 0000000000000000000000000000000000000000..3713fe118eaa9843e100ae60666d58dba4afad41 --- /dev/null +++ b/include/llvm/DebugInfo/CodeView/SymbolRecordHelpers.h @@ -0,0 +1,62 @@ +//===- SymbolRecordHelpers.h ------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_DEBUGINFO_CODEVIEW_SYMBOLRECORDHELPERS_H +#define LLVM_DEBUGINFO_CODEVIEW_SYMBOLRECORDHELPERS_H + +#include "llvm/DebugInfo/CodeView/SymbolRecord.h" + +namespace llvm { +namespace codeview { +/// Return true if this symbol opens a scope. This implies that the symbol has +/// "parent" and "end" fields, which contain the offset of the S_END or +/// S_INLINESITE_END record. +inline bool symbolOpensScope(SymbolKind Kind) { + switch (Kind) { + case SymbolKind::S_GPROC32: + case SymbolKind::S_LPROC32: + case SymbolKind::S_LPROC32_ID: + case SymbolKind::S_GPROC32_ID: + case SymbolKind::S_BLOCK32: + case SymbolKind::S_SEPCODE: + case SymbolKind::S_THUNK32: + case SymbolKind::S_INLINESITE: + case SymbolKind::S_INLINESITE2: + return true; + default: + break; + } + return false; +} + +/// Return true if this ssymbol ends a scope. +inline bool symbolEndsScope(SymbolKind Kind) { + switch (Kind) { + case SymbolKind::S_END: + case SymbolKind::S_PROC_ID_END: + case SymbolKind::S_INLINESITE_END: + return true; + default: + break; + } + return false; +} + +/// Given a symbol P for which symbolOpensScope(P) == true, return the +/// corresponding end offset. +uint32_t getScopeEndOffset(const CVSymbol &Symbol); +uint32_t getScopeParentOffset(const CVSymbol &Symbol); + +CVSymbolArray limitSymbolArrayToScope(const CVSymbolArray &Symbols, + uint32_t ScopeBegin); + +} // namespace codeview +} // namespace llvm + +#endif diff --git a/include/llvm/DebugInfo/CodeView/TypeRecord.h b/include/llvm/DebugInfo/CodeView/TypeRecord.h index 76f1f98ab660ae678ed5e4a50877eed37212aa24..e7b5293dc51a419dfe9fee3f59fb9d8faa8dc153 100644 --- a/include/llvm/DebugInfo/CodeView/TypeRecord.h +++ b/include/llvm/DebugInfo/CodeView/TypeRecord.h @@ -264,14 +264,18 @@ public: // LF_POINTER class PointerRecord : public TypeRecord { public: + // ---------------------------XXXXX static const uint32_t PointerKindShift = 0; static const uint32_t PointerKindMask = 0x1F; + // ------------------------XXX----- static const uint32_t PointerModeShift = 5; static const uint32_t PointerModeMask = 0x07; - static const uint32_t PointerOptionMask = 0xFF; + // ----------XXX------XXXXX-------- + static const uint32_t PointerOptionMask = 0x381f00; + // -------------XXXXXX------------ static const uint32_t PointerSizeShift = 13; static const uint32_t PointerSizeMask = 0xFF; @@ -305,7 +309,7 @@ public: } PointerOptions getOptions() const { - return static_cast(Attrs); + return static_cast(Attrs & PointerOptionMask); } uint8_t getSize() const { @@ -334,6 +338,14 @@ public: return !!(Attrs & uint32_t(PointerOptions::Restrict)); } + bool isLValueReferenceThisPtr() const { + return !!(Attrs & uint32_t(PointerOptions::LValueRefThisPointer)); + } + + bool isRValueReferenceThisPtr() const { + return !!(Attrs & uint32_t(PointerOptions::RValueRefThisPointer)); + } + TypeIndex ReferentType; uint32_t Attrs; Optional MemberInfo; @@ -429,6 +441,10 @@ public: return (Options & ClassOptions::ForwardReference) != ClassOptions::None; } + bool containsNestedClass() const { + return (Options & ClassOptions::ContainsNestedClass) != ClassOptions::None; + } + bool isScoped() const { return (Options & ClassOptions::Scoped) != ClassOptions::None; } diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h index 221f1f796980f343eb92ce666b54c679f8b6896c..dbb6be04544be7a1ab204f52f55dcd484536e7c2 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFContext.h +++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h @@ -360,6 +360,10 @@ public: /// Dump Error as warning message to stderr. static void dumpWarning(Error Warning); + Triple::ArchType getArch() const { + return getDWARFObj().getFile()->getArch(); + } + private: /// Return the compile unit which contains instruction with provided /// address. diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h b/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h index ff1c7fb383898731de832f9e9bf65478ab3413d0..7dc07d774aba9cf08eb829af248e608b575e4690 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h +++ b/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h @@ -13,6 +13,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/iterator.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/Triple.h" #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h" #include "llvm/DebugInfo/DWARF/DWARFExpression.h" #include "llvm/Support/Error.h" @@ -59,9 +60,11 @@ public: unsigned size() const { return (unsigned)Instructions.size(); } bool empty() const { return Instructions.empty(); } - CFIProgram(uint64_t CodeAlignmentFactor, int64_t DataAlignmentFactor) + CFIProgram(uint64_t CodeAlignmentFactor, int64_t DataAlignmentFactor, + Triple::ArchType Arch) : CodeAlignmentFactor(CodeAlignmentFactor), - DataAlignmentFactor(DataAlignmentFactor) {} + DataAlignmentFactor(DataAlignmentFactor), + Arch(Arch) {} /// Parse and store a sequence of CFI instructions from Data, /// starting at *Offset and ending at EndOffset. *Offset is updated @@ -76,6 +79,7 @@ private: std::vector Instructions; const uint64_t CodeAlignmentFactor; const int64_t DataAlignmentFactor; + Triple::ArchType Arch; /// Convenience method to add a new instruction with the given opcode. void addInstruction(uint8_t Opcode) { @@ -130,8 +134,9 @@ public: enum FrameKind { FK_CIE, FK_FDE }; FrameEntry(FrameKind K, uint64_t Offset, uint64_t Length, uint64_t CodeAlign, - int64_t DataAlign) - : Kind(K), Offset(Offset), Length(Length), CFIs(CodeAlign, DataAlign) {} + int64_t DataAlign, Triple::ArchType Arch) + : Kind(K), Offset(Offset), Length(Length), + CFIs(CodeAlign, DataAlign, Arch) {} virtual ~FrameEntry() {} @@ -168,9 +173,9 @@ public: int64_t DataAlignmentFactor, uint64_t ReturnAddressRegister, SmallString<8> AugmentationData, uint32_t FDEPointerEncoding, uint32_t LSDAPointerEncoding, Optional Personality, - Optional PersonalityEnc) + Optional PersonalityEnc, Triple::ArchType Arch) : FrameEntry(FK_CIE, Offset, Length, CodeAlignmentFactor, - DataAlignmentFactor), + DataAlignmentFactor, Arch), Version(Version), Augmentation(std::move(Augmentation)), AddressSize(AddressSize), SegmentDescriptorSize(SegmentDescriptorSize), CodeAlignmentFactor(CodeAlignmentFactor), @@ -224,10 +229,11 @@ public: // is obtained lazily once it's actually required. FDE(uint64_t Offset, uint64_t Length, int64_t LinkedCIEOffset, uint64_t InitialLocation, uint64_t AddressRange, CIE *Cie, - Optional LSDAAddress) + Optional LSDAAddress, Triple::ArchType Arch) : FrameEntry(FK_FDE, Offset, Length, Cie ? Cie->getCodeAlignmentFactor() : 0, - Cie ? Cie->getDataAlignmentFactor() : 0), + Cie ? Cie->getDataAlignmentFactor() : 0, + Arch), LinkedCIEOffset(LinkedCIEOffset), InitialLocation(InitialLocation), AddressRange(AddressRange), LinkedCIE(Cie), LSDAAddress(LSDAAddress) {} @@ -256,6 +262,7 @@ private: /// A parsed .debug_frame or .eh_frame section class DWARFDebugFrame { + const Triple::ArchType Arch; // True if this is parsing an eh_frame section. const bool IsEH; // Not zero for sane pointer values coming out of eh_frame @@ -272,7 +279,8 @@ public: // it is a .debug_frame section. EHFrameAddress should be different // than zero for correct parsing of .eh_frame addresses when they // use a PC-relative encoding. - DWARFDebugFrame(bool IsEH = false, uint64_t EHFrameAddress = 0); + DWARFDebugFrame(Triple::ArchType Arch, + bool IsEH = false, uint64_t EHFrameAddress = 0); ~DWARFDebugFrame(); /// Dump the section data into the given stream. diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h b/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h index cae4804e61d3b3c8dbd16a374ae29d7bc7c7e514..9e1656eb1615c4336671c5dde0512aa409a0b4cf 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h +++ b/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h @@ -13,6 +13,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/DebugInfo/DWARF/DWARFObject.h" #include #include @@ -67,7 +68,8 @@ private: bool GnuStyle; public: - DWARFDebugPubTable(StringRef Data, bool LittleEndian, bool GnuStyle); + DWARFDebugPubTable(const DWARFObject &Obj, const DWARFSection &Sec, + bool LittleEndian, bool GnuStyle); void dump(raw_ostream &OS) const; diff --git a/include/llvm/DebugInfo/DWARF/DWARFDie.h b/include/llvm/DebugInfo/DWARF/DWARFDie.h index baa47c2bfa580494c7bb1996bd1eca24dabbd662..56d46cd739a27108b5bc9419d1fb850694f6e651 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFDie.h +++ b/include/llvm/DebugInfo/DWARF/DWARFDie.h @@ -180,6 +180,7 @@ public: /// \returns a valid DWARFDie instance if the attribute exists, or an invalid /// DWARFDie object if it doesn't. DWARFDie getAttributeValueAsReferencedDie(dwarf::Attribute Attr) const; + DWARFDie getAttributeValueAsReferencedDie(const DWARFFormValue &V) const; /// Extract the range base attribute from this DIE as absolute section offset. /// diff --git a/include/llvm/DebugInfo/DWARF/DWARFObject.h b/include/llvm/DebugInfo/DWARF/DWARFObject.h index 5a808b0ec6a97ba13486475b9b8ba602e5a4dffa..d611b5d075c8ea13ab3194ae073b0ed7c63e4857 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFObject.h +++ b/include/llvm/DebugInfo/DWARF/DWARFObject.h @@ -49,10 +49,10 @@ public: virtual const DWARFSection &getRangeSection() const { return Dummy; } virtual const DWARFSection &getRnglistsSection() const { return Dummy; } virtual StringRef getMacinfoSection() const { return ""; } - virtual StringRef getPubNamesSection() const { return ""; } - virtual StringRef getPubTypesSection() const { return ""; } - virtual StringRef getGnuPubNamesSection() const { return ""; } - virtual StringRef getGnuPubTypesSection() const { return ""; } + virtual const DWARFSection &getPubNamesSection() const { return Dummy; } + virtual const DWARFSection &getPubTypesSection() const { return Dummy; } + virtual const DWARFSection &getGnuPubNamesSection() const { return Dummy; } + virtual const DWARFSection &getGnuPubTypesSection() const { return Dummy; } virtual const DWARFSection &getStringOffsetSection() const { return Dummy; } virtual void forEachInfoDWOSections(function_ref F) const {} diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h index 458278e4282fd777386ae0bc847d9976dcd51579..9909def1072355eb0959ba5e802f006c12b967d2 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h +++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h @@ -72,7 +72,8 @@ public: /// Parse a unit header from \p debug_info starting at \p offset_ptr. bool extract(DWARFContext &Context, const DWARFDataExtractor &debug_info, uint32_t *offset_ptr, DWARFSectionKind Kind = DW_SECT_INFO, - const DWARFUnitIndex *Index = nullptr); + const DWARFUnitIndex *Index = nullptr, + const DWARFUnitIndex::Entry *Entry = nullptr); uint32_t getOffset() const { return Offset; } const dwarf::FormParams &getFormParams() const { return FormParams; } uint16_t getVersion() const { return FormParams.Version; } @@ -108,7 +109,8 @@ const DWARFUnitIndex &getDWARFUnitIndex(DWARFContext &Context, /// .debug_info and .debug_types, or from .debug_info.dwo and .debug_types.dwo. class DWARFUnitVector final : public SmallVector, 1> { std::function(uint32_t, DWARFSectionKind, - const DWARFSection *)> + const DWARFSection *, + const DWARFUnitIndex::Entry *)> Parser; int NumInfoUnits = -1; diff --git a/include/llvm/DebugInfo/PDB/GenericError.h b/include/llvm/DebugInfo/PDB/GenericError.h index 7b5a85295963b78760b81d3dd8bd36d51da647fa..997f13f5f30ece7d53428948adff39c69460020e 100644 --- a/include/llvm/DebugInfo/PDB/GenericError.h +++ b/include/llvm/DebugInfo/PDB/GenericError.h @@ -21,6 +21,7 @@ enum class pdb_error_code { dia_sdk_not_present, dia_failed_loading, signature_out_of_date, + external_cmdline_ref, unspecified, }; } // namespace pdb diff --git a/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h b/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h index ce4d07917755c1d7eac87ad7c3e4dfdb0a23a7c7..ac7f741afefa7a5162bc067fc59c597592808850 100644 --- a/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h +++ b/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h @@ -51,6 +51,7 @@ public: void setObjFileName(StringRef Name); void setFirstSectionContrib(const SectionContrib &SC); void addSymbol(codeview::CVSymbol Symbol); + void addSymbolsInBulk(ArrayRef BulkSymbols); void addDebugSubsection(std::shared_ptr Subsection); @@ -91,7 +92,7 @@ private: std::string ModuleName; std::string ObjFileName; std::vector SourceFiles; - std::vector Symbols; + std::vector> Symbols; std::vector> C13Builders; diff --git a/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h b/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h index 1a4f89d607dffd9d55a66e1654a7e2f9eb17228d..4c39ca762b5b74f874068aa30457c39ec447f746 100644 --- a/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h +++ b/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h @@ -61,7 +61,6 @@ public: void addGlobalSymbol(const codeview::ProcRefSym &Sym); void addGlobalSymbol(const codeview::DataSym &Sym); void addGlobalSymbol(const codeview::ConstantSym &Sym); - void addGlobalSymbol(const codeview::UDTSym &Sym); void addGlobalSymbol(const codeview::CVSymbol &Sym); private: diff --git a/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h b/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h index 19b0ebdfac841b35fdcaba9db94ed57226d2a7fd..8d590df288f378d5010210beb2ccbd9820f0b39c 100644 --- a/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h +++ b/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h @@ -44,6 +44,8 @@ public: symbols(bool *HadError) const; const codeview::CVSymbolArray &getSymbolArray() const { return SymbolArray; } + const codeview::CVSymbolArray + getSymbolArrayForScope(uint32_t ScopeBegin) const; BinarySubstreamRef getSymbolsSubstream() const; BinarySubstreamRef getC11LinesSubstream() const; diff --git a/include/llvm/Demangle/Demangle.h b/include/llvm/Demangle/Demangle.h index b9b4d158e2ae7a39cc6b0e93ed9cca9088b4bd92..4c9dc9569e18835621ff6f924867bc734138dd12 100644 --- a/include/llvm/Demangle/Demangle.h +++ b/include/llvm/Demangle/Demangle.h @@ -31,11 +31,6 @@ enum : int { char *itaniumDemangle(const char *mangled_name, char *buf, size_t *n, int *status); -/// Calls the callback \c Callback with \c Ctx as an argument whenever a type is -/// encountered. Returns true if \c MangledName couldn't be parsed. -bool itaniumFindTypesInMangledName(const char *MangledName, void *Ctx, - void (*Callback)(void *, const char *)); - enum MSDemangleFlags { MSDF_None = 0, MSDF_DumpBackrefs = 1 << 0 }; char *microsoftDemangle(const char *mangled_name, char *buf, size_t *n, diff --git a/include/llvm/Demangle/ItaniumDemangle.h b/include/llvm/Demangle/ItaniumDemangle.h index c5619a15bbee9370355c1c9a48748d0ba72b28a3..0b9187f30a5a627d3c628fa59984d34820e25e1f 100644 --- a/include/llvm/Demangle/ItaniumDemangle.h +++ b/include/llvm/Demangle/ItaniumDemangle.h @@ -2157,9 +2157,6 @@ template struct AbstractManglingParser { // conversion operator's type, and are resolved in the enclosing . PODSmallVector ForwardTemplateRefs; - void (*TypeCallback)(void *, const char *) = nullptr; - void *TypeCallbackContext = nullptr; - bool TryToParseTemplateArgs = true; bool PermitForwardTemplateReferences = false; bool ParsingLambdaParams = false; @@ -3453,9 +3450,6 @@ template Node *AbstractManglingParser::parseType() { Node *Result = nullptr; - if (TypeCallback != nullptr) - TypeCallback(TypeCallbackContext, First); - switch (look()) { // ::= case 'r': diff --git a/include/llvm/Demangle/MicrosoftDemangleNodes.h b/include/llvm/Demangle/MicrosoftDemangleNodes.h index 1d0b66a7bf41c86b6a69fda67e0e9ce5eb574b4b..1eca6762475a0f502fcee27f39ffadd16648408a 100644 --- a/include/llvm/Demangle/MicrosoftDemangleNodes.h +++ b/include/llvm/Demangle/MicrosoftDemangleNodes.h @@ -235,6 +235,8 @@ struct Node { virtual void output(OutputStream &OS, OutputFlags Flags) const = 0; + std::string toString() const; + private: NodeKind Kind; }; diff --git a/include/llvm/Demangle/Utility.h b/include/llvm/Demangle/Utility.h index da4a0c050bc5c5b54f3d8052a806c6b8807552fb..1d1601c81635ab81969182fabab34644f0e8ae8a 100644 --- a/include/llvm/Demangle/Utility.h +++ b/include/llvm/Demangle/Utility.h @@ -175,13 +175,13 @@ inline bool initializeOutputStream(char *Buf, size_t *N, OutputStream &S, if (Buf == nullptr) { Buf = static_cast(std::malloc(InitSize)); if (Buf == nullptr) - return true; + return false; BufferSize = InitSize; } else BufferSize = *N; S.reset(Buf, BufferSize); - return false; + return true; } #endif diff --git a/include/llvm/ExecutionEngine/JITEventListener.h b/include/llvm/ExecutionEngine/JITEventListener.h index 589ca612f0466198400112b9d7f8f52f16af5001..1b08379b8c3b119a3aadc6ecc3bf7d70f709c220 100644 --- a/include/llvm/ExecutionEngine/JITEventListener.h +++ b/include/llvm/ExecutionEngine/JITEventListener.h @@ -42,23 +42,26 @@ class ObjectFile; /// The default implementation of each method does nothing. class JITEventListener { public: + using ObjectKey = uint64_t; + JITEventListener() = default; virtual ~JITEventListener() = default; - /// NotifyObjectEmitted - Called after an object has been successfully - /// emitted to memory. NotifyFunctionEmitted will not be called for + /// notifyObjectLoaded - Called after an object has had its sections allocated + /// and addresses assigned to all symbols. Note: Section memory will not have + /// been relocated yet. notifyFunctionLoaded will not be called for /// individual functions in the object. /// /// ELF-specific information /// The ObjectImage contains the generated object image /// with section headers updated to reflect the address at which sections /// were loaded and with relocations performed in-place on debug sections. - virtual void NotifyObjectEmitted(const object::ObjectFile &Obj, - const RuntimeDyld::LoadedObjectInfo &L) {} + virtual void notifyObjectLoaded(ObjectKey K, const object::ObjectFile &Obj, + const RuntimeDyld::LoadedObjectInfo &L) {} - /// NotifyFreeingObject - Called just before the memory associated with + /// notifyFreeingObject - Called just before the memory associated with /// a previously emitted object is released. - virtual void NotifyFreeingObject(const object::ObjectFile &Obj) {} + virtual void notifyFreeingObject(ObjectKey K) {} // Get a pointe to the GDB debugger registration listener. static JITEventListener *createGDBRegistrationListener(); diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h index 5aaaaf3c396bdc812ba2040fb5bde7d01a57ccdb..9fc4614af0105bff3b45d1e9eb9f812d629cbcd7 100644 --- a/include/llvm/IR/Attributes.h +++ b/include/llvm/IR/Attributes.h @@ -230,29 +230,33 @@ public: /// Add an argument attribute. Returns a new set because attribute sets are /// immutable. - AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const; + LLVM_NODISCARD AttributeSet addAttribute(LLVMContext &C, + Attribute::AttrKind Kind) const; /// Add a target-dependent attribute. Returns a new set because attribute sets /// are immutable. - AttributeSet addAttribute(LLVMContext &C, StringRef Kind, - StringRef Value = StringRef()) const; + LLVM_NODISCARD AttributeSet addAttribute(LLVMContext &C, StringRef Kind, + StringRef Value = StringRef()) const; /// Add attributes to the attribute set. Returns a new set because attribute /// sets are immutable. - AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const; + LLVM_NODISCARD AttributeSet addAttributes(LLVMContext &C, + AttributeSet AS) const; /// Remove the specified attribute from this set. Returns a new set because /// attribute sets are immutable. - AttributeSet removeAttribute(LLVMContext &C, Attribute::AttrKind Kind) const; + LLVM_NODISCARD AttributeSet removeAttribute(LLVMContext &C, + Attribute::AttrKind Kind) const; /// Remove the specified attribute from this set. Returns a new set because /// attribute sets are immutable. - AttributeSet removeAttribute(LLVMContext &C, StringRef Kind) const; + LLVM_NODISCARD AttributeSet removeAttribute(LLVMContext &C, + StringRef Kind) const; /// Remove the specified attributes from this set. Returns a new set because /// attribute sets are immutable. - AttributeSet removeAttributes(LLVMContext &C, - const AttrBuilder &AttrsToRemove) const; + LLVM_NODISCARD AttributeSet + removeAttributes(LLVMContext &C, const AttrBuilder &AttrsToRemove) const; /// Return the number of attributes in this set. unsigned getNumAttributes() const; @@ -375,133 +379,140 @@ public: /// Add an attribute to the attribute set at the given index. /// Returns a new list because attribute lists are immutable. - AttributeList addAttribute(LLVMContext &C, unsigned Index, - Attribute::AttrKind Kind) const; + LLVM_NODISCARD AttributeList addAttribute(LLVMContext &C, unsigned Index, + Attribute::AttrKind Kind) const; /// Add an attribute to the attribute set at the given index. /// Returns a new list because attribute lists are immutable. - AttributeList addAttribute(LLVMContext &C, unsigned Index, StringRef Kind, - StringRef Value = StringRef()) const; + LLVM_NODISCARD AttributeList + addAttribute(LLVMContext &C, unsigned Index, StringRef Kind, + StringRef Value = StringRef()) const; /// Add an attribute to the attribute set at the given index. /// Returns a new list because attribute lists are immutable. - AttributeList addAttribute(LLVMContext &C, unsigned Index, Attribute A) const; + LLVM_NODISCARD AttributeList addAttribute(LLVMContext &C, unsigned Index, + Attribute A) const; /// Add attributes to the attribute set at the given index. /// Returns a new list because attribute lists are immutable. - AttributeList addAttributes(LLVMContext &C, unsigned Index, - const AttrBuilder &B) const; + LLVM_NODISCARD AttributeList addAttributes(LLVMContext &C, unsigned Index, + const AttrBuilder &B) const; /// Add an argument attribute to the list. Returns a new list because /// attribute lists are immutable. - AttributeList addParamAttribute(LLVMContext &C, unsigned ArgNo, - Attribute::AttrKind Kind) const { + LLVM_NODISCARD AttributeList addParamAttribute( + LLVMContext &C, unsigned ArgNo, Attribute::AttrKind Kind) const { return addAttribute(C, ArgNo + FirstArgIndex, Kind); } /// Add an argument attribute to the list. Returns a new list because /// attribute lists are immutable. - AttributeList addParamAttribute(LLVMContext &C, unsigned ArgNo, - StringRef Kind, - StringRef Value = StringRef()) const { + LLVM_NODISCARD AttributeList + addParamAttribute(LLVMContext &C, unsigned ArgNo, StringRef Kind, + StringRef Value = StringRef()) const { return addAttribute(C, ArgNo + FirstArgIndex, Kind, Value); } /// Add an attribute to the attribute list at the given arg indices. Returns a /// new list because attribute lists are immutable. - AttributeList addParamAttribute(LLVMContext &C, ArrayRef ArgNos, - Attribute A) const; + LLVM_NODISCARD AttributeList addParamAttribute(LLVMContext &C, + ArrayRef ArgNos, + Attribute A) const; /// Add an argument attribute to the list. Returns a new list because /// attribute lists are immutable. - AttributeList addParamAttributes(LLVMContext &C, unsigned ArgNo, - const AttrBuilder &B) const { + LLVM_NODISCARD AttributeList addParamAttributes(LLVMContext &C, + unsigned ArgNo, + const AttrBuilder &B) const { return addAttributes(C, ArgNo + FirstArgIndex, B); } /// Remove the specified attribute at the specified index from this /// attribute list. Returns a new list because attribute lists are immutable. - AttributeList removeAttribute(LLVMContext &C, unsigned Index, - Attribute::AttrKind Kind) const; + LLVM_NODISCARD AttributeList removeAttribute(LLVMContext &C, unsigned Index, + Attribute::AttrKind Kind) const; /// Remove the specified attribute at the specified index from this /// attribute list. Returns a new list because attribute lists are immutable. - AttributeList removeAttribute(LLVMContext &C, unsigned Index, - StringRef Kind) const; + LLVM_NODISCARD AttributeList removeAttribute(LLVMContext &C, unsigned Index, + StringRef Kind) const; /// Remove the specified attributes at the specified index from this /// attribute list. Returns a new list because attribute lists are immutable. - AttributeList removeAttributes(LLVMContext &C, unsigned Index, - const AttrBuilder &AttrsToRemove) const; + LLVM_NODISCARD AttributeList removeAttributes( + LLVMContext &C, unsigned Index, const AttrBuilder &AttrsToRemove) const; /// Remove all attributes at the specified index from this /// attribute list. Returns a new list because attribute lists are immutable. - AttributeList removeAttributes(LLVMContext &C, unsigned Index) const; + LLVM_NODISCARD AttributeList removeAttributes(LLVMContext &C, + unsigned Index) const; /// Remove the specified attribute at the specified arg index from this /// attribute list. Returns a new list because attribute lists are immutable. - AttributeList removeParamAttribute(LLVMContext &C, unsigned ArgNo, - Attribute::AttrKind Kind) const { + LLVM_NODISCARD AttributeList removeParamAttribute( + LLVMContext &C, unsigned ArgNo, Attribute::AttrKind Kind) const { return removeAttribute(C, ArgNo + FirstArgIndex, Kind); } /// Remove the specified attribute at the specified arg index from this /// attribute list. Returns a new list because attribute lists are immutable. - AttributeList removeParamAttribute(LLVMContext &C, unsigned ArgNo, - StringRef Kind) const { + LLVM_NODISCARD AttributeList removeParamAttribute(LLVMContext &C, + unsigned ArgNo, + StringRef Kind) const { return removeAttribute(C, ArgNo + FirstArgIndex, Kind); } /// Remove the specified attribute at the specified arg index from this /// attribute list. Returns a new list because attribute lists are immutable. - AttributeList removeParamAttributes(LLVMContext &C, unsigned ArgNo, - const AttrBuilder &AttrsToRemove) const { + LLVM_NODISCARD AttributeList removeParamAttributes( + LLVMContext &C, unsigned ArgNo, const AttrBuilder &AttrsToRemove) const { return removeAttributes(C, ArgNo + FirstArgIndex, AttrsToRemove); } /// Remove all attributes at the specified arg index from this /// attribute list. Returns a new list because attribute lists are immutable. - AttributeList removeParamAttributes(LLVMContext &C, unsigned ArgNo) const { + LLVM_NODISCARD AttributeList removeParamAttributes(LLVMContext &C, + unsigned ArgNo) const { return removeAttributes(C, ArgNo + FirstArgIndex); } /// \brief Add the dereferenceable attribute to the attribute set at the given /// index. Returns a new list because attribute lists are immutable. - AttributeList addDereferenceableAttr(LLVMContext &C, unsigned Index, - uint64_t Bytes) const; + LLVM_NODISCARD AttributeList addDereferenceableAttr(LLVMContext &C, + unsigned Index, + uint64_t Bytes) const; /// \brief Add the dereferenceable attribute to the attribute set at the given /// arg index. Returns a new list because attribute lists are immutable. - AttributeList addDereferenceableParamAttr(LLVMContext &C, unsigned ArgNo, - uint64_t Bytes) const { + LLVM_NODISCARD AttributeList addDereferenceableParamAttr( + LLVMContext &C, unsigned ArgNo, uint64_t Bytes) const { return addDereferenceableAttr(C, ArgNo + FirstArgIndex, Bytes); } /// Add the dereferenceable_or_null attribute to the attribute set at /// the given index. Returns a new list because attribute lists are immutable. - AttributeList addDereferenceableOrNullAttr(LLVMContext &C, unsigned Index, - uint64_t Bytes) const; + LLVM_NODISCARD AttributeList addDereferenceableOrNullAttr( + LLVMContext &C, unsigned Index, uint64_t Bytes) const; /// Add the dereferenceable_or_null attribute to the attribute set at /// the given arg index. Returns a new list because attribute lists are /// immutable. - AttributeList addDereferenceableOrNullParamAttr(LLVMContext &C, - unsigned ArgNo, - uint64_t Bytes) const { + LLVM_NODISCARD AttributeList addDereferenceableOrNullParamAttr( + LLVMContext &C, unsigned ArgNo, uint64_t Bytes) const { return addDereferenceableOrNullAttr(C, ArgNo + FirstArgIndex, Bytes); } /// Add the allocsize attribute to the attribute set at the given index. /// Returns a new list because attribute lists are immutable. - AttributeList addAllocSizeAttr(LLVMContext &C, unsigned Index, - unsigned ElemSizeArg, - const Optional &NumElemsArg); + LLVM_NODISCARD AttributeList + addAllocSizeAttr(LLVMContext &C, unsigned Index, unsigned ElemSizeArg, + const Optional &NumElemsArg); /// Add the allocsize attribute to the attribute set at the given arg index. /// Returns a new list because attribute lists are immutable. - AttributeList addAllocSizeParamAttr(LLVMContext &C, unsigned ArgNo, - unsigned ElemSizeArg, - const Optional &NumElemsArg) { + LLVM_NODISCARD AttributeList + addAllocSizeParamAttr(LLVMContext &C, unsigned ArgNo, unsigned ElemSizeArg, + const Optional &NumElemsArg) { return addAllocSizeAttr(C, ArgNo + FirstArgIndex, ElemSizeArg, NumElemsArg); } diff --git a/include/llvm/IR/BasicBlock.h b/include/llvm/IR/BasicBlock.h index 7244bba1ca59e36f29b03645a3fc05a9ff2bce2c..99eac33f742ec2a7c8d0c5a7d87bba65bbd09098 100644 --- a/include/llvm/IR/BasicBlock.h +++ b/include/llvm/IR/BasicBlock.h @@ -237,6 +237,12 @@ public: static_cast(this)->getUniquePredecessor()); } + /// Return true if this block has exactly N predecessors. + bool hasNPredecessors(unsigned N) const; + + /// Return true if this block has N predecessors or more. + bool hasNPredecessorsOrMore(unsigned N) const; + /// Return the successor of this block if it has a single successor. /// Otherwise return a null pointer. /// diff --git a/include/llvm/IR/CFG.h b/include/llvm/IR/CFG.h index 4140c8a212e7eb1f300c1fc451e0eab1fd9f35ce..8385c4647e12c2bd65b40af07446d624c470ff5e 100644 --- a/include/llvm/IR/CFG.h +++ b/include/llvm/IR/CFG.h @@ -117,6 +117,8 @@ inline const_pred_iterator pred_end(const BasicBlock *BB) { inline bool pred_empty(const BasicBlock *BB) { return pred_begin(BB) == pred_end(BB); } +/// Get the number of predecessors of \p BB. This is a linear time operation. +/// Use \ref BasicBlock::hasNPredecessors() or hasNPredecessorsOrMore if able. inline unsigned pred_size(const BasicBlock *BB) { return std::distance(pred_begin(BB), pred_end(BB)); } diff --git a/include/llvm/IR/CallSite.h b/include/llvm/IR/CallSite.h index 2162ccb982b007c94362dbfc969ae28a3dce20f7..a3e78049f4bef2c4971dc72d58ba9f36a6a874ec 100644 --- a/include/llvm/IR/CallSite.h +++ b/include/llvm/IR/CallSite.h @@ -656,10 +656,7 @@ public: private: IterTy getCallee() const { - if (isCall()) // Skip Callee - return cast(getInstruction())->op_end() - 1; - else // Skip BB, BB, Callee - return cast(getInstruction())->op_end() - 3; + return cast(getInstruction())->op_end() - 1; } }; diff --git a/include/llvm/IR/Constant.h b/include/llvm/IR/Constant.h index 5fdf0ea00f008d038698a883dbdf52c5692c69b2..98437f8eff1fa435109857fa5853942b3fe72135 100644 --- a/include/llvm/IR/Constant.h +++ b/include/llvm/IR/Constant.h @@ -114,7 +114,8 @@ public: /// For aggregates (struct/array/vector) return the constant that corresponds /// to the specified element if possible, or null if not. This can return null - /// if the element index is a ConstantExpr, or if 'this' is a constant expr. + /// if the element index is a ConstantExpr, if 'this' is a constant expr or + /// if the constant does not fit into an uint64_t. Constant *getAggregateElement(unsigned Elt) const; Constant *getAggregateElement(Constant *Elt) const; diff --git a/include/llvm/IR/Constants.h b/include/llvm/IR/Constants.h index f9d5ebc560c70ac5f1f58e04a398b2dafaff8244..afc93cd61d474da770cb5948179bf45a6944f114 100644 --- a/include/llvm/IR/Constants.h +++ b/include/llvm/IR/Constants.h @@ -290,7 +290,11 @@ public: static Constant *get(Type* Ty, StringRef Str); static ConstantFP *get(LLVMContext &Context, const APFloat &V); - static Constant *getNaN(Type *Ty, bool Negative = false, unsigned type = 0); + static Constant *getNaN(Type *Ty, bool Negative = false, uint64_t Payload = 0); + static Constant *getQNaN(Type *Ty, bool Negative = false, + APInt *Payload = nullptr); + static Constant *getSNaN(Type *Ty, bool Negative = false, + APInt *Payload = nullptr); static Constant *getNegativeZero(Type *Ty); static Constant *getInfinity(Type *Ty, bool Negative = false); @@ -1114,6 +1118,13 @@ public: static Constant *getSelect(Constant *C, Constant *V1, Constant *V2, Type *OnlyIfReducedTy = nullptr); + /// get - Return a unary operator constant expression, + /// folding if possible. + /// + /// \param OnlyIfReducedTy see \a getWithOperands() docs. + static Constant *get(unsigned Opcode, Constant *C1, unsigned Flags = 0, + Type *OnlyIfReducedTy = nullptr); + /// get - Return a binary or shift operator constant expression, /// folding if possible. /// diff --git a/include/llvm/IR/DIBuilder.h b/include/llvm/IR/DIBuilder.h index 16b759709b745602ba6653c20e2027fc6b223346..1fe7b36db00b5ef19d37232f20540ec0e8c44a41 100644 --- a/include/llvm/IR/DIBuilder.h +++ b/include/llvm/IR/DIBuilder.h @@ -145,7 +145,8 @@ namespace llvm { uint64_t DWOId = 0, bool SplitDebugInlining = true, bool DebugInfoForProfiling = false, DICompileUnit::DebugNameTableKind NameTableKind = - DICompileUnit::DebugNameTableKind::Default); + DICompileUnit::DebugNameTableKind::Default, + bool RangesBaseAddress = false); /// Create a file descriptor to hold debugging information for a file. /// \param Filename File name. @@ -652,29 +653,28 @@ namespace llvm { /// \param File File where this variable is defined. /// \param LineNo Line number. /// \param Ty Function type. - /// \param isLocalToUnit True if this function is not externally visible. - /// \param isDefinition True if this is a function definition. /// \param ScopeLine Set to the beginning of the scope this starts /// \param Flags e.g. is this function prototyped or not. /// These flags are used to emit dwarf attributes. - /// \param isOptimized True if optimization is ON. + /// \param SPFlags Additional flags specific to subprograms. /// \param TParams Function template parameters. /// \param ThrownTypes Exception types this function may throw. - DISubprogram *createFunction( - DIScope *Scope, StringRef Name, StringRef LinkageName, DIFile *File, - unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit, - bool isDefinition, unsigned ScopeLine, - DINode::DIFlags Flags = DINode::FlagZero, bool isOptimized = false, - DITemplateParameterArray TParams = nullptr, - DISubprogram *Decl = nullptr, DITypeArray ThrownTypes = nullptr); + DISubprogram * + createFunction(DIScope *Scope, StringRef Name, StringRef LinkageName, + DIFile *File, unsigned LineNo, DISubroutineType *Ty, + unsigned ScopeLine, DINode::DIFlags Flags = DINode::FlagZero, + DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagZero, + DITemplateParameterArray TParams = nullptr, + DISubprogram *Decl = nullptr, + DITypeArray ThrownTypes = nullptr); /// Identical to createFunction, /// except that the resulting DbgNode is meant to be RAUWed. DISubprogram *createTempFunctionFwdDecl( DIScope *Scope, StringRef Name, StringRef LinkageName, DIFile *File, - unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit, - bool isDefinition, unsigned ScopeLine, - DINode::DIFlags Flags = DINode::FlagZero, bool isOptimized = false, + unsigned LineNo, DISubroutineType *Ty, unsigned ScopeLine, + DINode::DIFlags Flags = DINode::FlagZero, + DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagZero, DITemplateParameterArray TParams = nullptr, DISubprogram *Decl = nullptr, DITypeArray ThrownTypes = nullptr); @@ -686,10 +686,6 @@ namespace llvm { /// \param File File where this variable is defined. /// \param LineNo Line number. /// \param Ty Function type. - /// \param isLocalToUnit True if this function is not externally visible.. - /// \param isDefinition True if this is a function definition. - /// \param Virtuality Attributes describing virtualness. e.g. pure - /// virtual function. /// \param VTableIndex Index no of this method in virtual table, or -1u if /// unrepresentable. /// \param ThisAdjustment @@ -698,17 +694,18 @@ namespace llvm { /// \param VTableHolder Type that holds vtable. /// \param Flags e.g. is this function prototyped or not. /// This flags are used to emit dwarf attributes. - /// \param isOptimized True if optimization is ON. + /// \param SPFlags Additional flags specific to subprograms. /// \param TParams Function template parameters. /// \param ThrownTypes Exception types this function may throw. - DISubprogram *createMethod( - DIScope *Scope, StringRef Name, StringRef LinkageName, DIFile *File, - unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit, - bool isDefinition, unsigned Virtuality = 0, unsigned VTableIndex = 0, - int ThisAdjustment = 0, DIType *VTableHolder = nullptr, - DINode::DIFlags Flags = DINode::FlagZero, bool isOptimized = false, - DITemplateParameterArray TParams = nullptr, - DITypeArray ThrownTypes = nullptr); + DISubprogram * + createMethod(DIScope *Scope, StringRef Name, StringRef LinkageName, + DIFile *File, unsigned LineNo, DISubroutineType *Ty, + unsigned VTableIndex = 0, int ThisAdjustment = 0, + DIType *VTableHolder = nullptr, + DINode::DIFlags Flags = DINode::FlagZero, + DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagZero, + DITemplateParameterArray TParams = nullptr, + DITypeArray ThrownTypes = nullptr); /// This creates new descriptor for a namespace with the specified /// parent scope. diff --git a/include/llvm/IR/DebugInfoFlags.def b/include/llvm/IR/DebugInfoFlags.def index a3c3430768f40df9dfbc83f9245ded8550515917..a0a9c6bcacb8639a46fd0867f0bec11272236d07 100644 --- a/include/llvm/IR/DebugInfoFlags.def +++ b/include/llvm/IR/DebugInfoFlags.def @@ -11,11 +11,20 @@ // //===----------------------------------------------------------------------===// -// TODO: Add other DW-based macros. +#if !(defined HANDLE_DI_FLAG || defined HANDLE_DISP_FLAG) +#error "Missing macro definition of HANDLE_DI*" +#endif + #ifndef HANDLE_DI_FLAG -#error "Missing macro definition of HANDLE_DI_FLAG" +#define HANDLE_DI_FLAG(ID, NAME) #endif +#ifndef HANDLE_DISP_FLAG +#define HANDLE_DISP_FLAG(ID, NAME) +#endif + +// General flags kept in DINode. + HANDLE_DI_FLAG(0, Zero) // Use it as zero value. // For example: void foo(DIFlags Flags = FlagZero). HANDLE_DI_FLAG(1, Private) @@ -64,4 +73,25 @@ HANDLE_DI_FLAG((1 << 29), Largest) #undef DI_FLAG_LARGEST_NEEDED #endif +// Subprogram-specific flags kept in DISubprogram. + +// Use this as a zero/initialization value. +// For example: void foo(DISPFlags Flags = SPFlagZero). +HANDLE_DISP_FLAG(0, Zero) +// Virtuality is a two-bit enum field in the LSB of the word. +// Values should match DW_VIRTUALITY_*. +HANDLE_DISP_FLAG(1u, Virtual) +HANDLE_DISP_FLAG(2u, PureVirtual) +HANDLE_DISP_FLAG((1u << 2), LocalToUnit) +HANDLE_DISP_FLAG((1u << 3), Definition) +HANDLE_DISP_FLAG((1u << 4), Optimized) + +#ifdef DISP_FLAG_LARGEST_NEEDED +// Intended to be used with ADT/BitmaskEnum.h. +// NOTE: Always must be equal to largest flag, check this when adding new flags. +HANDLE_DISP_FLAG((1 << 4), Largest) +#undef DISP_FLAG_LARGEST_NEEDED +#endif + #undef HANDLE_DI_FLAG +#undef HANDLE_DISP_FLAG diff --git a/include/llvm/IR/DebugInfoMetadata.h b/include/llvm/IR/DebugInfoMetadata.h index 2a9181e17c6bad57f2ec09bf64a17dc7a55e29dd..0830b5ccbeca6327f41da041d717c0ffcda9c53f 100644 --- a/include/llvm/IR/DebugInfoMetadata.h +++ b/include/llvm/IR/DebugInfoMetadata.h @@ -1192,18 +1192,19 @@ private: bool SplitDebugInlining; bool DebugInfoForProfiling; unsigned NameTableKind; + bool RangesBaseAddress; DICompileUnit(LLVMContext &C, StorageType Storage, unsigned SourceLanguage, bool IsOptimized, unsigned RuntimeVersion, unsigned EmissionKind, uint64_t DWOId, bool SplitDebugInlining, bool DebugInfoForProfiling, unsigned NameTableKind, - ArrayRef Ops) + bool RangesBaseAddress, ArrayRef Ops) : DIScope(C, DICompileUnitKind, Storage, dwarf::DW_TAG_compile_unit, Ops), SourceLanguage(SourceLanguage), IsOptimized(IsOptimized), RuntimeVersion(RuntimeVersion), EmissionKind(EmissionKind), DWOId(DWOId), SplitDebugInlining(SplitDebugInlining), DebugInfoForProfiling(DebugInfoForProfiling), - NameTableKind(NameTableKind) { + NameTableKind(NameTableKind), RangesBaseAddress(RangesBaseAddress) { assert(Storage != Uniqued); } ~DICompileUnit() = default; @@ -1217,15 +1218,16 @@ private: DIGlobalVariableExpressionArray GlobalVariables, DIImportedEntityArray ImportedEntities, DIMacroNodeArray Macros, uint64_t DWOId, bool SplitDebugInlining, bool DebugInfoForProfiling, - unsigned NameTableKind, StorageType Storage, + unsigned NameTableKind, bool RangesBaseAddress, StorageType Storage, bool ShouldCreate = true) { - return getImpl( - Context, SourceLanguage, File, getCanonicalMDString(Context, Producer), - IsOptimized, getCanonicalMDString(Context, Flags), RuntimeVersion, - getCanonicalMDString(Context, SplitDebugFilename), EmissionKind, - EnumTypes.get(), RetainedTypes.get(), GlobalVariables.get(), - ImportedEntities.get(), Macros.get(), DWOId, SplitDebugInlining, - DebugInfoForProfiling, NameTableKind, Storage, ShouldCreate); + return getImpl(Context, SourceLanguage, File, + getCanonicalMDString(Context, Producer), IsOptimized, + getCanonicalMDString(Context, Flags), RuntimeVersion, + getCanonicalMDString(Context, SplitDebugFilename), + EmissionKind, EnumTypes.get(), RetainedTypes.get(), + GlobalVariables.get(), ImportedEntities.get(), Macros.get(), + DWOId, SplitDebugInlining, DebugInfoForProfiling, + NameTableKind, RangesBaseAddress, Storage, ShouldCreate); } static DICompileUnit * getImpl(LLVMContext &Context, unsigned SourceLanguage, Metadata *File, @@ -1235,16 +1237,16 @@ private: Metadata *GlobalVariables, Metadata *ImportedEntities, Metadata *Macros, uint64_t DWOId, bool SplitDebugInlining, bool DebugInfoForProfiling, unsigned NameTableKind, - StorageType Storage, bool ShouldCreate = true); + bool RangesBaseAddress, StorageType Storage, bool ShouldCreate = true); TempDICompileUnit cloneImpl() const { - return getTemporary(getContext(), getSourceLanguage(), getFile(), - getProducer(), isOptimized(), getFlags(), - getRuntimeVersion(), getSplitDebugFilename(), - getEmissionKind(), getEnumTypes(), getRetainedTypes(), - getGlobalVariables(), getImportedEntities(), - getMacros(), DWOId, getSplitDebugInlining(), - getDebugInfoForProfiling(), getNameTableKind()); + return getTemporary( + getContext(), getSourceLanguage(), getFile(), getProducer(), + isOptimized(), getFlags(), getRuntimeVersion(), getSplitDebugFilename(), + getEmissionKind(), getEnumTypes(), getRetainedTypes(), + getGlobalVariables(), getImportedEntities(), getMacros(), DWOId, + getSplitDebugInlining(), getDebugInfoForProfiling(), getNameTableKind(), + getRangesBaseAddress()); } public: @@ -1260,11 +1262,11 @@ public: DIGlobalVariableExpressionArray GlobalVariables, DIImportedEntityArray ImportedEntities, DIMacroNodeArray Macros, uint64_t DWOId, bool SplitDebugInlining, bool DebugInfoForProfiling, - DebugNameTableKind NameTableKind), + DebugNameTableKind NameTableKind, bool RangesBaseAddress), (SourceLanguage, File, Producer, IsOptimized, Flags, RuntimeVersion, SplitDebugFilename, EmissionKind, EnumTypes, RetainedTypes, GlobalVariables, ImportedEntities, Macros, DWOId, SplitDebugInlining, - DebugInfoForProfiling, (unsigned)NameTableKind)) + DebugInfoForProfiling, (unsigned)NameTableKind, RangesBaseAddress)) DEFINE_MDNODE_GET_DISTINCT_TEMPORARY( DICompileUnit, (unsigned SourceLanguage, Metadata *File, MDString *Producer, @@ -1273,11 +1275,11 @@ public: Metadata *RetainedTypes, Metadata *GlobalVariables, Metadata *ImportedEntities, Metadata *Macros, uint64_t DWOId, bool SplitDebugInlining, bool DebugInfoForProfiling, - unsigned NameTableKind), + unsigned NameTableKind, bool RangesBaseAddress), (SourceLanguage, File, Producer, IsOptimized, Flags, RuntimeVersion, SplitDebugFilename, EmissionKind, EnumTypes, RetainedTypes, GlobalVariables, ImportedEntities, Macros, DWOId, SplitDebugInlining, - DebugInfoForProfiling, NameTableKind)) + DebugInfoForProfiling, NameTableKind, RangesBaseAddress)) TempDICompileUnit clone() const { return cloneImpl(); } @@ -1294,9 +1296,14 @@ public: DebugNameTableKind getNameTableKind() const { return (DebugNameTableKind)NameTableKind; } - StringRef getProducer() const { return getStringOperand(1); } - StringRef getFlags() const { return getStringOperand(2); } - StringRef getSplitDebugFilename() const { return getStringOperand(3); } + bool getRangesBaseAddress() const { + return RangesBaseAddress; } + StringRef getProducer() const { + return getStringOperand(1); } + StringRef getFlags() const { + return getStringOperand(2); } + StringRef getSplitDebugFilename() const { + return getStringOperand(3); } DICompositeTypeArray getEnumTypes() const { return cast_or_null(getRawEnumTypes()); } @@ -1607,102 +1614,118 @@ class DISubprogram : public DILocalScope { /// negative. int ThisAdjustment; - // Virtuality can only assume three values, so we can pack - // in 2 bits (none/pure/pure_virtual). - unsigned Virtuality : 2; +public: + /// Debug info subprogram flags. + enum DISPFlags : uint32_t { +#define HANDLE_DISP_FLAG(ID, NAME) SPFlag##NAME = ID, +#define DISP_FLAG_LARGEST_NEEDED +#include "llvm/IR/DebugInfoFlags.def" + SPFlagNonvirtual = SPFlagZero, + SPFlagVirtuality = SPFlagVirtual | SPFlagPureVirtual, + LLVM_MARK_AS_BITMASK_ENUM(SPFlagLargest) + }; - // These are boolean flags so one bit is enough. - // MSVC starts a new container field every time the base - // type changes so we can't use 'bool' to ensure these bits - // are packed. - unsigned IsLocalToUnit : 1; - unsigned IsDefinition : 1; - unsigned IsOptimized : 1; + static DISPFlags getFlag(StringRef Flag); + static StringRef getFlagString(DISPFlags Flag); - unsigned Padding : 3; + /// Split up a flags bitfield for easier printing. + /// + /// Split \c Flags into \c SplitFlags, a vector of its components. Returns + /// any remaining (unrecognized) bits. + static DISPFlags splitFlags(DISPFlags Flags, + SmallVectorImpl &SplitFlags); + + // Helper for converting old bitfields to new flags word. + static DISPFlags toSPFlags(bool IsLocalToUnit, bool IsDefinition, + bool IsOptimized, + unsigned Virtuality = SPFlagNonvirtual) { + // We're assuming virtuality is the low-order field. + static_assert( + int(SPFlagVirtual) == int(dwarf::DW_VIRTUALITY_virtual) && + int(SPFlagPureVirtual) == int(dwarf::DW_VIRTUALITY_pure_virtual), + "Virtuality constant mismatch"); + return static_cast( + (Virtuality & SPFlagVirtuality) | + (IsLocalToUnit ? SPFlagLocalToUnit : SPFlagZero) | + (IsDefinition ? SPFlagDefinition : SPFlagZero) | + (IsOptimized ? SPFlagOptimized : SPFlagZero)); + } +private: DIFlags Flags; + DISPFlags SPFlags; DISubprogram(LLVMContext &C, StorageType Storage, unsigned Line, - unsigned ScopeLine, unsigned Virtuality, unsigned VirtualIndex, - int ThisAdjustment, DIFlags Flags, bool IsLocalToUnit, - bool IsDefinition, bool IsOptimized, ArrayRef Ops) + unsigned ScopeLine, unsigned VirtualIndex, int ThisAdjustment, + DIFlags Flags, DISPFlags SPFlags, ArrayRef Ops) : DILocalScope(C, DISubprogramKind, Storage, dwarf::DW_TAG_subprogram, Ops), Line(Line), ScopeLine(ScopeLine), VirtualIndex(VirtualIndex), - ThisAdjustment(ThisAdjustment), Virtuality(Virtuality), - IsLocalToUnit(IsLocalToUnit), IsDefinition(IsDefinition), - IsOptimized(IsOptimized), Flags(Flags) { + ThisAdjustment(ThisAdjustment), Flags(Flags), SPFlags(SPFlags) { static_assert(dwarf::DW_VIRTUALITY_max < 4, "Virtuality out of range"); - assert(Virtuality < 4 && "Virtuality out of range"); } ~DISubprogram() = default; static DISubprogram * getImpl(LLVMContext &Context, DIScopeRef Scope, StringRef Name, StringRef LinkageName, DIFile *File, unsigned Line, - DISubroutineType *Type, bool IsLocalToUnit, bool IsDefinition, - unsigned ScopeLine, DITypeRef ContainingType, unsigned Virtuality, + DISubroutineType *Type, unsigned ScopeLine, DITypeRef ContainingType, unsigned VirtualIndex, int ThisAdjustment, DIFlags Flags, - bool IsOptimized, DICompileUnit *Unit, + DISPFlags SPFlags, DICompileUnit *Unit, DITemplateParameterArray TemplateParams, DISubprogram *Declaration, DINodeArray RetainedNodes, DITypeArray ThrownTypes, StorageType Storage, bool ShouldCreate = true) { return getImpl(Context, Scope, getCanonicalMDString(Context, Name), getCanonicalMDString(Context, LinkageName), File, Line, Type, - IsLocalToUnit, IsDefinition, ScopeLine, ContainingType, - Virtuality, VirtualIndex, ThisAdjustment, Flags, IsOptimized, - Unit, TemplateParams.get(), Declaration, RetainedNodes.get(), - ThrownTypes.get(), Storage, ShouldCreate); + ScopeLine, ContainingType, VirtualIndex, ThisAdjustment, + Flags, SPFlags, Unit, TemplateParams.get(), Declaration, + RetainedNodes.get(), ThrownTypes.get(), Storage, + ShouldCreate); } - static DISubprogram * - getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name, - MDString *LinkageName, Metadata *File, unsigned Line, Metadata *Type, - bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine, - Metadata *ContainingType, unsigned Virtuality, unsigned VirtualIndex, - int ThisAdjustment, DIFlags Flags, bool IsOptimized, Metadata *Unit, - Metadata *TemplateParams, Metadata *Declaration, Metadata *RetainedNodes, - Metadata *ThrownTypes, StorageType Storage, bool ShouldCreate = true); + static DISubprogram *getImpl(LLVMContext &Context, Metadata *Scope, + MDString *Name, MDString *LinkageName, + Metadata *File, unsigned Line, Metadata *Type, + unsigned ScopeLine, Metadata *ContainingType, + unsigned VirtualIndex, int ThisAdjustment, + DIFlags Flags, DISPFlags SPFlags, Metadata *Unit, + Metadata *TemplateParams, Metadata *Declaration, + Metadata *RetainedNodes, Metadata *ThrownTypes, + StorageType Storage, bool ShouldCreate = true); TempDISubprogram cloneImpl() const { return getTemporary(getContext(), getScope(), getName(), getLinkageName(), - getFile(), getLine(), getType(), isLocalToUnit(), - isDefinition(), getScopeLine(), getContainingType(), - getVirtuality(), getVirtualIndex(), getThisAdjustment(), - getFlags(), isOptimized(), getUnit(), - getTemplateParams(), getDeclaration(), getRetainedNodes(), - getThrownTypes()); + getFile(), getLine(), getType(), getScopeLine(), + getContainingType(), getVirtualIndex(), + getThisAdjustment(), getFlags(), getSPFlags(), + getUnit(), getTemplateParams(), getDeclaration(), + getRetainedNodes(), getThrownTypes()); } public: - DEFINE_MDNODE_GET(DISubprogram, - (DIScopeRef Scope, StringRef Name, StringRef LinkageName, - DIFile *File, unsigned Line, DISubroutineType *Type, - bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine, - DITypeRef ContainingType, unsigned Virtuality, - unsigned VirtualIndex, int ThisAdjustment, DIFlags Flags, - bool IsOptimized, DICompileUnit *Unit, - DITemplateParameterArray TemplateParams = nullptr, - DISubprogram *Declaration = nullptr, - DINodeArray RetainedNodes = nullptr, - DITypeArray ThrownTypes = nullptr), - (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit, - IsDefinition, ScopeLine, ContainingType, Virtuality, - VirtualIndex, ThisAdjustment, Flags, IsOptimized, Unit, - TemplateParams, Declaration, RetainedNodes, ThrownTypes)) + DEFINE_MDNODE_GET( + DISubprogram, + (DIScopeRef Scope, StringRef Name, StringRef LinkageName, DIFile *File, + unsigned Line, DISubroutineType *Type, unsigned ScopeLine, + DITypeRef ContainingType, unsigned VirtualIndex, int ThisAdjustment, + DIFlags Flags, DISPFlags SPFlags, DICompileUnit *Unit, + DITemplateParameterArray TemplateParams = nullptr, + DISubprogram *Declaration = nullptr, DINodeArray RetainedNodes = nullptr, + DITypeArray ThrownTypes = nullptr), + (Scope, Name, LinkageName, File, Line, Type, ScopeLine, ContainingType, + VirtualIndex, ThisAdjustment, Flags, SPFlags, Unit, TemplateParams, + Declaration, RetainedNodes, ThrownTypes)) + DEFINE_MDNODE_GET( DISubprogram, (Metadata * Scope, MDString *Name, MDString *LinkageName, Metadata *File, - unsigned Line, Metadata *Type, bool IsLocalToUnit, bool IsDefinition, - unsigned ScopeLine, Metadata *ContainingType, unsigned Virtuality, - unsigned VirtualIndex, int ThisAdjustment, DIFlags Flags, - bool IsOptimized, Metadata *Unit, Metadata *TemplateParams = nullptr, - Metadata *Declaration = nullptr, Metadata *RetainedNodes = nullptr, - Metadata *ThrownTypes = nullptr), - (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit, IsDefinition, - ScopeLine, ContainingType, Virtuality, VirtualIndex, ThisAdjustment, - Flags, IsOptimized, Unit, TemplateParams, Declaration, RetainedNodes, - ThrownTypes)) + unsigned Line, Metadata *Type, unsigned ScopeLine, + Metadata *ContainingType, unsigned VirtualIndex, int ThisAdjustment, + DIFlags Flags, DISPFlags SPFlags, Metadata *Unit, + Metadata *TemplateParams = nullptr, Metadata *Declaration = nullptr, + Metadata *RetainedNodes = nullptr, Metadata *ThrownTypes = nullptr), + (Scope, Name, LinkageName, File, Line, Type, ScopeLine, ContainingType, + VirtualIndex, ThisAdjustment, Flags, SPFlags, Unit, TemplateParams, + Declaration, RetainedNodes, ThrownTypes)) TempDISubprogram clone() const { return cloneImpl(); } @@ -1715,14 +1738,15 @@ public: public: unsigned getLine() const { return Line; } - unsigned getVirtuality() const { return Virtuality; } + unsigned getVirtuality() const { return getSPFlags() & SPFlagVirtuality; } unsigned getVirtualIndex() const { return VirtualIndex; } int getThisAdjustment() const { return ThisAdjustment; } unsigned getScopeLine() const { return ScopeLine; } DIFlags getFlags() const { return Flags; } - bool isLocalToUnit() const { return IsLocalToUnit; } - bool isDefinition() const { return IsDefinition; } - bool isOptimized() const { return IsOptimized; } + DISPFlags getSPFlags() const { return SPFlags; } + bool isLocalToUnit() const { return getSPFlags() & SPFlagLocalToUnit; } + bool isDefinition() const { return getSPFlags() & SPFlagDefinition; } + bool isOptimized() const { return getSPFlags() & SPFlagOptimized; } bool isArtificial() const { return getFlags() & FlagArtificial; } bool isPrivate() const { diff --git a/include/llvm/IR/DiagnosticInfo.h b/include/llvm/IR/DiagnosticInfo.h index b8fdae2fcadbeb88b05f34dd6171deed3e2ab39c..b5ed2c7d5097812d4fc8e2640fcc96442f788624 100644 --- a/include/llvm/IR/DiagnosticInfo.h +++ b/include/llvm/IR/DiagnosticInfo.h @@ -340,7 +340,7 @@ private: }; class DiagnosticLocation { - StringRef Filename; + DIFile *File = nullptr; unsigned Line = 0; unsigned Column = 0; @@ -349,8 +349,11 @@ public: DiagnosticLocation(const DebugLoc &DL); DiagnosticLocation(const DISubprogram *SP); - bool isValid() const { return !Filename.empty(); } - StringRef getFilename() const { return Filename; } + bool isValid() const { return File; } + /// Return the full path to the file. + std::string getAbsolutePath() const; + /// Return the file name relative to the compilation directory. + StringRef getRelativePath() const; unsigned getLine() const { return Line; } unsigned getColumn() const { return Column; } }; @@ -375,9 +378,13 @@ public: const std::string getLocationStr() const; /// Return location information for this diagnostic in three parts: - /// the source file name, line number and column. - void getLocation(StringRef *Filename, unsigned *Line, unsigned *Column) const; + /// the relative source file path, line number and column. + void getLocation(StringRef &RelativePath, unsigned &Line, + unsigned &Column) const; + /// Return the absolute path tot the file. + std::string getAbsolutePath() const; + const Function &getFunction() const { return Fn; } DiagnosticLocation getLocation() const { return Loc; } diff --git a/include/llvm/IR/InstVisitor.h b/include/llvm/IR/InstVisitor.h index 554417f984aa5cc1e1d0d5b666a854fb346fd866..5e9b18d90a0cade0c32f7649472cfa21c73c91d6 100644 --- a/include/llvm/IR/InstVisitor.h +++ b/include/llvm/IR/InstVisitor.h @@ -263,6 +263,7 @@ public: // of instructions... // RetTy visitCastInst(CastInst &I) { DELEGATE(UnaryInstruction);} + RetTy visitUnaryOperator(UnaryOperator &I) { DELEGATE(UnaryInstruction);} RetTy visitBinaryOperator(BinaryOperator &I) { DELEGATE(Instruction);} RetTy visitCmpInst(CmpInst &I) { DELEGATE(Instruction);} RetTy visitUnaryInstruction(UnaryInstruction &I){ DELEGATE(Instruction);} diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h index f8d23c7f6140f2ef78278f94166cc17bb6e1ca2a..4611a61e0b034b5fc151c396a46f2a89880346c3 100644 --- a/include/llvm/IR/InstrTypes.h +++ b/include/llvm/IR/InstrTypes.h @@ -25,6 +25,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/ADT/iterator_range.h" #include "llvm/IR/Attributes.h" +#include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instruction.h" @@ -904,76 +905,6 @@ struct OperandTraits : public FixedNumOperandTraits { DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CmpInst, Value) -//===----------------------------------------------------------------------===// -// FuncletPadInst Class -//===----------------------------------------------------------------------===// -class FuncletPadInst : public Instruction { -private: - FuncletPadInst(const FuncletPadInst &CPI); - - explicit FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad, - ArrayRef Args, unsigned Values, - const Twine &NameStr, Instruction *InsertBefore); - explicit FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad, - ArrayRef Args, unsigned Values, - const Twine &NameStr, BasicBlock *InsertAtEnd); - - void init(Value *ParentPad, ArrayRef Args, const Twine &NameStr); - -protected: - // Note: Instruction needs to be a friend here to call cloneImpl. - friend class Instruction; - friend class CatchPadInst; - friend class CleanupPadInst; - - FuncletPadInst *cloneImpl() const; - -public: - /// Provide fast operand accessors - DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); - - /// getNumArgOperands - Return the number of funcletpad arguments. - /// - unsigned getNumArgOperands() const { return getNumOperands() - 1; } - - /// Convenience accessors - - /// Return the outer EH-pad this funclet is nested within. - /// - /// Note: This returns the associated CatchSwitchInst if this FuncletPadInst - /// is a CatchPadInst. - Value *getParentPad() const { return Op<-1>(); } - void setParentPad(Value *ParentPad) { - assert(ParentPad); - Op<-1>() = ParentPad; - } - - /// getArgOperand/setArgOperand - Return/set the i-th funcletpad argument. - /// - Value *getArgOperand(unsigned i) const { return getOperand(i); } - void setArgOperand(unsigned i, Value *v) { setOperand(i, v); } - - /// arg_operands - iteration adapter for range-for loops. - op_range arg_operands() { return op_range(op_begin(), op_end() - 1); } - - /// arg_operands - iteration adapter for range-for loops. - const_op_range arg_operands() const { - return const_op_range(op_begin(), op_end() - 1); - } - - // Methods for support type inquiry through isa, cast, and dyn_cast: - static bool classof(const Instruction *I) { return I->isFuncletPad(); } - static bool classof(const Value *V) { - return isa(V) && classof(cast(V)); - } -}; - -template <> -struct OperandTraits - : public VariadicOperandTraits {}; - -DEFINE_TRANSPARENT_OPERAND_ACCESSORS(FuncletPadInst, Value) - /// A lightweight accessor for an operand bundle meant to be passed /// around by value. struct OperandBundleUse { @@ -1058,54 +989,468 @@ public: using OperandBundleDef = OperandBundleDefT; using ConstOperandBundleDef = OperandBundleDefT; -/// A mixin to add operand bundle functionality to llvm instruction -/// classes. -/// -/// OperandBundleUser uses the descriptor area co-allocated with the host User -/// to store some meta information about which operands are "normal" operands, -/// and which ones belong to some operand bundle. -/// -/// The layout of an operand bundle user is -/// -/// +-----------uint32_t End-------------------------------------+ -/// | | -/// | +--------uint32_t Begin--------------------+ | -/// | | | | -/// ^ ^ v v -/// |------|------|----|----|----|----|----|---------|----|---------|----|----- -/// | BOI0 | BOI1 | .. | DU | U0 | U1 | .. | BOI0_U0 | .. | BOI1_U0 | .. | Un -/// |------|------|----|----|----|----|----|---------|----|---------|----|----- -/// v v ^ ^ -/// | | | | -/// | +--------uint32_t Begin------------+ | -/// | | -/// +-----------uint32_t End-----------------------------+ -/// -/// -/// BOI0, BOI1 ... are descriptions of operand bundles in this User's use list. -/// These descriptions are installed and managed by this class, and they're all -/// instances of OperandBundleUser::BundleOpInfo. -/// -/// DU is an additional descriptor installed by User's 'operator new' to keep -/// track of the 'BOI0 ... BOIN' co-allocation. OperandBundleUser does not -/// access or modify DU in any way, it's an implementation detail private to -/// User. -/// -/// The regular Use& vector for the User starts at U0. The operand bundle uses -/// are part of the Use& vector, just like normal uses. In the diagram above, -/// the operand bundle uses start at BOI0_U0. Each instance of BundleOpInfo has -/// information about a contiguous set of uses constituting an operand bundle, -/// and the total set of operand bundle uses themselves form a contiguous set of -/// uses (i.e. there are no gaps between uses corresponding to individual -/// operand bundles). +//===----------------------------------------------------------------------===// +// CallBase Class +//===----------------------------------------------------------------------===// + +/// Base class for all callable instructions (InvokeInst and CallInst) +/// Holds everything related to calling a function. /// -/// This class does not know the location of the set of operand bundle uses -/// within the use list -- that is decided by the User using this class via the -/// BeginIdx argument in populateBundleOperandInfos. +/// All call-like instructions are required to use a common operand layout: +/// - Zero or more arguments to the call, +/// - Zero or more operand bundles with zero or more operand inputs each +/// bundle, +/// - Zero or more subclass controlled operands +/// - The called function. /// -/// Currently operand bundle users with hung-off operands are not supported. -template class OperandBundleUser { +/// This allows this base class to easily access the called function and the +/// start of the arguments without knowing how many other operands a particular +/// subclass requires. Note that accessing the end of the argument list isn't +/// as cheap as most other operations on the base class. +class CallBase : public Instruction { +protected: + /// The last operand is the called operand. + static constexpr int CalledOperandOpEndIdx = -1; + + AttributeList Attrs; ///< parameter attributes for callable + FunctionType *FTy; + + template + CallBase(AttributeList const &A, FunctionType *FT, ArgsTy &&... Args) + : Instruction(std::forward(Args)...), Attrs(A), FTy(FT) {} + + using Instruction::Instruction; + + bool hasDescriptor() const { return Value::HasDescriptor; } + + unsigned getNumSubclassExtraOperands() const { + switch (getOpcode()) { + case Instruction::Call: + return 0; + case Instruction::Invoke: + return 2; + } + llvm_unreachable("Invalid opcode!"); + } + public: + using Instruction::getContext; + + static bool classof(const Instruction *I) { + return I->getOpcode() == Instruction::Call || + I->getOpcode() == Instruction::Invoke; + } + + FunctionType *getFunctionType() const { return FTy; } + + void mutateFunctionType(FunctionType *FTy) { + Value::mutateType(FTy->getReturnType()); + this->FTy = FTy; + } + + /// Return the iterator pointing to the beginning of the argument list. + User::op_iterator arg_begin() { return op_begin(); } + User::const_op_iterator arg_begin() const { + return const_cast(this)->arg_begin(); + } + + /// Return the iterator pointing to the end of the argument list. + User::op_iterator arg_end() { + // Walk from the end of the operands over the called operand, the subclass + // operands, and any operands for bundles to find the end of the argument + // operands. + return op_end() - getNumTotalBundleOperands() - + getNumSubclassExtraOperands() - 1; + } + User::const_op_iterator arg_end() const { + return const_cast(this)->arg_end(); + } + + /// Iteration adapter for range-for loops. + iterator_range arg_operands() { + return make_range(arg_begin(), arg_end()); + } + iterator_range arg_operands() const { + return make_range(arg_begin(), arg_end()); + } + + unsigned getNumArgOperands() const { return arg_end() - arg_begin(); } + + Value *getArgOperand(unsigned i) const { + assert(i < getNumArgOperands() && "Out of bounds!"); + return getOperand(i); + } + + void setArgOperand(unsigned i, Value *v) { + assert(i < getNumArgOperands() && "Out of bounds!"); + setOperand(i, v); + } + + /// Wrappers for getting the \c Use of a call argument. + const Use &getArgOperandUse(unsigned i) const { + assert(i < getNumArgOperands() && "Out of bounds!"); + return User::getOperandUse(i); + } + Use &getArgOperandUse(unsigned i) { + assert(i < getNumArgOperands() && "Out of bounds!"); + return User::getOperandUse(i); + } + + DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); + + Value *getCalledOperand() const { return Op(); } + + // DEPRECATED: This routine will be removed in favor of `getCalledOperand` in + // the near future. + Value *getCalledValue() const { return getCalledOperand(); } + + const Use &getCalledOperandUse() const { return Op(); } + Use &getCalledOperandUse() { return Op(); } + + /// Returns the function called, or null if this is an + /// indirect function invocation. + Function *getCalledFunction() const { + return dyn_cast_or_null(getCalledOperand()); + } + + void setCalledOperand(Value *V) { Op() = V; } + + /// Sets the function called, including updating the function type. + void setCalledFunction(Value *Fn) { + setCalledFunction( + cast(cast(Fn->getType())->getElementType()), + Fn); + } + + /// Sets the function called, including updating to the specified function + /// type. + void setCalledFunction(FunctionType *FTy, Value *Fn) { + this->FTy = FTy; + assert(FTy == cast( + cast(Fn->getType())->getElementType())); + setCalledOperand(Fn); + } + + CallingConv::ID getCallingConv() const { + return static_cast(getSubclassDataFromInstruction() >> 2); + } + + void setCallingConv(CallingConv::ID CC) { + auto ID = static_cast(CC); + assert(!(ID & ~CallingConv::MaxID) && "Unsupported calling convention"); + setInstructionSubclassData((getSubclassDataFromInstruction() & 3) | + (ID << 2)); + } + + /// \name Attribute API + /// + /// These methods access and modify attributes on this call (including + /// looking through to the attributes on the called function when necessary). + ///@{ + + /// Return the parameter attributes for this call. + /// + AttributeList getAttributes() const { return Attrs; } + + /// Set the parameter attributes for this call. + /// + void setAttributes(AttributeList A) { Attrs = A; } + + /// Determine whether this call has the given attribute. + bool hasFnAttr(Attribute::AttrKind Kind) const { + assert(Kind != Attribute::NoBuiltin && + "Use CallBase::isNoBuiltin() to check for Attribute::NoBuiltin"); + return hasFnAttrImpl(Kind); + } + + /// Determine whether this call has the given attribute. + bool hasFnAttr(StringRef Kind) const { return hasFnAttrImpl(Kind); } + + /// adds the attribute to the list of attributes. + void addAttribute(unsigned i, Attribute::AttrKind Kind) { + AttributeList PAL = getAttributes(); + PAL = PAL.addAttribute(getContext(), i, Kind); + setAttributes(PAL); + } + + /// adds the attribute to the list of attributes. + void addAttribute(unsigned i, Attribute Attr) { + AttributeList PAL = getAttributes(); + PAL = PAL.addAttribute(getContext(), i, Attr); + setAttributes(PAL); + } + + /// Adds the attribute to the indicated argument + void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { + assert(ArgNo < getNumArgOperands() && "Out of bounds"); + AttributeList PAL = getAttributes(); + PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind); + setAttributes(PAL); + } + + /// Adds the attribute to the indicated argument + void addParamAttr(unsigned ArgNo, Attribute Attr) { + assert(ArgNo < getNumArgOperands() && "Out of bounds"); + AttributeList PAL = getAttributes(); + PAL = PAL.addParamAttribute(getContext(), ArgNo, Attr); + setAttributes(PAL); + } + + /// removes the attribute from the list of attributes. + void removeAttribute(unsigned i, Attribute::AttrKind Kind) { + AttributeList PAL = getAttributes(); + PAL = PAL.removeAttribute(getContext(), i, Kind); + setAttributes(PAL); + } + + /// removes the attribute from the list of attributes. + void removeAttribute(unsigned i, StringRef Kind) { + AttributeList PAL = getAttributes(); + PAL = PAL.removeAttribute(getContext(), i, Kind); + setAttributes(PAL); + } + + /// Removes the attribute from the given argument + void removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { + assert(ArgNo < getNumArgOperands() && "Out of bounds"); + AttributeList PAL = getAttributes(); + PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind); + setAttributes(PAL); + } + + /// Removes the attribute from the given argument + void removeParamAttr(unsigned ArgNo, StringRef Kind) { + assert(ArgNo < getNumArgOperands() && "Out of bounds"); + AttributeList PAL = getAttributes(); + PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind); + setAttributes(PAL); + } + + /// adds the dereferenceable attribute to the list of attributes. + void addDereferenceableAttr(unsigned i, uint64_t Bytes) { + AttributeList PAL = getAttributes(); + PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes); + setAttributes(PAL); + } + + /// adds the dereferenceable_or_null attribute to the list of + /// attributes. + void addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) { + AttributeList PAL = getAttributes(); + PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes); + setAttributes(PAL); + } + + /// Determine whether the return value has the given attribute. + bool hasRetAttr(Attribute::AttrKind Kind) const; + + /// Determine whether the argument or parameter has the given attribute. + bool paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const; + + /// Get the attribute of a given kind at a position. + Attribute getAttribute(unsigned i, Attribute::AttrKind Kind) const { + return getAttributes().getAttribute(i, Kind); + } + + /// Get the attribute of a given kind at a position. + Attribute getAttribute(unsigned i, StringRef Kind) const { + return getAttributes().getAttribute(i, Kind); + } + + /// Get the attribute of a given kind from a given arg + Attribute getParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const { + assert(ArgNo < getNumArgOperands() && "Out of bounds"); + return getAttributes().getParamAttr(ArgNo, Kind); + } + + /// Get the attribute of a given kind from a given arg + Attribute getParamAttr(unsigned ArgNo, StringRef Kind) const { + assert(ArgNo < getNumArgOperands() && "Out of bounds"); + return getAttributes().getParamAttr(ArgNo, Kind); + } + + /// Return true if the data operand at index \p i has the attribute \p + /// A. + /// + /// Data operands include call arguments and values used in operand bundles, + /// but does not include the callee operand. This routine dispatches to the + /// underlying AttributeList or the OperandBundleUser as appropriate. + /// + /// The index \p i is interpreted as + /// + /// \p i == Attribute::ReturnIndex -> the return value + /// \p i in [1, arg_size + 1) -> argument number (\p i - 1) + /// \p i in [arg_size + 1, data_operand_size + 1) -> bundle operand at index + /// (\p i - 1) in the operand list. + bool dataOperandHasImpliedAttr(unsigned i, Attribute::AttrKind Kind) const { + // Note that we have to add one because `i` isn't zero-indexed. + assert(i < (getNumArgOperands() + getNumTotalBundleOperands() + 1) && + "Data operand index out of bounds!"); + + // The attribute A can either be directly specified, if the operand in + // question is a call argument; or be indirectly implied by the kind of its + // containing operand bundle, if the operand is a bundle operand. + + if (i == AttributeList::ReturnIndex) + return hasRetAttr(Kind); + + // FIXME: Avoid these i - 1 calculations and update the API to use + // zero-based indices. + if (i < (getNumArgOperands() + 1)) + return paramHasAttr(i - 1, Kind); + + assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) && + "Must be either a call argument or an operand bundle!"); + return bundleOperandHasAttr(i - 1, Kind); + } + + /// Extract the alignment of the return value. + unsigned getRetAlignment() const { return Attrs.getRetAlignment(); } + + /// Extract the alignment for a call or parameter (0=unknown). + unsigned getParamAlignment(unsigned ArgNo) const { + return Attrs.getParamAlignment(ArgNo); + } + + /// Extract the number of dereferenceable bytes for a call or + /// parameter (0=unknown). + uint64_t getDereferenceableBytes(unsigned i) const { + return Attrs.getDereferenceableBytes(i); + } + + /// Extract the number of dereferenceable_or_null bytes for a call or + /// parameter (0=unknown). + uint64_t getDereferenceableOrNullBytes(unsigned i) const { + return Attrs.getDereferenceableOrNullBytes(i); + } + + /// Determine if the return value is marked with NoAlias attribute. + bool returnDoesNotAlias() const { + return Attrs.hasAttribute(AttributeList::ReturnIndex, Attribute::NoAlias); + } + + /// If one of the arguments has the 'returned' attribute, returns its + /// operand value. Otherwise, return nullptr. + Value *getReturnedArgOperand() const; + + /// Return true if the call should not be treated as a call to a + /// builtin. + bool isNoBuiltin() const { + return hasFnAttrImpl(Attribute::NoBuiltin) && + !hasFnAttrImpl(Attribute::Builtin); + } + + /// Determine if the call requires strict floating point semantics. + bool isStrictFP() const { return hasFnAttr(Attribute::StrictFP); } + + /// Return true if the call should not be inlined. + bool isNoInline() const { return hasFnAttr(Attribute::NoInline); } + void setIsNoInline() { + addAttribute(AttributeList::FunctionIndex, Attribute::NoInline); + } + /// Determine if the call does not access memory. + bool doesNotAccessMemory() const { return hasFnAttr(Attribute::ReadNone); } + void setDoesNotAccessMemory() { + addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone); + } + + /// Determine if the call does not access or only reads memory. + bool onlyReadsMemory() const { + return doesNotAccessMemory() || hasFnAttr(Attribute::ReadOnly); + } + void setOnlyReadsMemory() { + addAttribute(AttributeList::FunctionIndex, Attribute::ReadOnly); + } + + /// Determine if the call does not access or only writes memory. + bool doesNotReadMemory() const { + return doesNotAccessMemory() || hasFnAttr(Attribute::WriteOnly); + } + void setDoesNotReadMemory() { + addAttribute(AttributeList::FunctionIndex, Attribute::WriteOnly); + } + + /// Determine if the call can access memmory only using pointers based + /// on its arguments. + bool onlyAccessesArgMemory() const { + return hasFnAttr(Attribute::ArgMemOnly); + } + void setOnlyAccessesArgMemory() { + addAttribute(AttributeList::FunctionIndex, Attribute::ArgMemOnly); + } + + /// Determine if the function may only access memory that is + /// inaccessible from the IR. + bool onlyAccessesInaccessibleMemory() const { + return hasFnAttr(Attribute::InaccessibleMemOnly); + } + void setOnlyAccessesInaccessibleMemory() { + addAttribute(AttributeList::FunctionIndex, Attribute::InaccessibleMemOnly); + } + + /// Determine if the function may only access memory that is + /// either inaccessible from the IR or pointed to by its arguments. + bool onlyAccessesInaccessibleMemOrArgMem() const { + return hasFnAttr(Attribute::InaccessibleMemOrArgMemOnly); + } + void setOnlyAccessesInaccessibleMemOrArgMem() { + addAttribute(AttributeList::FunctionIndex, + Attribute::InaccessibleMemOrArgMemOnly); + } + /// Determine if the call cannot return. + bool doesNotReturn() const { return hasFnAttr(Attribute::NoReturn); } + void setDoesNotReturn() { + addAttribute(AttributeList::FunctionIndex, Attribute::NoReturn); + } + + /// Determine if the call should not perform indirect branch tracking. + bool doesNoCfCheck() const { return hasFnAttr(Attribute::NoCfCheck); } + + /// Determine if the call cannot unwind. + bool doesNotThrow() const { return hasFnAttr(Attribute::NoUnwind); } + void setDoesNotThrow() { + addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind); + } + + /// Determine if the invoke cannot be duplicated. + bool cannotDuplicate() const { return hasFnAttr(Attribute::NoDuplicate); } + void setCannotDuplicate() { + addAttribute(AttributeList::FunctionIndex, Attribute::NoDuplicate); + } + + /// Determine if the invoke is convergent + bool isConvergent() const { return hasFnAttr(Attribute::Convergent); } + void setConvergent() { + addAttribute(AttributeList::FunctionIndex, Attribute::Convergent); + } + void setNotConvergent() { + removeAttribute(AttributeList::FunctionIndex, Attribute::Convergent); + } + + /// Determine if the call returns a structure through first + /// pointer argument. + bool hasStructRetAttr() const { + if (getNumArgOperands() == 0) + return false; + + // Be friendly and also check the callee. + return paramHasAttr(0, Attribute::StructRet); + } + + /// Determine if any call argument is an aggregate passed by value. + bool hasByValArgument() const { + return Attrs.hasAttrSomewhere(Attribute::ByVal); + } + + ///@{ + // End of attribute API. + + /// \name Operand Bundle API + /// + /// This group of methods provides the API to access and manipulate operand + /// bundles on this call. + /// @{ + /// Return the number of operand bundles associated with this User. unsigned getNumOperandBundles() const { return std::distance(bundle_op_info_begin(), bundle_op_info_end()); @@ -1261,8 +1606,7 @@ public: /// Return true if \p Other has the same sequence of operand bundle /// tags with the same number of operands on each one of them as this /// OperandBundleUser. - bool hasIdenticalOperandBundleSchema( - const OperandBundleUser &Other) const { + bool hasIdenticalOperandBundleSchema(const CallBase &Other) const { if (getNumOperandBundles() != Other.getNumOperandBundles()) return false; @@ -1281,7 +1625,6 @@ public: return false; } -protected: /// Is the function attribute S disallowed by some operand bundle on /// this operand bundle user? bool isFnAttrDisallowedByOpBundle(StringRef S) const { @@ -1340,8 +1683,8 @@ protected: /// OperandBundleUse. OperandBundleUse operandBundleFromBundleOpInfo(const BundleOpInfo &BOI) const { - auto op_begin = static_cast(this)->op_begin(); - ArrayRef Inputs(op_begin + BOI.Begin, op_begin + BOI.End); + auto begin = op_begin(); + ArrayRef Inputs(begin + BOI.Begin, begin + BOI.End); return OperandBundleUse(BOI.Tag, Inputs); } @@ -1350,37 +1693,79 @@ protected: /// Return the start of the list of BundleOpInfo instances associated /// with this OperandBundleUser. + /// + /// OperandBundleUser uses the descriptor area co-allocated with the host User + /// to store some meta information about which operands are "normal" operands, + /// and which ones belong to some operand bundle. + /// + /// The layout of an operand bundle user is + /// + /// +-----------uint32_t End-------------------------------------+ + /// | | + /// | +--------uint32_t Begin--------------------+ | + /// | | | | + /// ^ ^ v v + /// |------|------|----|----|----|----|----|---------|----|---------|----|----- + /// | BOI0 | BOI1 | .. | DU | U0 | U1 | .. | BOI0_U0 | .. | BOI1_U0 | .. | Un + /// |------|------|----|----|----|----|----|---------|----|---------|----|----- + /// v v ^ ^ + /// | | | | + /// | +--------uint32_t Begin------------+ | + /// | | + /// +-----------uint32_t End-----------------------------+ + /// + /// + /// BOI0, BOI1 ... are descriptions of operand bundles in this User's use + /// list. These descriptions are installed and managed by this class, and + /// they're all instances of OperandBundleUser::BundleOpInfo. + /// + /// DU is an additional descriptor installed by User's 'operator new' to keep + /// track of the 'BOI0 ... BOIN' co-allocation. OperandBundleUser does not + /// access or modify DU in any way, it's an implementation detail private to + /// User. + /// + /// The regular Use& vector for the User starts at U0. The operand bundle + /// uses are part of the Use& vector, just like normal uses. In the diagram + /// above, the operand bundle uses start at BOI0_U0. Each instance of + /// BundleOpInfo has information about a contiguous set of uses constituting + /// an operand bundle, and the total set of operand bundle uses themselves + /// form a contiguous set of uses (i.e. there are no gaps between uses + /// corresponding to individual operand bundles). + /// + /// This class does not know the location of the set of operand bundle uses + /// within the use list -- that is decided by the User using this class via + /// the BeginIdx argument in populateBundleOperandInfos. + /// + /// Currently operand bundle users with hung-off operands are not supported. bundle_op_iterator bundle_op_info_begin() { - if (!static_cast(this)->hasDescriptor()) + if (!hasDescriptor()) return nullptr; - uint8_t *BytesBegin = static_cast(this)->getDescriptor().begin(); + uint8_t *BytesBegin = getDescriptor().begin(); return reinterpret_cast(BytesBegin); } /// Return the start of the list of BundleOpInfo instances associated /// with this OperandBundleUser. const_bundle_op_iterator bundle_op_info_begin() const { - auto *NonConstThis = - const_cast *>(this); + auto *NonConstThis = const_cast(this); return NonConstThis->bundle_op_info_begin(); } /// Return the end of the list of BundleOpInfo instances associated /// with this OperandBundleUser. bundle_op_iterator bundle_op_info_end() { - if (!static_cast(this)->hasDescriptor()) + if (!hasDescriptor()) return nullptr; - uint8_t *BytesEnd = static_cast(this)->getDescriptor().end(); + uint8_t *BytesEnd = getDescriptor().end(); return reinterpret_cast(BytesEnd); } /// Return the end of the list of BundleOpInfo instances associated /// with this OperandBundleUser. const_bundle_op_iterator bundle_op_info_end() const { - auto *NonConstThis = - const_cast *>(this); + auto *NonConstThis = const_cast(this); return NonConstThis->bundle_op_info_end(); } @@ -1400,30 +1785,8 @@ protected: /// /// Each \p OperandBundleDef instance is tracked by a OperandBundleInfo /// instance allocated in this User's descriptor. - OpIteratorTy populateBundleOperandInfos(ArrayRef Bundles, - const unsigned BeginIndex) { - auto It = static_cast(this)->op_begin() + BeginIndex; - for (auto &B : Bundles) - It = std::copy(B.input_begin(), B.input_end(), It); - - auto *ContextImpl = static_cast(this)->getContext().pImpl; - auto BI = Bundles.begin(); - unsigned CurrentIndex = BeginIndex; - - for (auto &BOI : bundle_op_infos()) { - assert(BI != Bundles.end() && "Incorrect allocation?"); - - BOI.Tag = ContextImpl->getOrInsertBundleTag(BI->getTag()); - BOI.Begin = CurrentIndex; - BOI.End = CurrentIndex + BI->input_size(); - CurrentIndex = BOI.End; - BI++; - } - - assert(BI == Bundles.end() && "Incorrect allocation?"); - - return It; - } + op_iterator populateBundleOperandInfos(ArrayRef Bundles, + const unsigned BeginIndex); /// Return the BundleOpInfo for the operand at index OpIdx. /// @@ -1437,6 +1800,7 @@ protected: llvm_unreachable("Did not find operand bundle for operand!"); } +protected: /// Return the total number of values used in \p Bundles. static unsigned CountBundleInputs(ArrayRef Bundles) { unsigned Total = 0; @@ -1444,8 +1808,102 @@ protected: Total += B.input_size(); return Total; } + + /// @} + // End of operand bundle API. + +private: + bool hasFnAttrOnCalledFunction(Attribute::AttrKind Kind) const; + bool hasFnAttrOnCalledFunction(StringRef Kind) const; + + template bool hasFnAttrImpl(AttrKind Kind) const { + if (Attrs.hasAttribute(AttributeList::FunctionIndex, Kind)) + return true; + + // Operand bundles override attributes on the called function, but don't + // override attributes directly present on the call instruction. + if (isFnAttrDisallowedByOpBundle(Kind)) + return false; + + return hasFnAttrOnCalledFunction(Kind); + } +}; + +template <> +struct OperandTraits : public VariadicOperandTraits {}; + +DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CallBase, Value) + +//===----------------------------------------------------------------------===// +// FuncletPadInst Class +//===----------------------------------------------------------------------===// +class FuncletPadInst : public Instruction { +private: + FuncletPadInst(const FuncletPadInst &CPI); + + explicit FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad, + ArrayRef Args, unsigned Values, + const Twine &NameStr, Instruction *InsertBefore); + explicit FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad, + ArrayRef Args, unsigned Values, + const Twine &NameStr, BasicBlock *InsertAtEnd); + + void init(Value *ParentPad, ArrayRef Args, const Twine &NameStr); + +protected: + // Note: Instruction needs to be a friend here to call cloneImpl. + friend class Instruction; + friend class CatchPadInst; + friend class CleanupPadInst; + + FuncletPadInst *cloneImpl() const; + +public: + /// Provide fast operand accessors + DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); + + /// getNumArgOperands - Return the number of funcletpad arguments. + /// + unsigned getNumArgOperands() const { return getNumOperands() - 1; } + + /// Convenience accessors + + /// Return the outer EH-pad this funclet is nested within. + /// + /// Note: This returns the associated CatchSwitchInst if this FuncletPadInst + /// is a CatchPadInst. + Value *getParentPad() const { return Op<-1>(); } + void setParentPad(Value *ParentPad) { + assert(ParentPad); + Op<-1>() = ParentPad; + } + + /// getArgOperand/setArgOperand - Return/set the i-th funcletpad argument. + /// + Value *getArgOperand(unsigned i) const { return getOperand(i); } + void setArgOperand(unsigned i, Value *v) { setOperand(i, v); } + + /// arg_operands - iteration adapter for range-for loops. + op_range arg_operands() { return op_range(op_begin(), op_end() - 1); } + + /// arg_operands - iteration adapter for range-for loops. + const_op_range arg_operands() const { + return const_op_range(op_begin(), op_end() - 1); + } + + // Methods for support type inquiry through isa, cast, and dyn_cast: + static bool classof(const Instruction *I) { return I->isFuncletPad(); } + static bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } }; +template <> +struct OperandTraits + : public VariadicOperandTraits {}; + +DEFINE_TRANSPARENT_OPERAND_ACCESSORS(FuncletPadInst, Value) + } // end namespace llvm #endif // LLVM_IR_INSTRTYPES_H diff --git a/include/llvm/IR/Instruction.def b/include/llvm/IR/Instruction.def index 86617299c44ac6a2713a4fc8a4be7e325eace047..58e4e2e1d6cc50b1564438e2d58c2642fcf073f9 100644 --- a/include/llvm/IR/Instruction.def +++ b/include/llvm/IR/Instruction.def @@ -32,6 +32,20 @@ #define LAST_TERM_INST(num) #endif +#ifndef FIRST_UNARY_INST +#define FIRST_UNARY_INST(num) +#endif +#ifndef HANDLE_UNARY_INST +#ifndef HANDLE_INST +#define HANDLE_UNARY_INST(num, opcode, instclass) +#else +#define HANDLE_UNARY_INST(num, opcode, Class) HANDLE_INST(num, opcode, Class) +#endif +#endif +#ifndef LAST_UNARY_INST +#define LAST_UNARY_INST(num) +#endif + #ifndef FIRST_BINARY_INST #define FIRST_BINARY_INST(num) #endif @@ -123,87 +137,96 @@ HANDLE_TERM_INST ( 9, CatchRet , CatchReturnInst) HANDLE_TERM_INST (10, CatchSwitch , CatchSwitchInst) LAST_TERM_INST (10) +// Standard unary operators... + FIRST_UNARY_INST(11) +HANDLE_UNARY_INST(11, FNeg , UnaryOperator) + LAST_UNARY_INST(11) + // Standard binary operators... - FIRST_BINARY_INST(11) -HANDLE_BINARY_INST(11, Add , BinaryOperator) -HANDLE_BINARY_INST(12, FAdd , BinaryOperator) -HANDLE_BINARY_INST(13, Sub , BinaryOperator) -HANDLE_BINARY_INST(14, FSub , BinaryOperator) -HANDLE_BINARY_INST(15, Mul , BinaryOperator) -HANDLE_BINARY_INST(16, FMul , BinaryOperator) -HANDLE_BINARY_INST(17, UDiv , BinaryOperator) -HANDLE_BINARY_INST(18, SDiv , BinaryOperator) -HANDLE_BINARY_INST(19, FDiv , BinaryOperator) -HANDLE_BINARY_INST(20, URem , BinaryOperator) -HANDLE_BINARY_INST(21, SRem , BinaryOperator) -HANDLE_BINARY_INST(22, FRem , BinaryOperator) + FIRST_BINARY_INST(12) +HANDLE_BINARY_INST(12, Add , BinaryOperator) +HANDLE_BINARY_INST(13, FAdd , BinaryOperator) +HANDLE_BINARY_INST(14, Sub , BinaryOperator) +HANDLE_BINARY_INST(15, FSub , BinaryOperator) +HANDLE_BINARY_INST(16, Mul , BinaryOperator) +HANDLE_BINARY_INST(17, FMul , BinaryOperator) +HANDLE_BINARY_INST(18, UDiv , BinaryOperator) +HANDLE_BINARY_INST(19, SDiv , BinaryOperator) +HANDLE_BINARY_INST(20, FDiv , BinaryOperator) +HANDLE_BINARY_INST(21, URem , BinaryOperator) +HANDLE_BINARY_INST(22, SRem , BinaryOperator) +HANDLE_BINARY_INST(23, FRem , BinaryOperator) // Logical operators (integer operands) -HANDLE_BINARY_INST(23, Shl , BinaryOperator) // Shift left (logical) -HANDLE_BINARY_INST(24, LShr , BinaryOperator) // Shift right (logical) -HANDLE_BINARY_INST(25, AShr , BinaryOperator) // Shift right (arithmetic) -HANDLE_BINARY_INST(26, And , BinaryOperator) -HANDLE_BINARY_INST(27, Or , BinaryOperator) -HANDLE_BINARY_INST(28, Xor , BinaryOperator) - LAST_BINARY_INST(28) +HANDLE_BINARY_INST(24, Shl , BinaryOperator) // Shift left (logical) +HANDLE_BINARY_INST(25, LShr , BinaryOperator) // Shift right (logical) +HANDLE_BINARY_INST(26, AShr , BinaryOperator) // Shift right (arithmetic) +HANDLE_BINARY_INST(27, And , BinaryOperator) +HANDLE_BINARY_INST(28, Or , BinaryOperator) +HANDLE_BINARY_INST(29, Xor , BinaryOperator) + LAST_BINARY_INST(29) // Memory operators... - FIRST_MEMORY_INST(29) -HANDLE_MEMORY_INST(29, Alloca, AllocaInst) // Stack management -HANDLE_MEMORY_INST(30, Load , LoadInst ) // Memory manipulation instrs -HANDLE_MEMORY_INST(31, Store , StoreInst ) -HANDLE_MEMORY_INST(32, GetElementPtr, GetElementPtrInst) -HANDLE_MEMORY_INST(33, Fence , FenceInst ) -HANDLE_MEMORY_INST(34, AtomicCmpXchg , AtomicCmpXchgInst ) -HANDLE_MEMORY_INST(35, AtomicRMW , AtomicRMWInst ) - LAST_MEMORY_INST(35) + FIRST_MEMORY_INST(30) +HANDLE_MEMORY_INST(30, Alloca, AllocaInst) // Stack management +HANDLE_MEMORY_INST(31, Load , LoadInst ) // Memory manipulation instrs +HANDLE_MEMORY_INST(32, Store , StoreInst ) +HANDLE_MEMORY_INST(33, GetElementPtr, GetElementPtrInst) +HANDLE_MEMORY_INST(34, Fence , FenceInst ) +HANDLE_MEMORY_INST(35, AtomicCmpXchg , AtomicCmpXchgInst ) +HANDLE_MEMORY_INST(36, AtomicRMW , AtomicRMWInst ) + LAST_MEMORY_INST(36) // Cast operators ... // NOTE: The order matters here because CastInst::isEliminableCastPair // NOTE: (see Instructions.cpp) encodes a table based on this ordering. - FIRST_CAST_INST(36) -HANDLE_CAST_INST(36, Trunc , TruncInst ) // Truncate integers -HANDLE_CAST_INST(37, ZExt , ZExtInst ) // Zero extend integers -HANDLE_CAST_INST(38, SExt , SExtInst ) // Sign extend integers -HANDLE_CAST_INST(39, FPToUI , FPToUIInst ) // floating point -> UInt -HANDLE_CAST_INST(40, FPToSI , FPToSIInst ) // floating point -> SInt -HANDLE_CAST_INST(41, UIToFP , UIToFPInst ) // UInt -> floating point -HANDLE_CAST_INST(42, SIToFP , SIToFPInst ) // SInt -> floating point -HANDLE_CAST_INST(43, FPTrunc , FPTruncInst ) // Truncate floating point -HANDLE_CAST_INST(44, FPExt , FPExtInst ) // Extend floating point -HANDLE_CAST_INST(45, PtrToInt, PtrToIntInst) // Pointer -> Integer -HANDLE_CAST_INST(46, IntToPtr, IntToPtrInst) // Integer -> Pointer -HANDLE_CAST_INST(47, BitCast , BitCastInst ) // Type cast -HANDLE_CAST_INST(48, AddrSpaceCast, AddrSpaceCastInst) // addrspace cast - LAST_CAST_INST(48) - - FIRST_FUNCLETPAD_INST(49) -HANDLE_FUNCLETPAD_INST(49, CleanupPad, CleanupPadInst) -HANDLE_FUNCLETPAD_INST(50, CatchPad , CatchPadInst) - LAST_FUNCLETPAD_INST(50) + FIRST_CAST_INST(37) +HANDLE_CAST_INST(37, Trunc , TruncInst ) // Truncate integers +HANDLE_CAST_INST(38, ZExt , ZExtInst ) // Zero extend integers +HANDLE_CAST_INST(39, SExt , SExtInst ) // Sign extend integers +HANDLE_CAST_INST(40, FPToUI , FPToUIInst ) // floating point -> UInt +HANDLE_CAST_INST(41, FPToSI , FPToSIInst ) // floating point -> SInt +HANDLE_CAST_INST(42, UIToFP , UIToFPInst ) // UInt -> floating point +HANDLE_CAST_INST(43, SIToFP , SIToFPInst ) // SInt -> floating point +HANDLE_CAST_INST(44, FPTrunc , FPTruncInst ) // Truncate floating point +HANDLE_CAST_INST(45, FPExt , FPExtInst ) // Extend floating point +HANDLE_CAST_INST(46, PtrToInt, PtrToIntInst) // Pointer -> Integer +HANDLE_CAST_INST(47, IntToPtr, IntToPtrInst) // Integer -> Pointer +HANDLE_CAST_INST(48, BitCast , BitCastInst ) // Type cast +HANDLE_CAST_INST(49, AddrSpaceCast, AddrSpaceCastInst) // addrspace cast + LAST_CAST_INST(49) + + FIRST_FUNCLETPAD_INST(50) +HANDLE_FUNCLETPAD_INST(50, CleanupPad, CleanupPadInst) +HANDLE_FUNCLETPAD_INST(51, CatchPad , CatchPadInst) + LAST_FUNCLETPAD_INST(51) // Other operators... - FIRST_OTHER_INST(51) -HANDLE_OTHER_INST(51, ICmp , ICmpInst ) // Integer comparison instruction -HANDLE_OTHER_INST(52, FCmp , FCmpInst ) // Floating point comparison instr. -HANDLE_OTHER_INST(53, PHI , PHINode ) // PHI node instruction -HANDLE_OTHER_INST(54, Call , CallInst ) // Call a function -HANDLE_OTHER_INST(55, Select , SelectInst ) // select instruction -HANDLE_USER_INST (56, UserOp1, Instruction) // May be used internally in a pass -HANDLE_USER_INST (57, UserOp2, Instruction) // Internal to passes only -HANDLE_OTHER_INST(58, VAArg , VAArgInst ) // vaarg instruction -HANDLE_OTHER_INST(59, ExtractElement, ExtractElementInst)// extract from vector -HANDLE_OTHER_INST(60, InsertElement, InsertElementInst) // insert into vector -HANDLE_OTHER_INST(61, ShuffleVector, ShuffleVectorInst) // shuffle two vectors. -HANDLE_OTHER_INST(62, ExtractValue, ExtractValueInst)// extract from aggregate -HANDLE_OTHER_INST(63, InsertValue, InsertValueInst) // insert into aggregate -HANDLE_OTHER_INST(64, LandingPad, LandingPadInst) // Landing pad instruction. - LAST_OTHER_INST(64) + FIRST_OTHER_INST(52) +HANDLE_OTHER_INST(52, ICmp , ICmpInst ) // Integer comparison instruction +HANDLE_OTHER_INST(53, FCmp , FCmpInst ) // Floating point comparison instr. +HANDLE_OTHER_INST(54, PHI , PHINode ) // PHI node instruction +HANDLE_OTHER_INST(55, Call , CallInst ) // Call a function +HANDLE_OTHER_INST(56, Select , SelectInst ) // select instruction +HANDLE_USER_INST (57, UserOp1, Instruction) // May be used internally in a pass +HANDLE_USER_INST (58, UserOp2, Instruction) // Internal to passes only +HANDLE_OTHER_INST(59, VAArg , VAArgInst ) // vaarg instruction +HANDLE_OTHER_INST(60, ExtractElement, ExtractElementInst)// extract from vector +HANDLE_OTHER_INST(61, InsertElement, InsertElementInst) // insert into vector +HANDLE_OTHER_INST(62, ShuffleVector, ShuffleVectorInst) // shuffle two vectors. +HANDLE_OTHER_INST(63, ExtractValue, ExtractValueInst)// extract from aggregate +HANDLE_OTHER_INST(64, InsertValue, InsertValueInst) // insert into aggregate +HANDLE_OTHER_INST(65, LandingPad, LandingPadInst) // Landing pad instruction. + LAST_OTHER_INST(65) #undef FIRST_TERM_INST #undef HANDLE_TERM_INST #undef LAST_TERM_INST +#undef FIRST_UNARY_INST +#undef HANDLE_UNARY_INST +#undef LAST_UNARY_INST + #undef FIRST_BINARY_INST #undef HANDLE_BINARY_INST #undef LAST_BINARY_INST diff --git a/include/llvm/IR/Instruction.h b/include/llvm/IR/Instruction.h index 2a75801c649ec513f7dc98980a14d3b30b6351f6..7a81f700ee4ecccd6ca45652776c421a3295b287 100644 --- a/include/llvm/IR/Instruction.h +++ b/include/llvm/IR/Instruction.h @@ -127,6 +127,7 @@ public: const char *getOpcodeName() const { return getOpcodeName(getOpcode()); } bool isTerminator() const { return isTerminator(getOpcode()); } + bool isUnaryOp() const { return isUnaryOp(getOpcode()); } bool isBinaryOp() const { return isBinaryOp(getOpcode()); } bool isIntDivRem() const { return isIntDivRem(getOpcode()); } bool isShift() { return isShift(getOpcode()); } @@ -142,6 +143,9 @@ public: return OpCode >= TermOpsBegin && OpCode < TermOpsEnd; } + static inline bool isUnaryOp(unsigned Opcode) { + return Opcode >= UnaryOpsBegin && Opcode < UnaryOpsEnd; + } static inline bool isBinaryOp(unsigned Opcode) { return Opcode >= BinaryOpsBegin && Opcode < BinaryOpsEnd; } @@ -586,6 +590,14 @@ public: static_cast(this)->getNextNonDebugInstruction()); } + /// Return a pointer to the previous non-debug instruction in the same basic + /// block as 'this', or nullptr if no such instruction exists. + const Instruction *getPrevNonDebugInstruction() const; + Instruction *getPrevNonDebugInstruction() { + return const_cast( + static_cast(this)->getPrevNonDebugInstruction()); + } + /// Create a copy of 'this' instruction that is identical in all ways except /// the following: /// * The instruction has no parent @@ -654,6 +666,13 @@ public: #include "llvm/IR/Instruction.def" }; + enum UnaryOps { +#define FIRST_UNARY_INST(N) UnaryOpsBegin = N, +#define HANDLE_UNARY_INST(N, OPC, CLASS) OPC = N, +#define LAST_UNARY_INST(N) UnaryOpsEnd = N+1 +#include "llvm/IR/Instruction.def" + }; + enum BinaryOps { #define FIRST_BINARY_INST(N) BinaryOpsBegin = N, #define HANDLE_BINARY_INST(N, OPC, CLASS) OPC = N, diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h index 7b2c13c5328239fbf03877f632fc946694376de3..7700e7caa9cd5572b8164582cf0f9ddcac30bc93 100644 --- a/include/llvm/IR/Instructions.h +++ b/include/llvm/IR/Instructions.h @@ -1103,6 +1103,71 @@ GetElementPtrInst::GetElementPtrInst(Type *PointeeType, Value *Ptr, DEFINE_TRANSPARENT_OPERAND_ACCESSORS(GetElementPtrInst, Value) +//===----------------------------------------------------------------------===// +// UnaryOperator Class +//===----------------------------------------------------------------------===// + +/// a unary instruction +class UnaryOperator : public UnaryInstruction { + void AssertOK(); + +protected: + UnaryOperator(UnaryOps iType, Value *S, Type *Ty, + const Twine &Name, Instruction *InsertBefore); + UnaryOperator(UnaryOps iType, Value *S, Type *Ty, + const Twine &Name, BasicBlock *InsertAtEnd); + + // Note: Instruction needs to be a friend here to call cloneImpl. + friend class Instruction; + + UnaryOperator *cloneImpl() const; + +public: + + /// Construct a unary instruction, given the opcode and an operand. + /// Optionally (if InstBefore is specified) insert the instruction + /// into a BasicBlock right before the specified instruction. The specified + /// Instruction is allowed to be a dereferenced end iterator. + /// + static UnaryOperator *Create(UnaryOps Op, Value *S, + const Twine &Name = Twine(), + Instruction *InsertBefore = nullptr); + + /// Construct a unary instruction, given the opcode and an operand. + /// Also automatically insert this instruction to the end of the + /// BasicBlock specified. + /// + static UnaryOperator *Create(UnaryOps Op, Value *S, + const Twine &Name, + BasicBlock *InsertAtEnd); + + /// These methods just forward to Create, and are useful when you + /// statically know what type of instruction you're going to create. These + /// helpers just save some typing. +#define HANDLE_UNARY_INST(N, OPC, CLASS) \ + static UnaryInstruction *Create##OPC(Value *V, \ + const Twine &Name = "") {\ + return Create(Instruction::OPC, V, Name);\ + } +#include "llvm/IR/Instruction.def" +#define HANDLE_UNARY_INST(N, OPC, CLASS) \ + static UnaryInstruction *Create##OPC(Value *V, \ + const Twine &Name, BasicBlock *BB) {\ + return Create(Instruction::OPC, V, Name, BB);\ + } +#include "llvm/IR/Instruction.def" +#define HANDLE_UNARY_INST(N, OPC, CLASS) \ + static UnaryInstruction *Create##OPC(Value *V, \ + const Twine &Name, Instruction *I) {\ + return Create(Instruction::OPC, V, Name, I);\ + } +#include "llvm/IR/Instruction.def" + + UnaryOps getOpcode() const { + return static_cast(Instruction::getOpcode()); + } +}; + //===----------------------------------------------------------------------===// // ICmpInst Class //===----------------------------------------------------------------------===// @@ -1353,535 +1418,13 @@ public: } }; -class CallInst; -class InvokeInst; - -template struct CallBaseParent { using type = Instruction; }; - -//===----------------------------------------------------------------------===// -/// Base class for all callable instructions (InvokeInst and CallInst) -/// Holds everything related to calling a function, abstracting from the base -/// type @p BaseInstTy and the concrete instruction @p InstTy -/// -template -class CallBase : public CallBaseParent::type, - public OperandBundleUser { -protected: - AttributeList Attrs; ///< parameter attributes for callable - FunctionType *FTy; - using BaseInstTy = typename CallBaseParent::type; - - template - CallBase(AttributeList const &A, FunctionType *FT, ArgsTy &&... Args) - : BaseInstTy(std::forward(Args)...), Attrs(A), FTy(FT) {} - bool hasDescriptor() const { return Value::HasDescriptor; } - - using BaseInstTy::BaseInstTy; - - using OperandBundleUser::isFnAttrDisallowedByOpBundle; - using OperandBundleUser::getNumTotalBundleOperands; - using OperandBundleUser::bundleOperandHasAttr; - using Instruction::getSubclassDataFromInstruction; - using Instruction::setInstructionSubclassData; - -public: - using Instruction::getContext; - using OperandBundleUser::hasOperandBundles; - using OperandBundleUser::getBundleOperandsStartIndex; - - static bool classof(const Instruction *I) { - llvm_unreachable( - "CallBase is not meant to be used as part of the classof hierarchy"); - } - -public: - /// Return the parameter attributes for this call. - /// - AttributeList getAttributes() const { return Attrs; } - - /// Set the parameter attributes for this call. - /// - void setAttributes(AttributeList A) { Attrs = A; } - - FunctionType *getFunctionType() const { return FTy; } - - void mutateFunctionType(FunctionType *FTy) { - Value::mutateType(FTy->getReturnType()); - this->FTy = FTy; - } - - /// Return the number of call arguments. - /// - unsigned getNumArgOperands() const { - return getNumOperands() - getNumTotalBundleOperands() - InstTy::ArgOffset; - } - - /// getArgOperand/setArgOperand - Return/set the i-th call argument. - /// - Value *getArgOperand(unsigned i) const { - assert(i < getNumArgOperands() && "Out of bounds!"); - return getOperand(i); - } - void setArgOperand(unsigned i, Value *v) { - assert(i < getNumArgOperands() && "Out of bounds!"); - setOperand(i, v); - } - - /// Return the iterator pointing to the beginning of the argument list. - User::op_iterator arg_begin() { return op_begin(); } - - /// Return the iterator pointing to the end of the argument list. - User::op_iterator arg_end() { - // [ call args ], [ operand bundles ], callee - return op_end() - getNumTotalBundleOperands() - InstTy::ArgOffset; - } - - /// Iteration adapter for range-for loops. - iterator_range arg_operands() { - return make_range(arg_begin(), arg_end()); - } - - /// Return the iterator pointing to the beginning of the argument list. - User::const_op_iterator arg_begin() const { return op_begin(); } - - /// Return the iterator pointing to the end of the argument list. - User::const_op_iterator arg_end() const { - // [ call args ], [ operand bundles ], callee - return op_end() - getNumTotalBundleOperands() - InstTy::ArgOffset; - } - - /// Iteration adapter for range-for loops. - iterator_range arg_operands() const { - return make_range(arg_begin(), arg_end()); - } - - /// Wrappers for getting the \c Use of a call argument. - const Use &getArgOperandUse(unsigned i) const { - assert(i < getNumArgOperands() && "Out of bounds!"); - return User::getOperandUse(i); - } - Use &getArgOperandUse(unsigned i) { - assert(i < getNumArgOperands() && "Out of bounds!"); - return User::getOperandUse(i); - } - - /// If one of the arguments has the 'returned' attribute, return its - /// operand value. Otherwise, return nullptr. - Value *getReturnedArgOperand() const { - unsigned Index; - - if (Attrs.hasAttrSomewhere(Attribute::Returned, &Index) && Index) - return getArgOperand(Index - AttributeList::FirstArgIndex); - if (const Function *F = getCalledFunction()) - if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) && - Index) - return getArgOperand(Index - AttributeList::FirstArgIndex); - - return nullptr; - } - - User::op_iterator op_begin() { - return OperandTraits::op_begin(this); - } - - User::const_op_iterator op_begin() const { - return OperandTraits::op_begin(const_cast(this)); - } - - User::op_iterator op_end() { return OperandTraits::op_end(this); } - - User::const_op_iterator op_end() const { - return OperandTraits::op_end(const_cast(this)); - } - - Value *getOperand(unsigned i_nocapture) const { - assert(i_nocapture < OperandTraits::operands(this) && - "getOperand() out of range!"); - return cast_or_null(OperandTraits::op_begin( - const_cast(this))[i_nocapture] - .get()); - } - - void setOperand(unsigned i_nocapture, Value *Val_nocapture) { - assert(i_nocapture < OperandTraits::operands(this) && - "setOperand() out of range!"); - OperandTraits::op_begin(this)[i_nocapture] = Val_nocapture; - } - - unsigned getNumOperands() const { - return OperandTraits::operands(this); - } - template Use &Op() { - return User::OpFrom(this); - } - template const Use &Op() const { - return User::OpFrom(this); - } - - /// Return the function called, or null if this is an - /// indirect function invocation. - /// - Function *getCalledFunction() const { - return dyn_cast_or_null(Op<-InstTy::ArgOffset>()); - } - - /// Determine whether this call has the given attribute. - bool hasFnAttr(Attribute::AttrKind Kind) const { - assert(Kind != Attribute::NoBuiltin && - "Use CallBase::isNoBuiltin() to check for Attribute::NoBuiltin"); - return hasFnAttrImpl(Kind); - } - - /// Determine whether this call has the given attribute. - bool hasFnAttr(StringRef Kind) const { return hasFnAttrImpl(Kind); } - - /// getCallingConv/setCallingConv - Get or set the calling convention of this - /// function call. - CallingConv::ID getCallingConv() const { - return static_cast(getSubclassDataFromInstruction() >> 2); - } - void setCallingConv(CallingConv::ID CC) { - auto ID = static_cast(CC); - assert(!(ID & ~CallingConv::MaxID) && "Unsupported calling convention"); - setInstructionSubclassData((getSubclassDataFromInstruction() & 3) | - (ID << 2)); - } - - - /// adds the attribute to the list of attributes. - void addAttribute(unsigned i, Attribute::AttrKind Kind) { - AttributeList PAL = getAttributes(); - PAL = PAL.addAttribute(getContext(), i, Kind); - setAttributes(PAL); - } - - /// adds the attribute to the list of attributes. - void addAttribute(unsigned i, Attribute Attr) { - AttributeList PAL = getAttributes(); - PAL = PAL.addAttribute(getContext(), i, Attr); - setAttributes(PAL); - } - - /// Adds the attribute to the indicated argument - void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { - assert(ArgNo < getNumArgOperands() && "Out of bounds"); - AttributeList PAL = getAttributes(); - PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind); - setAttributes(PAL); - } - - /// Adds the attribute to the indicated argument - void addParamAttr(unsigned ArgNo, Attribute Attr) { - assert(ArgNo < getNumArgOperands() && "Out of bounds"); - AttributeList PAL = getAttributes(); - PAL = PAL.addParamAttribute(getContext(), ArgNo, Attr); - setAttributes(PAL); - } - - /// removes the attribute from the list of attributes. - void removeAttribute(unsigned i, Attribute::AttrKind Kind) { - AttributeList PAL = getAttributes(); - PAL = PAL.removeAttribute(getContext(), i, Kind); - setAttributes(PAL); - } - - /// removes the attribute from the list of attributes. - void removeAttribute(unsigned i, StringRef Kind) { - AttributeList PAL = getAttributes(); - PAL = PAL.removeAttribute(getContext(), i, Kind); - setAttributes(PAL); - } - - /// Removes the attribute from the given argument - void removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { - assert(ArgNo < getNumArgOperands() && "Out of bounds"); - AttributeList PAL = getAttributes(); - PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind); - setAttributes(PAL); - } - - /// Removes the attribute from the given argument - void removeParamAttr(unsigned ArgNo, StringRef Kind) { - assert(ArgNo < getNumArgOperands() && "Out of bounds"); - AttributeList PAL = getAttributes(); - PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind); - setAttributes(PAL); - } - - /// adds the dereferenceable attribute to the list of attributes. - void addDereferenceableAttr(unsigned i, uint64_t Bytes) { - AttributeList PAL = getAttributes(); - PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes); - setAttributes(PAL); - } - - /// adds the dereferenceable_or_null attribute to the list of - /// attributes. - void addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) { - AttributeList PAL = getAttributes(); - PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes); - setAttributes(PAL); - } - - /// Determine whether the return value has the given attribute. - bool hasRetAttr(Attribute::AttrKind Kind) const { - if (Attrs.hasAttribute(AttributeList::ReturnIndex, Kind)) - return true; - - // Look at the callee, if available. - if (const Function *F = getCalledFunction()) - return F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Kind); - return false; - } - - /// Determine whether the argument or parameter has the given attribute. - bool paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const { - assert(ArgNo < getNumArgOperands() && "Param index out of bounds!"); - - if (Attrs.hasParamAttribute(ArgNo, Kind)) - return true; - if (const Function *F = getCalledFunction()) - return F->getAttributes().hasParamAttribute(ArgNo, Kind); - return false; - } - - /// Get the attribute of a given kind at a position. - Attribute getAttribute(unsigned i, Attribute::AttrKind Kind) const { - return getAttributes().getAttribute(i, Kind); - } - - /// Get the attribute of a given kind at a position. - Attribute getAttribute(unsigned i, StringRef Kind) const { - return getAttributes().getAttribute(i, Kind); - } - - /// Get the attribute of a given kind from a given arg - Attribute getParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const { - assert(ArgNo < getNumArgOperands() && "Out of bounds"); - return getAttributes().getParamAttr(ArgNo, Kind); - } - - /// Get the attribute of a given kind from a given arg - Attribute getParamAttr(unsigned ArgNo, StringRef Kind) const { - assert(ArgNo < getNumArgOperands() && "Out of bounds"); - return getAttributes().getParamAttr(ArgNo, Kind); - } - /// Return true if the data operand at index \p i has the attribute \p - /// A. - /// - /// Data operands include call arguments and values used in operand bundles, - /// but does not include the callee operand. This routine dispatches to the - /// underlying AttributeList or the OperandBundleUser as appropriate. - /// - /// The index \p i is interpreted as - /// - /// \p i == Attribute::ReturnIndex -> the return value - /// \p i in [1, arg_size + 1) -> argument number (\p i - 1) - /// \p i in [arg_size + 1, data_operand_size + 1) -> bundle operand at index - /// (\p i - 1) in the operand list. - bool dataOperandHasImpliedAttr(unsigned i, Attribute::AttrKind Kind) const { - // There are getNumOperands() - (InstTy::ArgOffset - 1) data operands. - // The last operand is the callee. - assert(i < (getNumOperands() - InstTy::ArgOffset + 1) && - "Data operand index out of bounds!"); - - // The attribute A can either be directly specified, if the operand in - // question is a call argument; or be indirectly implied by the kind of its - // containing operand bundle, if the operand is a bundle operand. - - if (i == AttributeList::ReturnIndex) - return hasRetAttr(Kind); - - // FIXME: Avoid these i - 1 calculations and update the API to use - // zero-based indices. - if (i < (getNumArgOperands() + 1)) - return paramHasAttr(i - 1, Kind); - - assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) && - "Must be either a call argument or an operand bundle!"); - return bundleOperandHasAttr(i - 1, Kind); - } - - /// Extract the alignment of the return value. - unsigned getRetAlignment() const { return Attrs.getRetAlignment(); } - - /// Extract the alignment for a call or parameter (0=unknown). - unsigned getParamAlignment(unsigned ArgNo) const { - return Attrs.getParamAlignment(ArgNo); - } - - /// Extract the number of dereferenceable bytes for a call or - /// parameter (0=unknown). - uint64_t getDereferenceableBytes(unsigned i) const { - return Attrs.getDereferenceableBytes(i); - } - - /// Extract the number of dereferenceable_or_null bytes for a call or - /// parameter (0=unknown). - uint64_t getDereferenceableOrNullBytes(unsigned i) const { - return Attrs.getDereferenceableOrNullBytes(i); - } - - /// Determine if the return value is marked with NoAlias attribute. - bool returnDoesNotAlias() const { - return Attrs.hasAttribute(AttributeList::ReturnIndex, Attribute::NoAlias); - } - - /// Return true if the call should not be treated as a call to a - /// builtin. - bool isNoBuiltin() const { - return hasFnAttrImpl(Attribute::NoBuiltin) && - !hasFnAttrImpl(Attribute::Builtin); - } - - /// Determine if the call requires strict floating point semantics. - bool isStrictFP() const { return hasFnAttr(Attribute::StrictFP); } - - /// Return true if the call should not be inlined. - bool isNoInline() const { return hasFnAttr(Attribute::NoInline); } - void setIsNoInline() { - addAttribute(AttributeList::FunctionIndex, Attribute::NoInline); - } - /// Determine if the call does not access memory. - bool doesNotAccessMemory() const { - return hasFnAttr(Attribute::ReadNone); - } - void setDoesNotAccessMemory() { - addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone); - } - - /// Determine if the call does not access or only reads memory. - bool onlyReadsMemory() const { - return doesNotAccessMemory() || hasFnAttr(Attribute::ReadOnly); - } - void setOnlyReadsMemory() { - addAttribute(AttributeList::FunctionIndex, Attribute::ReadOnly); - } - - /// Determine if the call does not access or only writes memory. - bool doesNotReadMemory() const { - return doesNotAccessMemory() || hasFnAttr(Attribute::WriteOnly); - } - void setDoesNotReadMemory() { - addAttribute(AttributeList::FunctionIndex, Attribute::WriteOnly); - } - - /// Determine if the call can access memmory only using pointers based - /// on its arguments. - bool onlyAccessesArgMemory() const { - return hasFnAttr(Attribute::ArgMemOnly); - } - void setOnlyAccessesArgMemory() { - addAttribute(AttributeList::FunctionIndex, Attribute::ArgMemOnly); - } - - /// Determine if the function may only access memory that is - /// inaccessible from the IR. - bool onlyAccessesInaccessibleMemory() const { - return hasFnAttr(Attribute::InaccessibleMemOnly); - } - void setOnlyAccessesInaccessibleMemory() { - addAttribute(AttributeList::FunctionIndex, Attribute::InaccessibleMemOnly); - } - - /// Determine if the function may only access memory that is - /// either inaccessible from the IR or pointed to by its arguments. - bool onlyAccessesInaccessibleMemOrArgMem() const { - return hasFnAttr(Attribute::InaccessibleMemOrArgMemOnly); - } - void setOnlyAccessesInaccessibleMemOrArgMem() { - addAttribute(AttributeList::FunctionIndex, Attribute::InaccessibleMemOrArgMemOnly); - } - /// Determine if the call cannot return. - bool doesNotReturn() const { return hasFnAttr(Attribute::NoReturn); } - void setDoesNotReturn() { - addAttribute(AttributeList::FunctionIndex, Attribute::NoReturn); - } - - /// Determine if the call should not perform indirect branch tracking. - bool doesNoCfCheck() const { return hasFnAttr(Attribute::NoCfCheck); } - - /// Determine if the call cannot unwind. - bool doesNotThrow() const { return hasFnAttr(Attribute::NoUnwind); } - void setDoesNotThrow() { - addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind); - } - - /// Determine if the invoke cannot be duplicated. - bool cannotDuplicate() const {return hasFnAttr(Attribute::NoDuplicate); } - void setCannotDuplicate() { - addAttribute(AttributeList::FunctionIndex, Attribute::NoDuplicate); - } - - /// Determine if the invoke is convergent - bool isConvergent() const { return hasFnAttr(Attribute::Convergent); } - void setConvergent() { - addAttribute(AttributeList::FunctionIndex, Attribute::Convergent); - } - void setNotConvergent() { - removeAttribute(AttributeList::FunctionIndex, Attribute::Convergent); - } - - /// Determine if the call returns a structure through first - /// pointer argument. - bool hasStructRetAttr() const { - if (getNumArgOperands() == 0) - return false; - - // Be friendly and also check the callee. - return paramHasAttr(0, Attribute::StructRet); - } - - /// Determine if any call argument is an aggregate passed by value. - bool hasByValArgument() const { - return Attrs.hasAttrSomewhere(Attribute::ByVal); - } - /// Get a pointer to the function that is invoked by this - /// instruction. - const Value *getCalledValue() const { return Op<-InstTy::ArgOffset>(); } - Value *getCalledValue() { return Op<-InstTy::ArgOffset>(); } - - /// Set the function called. - void setCalledFunction(Value* Fn) { - setCalledFunction( - cast(cast(Fn->getType())->getElementType()), - Fn); - } - void setCalledFunction(FunctionType *FTy, Value *Fn) { - this->FTy = FTy; - assert(FTy == cast( - cast(Fn->getType())->getElementType())); - Op<-InstTy::ArgOffset>() = Fn; - } - -protected: - template bool hasFnAttrImpl(AttrKind Kind) const { - if (Attrs.hasAttribute(AttributeList::FunctionIndex, Kind)) - return true; - - // Operand bundles override attributes on the called function, but don't - // override attributes directly present on the call instruction. - if (isFnAttrDisallowedByOpBundle(Kind)) - return false; - - if (const Function *F = getCalledFunction()) - return F->getAttributes().hasAttribute(AttributeList::FunctionIndex, - Kind); - return false; - } -}; - //===----------------------------------------------------------------------===// /// This class represents a function call, abstracting a target /// machine's calling convention. This class uses low bit of the SubClassData /// field to indicate whether or not this is a tail call. The rest of the bits /// hold the calling convention of the call. /// -class CallInst : public CallBase { - friend class OperandBundleUser; - +class CallInst : public CallBase { CallInst(const CallInst &CI); /// Construct a CallInst given a range of arguments. @@ -1921,6 +1464,13 @@ class CallInst : public CallBase { ArrayRef Bundles, const Twine &NameStr); void init(Value *Func, const Twine &NameStr); + /// Compute the number of operands to allocate. + static int ComputeNumOperands(int NumArgs, int NumBundleInputs = 0) { + // We need one operand for the called function, plus the input operand + // counts provided. + return 1 + NumArgs + NumBundleInputs; + } + protected: // Note: Instruction needs to be a friend here to call cloneImpl. friend class Instruction; @@ -1928,8 +1478,6 @@ protected: CallInst *cloneImpl() const; public: - static constexpr int ArgOffset = 1; - static CallInst *Create(Value *Func, ArrayRef Args, ArrayRef Bundles = None, const Twine &NameStr = "", @@ -1950,7 +1498,7 @@ public: static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef Args, const Twine &NameStr, Instruction *InsertBefore = nullptr) { - return new (unsigned(Args.size() + 1)) + return new (ComputeNumOperands(Args.size())) CallInst(Ty, Func, Args, None, NameStr, InsertBefore); } @@ -1958,39 +1506,39 @@ public: ArrayRef Bundles = None, const Twine &NameStr = "", Instruction *InsertBefore = nullptr) { - const unsigned TotalOps = - unsigned(Args.size()) + CountBundleInputs(Bundles) + 1; + const int NumOperands = + ComputeNumOperands(Args.size(), CountBundleInputs(Bundles)); const unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo); - return new (TotalOps, DescriptorBytes) + return new (NumOperands, DescriptorBytes) CallInst(Ty, Func, Args, Bundles, NameStr, InsertBefore); } static CallInst *Create(Value *Func, ArrayRef Args, ArrayRef Bundles, const Twine &NameStr, BasicBlock *InsertAtEnd) { - const unsigned TotalOps = - unsigned(Args.size()) + CountBundleInputs(Bundles) + 1; + const int NumOperands = + ComputeNumOperands(Args.size(), CountBundleInputs(Bundles)); const unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo); - return new (TotalOps, DescriptorBytes) + return new (NumOperands, DescriptorBytes) CallInst(Func, Args, Bundles, NameStr, InsertAtEnd); } static CallInst *Create(Value *Func, ArrayRef Args, const Twine &NameStr, BasicBlock *InsertAtEnd) { - return new (unsigned(Args.size() + 1)) + return new (ComputeNumOperands(Args.size())) CallInst(Func, Args, None, NameStr, InsertAtEnd); } static CallInst *Create(Value *F, const Twine &NameStr = "", Instruction *InsertBefore = nullptr) { - return new (1) CallInst(F, NameStr, InsertBefore); + return new (ComputeNumOperands(0)) CallInst(F, NameStr, InsertBefore); } static CallInst *Create(Value *F, const Twine &NameStr, BasicBlock *InsertAtEnd) { - return new (1) CallInst(F, NameStr, InsertAtEnd); + return new (ComputeNumOperands(0)) CallInst(F, NameStr, InsertAtEnd); } /// Create a clone of \p CI with a different set of operand bundles and @@ -2081,7 +1629,7 @@ public: } /// Check if this call is an inline asm statement. - bool isInlineAsm() const { return isa(Op<-1>()); } + bool isInlineAsm() const { return isa(getCalledOperand()); } // Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const Instruction *I) { @@ -2099,32 +1647,28 @@ private: } }; -template <> -struct OperandTraits> - : public VariadicOperandTraits, 1> {}; - CallInst::CallInst(Value *Func, ArrayRef Args, ArrayRef Bundles, const Twine &NameStr, BasicBlock *InsertAtEnd) - : CallBase( - cast( - cast(Func->getType())->getElementType()) - ->getReturnType(), - Instruction::Call, - OperandTraits>::op_end(this) - - (Args.size() + CountBundleInputs(Bundles) + 1), - unsigned(Args.size() + CountBundleInputs(Bundles) + 1), InsertAtEnd) { + : CallBase(cast( + cast(Func->getType())->getElementType()) + ->getReturnType(), + Instruction::Call, + OperandTraits::op_end(this) - + (Args.size() + CountBundleInputs(Bundles) + 1), + unsigned(Args.size() + CountBundleInputs(Bundles) + 1), + InsertAtEnd) { init(Func, Args, Bundles, NameStr); } CallInst::CallInst(FunctionType *Ty, Value *Func, ArrayRef Args, ArrayRef Bundles, const Twine &NameStr, Instruction *InsertBefore) - : CallBase(Ty->getReturnType(), Instruction::Call, - OperandTraits>::op_end(this) - - (Args.size() + CountBundleInputs(Bundles) + 1), - unsigned(Args.size() + CountBundleInputs(Bundles) + 1), - InsertBefore) { + : CallBase(Ty->getReturnType(), Instruction::Call, + OperandTraits::op_end(this) - + (Args.size() + CountBundleInputs(Bundles) + 1), + unsigned(Args.size() + CountBundleInputs(Bundles) + 1), + InsertBefore) { init(Ty, Func, Args, Bundles, NameStr); } @@ -2648,6 +2192,25 @@ public: return !changesLength() && isTransposeMask(getMask()); } + /// Return true if this shuffle mask is an extract subvector mask. + /// A valid extract subvector mask returns a smaller vector from a single + /// source operand. The base extraction index is returned as well. + static bool isExtractSubvectorMask(ArrayRef Mask, int NumSrcElts, + int &Index); + static bool isExtractSubvectorMask(const Constant *Mask, int NumSrcElts, + int &Index) { + assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant."); + SmallVector MaskAsInts; + getShuffleMask(Mask, MaskAsInts); + return isExtractSubvectorMask(MaskAsInts, NumSrcElts, Index); + } + + /// Return true if this shuffle mask is an extract subvector mask. + bool isExtractSubvectorMask(int &Index) const { + int NumSrcElts = Op<0>()->getType()->getVectorNumElements(); + return isExtractSubvectorMask(getMask(), NumSrcElts, Index); + } + /// Change values in a shuffle permute mask assuming the two vector operands /// of length InVecNumElts have swapped position. static void commuteShuffleMask(MutableArrayRef Mask, @@ -3982,8 +3545,16 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(IndirectBrInst, Value) /// Invoke instruction. The SubclassData field is used to hold the /// calling convention of the call. /// -class InvokeInst : public CallBase { - friend class OperandBundleUser; +class InvokeInst : public CallBase { + /// The number of operands for this call beyond the called function, + /// arguments, and operand bundles. + static constexpr int NumExtraOperands = 2; + + /// The index from the end of the operand array to the normal destination. + static constexpr int NormalDestOpEndIdx = -3; + + /// The index from the end of the operand array to the unwind destination. + static constexpr int UnwindDestOpEndIdx = -2; InvokeInst(const InvokeInst &BI); @@ -3992,26 +3563,25 @@ class InvokeInst : public CallBase { /// Construct an InvokeInst from a range of arguments inline InvokeInst(Value *Func, BasicBlock *IfNormal, BasicBlock *IfException, ArrayRef Args, ArrayRef Bundles, - unsigned Values, const Twine &NameStr, + int NumOperands, const Twine &NameStr, Instruction *InsertBefore) : InvokeInst(cast( cast(Func->getType())->getElementType()), - Func, IfNormal, IfException, Args, Bundles, Values, NameStr, - InsertBefore) {} + Func, IfNormal, IfException, Args, Bundles, NumOperands, + NameStr, InsertBefore) {} inline InvokeInst(FunctionType *Ty, Value *Func, BasicBlock *IfNormal, BasicBlock *IfException, ArrayRef Args, - ArrayRef Bundles, unsigned Values, + ArrayRef Bundles, int NumOperands, const Twine &NameStr, Instruction *InsertBefore); /// Construct an InvokeInst given a range of arguments. /// /// Construct an InvokeInst from a range of arguments inline InvokeInst(Value *Func, BasicBlock *IfNormal, BasicBlock *IfException, ArrayRef Args, ArrayRef Bundles, - unsigned Values, const Twine &NameStr, + int NumOperands, const Twine &NameStr, BasicBlock *InsertAtEnd); - void init(Value *Func, BasicBlock *IfNormal, BasicBlock *IfException, ArrayRef Args, ArrayRef Bundles, const Twine &NameStr) { @@ -4024,6 +3594,13 @@ class InvokeInst : public CallBase { BasicBlock *IfException, ArrayRef Args, ArrayRef Bundles, const Twine &NameStr); + /// Compute the number of operands to allocate. + static int ComputeNumOperands(int NumArgs, int NumBundleInputs = 0) { + // We need one operand for the called function, plus our extra operands and + // the input operand counts provided. + return 1 + NumExtraOperands + NumArgs + NumBundleInputs; + } + protected: // Note: Instruction needs to be a friend here to call cloneImpl. friend class Instruction; @@ -4031,7 +3608,6 @@ protected: InvokeInst *cloneImpl() const; public: - static constexpr int ArgOffset = 3; static InvokeInst *Create(Value *Func, BasicBlock *IfNormal, BasicBlock *IfException, ArrayRef Args, const Twine &NameStr, @@ -4057,9 +3633,10 @@ public: BasicBlock *IfException, ArrayRef Args, const Twine &NameStr, Instruction *InsertBefore = nullptr) { - unsigned Values = unsigned(Args.size()) + 3; - return new (Values) InvokeInst(Ty, Func, IfNormal, IfException, Args, None, - Values, NameStr, InsertBefore); + int NumOperands = ComputeNumOperands(Args.size()); + return new (NumOperands) + InvokeInst(Ty, Func, IfNormal, IfException, Args, None, NumOperands, + NameStr, InsertBefore); } static InvokeInst *Create(FunctionType *Ty, Value *Func, BasicBlock *IfNormal, @@ -4067,11 +3644,12 @@ public: ArrayRef Bundles = None, const Twine &NameStr = "", Instruction *InsertBefore = nullptr) { - unsigned Values = unsigned(Args.size()) + CountBundleInputs(Bundles) + 3; + int NumOperands = + ComputeNumOperands(Args.size(), CountBundleInputs(Bundles)); unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo); - return new (Values, DescriptorBytes) - InvokeInst(Ty, Func, IfNormal, IfException, Args, Bundles, Values, + return new (NumOperands, DescriptorBytes) + InvokeInst(Ty, Func, IfNormal, IfException, Args, Bundles, NumOperands, NameStr, InsertBefore); } @@ -4079,21 +3657,22 @@ public: BasicBlock *IfNormal, BasicBlock *IfException, ArrayRef Args, const Twine &NameStr, BasicBlock *InsertAtEnd) { - unsigned Values = unsigned(Args.size()) + 3; - return new (Values) InvokeInst(Func, IfNormal, IfException, Args, None, - Values, NameStr, InsertAtEnd); + int NumOperands = ComputeNumOperands(Args.size()); + return new (NumOperands) InvokeInst(Func, IfNormal, IfException, Args, None, + NumOperands, NameStr, InsertAtEnd); } static InvokeInst *Create(Value *Func, BasicBlock *IfNormal, BasicBlock *IfException, ArrayRef Args, ArrayRef Bundles, const Twine &NameStr, BasicBlock *InsertAtEnd) { - unsigned Values = unsigned(Args.size()) + CountBundleInputs(Bundles) + 3; + int NumOperands = + ComputeNumOperands(Args.size(), CountBundleInputs(Bundles)); unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo); - return new (Values, DescriptorBytes) - InvokeInst(Func, IfNormal, IfException, Args, Bundles, Values, NameStr, - InsertAtEnd); + return new (NumOperands, DescriptorBytes) + InvokeInst(Func, IfNormal, IfException, Args, Bundles, NumOperands, + NameStr, InsertAtEnd); } /// Create a clone of \p II with a different set of operand bundles and @@ -4114,43 +3693,18 @@ public: addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind); } - /// Return the function called, or null if this is an - /// indirect function invocation. - /// - Function *getCalledFunction() const { - return dyn_cast(Op<-3>()); - } - - /// Get a pointer to the function that is invoked by this - /// instruction - const Value *getCalledValue() const { return Op<-3>(); } - Value *getCalledValue() { return Op<-3>(); } - - /// Set the function called. - void setCalledFunction(Value* Fn) { - setCalledFunction( - cast(cast(Fn->getType())->getElementType()), - Fn); - } - void setCalledFunction(FunctionType *FTy, Value *Fn) { - this->FTy = FTy; - assert(FTy == cast( - cast(Fn->getType())->getElementType())); - Op<-3>() = Fn; - } - // get*Dest - Return the destination basic blocks... BasicBlock *getNormalDest() const { - return cast(Op<-2>()); + return cast(Op()); } BasicBlock *getUnwindDest() const { - return cast(Op<-1>()); + return cast(Op()); } void setNormalDest(BasicBlock *B) { - Op<-2>() = reinterpret_cast(B); + Op() = reinterpret_cast(B); } void setUnwindDest(BasicBlock *B) { - Op<-1>() = reinterpret_cast(B); + Op() = reinterpret_cast(B); } /// Get the landingpad instruction from the landing pad @@ -4162,9 +3716,12 @@ public: return i == 0 ? getNormalDest() : getUnwindDest(); } - void setSuccessor(unsigned idx, BasicBlock *NewSucc) { - assert(idx < 2 && "Successor # out of range for invoke!"); - *(&Op<-2>() + idx) = reinterpret_cast(NewSucc); + void setSuccessor(unsigned i, BasicBlock *NewSucc) { + assert(i < 2 && "Successor # out of range for invoke!"); + if (i == 0) + setNormalDest(NewSucc); + else + setUnwindDest(NewSucc); } unsigned getNumSuccessors() const { return 2; } @@ -4186,36 +3743,29 @@ private: } }; -template <> -struct OperandTraits> - : public VariadicOperandTraits, 3> {}; - InvokeInst::InvokeInst(FunctionType *Ty, Value *Func, BasicBlock *IfNormal, BasicBlock *IfException, ArrayRef Args, - ArrayRef Bundles, unsigned Values, + ArrayRef Bundles, int NumOperands, const Twine &NameStr, Instruction *InsertBefore) - : CallBase(Ty->getReturnType(), Instruction::Invoke, - OperandTraits>::op_end(this) - - Values, - Values, InsertBefore) { + : CallBase(Ty->getReturnType(), Instruction::Invoke, + OperandTraits::op_end(this) - NumOperands, NumOperands, + InsertBefore) { init(Ty, Func, IfNormal, IfException, Args, Bundles, NameStr); } InvokeInst::InvokeInst(Value *Func, BasicBlock *IfNormal, BasicBlock *IfException, ArrayRef Args, - ArrayRef Bundles, unsigned Values, + ArrayRef Bundles, int NumOperands, const Twine &NameStr, BasicBlock *InsertAtEnd) - : CallBase( - cast( - cast(Func->getType())->getElementType()) - ->getReturnType(), - Instruction::Invoke, - OperandTraits>::op_end(this) - Values, Values, - InsertAtEnd) { + : CallBase(cast( + cast(Func->getType())->getElementType()) + ->getReturnType(), + Instruction::Invoke, + OperandTraits::op_end(this) - NumOperands, NumOperands, + InsertAtEnd) { init(Func, IfNormal, IfException, Args, Bundles, NameStr); } - //===----------------------------------------------------------------------===// // ResumeInst Class //===----------------------------------------------------------------------===// diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td index 04de1ca63a26e877df3ad76b1966e71f9a9e9519..f503d3ebbdfebd496b1d5ca63253075e3b383849 100644 --- a/include/llvm/IR/Intrinsics.td +++ b/include/llvm/IR/Intrinsics.td @@ -90,6 +90,10 @@ class ReadNone : IntrinsicProperty { def IntrNoReturn : IntrinsicProperty; +// IntrCold - Calls to this intrinsic are cold. +// Parallels the cold attribute on LLVM IR functions. +def IntrCold : IntrinsicProperty; + // IntrNoduplicate - Calls to this intrinsic cannot be duplicated. // Parallels the noduplicate attribute on LLVM IR functions. def IntrNoDuplicate : IntrinsicProperty; @@ -315,6 +319,78 @@ def int_gcwrite : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty, llvm_ptrptr_ty], [IntrArgMemOnly, NoCapture<1>, NoCapture<2>]>; +//===------------------- ObjC ARC runtime Intrinsics --------------------===// +// +// Note these are to support the Objective-C ARC optimizer which wants to +// eliminate retain and releases where possible. + +def int_objc_autorelease : Intrinsic<[llvm_ptr_ty], + [llvm_ptr_ty]>; +def int_objc_autoreleasePoolPop : Intrinsic<[], [llvm_ptr_ty]>; +def int_objc_autoreleasePoolPush : Intrinsic<[llvm_ptr_ty], []>; +def int_objc_autoreleaseReturnValue : Intrinsic<[llvm_ptr_ty], + [llvm_ptr_ty]>; +def int_objc_copyWeak : Intrinsic<[], + [llvm_ptrptr_ty, + llvm_ptrptr_ty]>; +def int_objc_destroyWeak : Intrinsic<[], [llvm_ptrptr_ty]>; +def int_objc_initWeak : Intrinsic<[llvm_ptr_ty], + [llvm_ptrptr_ty, + llvm_ptr_ty]>; +def int_objc_loadWeak : Intrinsic<[llvm_ptr_ty], + [llvm_ptrptr_ty]>; +def int_objc_loadWeakRetained : Intrinsic<[llvm_ptr_ty], + [llvm_ptrptr_ty]>; +def int_objc_moveWeak : Intrinsic<[], + [llvm_ptrptr_ty, + llvm_ptrptr_ty]>; +def int_objc_release : Intrinsic<[], [llvm_ptr_ty]>; +def int_objc_retain : Intrinsic<[llvm_ptr_ty], + [llvm_ptr_ty]>; +def int_objc_retainAutorelease : Intrinsic<[llvm_ptr_ty], + [llvm_ptr_ty]>; +def int_objc_retainAutoreleaseReturnValue : Intrinsic<[llvm_ptr_ty], + [llvm_ptr_ty]>; +def int_objc_retainAutoreleasedReturnValue : Intrinsic<[llvm_ptr_ty], + [llvm_ptr_ty]>; +def int_objc_retainBlock : Intrinsic<[llvm_ptr_ty], + [llvm_ptr_ty]>; +def int_objc_storeStrong : Intrinsic<[], + [llvm_ptrptr_ty, + llvm_ptr_ty]>; +def int_objc_storeWeak : Intrinsic<[llvm_ptr_ty], + [llvm_ptrptr_ty, + llvm_ptr_ty]>; +def int_objc_clang_arc_use : Intrinsic<[], + [llvm_vararg_ty]>; +def int_objc_unsafeClaimAutoreleasedReturnValue : Intrinsic<[llvm_ptr_ty], + [llvm_ptr_ty]>; +def int_objc_retainedObject : Intrinsic<[llvm_ptr_ty], + [llvm_ptr_ty]>; +def int_objc_unretainedObject : Intrinsic<[llvm_ptr_ty], + [llvm_ptr_ty]>; +def int_objc_unretainedPointer : Intrinsic<[llvm_ptr_ty], + [llvm_ptr_ty]>; +def int_objc_retain_autorelease : Intrinsic<[llvm_ptr_ty], + [llvm_ptr_ty]>; +def int_objc_sync_enter : Intrinsic<[llvm_i32_ty], + [llvm_ptr_ty]>; +def int_objc_sync_exit : Intrinsic<[llvm_i32_ty], + [llvm_ptr_ty]>; +def int_objc_arc_annotation_topdown_bbstart : Intrinsic<[], + [llvm_ptrptr_ty, + llvm_ptrptr_ty]>; +def int_objc_arc_annotation_topdown_bbend : Intrinsic<[], + [llvm_ptrptr_ty, + llvm_ptrptr_ty]>; +def int_objc_arc_annotation_bottomup_bbstart : Intrinsic<[], + [llvm_ptrptr_ty, + llvm_ptrptr_ty]>; +def int_objc_arc_annotation_bottomup_bbend : Intrinsic<[], + [llvm_ptrptr_ty, + llvm_ptrptr_ty]>; + + //===--------------------- Code Generator Intrinsics ----------------------===// // def int_returnaddress : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem]>; @@ -735,7 +811,7 @@ def int_umul_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; -//===------------------------- Fixed Point Intrinsics ---------------------===// +//===------------------------- Saturation Arithmetic Intrinsics ---------------------===// // def int_sadd_sat : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], @@ -750,6 +826,12 @@ def int_usub_sat : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; +//===------------------------- Fixed Point Arithmetic Intrinsics ---------------------===// +// +def int_smul_fix : Intrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable, Commutative]>; + //===------------------------- Memory Use Markers -------------------------===// // def int_lifetime_start : Intrinsic<[], @@ -867,7 +949,7 @@ def int_coro_subfn_addr : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_i8_ty], // def int_flt_rounds : Intrinsic<[llvm_i32_ty]>, GCCBuiltin<"__builtin_flt_rounds">; -def int_trap : Intrinsic<[], [], [IntrNoReturn]>, +def int_trap : Intrinsic<[], [], [IntrNoReturn, IntrCold]>, GCCBuiltin<"__builtin_trap">; def int_debugtrap : Intrinsic<[]>, GCCBuiltin<"__builtin_debugtrap">; @@ -880,6 +962,10 @@ def int_experimental_deoptimize : Intrinsic<[llvm_any_ty], [llvm_vararg_ty], def int_experimental_guard : Intrinsic<[], [llvm_i1_ty, llvm_vararg_ty], [Throws]>; +// Supports widenable conditions for guards represented as explicit branches. +def int_experimental_widenable_condition : Intrinsic<[llvm_i1_ty], [], + [IntrInaccessibleMemOnly]>; + // NOP: calls/invokes to this intrinsic are removed by codegen def int_donothing : Intrinsic<[], [], [IntrNoMem]>; diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td index 67e7da7797a4b908e8c0143aa85e8d4d23474afa..3ea364c78457e70e507e8b0cc8860a12f779585c 100644 --- a/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/include/llvm/IR/IntrinsicsAMDGPU.td @@ -803,7 +803,7 @@ def int_amdgcn_buffer_load_format : AMDGPUBufferLoad; def int_amdgcn_buffer_load : AMDGPUBufferLoad; def int_amdgcn_s_buffer_load : Intrinsic < - [llvm_anyint_ty], + [llvm_any_ty], [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // byte offset(SGPR/VGPR/imm) llvm_i32_ty], // cachepolicy(imm; bit 0 = glc) @@ -835,7 +835,7 @@ class AMDGPURawBufferLoad : Intrinsic < [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrReadMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad; @@ -847,7 +847,7 @@ class AMDGPUStructBufferLoad : Intrinsic < llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrReadMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; @@ -859,7 +859,7 @@ class AMDGPURawBufferStore : Intrinsic < llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrWriteMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore; @@ -872,7 +872,7 @@ class AMDGPUStructBufferStore : Intrinsic < llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrWriteMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; @@ -884,7 +884,7 @@ class AMDGPURawBufferAtomic : Intrinsic < llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) [], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1, 0>; def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic; @@ -904,7 +904,7 @@ def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) [], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; @@ -915,7 +915,7 @@ class AMDGPUStructBufferAtomic : Intrinsic < llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) [], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1, 0>; def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic; @@ -936,7 +936,7 @@ def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) [], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; @@ -980,7 +980,7 @@ def int_amdgcn_raw_tbuffer_load : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrReadMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -991,7 +991,7 @@ def int_amdgcn_raw_tbuffer_store : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrWriteMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; @@ -1002,7 +1002,7 @@ def int_amdgcn_struct_tbuffer_load : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrReadMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -1014,7 +1014,7 @@ def int_amdgcn_struct_tbuffer_store : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrWriteMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; diff --git a/include/llvm/IR/IntrinsicsHexagon.td b/include/llvm/IR/IntrinsicsHexagon.td index 25f4215d68a848c073d38a9e85e4cba18287551d..ecc69a679553ffe99cb35047021c3a41e587547a 100644 --- a/include/llvm/IR/IntrinsicsHexagon.td +++ b/include/llvm/IR/IntrinsicsHexagon.td @@ -15,7 +15,7 @@ // // All Hexagon intrinsics start with "llvm.hexagon.". let TargetPrefix = "hexagon" in { - /// Hexagon_Intrinsic - Base class for all Hexagon intrinsics. + /// Hexagon_Intrinsic - Base class for the majority of Hexagon intrinsics. class Hexagon_Intrinsic ret_types, list param_types, list properties> @@ -30,397 +30,6 @@ let TargetPrefix = "hexagon" in { : Intrinsic; } -//===----------------------------------------------------------------------===// -// -// DEF_FUNCTION_TYPE_1(QI_ftype_MEM,BT_BOOL,BT_PTR) -> -// Hexagon_qi_mem_Intrinsic -// -class Hexagon_qi_mem_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_1(HI_ftype_SI,BT_I16,BT_INT) -> -// Hexagon_hi_si_Intrinsic -// -class Hexagon_hi_si_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_1(SI_ftype_SI,BT_INT,BT_INT) -> -// Hexagon_si_si_Intrinsic -// -class Hexagon_si_si_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_1(DI_ftype_SI,BT_LONGLONG,BT_INT) -> -// Hexagon_di_si_Intrinsic -// -class Hexagon_di_si_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_1(SI_ftype_DI,BT_INT,BT_LONGLONG) -> -// Hexagon_si_di_Intrinsic -// -class Hexagon_si_di_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_1(DI_ftype_DI,BT_LONGLONG,BT_LONGLONG) -> -// Hexagon_di_di_Intrinsic -// -class Hexagon_di_di_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_1(QI_ftype_QI,BT_BOOL,BT_BOOL) -> -// Hexagon_qi_qi_Intrinsic -// -class Hexagon_qi_qi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_1(QI_ftype_SI,BT_BOOL,BT_INT) -> -// Hexagon_qi_si_Intrinsic -// -class Hexagon_qi_si_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_1(DI_ftype_QI,BT_LONGLONG,BT_BOOL) -> -// Hexagon_di_qi_Intrinsic -// -class Hexagon_di_qi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_1(SI_ftype_QI,BT_INT,BT_BOOL) -> -// Hexagon_si_qi_Intrinsic -// -class Hexagon_si_qi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(QI_ftype_SISI,BT_BOOL,BT_INT,BT_INT) -> -// Hexagon_qi_sisi_Intrinsic -// -class Hexagon_qi_sisi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(void_ftype_SISI,BT_VOID,BT_INT,BT_INT) -> -// Hexagon_void_sisi_Intrinsic -// -class Hexagon_void_sisi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(SI_ftype_SISI,BT_INT,BT_INT,BT_INT) -> -// Hexagon_si_sisi_Intrinsic -// -class Hexagon_si_sisi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(USI_ftype_SISI,BT_UINT,BT_INT,BT_INT) -> -// Hexagon_usi_sisi_Intrinsic -// -class Hexagon_usi_sisi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(DI_ftype_SISI,BT_LONGLONG,BT_INT,BT_INT) -> -// Hexagon_di_sisi_Intrinsic -// -class Hexagon_di_sisi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(UDI_ftype_SISI,BT_ULONGLONG,BT_INT,BT_INT) -> -// Hexagon_udi_sisi_Intrinsic -// -class Hexagon_udi_sisi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(DI_ftype_SIDI,BT_LONGLONG,BT_INT,BT_LONGLONG) -> -// Hexagon_di_sidi_Intrinsic -// -class Hexagon_di_sidi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(DI_ftype_DISI,BT_LONGLONG,BT_LONGLONG,BT_INT) -> -// Hexagon_di_disi_Intrinsic -// -class Hexagon_di_disi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(SI_ftype_SIDI,BT_INT,BT_INT,BT_LONGLONG) -> -// Hexagon_si_sidi_Intrinsic -// -class Hexagon_si_sidi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(SI_ftype_DIDI,BT_INT,BT_LONGLONG,BT_LONGLONG) -> -// Hexagon_si_didi_Intrinsic -// -class Hexagon_si_didi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(DI_ftype_DIDI,BT_LONGLONG,BT_LONGLONG,BT_LONGLONG) -> -// Hexagon_di_didi_Intrinsic -// -class Hexagon_di_didi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(UDI_ftype_DIDI,BT_ULONGLONG,BT_LONGLONG,BT_LONGLONG) -> -// Hexagon_udi_didi_Intrinsic -// -class Hexagon_udi_didi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(SI_ftype_DISI,BT_INT,BT_LONGLONG,BT_INT) -> -// Hexagon_si_disi_Intrinsic -// -class Hexagon_si_disi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(QI_ftype_DIDI,BT_BOOL,BT_LONGLONG,BT_LONGLONG) -> -// Hexagon_qi_didi_Intrinsic -// -class Hexagon_qi_didi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(QI_ftype_SIDI,BT_BOOL,BT_INT,BT_LONGLONG) -> -// Hexagon_qi_didi_Intrinsic -// -class Hexagon_qi_sidi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(QI_ftype_DISI,BT_BOOL,BT_LONGLONG,BT_INT) -> -// Hexagon_qi_disi_Intrinsic -// -class Hexagon_qi_disi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(QI_ftype_QIQI,BT_BOOL,BT_BOOL,BT_BOOL) -> -// Hexagon_qi_qiqi_Intrinsic -// -class Hexagon_qi_qiqi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(QI_ftype_QIQIQI,BT_BOOL,BT_BOOL,BT_BOOL) -> -// Hexagon_qi_qiqiqi_Intrinsic -// -class Hexagon_qi_qiqiqi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(SI_ftype_QIQI,BT_INT,BT_BOOL,BT_BOOL) -> -// Hexagon_si_qiqi_Intrinsic -// -class Hexagon_si_qiqi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_2(SI_ftype_QISI,BT_INT,BT_BOOL,BT_INT) -> -// Hexagon_si_qisi_Intrinsic -// -class Hexagon_si_qisi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_3(void_ftype_SISISI,BT_VOID,BT_INT,BT_INT,BT_INT) -> -// Hexagon_void_sisisi_Intrinsic -// -class Hexagon_void_sisisi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_3(SI_ftype_SISISI,BT_INT,BT_INT,BT_INT,BT_INT) -> -// Hexagon_si_sisisi_Intrinsic -// -class Hexagon_si_sisisi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_3(DI_ftype_SISISI,BT_LONGLONG,BT_INT,BT_INT,BT_INT) -> -// Hexagon_di_sisisi_Intrinsic -// -class Hexagon_di_sisisi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_3(SI_ftype_DISISI,BT_INT,BT_LONGLONG,BT_INT,BT_INT) -> -// Hexagon_si_disisi_Intrinsic -// -class Hexagon_si_disisi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_3(DI_ftype_DISISI,BT_LONGLONG,BT_LONGLONG,BT_INT,BT_INT) -> -// Hexagon_di_disisi_Intrinsic -// -class Hexagon_di_disisi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_3(SI_ftype_SIDISI,BT_INT,BT_INT,BT_LONGLONG,BT_INT) -> -// Hexagon_si_sidisi_Intrinsic -// -class Hexagon_si_sidisi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_3(DI_ftype_DIDISI,BT_LONGLONG,BT_LONGLONG, -// BT_LONGLONG,BT_INT) -> -// Hexagon_di_didisi_Intrinsic -// -class Hexagon_di_didisi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_3(SI_ftype_SIDIDI,BT_INT,BT_INT,BT_LONGLONG,BT_LONGLONG) -> -// Hexagon_si_sididi_Intrinsic -// -class Hexagon_si_sididi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_3(DI_ftype_DIDIDI,BT_LONGLONG,BT_LONGLONG,BT_LONGLONG, -// BT_LONGLONG) -> -// Hexagon_di_dididi_Intrinsic -// -class Hexagon_di_dididi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_3(SI_ftype_SISIDI,BT_INT,BT_INT,BT_INT,BT_LONGLONG) -> -// Hexagon_si_sisidi_Intrinsic -// -class Hexagon_si_sisidi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_3(SI_ftype_QISISI,BT_INT,BT_BOOL,BT_INT,BT_INT) -> -// Hexagon_si_qisisi_Intrinsic -// -class Hexagon_si_qisisi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_3(DI_ftype_QISISI,BT_LONGLONG,BT_BOOL,BT_INT,BT_INT) -> -// Hexagon_di_qisisi_Intrinsic -// -class Hexagon_di_qisisi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_3(DI_ftype_QIDIDI,BT_LONGLONG,BT_BOOL,BT_LONGLONG, -// BT_LONGLONG) -> -// Hexagon_di_qididi_Intrinsic -// -class Hexagon_di_qididi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_3(DI_ftype_DIDIQI,BT_LONGLONG,BT_LONGLONG,BT_LONGLONG, -// BT_BOOL) -> -// Hexagon_di_didiqi_Intrinsic -// -class Hexagon_di_didiqi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_4(SI_ftype_SISISISI,BT_INT,BT_INT,BT_INT,BT_INT,BT_INT) -> -// Hexagon_si_sisisisi_Intrinsic -// -class Hexagon_si_sisisisi_Intrinsic - : Hexagon_Intrinsic; -// -// DEF_FUNCTION_TYPE_4(DI_ftype_DIDISISI,BT_LONGLONG,BT_LONGLONG, -// BT_LONGLONG,BT_INT,BT_INT) -> -// Hexagon_di_didisisi_Intrinsic -// -class Hexagon_di_didisisi_Intrinsic - : Hexagon_Intrinsic; - class Hexagon_mem_memmemsi_Intrinsic : Hexagon_Intrinsic llvm_i32_ty, llvm_i32_ty], [IntrWriteMem]>; -class Hexagon_v256_v256v256_Intrinsic - : Hexagon_Intrinsic; - // -// Hexagon_sf_df_Intrinsic +// BUILTIN_INFO_NONCONST(circ_ldd,PTR_ftype_PTRPTRSISI,4) // -class Hexagon_sf_si_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_circ_ldd : +Hexagon_mem_memmemsisi_Intrinsic<"circ_ldd">; // -// Hexagon_sf_df_Intrinsic +// BUILTIN_INFO_NONCONST(circ_ldw,PTR_ftype_PTRPTRSISI,4) // -class Hexagon_sf_df_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_circ_ldw : +Hexagon_mem_memmemsisi_Intrinsic<"circ_ldw">; // -// Hexagon_sf_di_Intrinsic +// BUILTIN_INFO_NONCONST(circ_ldh,PTR_ftype_PTRPTRSISI,4) // -class Hexagon_sf_di_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_circ_ldh : +Hexagon_mem_memmemsisi_Intrinsic<"circ_ldh">; // -// Hexagon_df_sf_Intrinsic +// BUILTIN_INFO_NONCONST(circ_lduh,PTR_ftype_PTRPTRSISI,4) // -class Hexagon_df_sf_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_circ_lduh : +Hexagon_mem_memmemsisi_Intrinsic<"circ_lduh">; // -// Hexagon_di_sf_Intrinsic +// BUILTIN_INFO_NONCONST(circ_ldb,PTR_ftype_PTRPTRSISI,4) // -class Hexagon_di_sf_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_circ_ldb : +Hexagon_mem_memmemsisi_Intrinsic<"circ_ldb">; // -// Hexagon_sf_sf_Intrinsic +// BUILTIN_INFO_NONCONST(circ_ldub,PTR_ftype_PTRPTRSISI,4) // -class Hexagon_sf_sf_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_circ_ldub : +Hexagon_mem_memmemsisi_Intrinsic<"circ_ldub">; + // -// Hexagon_si_sf_Intrinsic +// BUILTIN_INFO_NONCONST(circ_std,PTR_ftype_PTRDISISI,4) // -class Hexagon_si_sf_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_circ_std : +Hexagon_mem_memdisisi_Intrinsic<"circ_std">; // -// Hexagon_si_df_Intrinsic +// BUILTIN_INFO_NONCONST(circ_stw,PTR_ftype_PTRSISISI,4) // -class Hexagon_si_df_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_circ_stw : +Hexagon_mem_memsisisi_Intrinsic<"circ_stw">; // -// Hexagon_sf_sfsf_Intrinsic +// BUILTIN_INFO_NONCONST(circ_sth,PTR_ftype_PTRSISISI,4) // -class Hexagon_sf_sfsf_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_circ_sth : +Hexagon_mem_memsisisi_Intrinsic<"circ_sth">; // -// Hexagon_si_sfsf_Intrinsic +// BUILTIN_INFO_NONCONST(circ_sthhi,PTR_ftype_PTRSISISI,4) // -class Hexagon_si_sfsf_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_circ_sthhi : +Hexagon_mem_memsisisi_Intrinsic<"circ_sthhi">; // -// Hexagon_si_sfsi_Intrinsic +// BUILTIN_INFO_NONCONST(circ_stb,PTR_ftype_PTRSISISI,4) // -class Hexagon_si_sfsi_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_circ_stb : +Hexagon_mem_memsisisi_Intrinsic<"circ_stb">; + // -// Hexagon_qi_sfqi_Intrinsic +// BUILTIN_INFO(HEXAGON.dcfetch_A,v_ftype_DI*,1) // -class Hexagon_qi_sfqi_Intrinsic - : Hexagon_Intrinsic; -// -// Hexagon_sf_sfsfsf_Intrinsic -// -class Hexagon_sf_sfsfsf_Intrinsic - : Hexagon_Intrinsic; -// -// Hexagon_sf_sfsfsfqi_Intrinsic -// -class Hexagon_sf_sfsfsfqi_Intrinsic - : Hexagon_Intrinsic; -// -// Hexagon_di_dididi_Intrinsic -// -class Hexagon_di_dididisi_Intrinsic - : Hexagon_Intrinsic; -// -// Hexagon_df_si_Intrinsic -// -class Hexagon_df_si_Intrinsic - : Hexagon_Intrinsic; -// -// Hexagon_df_di_Intrinsic -// -class Hexagon_df_di_Intrinsic - : Hexagon_Intrinsic; -// -// Hexagon_di_df_Intrinsic -// -class Hexagon_di_df_Intrinsic - : Hexagon_Intrinsic; -// -// Hexagon_df_df_Intrinsic -// -class Hexagon_df_df_Intrinsic - : Hexagon_Intrinsic; -// -// Hexagon_df_dfdf_Intrinsic -// -class Hexagon_df_dfdf_Intrinsic - : Hexagon_Intrinsic; -// -// Hexagon_si_dfdf_Intrinsic -// -class Hexagon_si_dfdf_Intrinsic - : Hexagon_Intrinsic; -// -// Hexagon_si_dfsi_Intrinsic -// -class Hexagon_si_dfsi_Intrinsic - : Hexagon_Intrinsic; -// -// -// Hexagon_df_dfdfdf_Intrinsic -// -class Hexagon_df_dfdfdf_Intrinsic - : Hexagon_Intrinsic; -// -// Hexagon_df_dfdfdf_Intrinsic -// -class Hexagon_df_dfdfdfqi_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_prefetch : +Hexagon_Intrinsic<"HEXAGON_prefetch", [], [llvm_ptr_ty], []>; +def int_hexagon_Y2_dccleana : +Hexagon_Intrinsic<"HEXAGON_Y2_dccleana", [], [llvm_ptr_ty], []>; +def int_hexagon_Y2_dccleaninva : +Hexagon_Intrinsic<"HEXAGON_Y2_dccleaninva", [], [llvm_ptr_ty], []>; +def int_hexagon_Y2_dcinva : +Hexagon_Intrinsic<"HEXAGON_Y2_dcinva", [], [llvm_ptr_ty], []>; +def int_hexagon_Y2_dczeroa : +Hexagon_Intrinsic<"HEXAGON_Y2_dczeroa", [], [llvm_ptr_ty], + [IntrWriteMem, IntrArgMemOnly, IntrHasSideEffects]>; +def int_hexagon_Y4_l2fetch : +Hexagon_Intrinsic<"HEXAGON_Y4_l2fetch", [], [llvm_ptr_ty, llvm_i32_ty], []>; +def int_hexagon_Y5_l2fetch : +Hexagon_Intrinsic<"HEXAGON_Y5_l2fetch", [], [llvm_ptr_ty, llvm_i64_ty], []>; +def llvm_ptr32_ty : LLVMPointerType; +def llvm_ptr64_ty : LLVMPointerType; -// This one below will not be auto-generated, -// so make sure, you don't overwrite this one. -// -// BUILTIN_INFO_NONCONST(circ_ldd,PTR_ftype_PTRPTRSISI,4) -// -def int_hexagon_circ_ldd : -Hexagon_mem_memmemsisi_Intrinsic<"circ_ldd">; -// -// BUILTIN_INFO_NONCONST(circ_ldw,PTR_ftype_PTRPTRSISI,4) -// -def int_hexagon_circ_ldw : -Hexagon_mem_memmemsisi_Intrinsic<"circ_ldw">; -// -// BUILTIN_INFO_NONCONST(circ_ldh,PTR_ftype_PTRPTRSISI,4) -// -def int_hexagon_circ_ldh : -Hexagon_mem_memmemsisi_Intrinsic<"circ_ldh">; -// -// BUILTIN_INFO_NONCONST(circ_lduh,PTR_ftype_PTRPTRSISI,4) -// -def int_hexagon_circ_lduh : -Hexagon_mem_memmemsisi_Intrinsic<"circ_lduh">; -// -// BUILTIN_INFO_NONCONST(circ_ldb,PTR_ftype_PTRPTRSISI,4) -// -def int_hexagon_circ_ldb : -Hexagon_mem_memmemsisi_Intrinsic<"circ_ldb">; -// -// BUILTIN_INFO_NONCONST(circ_ldub,PTR_ftype_PTRPTRSISI,4) -// -def int_hexagon_circ_ldub : -Hexagon_mem_memmemsisi_Intrinsic<"circ_ldub">; +// Mark locked loads as read/write to prevent any accidental reordering. +def int_hexagon_L2_loadw_locked : +Hexagon_Intrinsic<"HEXAGON_L2_loadw_locked", [llvm_i32_ty], [llvm_ptr32_ty], + [IntrArgMemOnly, NoCapture<0>]>; +def int_hexagon_L4_loadd_locked : +Hexagon_Intrinsic<"HEXAGON_L4_loadd_locked", [llvm_i64_ty], [llvm_ptr64_ty], + [IntrArgMemOnly, NoCapture<0>]>; -// -// BUILTIN_INFO_NONCONST(circ_std,PTR_ftype_PTRDISISI,4) -// -def int_hexagon_circ_std : -Hexagon_mem_memdisisi_Intrinsic<"circ_std">; -// -// BUILTIN_INFO_NONCONST(circ_stw,PTR_ftype_PTRSISISI,4) -// -def int_hexagon_circ_stw : -Hexagon_mem_memsisisi_Intrinsic<"circ_stw">; -// -// BUILTIN_INFO_NONCONST(circ_sth,PTR_ftype_PTRSISISI,4) -// -def int_hexagon_circ_sth : -Hexagon_mem_memsisisi_Intrinsic<"circ_sth">; -// -// BUILTIN_INFO_NONCONST(circ_sthhi,PTR_ftype_PTRSISISI,4) -// -def int_hexagon_circ_sthhi : -Hexagon_mem_memsisisi_Intrinsic<"circ_sthhi">; -// -// BUILTIN_INFO_NONCONST(circ_stb,PTR_ftype_PTRSISISI,4) -// -def int_hexagon_circ_stb : -Hexagon_mem_memsisisi_Intrinsic<"circ_stb">; +def int_hexagon_S2_storew_locked : +Hexagon_Intrinsic<"HEXAGON_S2_storew_locked", [llvm_i32_ty], + [llvm_ptr32_ty, llvm_i32_ty], [IntrArgMemOnly, NoCapture<0>]>; +def int_hexagon_S4_stored_locked : +Hexagon_Intrinsic<"HEXAGON_S4_stored_locked", [llvm_i32_ty], + [llvm_ptr64_ty, llvm_i64_ty], [IntrArgMemOnly, NoCapture<0>]>; +def int_hexagon_vmemcpy : Hexagon_Intrinsic<"hexagon_vmemcpy", + [], [llvm_ptr_ty, llvm_ptr_ty, llvm_i32_ty], + [IntrArgMemOnly, NoCapture<0>, NoCapture<1>, WriteOnly<0>, ReadOnly<1>]>; -def int_hexagon_mm256i_vaddw : -Hexagon_v256_v256v256_Intrinsic<"_mm256i_vaddw">; +def int_hexagon_vmemset : Hexagon_Intrinsic<"hexagon_vmemset", + [], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], + [IntrArgMemOnly, NoCapture<0>, WriteOnly<0>]>; +multiclass Hexagon_custom_circ_ld_Intrinsic { + def NAME#_pci : Hexagon_NonGCC_Intrinsic< + [ElTy, llvm_ptr_ty], + [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], + [IntrArgMemOnly, NoCapture<3>]>; + def NAME#_pcr : Hexagon_NonGCC_Intrinsic< + [ElTy, llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_ptr_ty], + [IntrArgMemOnly, NoCapture<2>]>; +} + +defm int_hexagon_L2_loadrub : Hexagon_custom_circ_ld_Intrinsic; +defm int_hexagon_L2_loadrb : Hexagon_custom_circ_ld_Intrinsic; +defm int_hexagon_L2_loadruh : Hexagon_custom_circ_ld_Intrinsic; +defm int_hexagon_L2_loadrh : Hexagon_custom_circ_ld_Intrinsic; +defm int_hexagon_L2_loadri : Hexagon_custom_circ_ld_Intrinsic; +defm int_hexagon_L2_loadrd : Hexagon_custom_circ_ld_Intrinsic; + +multiclass Hexagon_custom_circ_st_Intrinsic { + def NAME#_pci : Hexagon_NonGCC_Intrinsic< + [llvm_ptr_ty], + [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ElTy, llvm_ptr_ty], + [IntrArgMemOnly, NoCapture<4>]>; + def NAME#_pcr : Hexagon_NonGCC_Intrinsic< + [llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty, ElTy, llvm_ptr_ty], + [IntrArgMemOnly, NoCapture<3>]>; +} + +defm int_hexagon_S2_storerb : Hexagon_custom_circ_st_Intrinsic; +defm int_hexagon_S2_storerh : Hexagon_custom_circ_st_Intrinsic; +defm int_hexagon_S2_storerf : Hexagon_custom_circ_st_Intrinsic; +defm int_hexagon_S2_storeri : Hexagon_custom_circ_st_Intrinsic; +defm int_hexagon_S2_storerd : Hexagon_custom_circ_st_Intrinsic; + +// The front-end emits the intrinsic call with only two arguments. The third +// argument from the builtin is already used by front-end to write to memory +// by generating a store. +class Hexagon_custom_brev_ld_Intrinsic + : Hexagon_NonGCC_Intrinsic< + [ElTy, llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty], + [IntrReadMem]>; + +def int_hexagon_L2_loadrub_pbr : Hexagon_custom_brev_ld_Intrinsic; +def int_hexagon_L2_loadrb_pbr : Hexagon_custom_brev_ld_Intrinsic; +def int_hexagon_L2_loadruh_pbr : Hexagon_custom_brev_ld_Intrinsic; +def int_hexagon_L2_loadrh_pbr : Hexagon_custom_brev_ld_Intrinsic; +def int_hexagon_L2_loadri_pbr : Hexagon_custom_brev_ld_Intrinsic; +def int_hexagon_L2_loadrd_pbr : Hexagon_custom_brev_ld_Intrinsic; + +def int_hexagon_S2_storerb_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_stb">; +def int_hexagon_S2_storerh_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_sth">; +def int_hexagon_S2_storerf_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_sthhi">; +def int_hexagon_S2_storeri_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_stw">; +def int_hexagon_S2_storerd_pbr : Hexagon_mem_memdisi_Intrinsic<"brev_std">; -// This one above will not be auto-generated, -// so make sure, you don't overwrite this one. -// -// BUILTIN_INFO(HEXAGON.C2_cmpeq,QI_ftype_SISI,2) -// -def int_hexagon_C2_cmpeq : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpeq">; -// -// BUILTIN_INFO(HEXAGON.C2_cmpgt,QI_ftype_SISI,2) -// -def int_hexagon_C2_cmpgt : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpgt">; -// -// BUILTIN_INFO(HEXAGON.C2_cmpgtu,QI_ftype_SISI,2) -// -def int_hexagon_C2_cmpgtu : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpgtu">; -// -// BUILTIN_INFO(HEXAGON.C2_cmpeqp,QI_ftype_DIDI,2) -// -def int_hexagon_C2_cmpeqp : -Hexagon_si_didi_Intrinsic<"HEXAGON_C2_cmpeqp">; -// -// BUILTIN_INFO(HEXAGON.C2_cmpgtp,QI_ftype_DIDI,2) -// -def int_hexagon_C2_cmpgtp : -Hexagon_si_didi_Intrinsic<"HEXAGON_C2_cmpgtp">; -// -// BUILTIN_INFO(HEXAGON.C2_cmpgtup,QI_ftype_DIDI,2) -// -def int_hexagon_C2_cmpgtup : -Hexagon_si_didi_Intrinsic<"HEXAGON_C2_cmpgtup">; -// -// BUILTIN_INFO(HEXAGON.A4_rcmpeqi,SI_ftype_SISI,2) -// -def int_hexagon_A4_rcmpeqi : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_rcmpeqi">; -// -// BUILTIN_INFO(HEXAGON.A4_rcmpneqi,SI_ftype_SISI,2) -// -def int_hexagon_A4_rcmpneqi : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_rcmpneqi">; -// -// BUILTIN_INFO(HEXAGON.A4_rcmpeq,SI_ftype_SISI,2) -// -def int_hexagon_A4_rcmpeq : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_rcmpeq">; -// -// BUILTIN_INFO(HEXAGON.A4_rcmpneq,SI_ftype_SISI,2) -// -def int_hexagon_A4_rcmpneq : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_rcmpneq">; -// -// BUILTIN_INFO(HEXAGON.C2_bitsset,QI_ftype_SISI,2) -// -def int_hexagon_C2_bitsset : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_bitsset">; -// -// BUILTIN_INFO(HEXAGON.C2_bitsclr,QI_ftype_SISI,2) -// -def int_hexagon_C2_bitsclr : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_bitsclr">; -// -// BUILTIN_INFO(HEXAGON.C4_nbitsset,QI_ftype_SISI,2) -// -def int_hexagon_C4_nbitsset : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_nbitsset">; -// -// BUILTIN_INFO(HEXAGON.C4_nbitsclr,QI_ftype_SISI,2) -// -def int_hexagon_C4_nbitsclr : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_nbitsclr">; -// -// BUILTIN_INFO(HEXAGON.C2_cmpeqi,QI_ftype_SISI,2) -// -def int_hexagon_C2_cmpeqi : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpeqi">; -// -// BUILTIN_INFO(HEXAGON.C2_cmpgti,QI_ftype_SISI,2) -// -def int_hexagon_C2_cmpgti : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpgti">; -// -// BUILTIN_INFO(HEXAGON.C2_cmpgtui,QI_ftype_SISI,2) -// -def int_hexagon_C2_cmpgtui : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpgtui">; -// -// BUILTIN_INFO(HEXAGON.C2_cmpgei,QI_ftype_SISI,2) -// -def int_hexagon_C2_cmpgei : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpgei">; -// -// BUILTIN_INFO(HEXAGON.C2_cmpgeui,QI_ftype_SISI,2) -// -def int_hexagon_C2_cmpgeui : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpgeui">; -// -// BUILTIN_INFO(HEXAGON.C2_cmplt,QI_ftype_SISI,2) -// -def int_hexagon_C2_cmplt : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmplt">; -// -// BUILTIN_INFO(HEXAGON.C2_cmpltu,QI_ftype_SISI,2) -// -def int_hexagon_C2_cmpltu : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_cmpltu">; -// -// BUILTIN_INFO(HEXAGON.C2_bitsclri,QI_ftype_SISI,2) // -def int_hexagon_C2_bitsclri : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_bitsclri">; +// Masked vector stores // -// BUILTIN_INFO(HEXAGON.C4_nbitsclri,QI_ftype_SISI,2) + // -def int_hexagon_C4_nbitsclri : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_nbitsclri">; +// Hexagon_vv64ivmemv512_Intrinsic +// tag: V6_vS32b_qpred_ai +class Hexagon_vv64ivmemv512_Intrinsic + : Hexagon_Intrinsic; + // -// BUILTIN_INFO(HEXAGON.C4_cmpneqi,QI_ftype_SISI,2) +// Hexagon_vv128ivmemv1024_Intrinsic +// tag: V6_vS32b_qpred_ai_128B +class Hexagon_vv128ivmemv1024_Intrinsic + : Hexagon_Intrinsic; + +def int_hexagon_V6_vS32b_qpred_ai : +Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vS32b_qpred_ai">; + +def int_hexagon_V6_vS32b_nqpred_ai : +Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vS32b_nqpred_ai">; + +def int_hexagon_V6_vS32b_nt_qpred_ai : +Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vS32b_nt_qpred_ai">; + +def int_hexagon_V6_vS32b_nt_nqpred_ai : +Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vS32b_nt_nqpred_ai">; + +def int_hexagon_V6_vS32b_qpred_ai_128B : +Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vS32b_qpred_ai_128B">; + +def int_hexagon_V6_vS32b_nqpred_ai_128B : +Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vS32b_nqpred_ai_128B">; + +def int_hexagon_V6_vS32b_nt_qpred_ai_128B : +Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vS32b_nt_qpred_ai_128B">; + +def int_hexagon_V6_vS32b_nt_nqpred_ai_128B : +Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vS32b_nt_nqpred_ai_128B">; + +def int_hexagon_V6_vmaskedstoreq : +Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstoreq">; + +def int_hexagon_V6_vmaskedstorenq : +Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstorenq">; + +def int_hexagon_V6_vmaskedstorentq : +Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstorentq">; + +def int_hexagon_V6_vmaskedstorentnq : +Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstorentnq">; + +def int_hexagon_V6_vmaskedstoreq_128B : +Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstoreq_128B">; + +def int_hexagon_V6_vmaskedstorenq_128B : +Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstorenq_128B">; + +def int_hexagon_V6_vmaskedstorentq_128B : +Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstorentq_128B">; + +def int_hexagon_V6_vmaskedstorentnq_128B : +Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstorentnq_128B">; + +class Hexagon_V65_vvmemiiv512_Intrinsic + : Hexagon_Intrinsic; + +class Hexagon_V65_vvmemiiv1024_Intrinsic + : Hexagon_Intrinsic; + +class Hexagon_V65_vvmemiiv2048_Intrinsic + : Hexagon_Intrinsic; + +class Hexagon_V65_vvmemv64iiiv512_Intrinsic + : Hexagon_Intrinsic; + +class Hexagon_V65_vvmemv128iiiv1024_Intrinsic + : Hexagon_Intrinsic; + +class Hexagon_V65_vvmemv64iiiv1024_Intrinsic + : Hexagon_Intrinsic; + +class Hexagon_V65_vvmemv128iiiv2048_Intrinsic + : Hexagon_Intrinsic; + +def int_hexagon_V6_vgathermw : +Hexagon_V65_vvmemiiv512_Intrinsic<"HEXAGON_V6_vgathermw">; + +def int_hexagon_V6_vgathermw_128B : +Hexagon_V65_vvmemiiv1024_Intrinsic<"HEXAGON_V6_vgathermw_128B">; + +def int_hexagon_V6_vgathermh : +Hexagon_V65_vvmemiiv512_Intrinsic<"HEXAGON_V6_vgathermh">; + +def int_hexagon_V6_vgathermh_128B : +Hexagon_V65_vvmemiiv1024_Intrinsic<"HEXAGON_V6_vgathermh_128B">; + +def int_hexagon_V6_vgathermhw : +Hexagon_V65_vvmemiiv1024_Intrinsic<"HEXAGON_V6_vgathermhw">; + +def int_hexagon_V6_vgathermhw_128B : +Hexagon_V65_vvmemiiv2048_Intrinsic<"HEXAGON_V6_vgathermhw_128B">; + +def int_hexagon_V6_vgathermwq : +Hexagon_V65_vvmemv64iiiv512_Intrinsic<"HEXAGON_V6_vgathermwq">; + +def int_hexagon_V6_vgathermwq_128B : +Hexagon_V65_vvmemv128iiiv1024_Intrinsic<"HEXAGON_V6_vgathermwq_128B">; + +def int_hexagon_V6_vgathermhq : +Hexagon_V65_vvmemv64iiiv512_Intrinsic<"HEXAGON_V6_vgathermhq">; + +def int_hexagon_V6_vgathermhq_128B : +Hexagon_V65_vvmemv128iiiv1024_Intrinsic<"HEXAGON_V6_vgathermhq_128B">; + +def int_hexagon_V6_vgathermhwq : +Hexagon_V65_vvmemv64iiiv1024_Intrinsic<"HEXAGON_V6_vgathermhwq">; + +def int_hexagon_V6_vgathermhwq_128B : +Hexagon_V65_vvmemv128iiiv2048_Intrinsic<"HEXAGON_V6_vgathermhwq_128B">; + +class Hexagon_V65_viiv512v512_Intrinsic + : Hexagon_Intrinsic; + +class Hexagon_V65_viiv1024v1024_Intrinsic + : Hexagon_Intrinsic; + +class Hexagon_V65_vv64iiiv512v512_Intrinsic + : Hexagon_Intrinsic; + +class Hexagon_V65_vv128iiiv1024v1024_Intrinsic + : Hexagon_Intrinsic; + +class Hexagon_V65_viiv1024v512_Intrinsic + : Hexagon_Intrinsic; + +class Hexagon_V65_viiv2048v1024_Intrinsic + : Hexagon_Intrinsic; + +class Hexagon_V65_vv64iiiv1024v512_Intrinsic + : Hexagon_Intrinsic; + +class Hexagon_V65_vv128iiiv2048v1024_Intrinsic + : Hexagon_Intrinsic; + +class Hexagon_V65_v2048_Intrinsic + : Hexagon_Intrinsic; + // -def int_hexagon_C4_cmpneqi : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_cmpneqi">; +// BUILTIN_INFO(HEXAGON.V6_vscattermw,v_ftype_SISIVIVI,4) +// tag : V6_vscattermw +def int_hexagon_V6_vscattermw : +Hexagon_V65_viiv512v512_Intrinsic<"HEXAGON_V6_vscattermw">; + // -// BUILTIN_INFO(HEXAGON.C4_cmpltei,QI_ftype_SISI,2) +// BUILTIN_INFO(HEXAGON.V6_vscattermw_128B,v_ftype_SISIVIVI,4) +// tag : V6_vscattermw_128B +def int_hexagon_V6_vscattermw_128B : +Hexagon_V65_viiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermw_128B">; + // -def int_hexagon_C4_cmpltei : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_cmpltei">; +// BUILTIN_INFO(HEXAGON.V6_vscattermh,v_ftype_SISIVIVI,4) +// tag : V6_vscattermh +def int_hexagon_V6_vscattermh : +Hexagon_V65_viiv512v512_Intrinsic<"HEXAGON_V6_vscattermh">; + // -// BUILTIN_INFO(HEXAGON.C4_cmplteui,QI_ftype_SISI,2) +// BUILTIN_INFO(HEXAGON.V6_vscattermh_128B,v_ftype_SISIVIVI,4) +// tag : V6_vscattermh_128B +def int_hexagon_V6_vscattermh_128B : +Hexagon_V65_viiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermh_128B">; + // -def int_hexagon_C4_cmplteui : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_cmplteui">; +// BUILTIN_INFO(HEXAGON.V6_vscattermw_add,v_ftype_SISIVIVI,4) +// tag : V6_vscattermw_add +def int_hexagon_V6_vscattermw_add : +Hexagon_V65_viiv512v512_Intrinsic<"HEXAGON_V6_vscattermw_add">; + // -// BUILTIN_INFO(HEXAGON.C4_cmpneq,QI_ftype_SISI,2) +// BUILTIN_INFO(HEXAGON.V6_vscattermw_add_128B,v_ftype_SISIVIVI,4) +// tag : V6_vscattermw_add_128B +def int_hexagon_V6_vscattermw_add_128B : +Hexagon_V65_viiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermw_add_128B">; + // -def int_hexagon_C4_cmpneq : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_cmpneq">; +// BUILTIN_INFO(HEXAGON.V6_vscattermh_add,v_ftype_SISIVIVI,4) +// tag : V6_vscattermh_add +def int_hexagon_V6_vscattermh_add : +Hexagon_V65_viiv512v512_Intrinsic<"HEXAGON_V6_vscattermh_add">; + // -// BUILTIN_INFO(HEXAGON.C4_cmplte,QI_ftype_SISI,2) +// BUILTIN_INFO(HEXAGON.V6_vscattermh_add_128B,v_ftype_SISIVIVI,4) +// tag : V6_vscattermh_add_128B +def int_hexagon_V6_vscattermh_add_128B : +Hexagon_V65_viiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermh_add_128B">; + // -def int_hexagon_C4_cmplte : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_cmplte">; +// BUILTIN_INFO(HEXAGON.V6_vscattermwq,v_ftype_QVSISIVIVI,5) +// tag : V6_vscattermwq +def int_hexagon_V6_vscattermwq : +Hexagon_V65_vv64iiiv512v512_Intrinsic<"HEXAGON_V6_vscattermwq">; + // -// BUILTIN_INFO(HEXAGON.C4_cmplteu,QI_ftype_SISI,2) +// BUILTIN_INFO(HEXAGON.V6_vscattermwq_128B,v_ftype_QVSISIVIVI,5) +// tag : V6_vscattermwq_128B +def int_hexagon_V6_vscattermwq_128B : +Hexagon_V65_vv128iiiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermwq_128B">; + // -def int_hexagon_C4_cmplteu : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C4_cmplteu">; +// BUILTIN_INFO(HEXAGON.V6_vscattermhq,v_ftype_QVSISIVIVI,5) +// tag : V6_vscattermhq +def int_hexagon_V6_vscattermhq : +Hexagon_V65_vv64iiiv512v512_Intrinsic<"HEXAGON_V6_vscattermhq">; + // -// BUILTIN_INFO(HEXAGON.C2_and,QI_ftype_QIQI,2) +// BUILTIN_INFO(HEXAGON.V6_vscattermhq_128B,v_ftype_QVSISIVIVI,5) +// tag : V6_vscattermhq_128B +def int_hexagon_V6_vscattermhq_128B : +Hexagon_V65_vv128iiiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermhq_128B">; + // -def int_hexagon_C2_and : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_and">; -// -// BUILTIN_INFO(HEXAGON.C2_or,QI_ftype_QIQI,2) -// -def int_hexagon_C2_or : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_or">; -// -// BUILTIN_INFO(HEXAGON.C2_xor,QI_ftype_QIQI,2) -// -def int_hexagon_C2_xor : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_xor">; -// -// BUILTIN_INFO(HEXAGON.C2_andn,QI_ftype_QIQI,2) -// -def int_hexagon_C2_andn : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_andn">; -// -// BUILTIN_INFO(HEXAGON.C2_not,QI_ftype_QI,1) -// -def int_hexagon_C2_not : -Hexagon_si_si_Intrinsic<"HEXAGON_C2_not">; -// -// BUILTIN_INFO(HEXAGON.C2_orn,QI_ftype_QIQI,2) -// -def int_hexagon_C2_orn : -Hexagon_si_sisi_Intrinsic<"HEXAGON_C2_orn">; -// -// BUILTIN_INFO(HEXAGON.C4_and_and,QI_ftype_QIQIQI,3) -// -def int_hexagon_C4_and_and : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_and_and">; -// -// BUILTIN_INFO(HEXAGON.C4_and_or,QI_ftype_QIQIQI,3) -// -def int_hexagon_C4_and_or : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_and_or">; -// -// BUILTIN_INFO(HEXAGON.C4_or_and,QI_ftype_QIQIQI,3) -// -def int_hexagon_C4_or_and : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_or_and">; -// -// BUILTIN_INFO(HEXAGON.C4_or_or,QI_ftype_QIQIQI,3) -// -def int_hexagon_C4_or_or : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_or_or">; -// -// BUILTIN_INFO(HEXAGON.C4_and_andn,QI_ftype_QIQIQI,3) -// -def int_hexagon_C4_and_andn : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_and_andn">; -// -// BUILTIN_INFO(HEXAGON.C4_and_orn,QI_ftype_QIQIQI,3) -// -def int_hexagon_C4_and_orn : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_and_orn">; -// -// BUILTIN_INFO(HEXAGON.C4_or_andn,QI_ftype_QIQIQI,3) -// -def int_hexagon_C4_or_andn : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_or_andn">; -// -// BUILTIN_INFO(HEXAGON.C4_or_orn,QI_ftype_QIQIQI,3) -// -def int_hexagon_C4_or_orn : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_C4_or_orn">; -// -// BUILTIN_INFO(HEXAGON.C2_pxfer_map,QI_ftype_QI,1) -// -def int_hexagon_C2_pxfer_map : -Hexagon_si_qi_Intrinsic<"HEXAGON_C2_pxfer_map">; -// -// BUILTIN_INFO(HEXAGON.C2_any8,QI_ftype_QI,1) -// -def int_hexagon_C2_any8 : -Hexagon_si_qi_Intrinsic<"HEXAGON_C2_any8">; -// -// BUILTIN_INFO(HEXAGON.C2_all8,QI_ftype_QI,1) -// -def int_hexagon_C2_all8 : -Hexagon_si_qi_Intrinsic<"HEXAGON_C2_all8">; -// -// BUILTIN_INFO(HEXAGON.C2_vitpack,SI_ftype_QIQI,2) -// -def int_hexagon_C2_vitpack : -Hexagon_si_qiqi_Intrinsic<"HEXAGON_C2_vitpack">; -// -// BUILTIN_INFO(HEXAGON.C2_mux,SI_ftype_QISISI,3) -// -def int_hexagon_C2_mux : -Hexagon_si_qisisi_Intrinsic<"HEXAGON_C2_mux">; -// -// BUILTIN_INFO(HEXAGON.C2_muxii,SI_ftype_QISISI,3) -// -def int_hexagon_C2_muxii : -Hexagon_si_qisisi_Intrinsic<"HEXAGON_C2_muxii">; -// -// BUILTIN_INFO(HEXAGON.C2_muxir,SI_ftype_QISISI,3) -// -def int_hexagon_C2_muxir : -Hexagon_si_qisisi_Intrinsic<"HEXAGON_C2_muxir">; -// -// BUILTIN_INFO(HEXAGON.C2_muxri,SI_ftype_QISISI,3) -// -def int_hexagon_C2_muxri : -Hexagon_si_qisisi_Intrinsic<"HEXAGON_C2_muxri">; -// -// BUILTIN_INFO(HEXAGON.C2_vmux,DI_ftype_QIDIDI,3) -// -def int_hexagon_C2_vmux : -Hexagon_di_qididi_Intrinsic<"HEXAGON_C2_vmux">; -// -// BUILTIN_INFO(HEXAGON.C2_mask,DI_ftype_QI,1) -// -def int_hexagon_C2_mask : -Hexagon_di_qi_Intrinsic<"HEXAGON_C2_mask">; -// -// BUILTIN_INFO(HEXAGON.A2_vcmpbeq,QI_ftype_DIDI,2) -// -def int_hexagon_A2_vcmpbeq : -Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmpbeq">; -// -// BUILTIN_INFO(HEXAGON.A4_vcmpbeqi,QI_ftype_DISI,2) -// -def int_hexagon_A4_vcmpbeqi : -Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpbeqi">; -// -// BUILTIN_INFO(HEXAGON.A4_vcmpbeq_any,QI_ftype_DIDI,2) -// -def int_hexagon_A4_vcmpbeq_any : -Hexagon_si_didi_Intrinsic<"HEXAGON_A4_vcmpbeq_any">; -// -// BUILTIN_INFO(HEXAGON.A2_vcmpbgtu,QI_ftype_DIDI,2) -// -def int_hexagon_A2_vcmpbgtu : -Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmpbgtu">; -// -// BUILTIN_INFO(HEXAGON.A4_vcmpbgtui,QI_ftype_DISI,2) -// -def int_hexagon_A4_vcmpbgtui : -Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpbgtui">; -// -// BUILTIN_INFO(HEXAGON.A4_vcmpbgt,QI_ftype_DIDI,2) -// -def int_hexagon_A4_vcmpbgt : -Hexagon_si_didi_Intrinsic<"HEXAGON_A4_vcmpbgt">; -// -// BUILTIN_INFO(HEXAGON.A4_vcmpbgti,QI_ftype_DISI,2) -// -def int_hexagon_A4_vcmpbgti : -Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpbgti">; -// -// BUILTIN_INFO(HEXAGON.A4_cmpbeq,QI_ftype_SISI,2) -// -def int_hexagon_A4_cmpbeq : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpbeq">; -// -// BUILTIN_INFO(HEXAGON.A4_cmpbeqi,QI_ftype_SISI,2) -// -def int_hexagon_A4_cmpbeqi : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpbeqi">; -// -// BUILTIN_INFO(HEXAGON.A4_cmpbgtu,QI_ftype_SISI,2) -// -def int_hexagon_A4_cmpbgtu : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpbgtu">; -// -// BUILTIN_INFO(HEXAGON.A4_cmpbgtui,QI_ftype_SISI,2) -// -def int_hexagon_A4_cmpbgtui : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpbgtui">; -// -// BUILTIN_INFO(HEXAGON.A4_cmpbgt,QI_ftype_SISI,2) -// -def int_hexagon_A4_cmpbgt : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpbgt">; -// -// BUILTIN_INFO(HEXAGON.A4_cmpbgti,QI_ftype_SISI,2) -// -def int_hexagon_A4_cmpbgti : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpbgti">; -// -// BUILTIN_INFO(HEXAGON.A2_vcmpheq,QI_ftype_DIDI,2) -// -def int_hexagon_A2_vcmpheq : -Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmpheq">; -// -// BUILTIN_INFO(HEXAGON.A2_vcmphgt,QI_ftype_DIDI,2) -// -def int_hexagon_A2_vcmphgt : -Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmphgt">; -// -// BUILTIN_INFO(HEXAGON.A2_vcmphgtu,QI_ftype_DIDI,2) -// -def int_hexagon_A2_vcmphgtu : -Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmphgtu">; -// -// BUILTIN_INFO(HEXAGON.A4_vcmpheqi,QI_ftype_DISI,2) -// -def int_hexagon_A4_vcmpheqi : -Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpheqi">; -// -// BUILTIN_INFO(HEXAGON.A4_vcmphgti,QI_ftype_DISI,2) -// -def int_hexagon_A4_vcmphgti : -Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmphgti">; -// -// BUILTIN_INFO(HEXAGON.A4_vcmphgtui,QI_ftype_DISI,2) -// -def int_hexagon_A4_vcmphgtui : -Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmphgtui">; -// -// BUILTIN_INFO(HEXAGON.A4_cmpheq,QI_ftype_SISI,2) -// -def int_hexagon_A4_cmpheq : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpheq">; -// -// BUILTIN_INFO(HEXAGON.A4_cmphgt,QI_ftype_SISI,2) -// -def int_hexagon_A4_cmphgt : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmphgt">; -// -// BUILTIN_INFO(HEXAGON.A4_cmphgtu,QI_ftype_SISI,2) -// -def int_hexagon_A4_cmphgtu : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmphgtu">; -// -// BUILTIN_INFO(HEXAGON.A4_cmpheqi,QI_ftype_SISI,2) -// -def int_hexagon_A4_cmpheqi : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmpheqi">; -// -// BUILTIN_INFO(HEXAGON.A4_cmphgti,QI_ftype_SISI,2) -// -def int_hexagon_A4_cmphgti : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmphgti">; -// -// BUILTIN_INFO(HEXAGON.A4_cmphgtui,QI_ftype_SISI,2) -// -def int_hexagon_A4_cmphgtui : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cmphgtui">; -// -// BUILTIN_INFO(HEXAGON.A2_vcmpweq,QI_ftype_DIDI,2) -// -def int_hexagon_A2_vcmpweq : -Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmpweq">; -// -// BUILTIN_INFO(HEXAGON.A2_vcmpwgt,QI_ftype_DIDI,2) -// -def int_hexagon_A2_vcmpwgt : -Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmpwgt">; -// -// BUILTIN_INFO(HEXAGON.A2_vcmpwgtu,QI_ftype_DIDI,2) -// -def int_hexagon_A2_vcmpwgtu : -Hexagon_si_didi_Intrinsic<"HEXAGON_A2_vcmpwgtu">; -// -// BUILTIN_INFO(HEXAGON.A4_vcmpweqi,QI_ftype_DISI,2) -// -def int_hexagon_A4_vcmpweqi : -Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpweqi">; -// -// BUILTIN_INFO(HEXAGON.A4_vcmpwgti,QI_ftype_DISI,2) -// -def int_hexagon_A4_vcmpwgti : -Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpwgti">; -// -// BUILTIN_INFO(HEXAGON.A4_vcmpwgtui,QI_ftype_DISI,2) -// -def int_hexagon_A4_vcmpwgtui : -Hexagon_si_disi_Intrinsic<"HEXAGON_A4_vcmpwgtui">; -// -// BUILTIN_INFO(HEXAGON.A4_boundscheck,QI_ftype_SIDI,2) -// -def int_hexagon_A4_boundscheck : -Hexagon_si_sidi_Intrinsic<"HEXAGON_A4_boundscheck">; -// -// BUILTIN_INFO(HEXAGON.A4_tlbmatch,QI_ftype_DISI,2) -// -def int_hexagon_A4_tlbmatch : -Hexagon_si_disi_Intrinsic<"HEXAGON_A4_tlbmatch">; -// -// BUILTIN_INFO(HEXAGON.C2_tfrpr,SI_ftype_QI,1) -// -def int_hexagon_C2_tfrpr : -Hexagon_si_qi_Intrinsic<"HEXAGON_C2_tfrpr">; -// -// BUILTIN_INFO(HEXAGON.C2_tfrrp,QI_ftype_SI,1) -// -def int_hexagon_C2_tfrrp : -Hexagon_si_si_Intrinsic<"HEXAGON_C2_tfrrp">; -// -// BUILTIN_INFO(HEXAGON.C4_fastcorner9,QI_ftype_QIQI,2) -// -def int_hexagon_C4_fastcorner9 : -Hexagon_si_qiqi_Intrinsic<"HEXAGON_C4_fastcorner9">; -// -// BUILTIN_INFO(HEXAGON.C4_fastcorner9_not,QI_ftype_QIQI,2) -// -def int_hexagon_C4_fastcorner9_not : -Hexagon_si_qiqi_Intrinsic<"HEXAGON_C4_fastcorner9_not">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_acc_hh_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_acc_hh_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_acc_hh_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_acc_hh_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_acc_hl_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_acc_hl_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_acc_hl_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_acc_hl_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_acc_lh_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_acc_lh_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_acc_lh_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_acc_lh_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_acc_ll_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_acc_ll_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_acc_ll_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_acc_ll_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_ll_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_nac_hh_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_nac_hh_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_nac_hh_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_nac_hh_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_nac_hl_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_nac_hl_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_nac_hl_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_nac_hl_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_nac_lh_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_nac_lh_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_nac_lh_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_nac_lh_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_nac_ll_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_nac_ll_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_nac_ll_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_nac_ll_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_ll_s1">; +// BUILTIN_INFO(HEXAGON.V6_vscattermhw,v_ftype_SISIVDVI,4) +// tag : V6_vscattermhw +def int_hexagon_V6_vscattermhw : +Hexagon_V65_viiv1024v512_Intrinsic<"HEXAGON_V6_vscattermhw">; + // -// BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_hh_s0,SI_ftype_SISISI,3) +// BUILTIN_INFO(HEXAGON.V6_vscattermhw_128B,v_ftype_SISIVDVI,4) +// tag : V6_vscattermhw_128B +def int_hexagon_V6_vscattermhw_128B : +Hexagon_V65_viiv2048v1024_Intrinsic<"HEXAGON_V6_vscattermhw_128B">; + // -def int_hexagon_M2_mpy_acc_sat_hh_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hh_s0">; +// BUILTIN_INFO(HEXAGON.V6_vscattermhwq,v_ftype_QVSISIVDVI,5) +// tag : V6_vscattermhwq +def int_hexagon_V6_vscattermhwq : +Hexagon_V65_vv64iiiv1024v512_Intrinsic<"HEXAGON_V6_vscattermhwq">; + // -// BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_hh_s1,SI_ftype_SISISI,3) +// BUILTIN_INFO(HEXAGON.V6_vscattermhwq_128B,v_ftype_QVSISIVDVI,5) +// tag : V6_vscattermhwq_128B +def int_hexagon_V6_vscattermhwq_128B : +Hexagon_V65_vv128iiiv2048v1024_Intrinsic<"HEXAGON_V6_vscattermhwq_128B">; + // -def int_hexagon_M2_mpy_acc_sat_hh_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_hl_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_acc_sat_hl_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_hl_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_acc_sat_hl_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_lh_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_acc_sat_lh_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_lh_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_acc_sat_lh_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_ll_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_acc_sat_ll_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_acc_sat_ll_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_acc_sat_ll_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_acc_sat_ll_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_hh_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_nac_sat_hh_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_hh_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_nac_sat_hh_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_hl_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_nac_sat_hl_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_hl_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_nac_sat_hl_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_lh_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_nac_sat_lh_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_lh_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_nac_sat_lh_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_ll_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_nac_sat_ll_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_nac_sat_ll_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpy_nac_sat_ll_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpy_nac_sat_ll_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_hh_s0,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_hh_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_hh_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_hh_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_hl_s0,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_hl_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_hl_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_hl_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_lh_s0,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_lh_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_lh_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_lh_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_ll_s0,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_ll_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_ll_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_ll_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_ll_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_sat_hh_s0,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_sat_hh_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_sat_hh_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_sat_hh_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_sat_hl_s0,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_sat_hl_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_sat_hl_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_sat_hl_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_sat_lh_s0,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_sat_lh_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_sat_lh_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_sat_lh_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_sat_ll_s0,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_sat_ll_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_sat_ll_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_sat_ll_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_ll_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_rnd_hh_s0,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_rnd_hh_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_rnd_hh_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_rnd_hh_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_rnd_hl_s0,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_rnd_hl_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_rnd_hl_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_rnd_hl_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_rnd_lh_s0,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_rnd_lh_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_rnd_lh_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_rnd_lh_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_rnd_ll_s0,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_rnd_ll_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_rnd_ll_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_rnd_ll_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_rnd_ll_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_hh_s0,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_sat_rnd_hh_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_hh_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_sat_rnd_hh_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_hl_s0,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_sat_rnd_hl_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_hl_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_sat_rnd_hl_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_lh_s0,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_sat_rnd_lh_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_lh_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_sat_rnd_lh_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_ll_s0,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_sat_rnd_ll_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_sat_rnd_ll_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_sat_rnd_ll_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_ll_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_acc_hh_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyd_acc_hh_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_acc_hh_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyd_acc_hh_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_acc_hl_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyd_acc_hl_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_acc_hl_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyd_acc_hl_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_acc_lh_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyd_acc_lh_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_acc_lh_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyd_acc_lh_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_acc_ll_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyd_acc_ll_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_acc_ll_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyd_acc_ll_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_acc_ll_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_nac_hh_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyd_nac_hh_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_nac_hh_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyd_nac_hh_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_nac_hl_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyd_nac_hl_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_nac_hl_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyd_nac_hl_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_nac_lh_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyd_nac_lh_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_nac_lh_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyd_nac_lh_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_nac_ll_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyd_nac_ll_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_nac_ll_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyd_nac_ll_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyd_nac_ll_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_hh_s0,DI_ftype_SISI,2) -// -def int_hexagon_M2_mpyd_hh_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_hh_s1,DI_ftype_SISI,2) -// -def int_hexagon_M2_mpyd_hh_s1 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_hl_s0,DI_ftype_SISI,2) -// -def int_hexagon_M2_mpyd_hl_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_hl_s1,DI_ftype_SISI,2) -// -def int_hexagon_M2_mpyd_hl_s1 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_lh_s0,DI_ftype_SISI,2) -// -def int_hexagon_M2_mpyd_lh_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_lh_s1,DI_ftype_SISI,2) -// -def int_hexagon_M2_mpyd_lh_s1 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_ll_s0,DI_ftype_SISI,2) -// -def int_hexagon_M2_mpyd_ll_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_ll_s1,DI_ftype_SISI,2) -// -def int_hexagon_M2_mpyd_ll_s1 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_ll_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_hh_s0,DI_ftype_SISI,2) -// -def int_hexagon_M2_mpyd_rnd_hh_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_hh_s1,DI_ftype_SISI,2) -// -def int_hexagon_M2_mpyd_rnd_hh_s1 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_hl_s0,DI_ftype_SISI,2) -// -def int_hexagon_M2_mpyd_rnd_hl_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_hl_s1,DI_ftype_SISI,2) -// -def int_hexagon_M2_mpyd_rnd_hl_s1 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_lh_s0,DI_ftype_SISI,2) -// -def int_hexagon_M2_mpyd_rnd_lh_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_lh_s1,DI_ftype_SISI,2) -// -def int_hexagon_M2_mpyd_rnd_lh_s1 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_ll_s0,DI_ftype_SISI,2) -// -def int_hexagon_M2_mpyd_rnd_ll_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyd_rnd_ll_s1,DI_ftype_SISI,2) -// -def int_hexagon_M2_mpyd_rnd_ll_s1 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyd_rnd_ll_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_acc_hh_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpyu_acc_hh_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_acc_hh_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpyu_acc_hh_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_acc_hl_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpyu_acc_hl_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_acc_hl_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpyu_acc_hl_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_acc_lh_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpyu_acc_lh_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_acc_lh_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpyu_acc_lh_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_acc_ll_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpyu_acc_ll_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_acc_ll_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpyu_acc_ll_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_acc_ll_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_nac_hh_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpyu_nac_hh_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_nac_hh_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpyu_nac_hh_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_nac_hl_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpyu_nac_hl_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_nac_hl_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpyu_nac_hl_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_nac_lh_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpyu_nac_lh_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_nac_lh_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpyu_nac_lh_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_nac_ll_s0,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpyu_nac_ll_s0 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_nac_ll_s1,SI_ftype_SISISI,3) -// -def int_hexagon_M2_mpyu_nac_ll_s1 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_mpyu_nac_ll_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_hh_s0,USI_ftype_SISI,2) -// -def int_hexagon_M2_mpyu_hh_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_hh_s1,USI_ftype_SISI,2) -// -def int_hexagon_M2_mpyu_hh_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_hl_s0,USI_ftype_SISI,2) -// -def int_hexagon_M2_mpyu_hl_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_hl_s1,USI_ftype_SISI,2) -// -def int_hexagon_M2_mpyu_hl_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_lh_s0,USI_ftype_SISI,2) -// -def int_hexagon_M2_mpyu_lh_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_lh_s1,USI_ftype_SISI,2) -// -def int_hexagon_M2_mpyu_lh_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_ll_s0,USI_ftype_SISI,2) -// -def int_hexagon_M2_mpyu_ll_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_ll_s1,USI_ftype_SISI,2) -// -def int_hexagon_M2_mpyu_ll_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_ll_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_acc_hh_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyud_acc_hh_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_acc_hh_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyud_acc_hh_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_acc_hl_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyud_acc_hl_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_acc_hl_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyud_acc_hl_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_acc_lh_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyud_acc_lh_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_acc_lh_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyud_acc_lh_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_acc_ll_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyud_acc_ll_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_acc_ll_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyud_acc_ll_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_acc_ll_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_nac_hh_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyud_nac_hh_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_nac_hh_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyud_nac_hh_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_nac_hl_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyud_nac_hl_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_nac_hl_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyud_nac_hl_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_nac_lh_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyud_nac_lh_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_nac_lh_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyud_nac_lh_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_nac_ll_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyud_nac_ll_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_nac_ll_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_mpyud_nac_ll_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_mpyud_nac_ll_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_hh_s0,UDI_ftype_SISI,2) -// -def int_hexagon_M2_mpyud_hh_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_hh_s1,UDI_ftype_SISI,2) -// -def int_hexagon_M2_mpyud_hh_s1 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_hl_s0,UDI_ftype_SISI,2) -// -def int_hexagon_M2_mpyud_hl_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_hl_s1,UDI_ftype_SISI,2) -// -def int_hexagon_M2_mpyud_hl_s1 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_lh_s0,UDI_ftype_SISI,2) -// -def int_hexagon_M2_mpyud_lh_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_lh_s1,UDI_ftype_SISI,2) -// -def int_hexagon_M2_mpyud_lh_s1 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_ll_s0,UDI_ftype_SISI,2) -// -def int_hexagon_M2_mpyud_ll_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyud_ll_s1,UDI_ftype_SISI,2) -// -def int_hexagon_M2_mpyud_ll_s1 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_mpyud_ll_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpysmi,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpysmi : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpysmi">; -// -// BUILTIN_INFO(HEXAGON.M2_macsip,SI_ftype_SISISI,3) -// -def int_hexagon_M2_macsip : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_macsip">; -// -// BUILTIN_INFO(HEXAGON.M2_macsin,SI_ftype_SISISI,3) -// -def int_hexagon_M2_macsin : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_macsin">; -// -// BUILTIN_INFO(HEXAGON.M2_dpmpyss_s0,DI_ftype_SISI,2) -// -def int_hexagon_M2_dpmpyss_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_dpmpyss_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_dpmpyss_acc_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_dpmpyss_acc_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_dpmpyss_acc_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_dpmpyss_nac_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_dpmpyss_nac_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_dpmpyss_nac_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_dpmpyuu_s0,UDI_ftype_SISI,2) -// -def int_hexagon_M2_dpmpyuu_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_dpmpyuu_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_dpmpyuu_acc_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_dpmpyuu_acc_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_dpmpyuu_acc_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_dpmpyuu_nac_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_dpmpyuu_nac_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_dpmpyuu_nac_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_up,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_up : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_up">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_up_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_up_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_up_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mpy_up_s1_sat,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpy_up_s1_sat : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpy_up_s1_sat">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyu_up,USI_ftype_SISI,2) -// -def int_hexagon_M2_mpyu_up : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyu_up">; -// -// BUILTIN_INFO(HEXAGON.M2_mpysu_up,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpysu_up : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpysu_up">; -// -// BUILTIN_INFO(HEXAGON.M2_dpmpyss_rnd_s0,SI_ftype_SISI,2) -// -def int_hexagon_M2_dpmpyss_rnd_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_dpmpyss_rnd_s0">; -// -// BUILTIN_INFO(HEXAGON.M4_mac_up_s1_sat,SI_ftype_SISISI,3) -// -def int_hexagon_M4_mac_up_s1_sat : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_mac_up_s1_sat">; -// -// BUILTIN_INFO(HEXAGON.M4_nac_up_s1_sat,SI_ftype_SISISI,3) -// -def int_hexagon_M4_nac_up_s1_sat : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_nac_up_s1_sat">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyi,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpyi : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyi">; -// -// BUILTIN_INFO(HEXAGON.M2_mpyui,SI_ftype_SISI,2) -// -def int_hexagon_M2_mpyui : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_mpyui">; -// -// BUILTIN_INFO(HEXAGON.M2_maci,SI_ftype_SISISI,3) -// -def int_hexagon_M2_maci : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_maci">; -// -// BUILTIN_INFO(HEXAGON.M2_acci,SI_ftype_SISISI,3) -// -def int_hexagon_M2_acci : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_acci">; -// -// BUILTIN_INFO(HEXAGON.M2_accii,SI_ftype_SISISI,3) -// -def int_hexagon_M2_accii : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_accii">; -// -// BUILTIN_INFO(HEXAGON.M2_nacci,SI_ftype_SISISI,3) -// -def int_hexagon_M2_nacci : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_nacci">; -// -// BUILTIN_INFO(HEXAGON.M2_naccii,SI_ftype_SISISI,3) -// -def int_hexagon_M2_naccii : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_naccii">; -// -// BUILTIN_INFO(HEXAGON.M2_subacc,SI_ftype_SISISI,3) -// -def int_hexagon_M2_subacc : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_subacc">; -// -// BUILTIN_INFO(HEXAGON.M4_mpyrr_addr,SI_ftype_SISISI,3) -// -def int_hexagon_M4_mpyrr_addr : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_mpyrr_addr">; -// -// BUILTIN_INFO(HEXAGON.M4_mpyri_addr_u2,SI_ftype_SISISI,3) -// -def int_hexagon_M4_mpyri_addr_u2 : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_mpyri_addr_u2">; -// -// BUILTIN_INFO(HEXAGON.M4_mpyri_addr,SI_ftype_SISISI,3) -// -def int_hexagon_M4_mpyri_addr : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_mpyri_addr">; -// -// BUILTIN_INFO(HEXAGON.M4_mpyri_addi,SI_ftype_SISISI,3) -// -def int_hexagon_M4_mpyri_addi : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_mpyri_addi">; -// -// BUILTIN_INFO(HEXAGON.M4_mpyrr_addi,SI_ftype_SISISI,3) -// -def int_hexagon_M4_mpyrr_addi : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_mpyrr_addi">; -// -// BUILTIN_INFO(HEXAGON.M2_vmpy2s_s0,DI_ftype_SISI,2) -// -def int_hexagon_M2_vmpy2s_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_vmpy2s_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_vmpy2s_s1,DI_ftype_SISI,2) -// -def int_hexagon_M2_vmpy2s_s1 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_vmpy2s_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_vmac2s_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_vmac2s_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_vmac2s_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_vmac2s_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_vmac2s_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_vmac2s_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_vmpy2su_s0,DI_ftype_SISI,2) -// -def int_hexagon_M2_vmpy2su_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_vmpy2su_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_vmpy2su_s1,DI_ftype_SISI,2) -// -def int_hexagon_M2_vmpy2su_s1 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_vmpy2su_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_vmac2su_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_vmac2su_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_vmac2su_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_vmac2su_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_vmac2su_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_vmac2su_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_vmpy2s_s0pack,SI_ftype_SISI,2) -// -def int_hexagon_M2_vmpy2s_s0pack : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_vmpy2s_s0pack">; -// -// BUILTIN_INFO(HEXAGON.M2_vmpy2s_s1pack,SI_ftype_SISI,2) -// -def int_hexagon_M2_vmpy2s_s1pack : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_vmpy2s_s1pack">; -// -// BUILTIN_INFO(HEXAGON.M2_vmac2,DI_ftype_DISISI,3) -// -def int_hexagon_M2_vmac2 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_vmac2">; -// -// BUILTIN_INFO(HEXAGON.M2_vmpy2es_s0,DI_ftype_DIDI,2) -// -def int_hexagon_M2_vmpy2es_s0 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vmpy2es_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_vmpy2es_s1,DI_ftype_DIDI,2) -// -def int_hexagon_M2_vmpy2es_s1 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vmpy2es_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_vmac2es_s0,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_vmac2es_s0 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vmac2es_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_vmac2es_s1,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_vmac2es_s1 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vmac2es_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_vmac2es,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_vmac2es : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vmac2es">; -// -// BUILTIN_INFO(HEXAGON.M2_vrmac_s0,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_vrmac_s0 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vrmac_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_vrmpy_s0,DI_ftype_DIDI,2) -// -def int_hexagon_M2_vrmpy_s0 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vrmpy_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_vdmpyrs_s0,SI_ftype_DIDI,2) -// -def int_hexagon_M2_vdmpyrs_s0 : -Hexagon_si_didi_Intrinsic<"HEXAGON_M2_vdmpyrs_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_vdmpyrs_s1,SI_ftype_DIDI,2) -// -def int_hexagon_M2_vdmpyrs_s1 : -Hexagon_si_didi_Intrinsic<"HEXAGON_M2_vdmpyrs_s1">; -// -// BUILTIN_INFO(HEXAGON.M5_vrmpybuu,DI_ftype_DIDI,2) -// -def int_hexagon_M5_vrmpybuu : -Hexagon_di_didi_Intrinsic<"HEXAGON_M5_vrmpybuu">; -// -// BUILTIN_INFO(HEXAGON.M5_vrmacbuu,DI_ftype_DIDIDI,3) -// -def int_hexagon_M5_vrmacbuu : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M5_vrmacbuu">; -// -// BUILTIN_INFO(HEXAGON.M5_vrmpybsu,DI_ftype_DIDI,2) -// -def int_hexagon_M5_vrmpybsu : -Hexagon_di_didi_Intrinsic<"HEXAGON_M5_vrmpybsu">; -// -// BUILTIN_INFO(HEXAGON.M5_vrmacbsu,DI_ftype_DIDIDI,3) -// -def int_hexagon_M5_vrmacbsu : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M5_vrmacbsu">; -// -// BUILTIN_INFO(HEXAGON.M5_vmpybuu,DI_ftype_SISI,2) -// -def int_hexagon_M5_vmpybuu : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M5_vmpybuu">; -// -// BUILTIN_INFO(HEXAGON.M5_vmpybsu,DI_ftype_SISI,2) -// -def int_hexagon_M5_vmpybsu : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M5_vmpybsu">; -// -// BUILTIN_INFO(HEXAGON.M5_vmacbuu,DI_ftype_DISISI,3) -// -def int_hexagon_M5_vmacbuu : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M5_vmacbuu">; -// -// BUILTIN_INFO(HEXAGON.M5_vmacbsu,DI_ftype_DISISI,3) -// -def int_hexagon_M5_vmacbsu : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M5_vmacbsu">; -// -// BUILTIN_INFO(HEXAGON.M5_vdmpybsu,DI_ftype_DIDI,2) -// -def int_hexagon_M5_vdmpybsu : -Hexagon_di_didi_Intrinsic<"HEXAGON_M5_vdmpybsu">; -// -// BUILTIN_INFO(HEXAGON.M5_vdmacbsu,DI_ftype_DIDIDI,3) -// -def int_hexagon_M5_vdmacbsu : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M5_vdmacbsu">; -// -// BUILTIN_INFO(HEXAGON.M2_vdmacs_s0,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_vdmacs_s0 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vdmacs_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_vdmacs_s1,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_vdmacs_s1 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vdmacs_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_vdmpys_s0,DI_ftype_DIDI,2) -// -def int_hexagon_M2_vdmpys_s0 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vdmpys_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_vdmpys_s1,DI_ftype_DIDI,2) -// -def int_hexagon_M2_vdmpys_s1 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vdmpys_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_cmpyrs_s0,SI_ftype_SISI,2) -// -def int_hexagon_M2_cmpyrs_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_cmpyrs_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_cmpyrs_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_cmpyrs_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_cmpyrs_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_cmpyrsc_s0,SI_ftype_SISI,2) -// -def int_hexagon_M2_cmpyrsc_s0 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_cmpyrsc_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_cmpyrsc_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_cmpyrsc_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_cmpyrsc_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_cmacs_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_cmacs_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cmacs_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_cmacs_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_cmacs_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cmacs_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_cmacsc_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_cmacsc_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cmacsc_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_cmacsc_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_cmacsc_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cmacsc_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_cmpys_s0,DI_ftype_SISI,2) -// -def int_hexagon_M2_cmpys_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_cmpys_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_cmpys_s1,DI_ftype_SISI,2) -// -def int_hexagon_M2_cmpys_s1 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_cmpys_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_cmpysc_s0,DI_ftype_SISI,2) -// -def int_hexagon_M2_cmpysc_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_cmpysc_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_cmpysc_s1,DI_ftype_SISI,2) -// -def int_hexagon_M2_cmpysc_s1 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_cmpysc_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_cnacs_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_cnacs_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cnacs_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_cnacs_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_cnacs_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cnacs_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_cnacsc_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_cnacsc_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cnacsc_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_cnacsc_s1,DI_ftype_DISISI,3) -// -def int_hexagon_M2_cnacsc_s1 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cnacsc_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_vrcmpys_s1,DI_ftype_DISI,2) -// -def int_hexagon_M2_vrcmpys_s1 : -Hexagon_di_disi_Intrinsic<"HEXAGON_M2_vrcmpys_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_vrcmpys_acc_s1,DI_ftype_DIDISI,3) -// -def int_hexagon_M2_vrcmpys_acc_s1 : -Hexagon_di_didisi_Intrinsic<"HEXAGON_M2_vrcmpys_acc_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_vrcmpys_s1rp,SI_ftype_DISI,2) -// -def int_hexagon_M2_vrcmpys_s1rp : -Hexagon_si_disi_Intrinsic<"HEXAGON_M2_vrcmpys_s1rp">; -// -// BUILTIN_INFO(HEXAGON.M2_mmacls_s0,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_mmacls_s0 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacls_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mmacls_s1,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_mmacls_s1 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacls_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mmachs_s0,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_mmachs_s0 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmachs_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mmachs_s1,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_mmachs_s1 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmachs_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mmpyl_s0,DI_ftype_DIDI,2) -// -def int_hexagon_M2_mmpyl_s0 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyl_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mmpyl_s1,DI_ftype_DIDI,2) -// -def int_hexagon_M2_mmpyl_s1 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mmpyh_s0,DI_ftype_DIDI,2) -// -def int_hexagon_M2_mmpyh_s0 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mmpyh_s1,DI_ftype_DIDI,2) -// -def int_hexagon_M2_mmpyh_s1 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mmacls_rs0,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_mmacls_rs0 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacls_rs0">; -// -// BUILTIN_INFO(HEXAGON.M2_mmacls_rs1,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_mmacls_rs1 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacls_rs1">; -// -// BUILTIN_INFO(HEXAGON.M2_mmachs_rs0,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_mmachs_rs0 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmachs_rs0">; -// -// BUILTIN_INFO(HEXAGON.M2_mmachs_rs1,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_mmachs_rs1 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmachs_rs1">; -// -// BUILTIN_INFO(HEXAGON.M2_mmpyl_rs0,DI_ftype_DIDI,2) -// -def int_hexagon_M2_mmpyl_rs0 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyl_rs0">; -// -// BUILTIN_INFO(HEXAGON.M2_mmpyl_rs1,DI_ftype_DIDI,2) -// -def int_hexagon_M2_mmpyl_rs1 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyl_rs1">; -// -// BUILTIN_INFO(HEXAGON.M2_mmpyh_rs0,DI_ftype_DIDI,2) -// -def int_hexagon_M2_mmpyh_rs0 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyh_rs0">; -// -// BUILTIN_INFO(HEXAGON.M2_mmpyh_rs1,DI_ftype_DIDI,2) -// -def int_hexagon_M2_mmpyh_rs1 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyh_rs1">; -// -// BUILTIN_INFO(HEXAGON.M4_vrmpyeh_s0,DI_ftype_DIDI,2) -// -def int_hexagon_M4_vrmpyeh_s0 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M4_vrmpyeh_s0">; -// -// BUILTIN_INFO(HEXAGON.M4_vrmpyeh_s1,DI_ftype_DIDI,2) -// -def int_hexagon_M4_vrmpyeh_s1 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M4_vrmpyeh_s1">; -// -// BUILTIN_INFO(HEXAGON.M4_vrmpyeh_acc_s0,DI_ftype_DIDIDI,3) -// -def int_hexagon_M4_vrmpyeh_acc_s0 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M4_vrmpyeh_acc_s0">; -// -// BUILTIN_INFO(HEXAGON.M4_vrmpyeh_acc_s1,DI_ftype_DIDIDI,3) -// -def int_hexagon_M4_vrmpyeh_acc_s1 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M4_vrmpyeh_acc_s1">; -// -// BUILTIN_INFO(HEXAGON.M4_vrmpyoh_s0,DI_ftype_DIDI,2) -// -def int_hexagon_M4_vrmpyoh_s0 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M4_vrmpyoh_s0">; -// -// BUILTIN_INFO(HEXAGON.M4_vrmpyoh_s1,DI_ftype_DIDI,2) -// -def int_hexagon_M4_vrmpyoh_s1 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M4_vrmpyoh_s1">; -// -// BUILTIN_INFO(HEXAGON.M4_vrmpyoh_acc_s0,DI_ftype_DIDIDI,3) -// -def int_hexagon_M4_vrmpyoh_acc_s0 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M4_vrmpyoh_acc_s0">; -// -// BUILTIN_INFO(HEXAGON.M4_vrmpyoh_acc_s1,DI_ftype_DIDIDI,3) -// -def int_hexagon_M4_vrmpyoh_acc_s1 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M4_vrmpyoh_acc_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_hmmpyl_rs1,SI_ftype_SISI,2) -// -def int_hexagon_M2_hmmpyl_rs1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_hmmpyl_rs1">; -// -// BUILTIN_INFO(HEXAGON.M2_hmmpyh_rs1,SI_ftype_SISI,2) -// -def int_hexagon_M2_hmmpyh_rs1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_hmmpyh_rs1">; -// -// BUILTIN_INFO(HEXAGON.M2_hmmpyl_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_hmmpyl_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_hmmpyl_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_hmmpyh_s1,SI_ftype_SISI,2) -// -def int_hexagon_M2_hmmpyh_s1 : -Hexagon_si_sisi_Intrinsic<"HEXAGON_M2_hmmpyh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mmaculs_s0,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_mmaculs_s0 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmaculs_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mmaculs_s1,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_mmaculs_s1 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmaculs_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mmacuhs_s0,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_mmacuhs_s0 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacuhs_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mmacuhs_s1,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_mmacuhs_s1 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacuhs_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mmpyul_s0,DI_ftype_DIDI,2) -// -def int_hexagon_M2_mmpyul_s0 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyul_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mmpyul_s1,DI_ftype_DIDI,2) -// -def int_hexagon_M2_mmpyul_s1 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyul_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mmpyuh_s0,DI_ftype_DIDI,2) -// -def int_hexagon_M2_mmpyuh_s0 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyuh_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_mmpyuh_s1,DI_ftype_DIDI,2) -// -def int_hexagon_M2_mmpyuh_s1 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyuh_s1">; -// -// BUILTIN_INFO(HEXAGON.M2_mmaculs_rs0,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_mmaculs_rs0 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmaculs_rs0">; -// -// BUILTIN_INFO(HEXAGON.M2_mmaculs_rs1,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_mmaculs_rs1 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmaculs_rs1">; -// -// BUILTIN_INFO(HEXAGON.M2_mmacuhs_rs0,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_mmacuhs_rs0 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacuhs_rs0">; -// -// BUILTIN_INFO(HEXAGON.M2_mmacuhs_rs1,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_mmacuhs_rs1 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_mmacuhs_rs1">; -// -// BUILTIN_INFO(HEXAGON.M2_mmpyul_rs0,DI_ftype_DIDI,2) -// -def int_hexagon_M2_mmpyul_rs0 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyul_rs0">; -// -// BUILTIN_INFO(HEXAGON.M2_mmpyul_rs1,DI_ftype_DIDI,2) -// -def int_hexagon_M2_mmpyul_rs1 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyul_rs1">; -// -// BUILTIN_INFO(HEXAGON.M2_mmpyuh_rs0,DI_ftype_DIDI,2) -// -def int_hexagon_M2_mmpyuh_rs0 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyuh_rs0">; -// -// BUILTIN_INFO(HEXAGON.M2_mmpyuh_rs1,DI_ftype_DIDI,2) -// -def int_hexagon_M2_mmpyuh_rs1 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_mmpyuh_rs1">; -// -// BUILTIN_INFO(HEXAGON.M2_vrcmaci_s0,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_vrcmaci_s0 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vrcmaci_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_vrcmacr_s0,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_vrcmacr_s0 : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vrcmacr_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_vrcmaci_s0c,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_vrcmaci_s0c : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vrcmaci_s0c">; -// -// BUILTIN_INFO(HEXAGON.M2_vrcmacr_s0c,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_vrcmacr_s0c : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vrcmacr_s0c">; -// -// BUILTIN_INFO(HEXAGON.M2_cmaci_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_cmaci_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cmaci_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_cmacr_s0,DI_ftype_DISISI,3) -// -def int_hexagon_M2_cmacr_s0 : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M2_cmacr_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_vrcmpyi_s0,DI_ftype_DIDI,2) -// -def int_hexagon_M2_vrcmpyi_s0 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vrcmpyi_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_vrcmpyr_s0,DI_ftype_DIDI,2) -// -def int_hexagon_M2_vrcmpyr_s0 : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vrcmpyr_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_vrcmpyi_s0c,DI_ftype_DIDI,2) -// -def int_hexagon_M2_vrcmpyi_s0c : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vrcmpyi_s0c">; -// -// BUILTIN_INFO(HEXAGON.M2_vrcmpyr_s0c,DI_ftype_DIDI,2) -// -def int_hexagon_M2_vrcmpyr_s0c : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vrcmpyr_s0c">; -// -// BUILTIN_INFO(HEXAGON.M2_cmpyi_s0,DI_ftype_SISI,2) -// -def int_hexagon_M2_cmpyi_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_cmpyi_s0">; -// -// BUILTIN_INFO(HEXAGON.M2_cmpyr_s0,DI_ftype_SISI,2) -// -def int_hexagon_M2_cmpyr_s0 : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M2_cmpyr_s0">; -// -// BUILTIN_INFO(HEXAGON.M4_cmpyi_wh,SI_ftype_DISI,2) -// -def int_hexagon_M4_cmpyi_wh : -Hexagon_si_disi_Intrinsic<"HEXAGON_M4_cmpyi_wh">; -// -// BUILTIN_INFO(HEXAGON.M4_cmpyr_wh,SI_ftype_DISI,2) -// -def int_hexagon_M4_cmpyr_wh : -Hexagon_si_disi_Intrinsic<"HEXAGON_M4_cmpyr_wh">; -// -// BUILTIN_INFO(HEXAGON.M4_cmpyi_whc,SI_ftype_DISI,2) +// BUILTIN_INFO(HEXAGON.V6_vscattermhw_add,v_ftype_SISIVDVI,4) +// tag : V6_vscattermhw_add +def int_hexagon_V6_vscattermhw_add : +Hexagon_V65_viiv1024v512_Intrinsic<"HEXAGON_V6_vscattermhw_add">; + // +// BUILTIN_INFO(HEXAGON.V6_vscattermhw_add_128B,v_ftype_SISIVDVI,4) +// tag : V6_vscattermhw_add_128B +def int_hexagon_V6_vscattermhw_add_128B : +Hexagon_V65_viiv2048v1024_Intrinsic<"HEXAGON_V6_vscattermhw_add_128B">; + +// Auto-generated intrinsics + +// tag : S2_vsatwh +class Hexagon_i32_i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vrmpybusv +class Hexagon_v16i32_v16i32v16i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vrmpybusv +class Hexagon_v32i32_v32i32v32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vaslw_acc +class Hexagon_v16i32_v16i32v16i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vaslw_acc +class Hexagon_v32i32_v32i32v32i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vmux +class Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vmux +class Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : S2_tableidxd_goodsyntax +class Hexagon_i32_i32i32i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vandnqrt_acc +class Hexagon_v16i32_v16i32v512i1i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vandnqrt_acc +class Hexagon_v32i32_v32i32v1024i1i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vrmpybusi +class Hexagon_v32i32_v32i32i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vrmpybusi +class Hexagon_v64i32_v64i32i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vsubb_dv +class Hexagon_v64i32_v64i32v64i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : M2_mpysu_up +class Hexagon_i32_i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : M2_mpyud_acc_ll_s0 +class Hexagon_i64_i64i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : S2_lsr_i_r_nac +class Hexagon_i32_i32i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : M2_cmpysc_s0 +class Hexagon_i64_i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_lo +class Hexagon_v16i32_v32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_lo +class Hexagon_v32i32_v64i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : S2_shuffoh +class Hexagon_i64_i64i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_sfmax +class Hexagon_float_floatfloat_Intrinsic + : Hexagon_Intrinsic; + +// tag : A2_vabswsat +class Hexagon_i64_i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : +class Hexagon_v32i32_v32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_ldnp0 +class Hexagon_v16i32_i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_ldnp0 +class Hexagon_v32i32_i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vdmpyhb +class Hexagon_v16i32_v16i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vdmpyhb +class Hexagon_v32i32_v32i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : A4_vcmphgti +class Hexagon_i32_i64i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : +class Hexagon_v32i32_v16i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : S6_rol_i_p_or +class Hexagon_i64_i64i64i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vgtuh_and +class Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vgtuh_and +class Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : A2_abssat +class Hexagon_i32_i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : A2_vcmpwgtu +class Hexagon_i32_i64i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vtmpybus_acc +class Hexagon_v64i32_v64i32v64i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_conv_df2uw_chop +class Hexagon_i32_double_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_pred_or +class Hexagon_v512i1_v512i1v512i1_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_pred_or +class Hexagon_v1024i1_v1024i1v1024i1_Intrinsic + : Hexagon_Intrinsic; + +// tag : S2_asr_i_p_rnd_goodsyntax +class Hexagon_i64_i64i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_conv_w2df +class Hexagon_double_i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vunpackuh +class Hexagon_v32i32_v16i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vunpackuh +class Hexagon_v64i32_v32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vadduhw_acc +class Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vadduhw_acc +class Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : M2_vdmacs_s0 +class Hexagon_i64_i64i64i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vrmpybub_rtt_acc +class Hexagon_v32i32_v32i32v16i32i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vrmpybub_rtt_acc +class Hexagon_v64i32_v64i32v32i32i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_ldu0 +class Hexagon_v16i32_i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_ldu0 +class Hexagon_v32i32_i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : S4_extract_rp +class Hexagon_i32_i32i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vdmpyhsuisat +class Hexagon_v16i32_v32i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vdmpyhsuisat +class Hexagon_v32i32_v64i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : A2_addsp +class Hexagon_i64_i32i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_extractw +class Hexagon_i32_v16i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_extractw +class Hexagon_i32_v32i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vlutvwhi +class Hexagon_v32i32_v16i32v16i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vlutvwhi +class Hexagon_v64i32_v32i32v32i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vgtuh +class Hexagon_v512i1_v16i32v16i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vgtuh +class Hexagon_v1024i1_v32i32v32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_sffma_lib +class Hexagon_float_floatfloatfloat_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_conv_ud2df +class Hexagon_double_i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : S2_vzxthw +class Hexagon_i64_i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vtmpyhb +class Hexagon_v64i32_v64i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vshufoeh +class Hexagon_v32i32_v16i32v16i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vshufoeh +class Hexagon_v64i32_v32i32v32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vlut4 +class Hexagon_v16i32_v16i32i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vlut4 +class Hexagon_v32i32_v32i32i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : +class Hexagon_v16i32_v16i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_conv_uw2sf +class Hexagon_float_i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vswap +class Hexagon_v32i32_v512i1v16i32v16i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vswap +class Hexagon_v64i32_v1024i1v32i32v32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vandnqrt +class Hexagon_v16i32_v512i1i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vandnqrt +class Hexagon_v32i32_v1024i1i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vmpyub +class Hexagon_v64i32_v32i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : A5_ACS +class Hexagon_i64i32_i64i64i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vunpackob +class Hexagon_v32i32_v32i32v16i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vunpackob +class Hexagon_v64i32_v64i32v32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vmpyhsat_acc +class Hexagon_v32i32_v32i32v16i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vmpyhsat_acc +class Hexagon_v64i32_v64i32v32i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vaddcarrysat +class Hexagon_v16i32_v16i32v16i32v512i1_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vaddcarrysat +class Hexagon_v32i32_v32i32v32i32v1024i1_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vlutvvb_oracc +class Hexagon_v16i32_v16i32v16i32v16i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vlutvvb_oracc +class Hexagon_v32i32_v32i32v32i32v32i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vrmpybub_rtt +class Hexagon_v32i32_v16i32i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vrmpybub_rtt +class Hexagon_v64i32_v32i32i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : A4_addp_c +class Hexagon_i64i32_i64i64i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vrsadubi_acc +class Hexagon_v32i32_v32i32v32i32i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vrsadubi_acc +class Hexagon_v64i32_v64i32v64i32i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_conv_df2sf +class Hexagon_float_double_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vandvqv +class Hexagon_v16i32_v512i1v16i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vandvqv +class Hexagon_v32i32_v1024i1v32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : C2_vmux +class Hexagon_i64_i32i64i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_sfcmpeq +class Hexagon_i32_floatfloat_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vmpahhsat +class Hexagon_v16i32_v16i32v16i32i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vmpahhsat +class Hexagon_v32i32_v32i32v32i32i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vandvrt +class Hexagon_v512i1_v16i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vandvrt +class Hexagon_v1024i1_v32i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vsubcarry +class Hexagon_custom_v16i32v512i1_v16i32v16i32v512i1_Intrinsic + : Hexagon_NonGCC_Intrinsic< + [llvm_v16i32_ty,llvm_v512i1_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_v512i1_ty], + [IntrNoMem]>; + +// tag : V6_vsubcarry +class Hexagon_custom_v32i32v1024i1_v32i32v32i32v1024i1_Intrinsic_128B + : Hexagon_NonGCC_Intrinsic< + [llvm_v32i32_ty,llvm_v1024i1_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_v1024i1_ty], + [IntrNoMem]>; + +// tag : F2_sffixupr +class Hexagon_float_float_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vandvrt_acc +class Hexagon_v512i1_v512i1v16i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vandvrt_acc +class Hexagon_v1024i1_v1024i1v32i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_dfsub +class Hexagon_double_doubledouble_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vmpyowh_sacc +class Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vmpyowh_sacc +class Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : S2_insertp +class Hexagon_i64_i64i64i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_sfinvsqrta +class Hexagon_floati32_float_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vtran2x2_map +class Hexagon_v16i32v16i32_v16i32v16i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vtran2x2_map +class Hexagon_v32i32v32i32_v32i32v32i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vlutvwh_oracc +class Hexagon_v32i32_v32i32v16i32v16i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vlutvwh_oracc +class Hexagon_v64i32_v64i32v32i32v32i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_dfcmpge +class Hexagon_i32_doubledouble_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_conv_df2d_chop +class Hexagon_i64_double_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_conv_sf2w +class Hexagon_i32_float_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_sfclass +class Hexagon_i32_floati32_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_conv_sf2ud_chop +class Hexagon_i64_float_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_pred_scalar2v2 +class Hexagon_v512i1_i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_pred_scalar2v2 +class Hexagon_v1024i1_i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_sfrecipa +class Hexagon_floati32_floatfloat_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vprefixqh +class Hexagon_v16i32_v512i1_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vprefixqh +class Hexagon_v32i32_v1024i1_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vdmpyhisat_acc +class Hexagon_v16i32_v16i32v32i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vdmpyhisat_acc +class Hexagon_v32i32_v32i32v64i32i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_conv_ud2sf +class Hexagon_float_i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_conv_sf2df +class Hexagon_double_float_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_sffma_sc +class Hexagon_float_floatfloatfloati32_Intrinsic + : Hexagon_Intrinsic; + +// tag : F2_dfclass +class Hexagon_i32_doublei32_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vd0 +class Hexagon_v16i32__Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vd0 +class Hexagon_v32i32__Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vdd0 +class Hexagon_v64i32__Intrinsic + : Hexagon_Intrinsic; + +// tag : S2_insert_rp +class Hexagon_i32_i32i32i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_vassignp +class Hexagon_v64i32_v64i32_Intrinsic + : Hexagon_Intrinsic; + +// tag : A6_vminub_RdP +class Hexagon_i64i32_i64i64_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_pred_not +class Hexagon_v512i1_v512i1_Intrinsic + : Hexagon_Intrinsic; + +// tag : V6_pred_not +class Hexagon_v1024i1_v1024i1_Intrinsic + : Hexagon_Intrinsic; + +// V5 Scalar Instructions. + +def int_hexagon_S2_asr_r_p_or : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_r_p_or">; + +def int_hexagon_S2_vsatwh : +Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_vsatwh">; + +def int_hexagon_S2_tableidxd_goodsyntax : +Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_tableidxd_goodsyntax">; + +def int_hexagon_M2_mpysu_up : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpysu_up">; + +def int_hexagon_M2_mpyud_acc_ll_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_acc_ll_s0">; + +def int_hexagon_M2_mpyud_acc_ll_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_acc_ll_s1">; + +def int_hexagon_M2_cmpysc_s1 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_cmpysc_s1">; + +def int_hexagon_M2_cmpysc_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_cmpysc_s0">; + def int_hexagon_M4_cmpyi_whc : -Hexagon_si_disi_Intrinsic<"HEXAGON_M4_cmpyi_whc">; -// -// BUILTIN_INFO(HEXAGON.M4_cmpyr_whc,SI_ftype_DISI,2) -// -def int_hexagon_M4_cmpyr_whc : -Hexagon_si_disi_Intrinsic<"HEXAGON_M4_cmpyr_whc">; -// -// BUILTIN_INFO(HEXAGON.M2_vcmpy_s0_sat_i,DI_ftype_DIDI,2) -// -def int_hexagon_M2_vcmpy_s0_sat_i : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vcmpy_s0_sat_i">; -// -// BUILTIN_INFO(HEXAGON.M2_vcmpy_s0_sat_r,DI_ftype_DIDI,2) -// -def int_hexagon_M2_vcmpy_s0_sat_r : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vcmpy_s0_sat_r">; -// -// BUILTIN_INFO(HEXAGON.M2_vcmpy_s1_sat_i,DI_ftype_DIDI,2) -// -def int_hexagon_M2_vcmpy_s1_sat_i : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vcmpy_s1_sat_i">; -// -// BUILTIN_INFO(HEXAGON.M2_vcmpy_s1_sat_r,DI_ftype_DIDI,2) -// -def int_hexagon_M2_vcmpy_s1_sat_r : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vcmpy_s1_sat_r">; -// -// BUILTIN_INFO(HEXAGON.M2_vcmac_s0_sat_i,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_vcmac_s0_sat_i : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vcmac_s0_sat_i">; -// -// BUILTIN_INFO(HEXAGON.M2_vcmac_s0_sat_r,DI_ftype_DIDIDI,3) -// -def int_hexagon_M2_vcmac_s0_sat_r : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M2_vcmac_s0_sat_r">; -// -// BUILTIN_INFO(HEXAGON.S2_vcrotate,DI_ftype_DISI,2) -// -def int_hexagon_S2_vcrotate : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_vcrotate">; -// -// BUILTIN_INFO(HEXAGON.S4_vrcrotate_acc,DI_ftype_DIDISISI,4) -// -def int_hexagon_S4_vrcrotate_acc : -Hexagon_di_didisisi_Intrinsic<"HEXAGON_S4_vrcrotate_acc">; -// -// BUILTIN_INFO(HEXAGON.S4_vrcrotate,DI_ftype_DISISI,3) -// -def int_hexagon_S4_vrcrotate : -Hexagon_di_disisi_Intrinsic<"HEXAGON_S4_vrcrotate">; -// -// BUILTIN_INFO(HEXAGON.S2_vcnegh,DI_ftype_DISI,2) -// -def int_hexagon_S2_vcnegh : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_vcnegh">; -// -// BUILTIN_INFO(HEXAGON.S2_vrcnegh,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_vrcnegh : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_vrcnegh">; -// -// BUILTIN_INFO(HEXAGON.M4_pmpyw,DI_ftype_SISI,2) -// -def int_hexagon_M4_pmpyw : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M4_pmpyw">; -// -// BUILTIN_INFO(HEXAGON.M4_vpmpyh,DI_ftype_SISI,2) -// -def int_hexagon_M4_vpmpyh : -Hexagon_di_sisi_Intrinsic<"HEXAGON_M4_vpmpyh">; -// -// BUILTIN_INFO(HEXAGON.M4_pmpyw_acc,DI_ftype_DISISI,3) -// -def int_hexagon_M4_pmpyw_acc : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M4_pmpyw_acc">; -// -// BUILTIN_INFO(HEXAGON.M4_vpmpyh_acc,DI_ftype_DISISI,3) -// -def int_hexagon_M4_vpmpyh_acc : -Hexagon_di_disisi_Intrinsic<"HEXAGON_M4_vpmpyh_acc">; -// -// BUILTIN_INFO(HEXAGON.A2_add,SI_ftype_SISI,2) -// -def int_hexagon_A2_add : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_add">; -// -// BUILTIN_INFO(HEXAGON.A2_sub,SI_ftype_SISI,2) -// -def int_hexagon_A2_sub : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_sub">; -// -// BUILTIN_INFO(HEXAGON.A2_addsat,SI_ftype_SISI,2) -// -def int_hexagon_A2_addsat : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addsat">; -// -// BUILTIN_INFO(HEXAGON.A2_subsat,SI_ftype_SISI,2) -// -def int_hexagon_A2_subsat : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subsat">; -// -// BUILTIN_INFO(HEXAGON.A2_addi,SI_ftype_SISI,2) -// -def int_hexagon_A2_addi : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addi">; -// -// BUILTIN_INFO(HEXAGON.A2_addh_l16_ll,SI_ftype_SISI,2) -// -def int_hexagon_A2_addh_l16_ll : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_l16_ll">; -// -// BUILTIN_INFO(HEXAGON.A2_addh_l16_hl,SI_ftype_SISI,2) -// -def int_hexagon_A2_addh_l16_hl : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_l16_hl">; -// -// BUILTIN_INFO(HEXAGON.A2_addh_l16_sat_ll,SI_ftype_SISI,2) -// -def int_hexagon_A2_addh_l16_sat_ll : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_l16_sat_ll">; -// -// BUILTIN_INFO(HEXAGON.A2_addh_l16_sat_hl,SI_ftype_SISI,2) -// -def int_hexagon_A2_addh_l16_sat_hl : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_l16_sat_hl">; -// -// BUILTIN_INFO(HEXAGON.A2_subh_l16_ll,SI_ftype_SISI,2) -// -def int_hexagon_A2_subh_l16_ll : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_l16_ll">; -// -// BUILTIN_INFO(HEXAGON.A2_subh_l16_hl,SI_ftype_SISI,2) -// -def int_hexagon_A2_subh_l16_hl : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_l16_hl">; -// -// BUILTIN_INFO(HEXAGON.A2_subh_l16_sat_ll,SI_ftype_SISI,2) -// -def int_hexagon_A2_subh_l16_sat_ll : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_l16_sat_ll">; -// -// BUILTIN_INFO(HEXAGON.A2_subh_l16_sat_hl,SI_ftype_SISI,2) -// -def int_hexagon_A2_subh_l16_sat_hl : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_l16_sat_hl">; -// -// BUILTIN_INFO(HEXAGON.A2_addh_h16_ll,SI_ftype_SISI,2) -// -def int_hexagon_A2_addh_h16_ll : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_ll">; -// -// BUILTIN_INFO(HEXAGON.A2_addh_h16_lh,SI_ftype_SISI,2) -// -def int_hexagon_A2_addh_h16_lh : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_lh">; -// -// BUILTIN_INFO(HEXAGON.A2_addh_h16_hl,SI_ftype_SISI,2) -// -def int_hexagon_A2_addh_h16_hl : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_hl">; -// -// BUILTIN_INFO(HEXAGON.A2_addh_h16_hh,SI_ftype_SISI,2) -// -def int_hexagon_A2_addh_h16_hh : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_hh">; -// -// BUILTIN_INFO(HEXAGON.A2_addh_h16_sat_ll,SI_ftype_SISI,2) -// -def int_hexagon_A2_addh_h16_sat_ll : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_sat_ll">; -// -// BUILTIN_INFO(HEXAGON.A2_addh_h16_sat_lh,SI_ftype_SISI,2) -// -def int_hexagon_A2_addh_h16_sat_lh : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_sat_lh">; -// -// BUILTIN_INFO(HEXAGON.A2_addh_h16_sat_hl,SI_ftype_SISI,2) -// -def int_hexagon_A2_addh_h16_sat_hl : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_sat_hl">; -// -// BUILTIN_INFO(HEXAGON.A2_addh_h16_sat_hh,SI_ftype_SISI,2) -// -def int_hexagon_A2_addh_h16_sat_hh : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_addh_h16_sat_hh">; -// -// BUILTIN_INFO(HEXAGON.A2_subh_h16_ll,SI_ftype_SISI,2) -// -def int_hexagon_A2_subh_h16_ll : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_ll">; -// -// BUILTIN_INFO(HEXAGON.A2_subh_h16_lh,SI_ftype_SISI,2) -// -def int_hexagon_A2_subh_h16_lh : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_lh">; -// -// BUILTIN_INFO(HEXAGON.A2_subh_h16_hl,SI_ftype_SISI,2) -// -def int_hexagon_A2_subh_h16_hl : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_hl">; -// -// BUILTIN_INFO(HEXAGON.A2_subh_h16_hh,SI_ftype_SISI,2) -// -def int_hexagon_A2_subh_h16_hh : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_hh">; -// -// BUILTIN_INFO(HEXAGON.A2_subh_h16_sat_ll,SI_ftype_SISI,2) -// -def int_hexagon_A2_subh_h16_sat_ll : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_sat_ll">; -// -// BUILTIN_INFO(HEXAGON.A2_subh_h16_sat_lh,SI_ftype_SISI,2) -// -def int_hexagon_A2_subh_h16_sat_lh : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_sat_lh">; -// -// BUILTIN_INFO(HEXAGON.A2_subh_h16_sat_hl,SI_ftype_SISI,2) -// -def int_hexagon_A2_subh_h16_sat_hl : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_sat_hl">; -// -// BUILTIN_INFO(HEXAGON.A2_subh_h16_sat_hh,SI_ftype_SISI,2) -// -def int_hexagon_A2_subh_h16_sat_hh : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subh_h16_sat_hh">; -// -// BUILTIN_INFO(HEXAGON.A2_aslh,SI_ftype_SI,1) -// -def int_hexagon_A2_aslh : -Hexagon_si_si_Intrinsic<"HEXAGON_A2_aslh">; -// -// BUILTIN_INFO(HEXAGON.A2_asrh,SI_ftype_SI,1) -// -def int_hexagon_A2_asrh : -Hexagon_si_si_Intrinsic<"HEXAGON_A2_asrh">; -// -// BUILTIN_INFO(HEXAGON.A2_addp,DI_ftype_DIDI,2) -// -def int_hexagon_A2_addp : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_addp">; -// -// BUILTIN_INFO(HEXAGON.A2_addpsat,DI_ftype_DIDI,2) -// -def int_hexagon_A2_addpsat : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_addpsat">; -// -// BUILTIN_INFO(HEXAGON.A2_addsp,DI_ftype_SIDI,2) -// -def int_hexagon_A2_addsp : -Hexagon_di_sidi_Intrinsic<"HEXAGON_A2_addsp">; -// -// BUILTIN_INFO(HEXAGON.A2_subp,DI_ftype_DIDI,2) -// -def int_hexagon_A2_subp : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_subp">; -// -// BUILTIN_INFO(HEXAGON.A2_neg,SI_ftype_SI,1) -// -def int_hexagon_A2_neg : -Hexagon_si_si_Intrinsic<"HEXAGON_A2_neg">; -// -// BUILTIN_INFO(HEXAGON.A2_negsat,SI_ftype_SI,1) -// -def int_hexagon_A2_negsat : -Hexagon_si_si_Intrinsic<"HEXAGON_A2_negsat">; -// -// BUILTIN_INFO(HEXAGON.A2_abs,SI_ftype_SI,1) -// -def int_hexagon_A2_abs : -Hexagon_si_si_Intrinsic<"HEXAGON_A2_abs">; -// -// BUILTIN_INFO(HEXAGON.A2_abssat,SI_ftype_SI,1) -// -def int_hexagon_A2_abssat : -Hexagon_si_si_Intrinsic<"HEXAGON_A2_abssat">; -// -// BUILTIN_INFO(HEXAGON.A2_vconj,DI_ftype_DI,1) -// -def int_hexagon_A2_vconj : -Hexagon_di_di_Intrinsic<"HEXAGON_A2_vconj">; -// -// BUILTIN_INFO(HEXAGON.A2_negp,DI_ftype_DI,1) -// -def int_hexagon_A2_negp : -Hexagon_di_di_Intrinsic<"HEXAGON_A2_negp">; -// -// BUILTIN_INFO(HEXAGON.A2_absp,DI_ftype_DI,1) -// -def int_hexagon_A2_absp : -Hexagon_di_di_Intrinsic<"HEXAGON_A2_absp">; -// -// BUILTIN_INFO(HEXAGON.A2_max,SI_ftype_SISI,2) -// -def int_hexagon_A2_max : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_max">; -// -// BUILTIN_INFO(HEXAGON.A2_maxu,USI_ftype_SISI,2) -// -def int_hexagon_A2_maxu : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_maxu">; -// -// BUILTIN_INFO(HEXAGON.A2_min,SI_ftype_SISI,2) -// -def int_hexagon_A2_min : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_min">; -// -// BUILTIN_INFO(HEXAGON.A2_minu,USI_ftype_SISI,2) -// -def int_hexagon_A2_minu : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_minu">; -// -// BUILTIN_INFO(HEXAGON.A2_maxp,DI_ftype_DIDI,2) -// -def int_hexagon_A2_maxp : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_maxp">; -// -// BUILTIN_INFO(HEXAGON.A2_maxup,UDI_ftype_DIDI,2) -// -def int_hexagon_A2_maxup : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_maxup">; -// -// BUILTIN_INFO(HEXAGON.A2_minp,DI_ftype_DIDI,2) -// -def int_hexagon_A2_minp : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_minp">; -// -// BUILTIN_INFO(HEXAGON.A2_minup,UDI_ftype_DIDI,2) -// -def int_hexagon_A2_minup : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_minup">; -// -// BUILTIN_INFO(HEXAGON.A2_tfr,SI_ftype_SI,1) -// -def int_hexagon_A2_tfr : -Hexagon_si_si_Intrinsic<"HEXAGON_A2_tfr">; -// -// BUILTIN_INFO(HEXAGON.A2_tfrsi,SI_ftype_SI,1) -// -def int_hexagon_A2_tfrsi : -Hexagon_si_si_Intrinsic<"HEXAGON_A2_tfrsi">; -// -// BUILTIN_INFO(HEXAGON.A2_tfrp,DI_ftype_DI,1) -// -def int_hexagon_A2_tfrp : -Hexagon_di_di_Intrinsic<"HEXAGON_A2_tfrp">; -// -// BUILTIN_INFO(HEXAGON.A2_tfrpi,DI_ftype_SI,1) -// -def int_hexagon_A2_tfrpi : -Hexagon_di_si_Intrinsic<"HEXAGON_A2_tfrpi">; -// -// BUILTIN_INFO(HEXAGON.A2_zxtb,SI_ftype_SI,1) -// -def int_hexagon_A2_zxtb : -Hexagon_si_si_Intrinsic<"HEXAGON_A2_zxtb">; -// -// BUILTIN_INFO(HEXAGON.A2_sxtb,SI_ftype_SI,1) -// -def int_hexagon_A2_sxtb : -Hexagon_si_si_Intrinsic<"HEXAGON_A2_sxtb">; -// -// BUILTIN_INFO(HEXAGON.A2_zxth,SI_ftype_SI,1) -// -def int_hexagon_A2_zxth : -Hexagon_si_si_Intrinsic<"HEXAGON_A2_zxth">; -// -// BUILTIN_INFO(HEXAGON.A2_sxth,SI_ftype_SI,1) -// -def int_hexagon_A2_sxth : -Hexagon_si_si_Intrinsic<"HEXAGON_A2_sxth">; -// -// BUILTIN_INFO(HEXAGON.A2_combinew,DI_ftype_SISI,2) -// -def int_hexagon_A2_combinew : -Hexagon_di_sisi_Intrinsic<"HEXAGON_A2_combinew">; -// -// BUILTIN_INFO(HEXAGON.A4_combineri,DI_ftype_SISI,2) -// +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_M4_cmpyi_whc">; + +def int_hexagon_M2_mpy_sat_rnd_lh_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_lh_s1">; + +def int_hexagon_M2_mpy_sat_rnd_lh_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_lh_s0">; + +def int_hexagon_S2_tableidxb_goodsyntax : +Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_tableidxb_goodsyntax">; + +def int_hexagon_S2_shuffoh : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S2_shuffoh">; + +def int_hexagon_F2_sfmax : +Hexagon_float_floatfloat_Intrinsic<"HEXAGON_F2_sfmax">; + +def int_hexagon_A2_vabswsat : +Hexagon_i64_i64_Intrinsic<"HEXAGON_A2_vabswsat">; + +def int_hexagon_S2_asr_i_r : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asr_i_r">; + +def int_hexagon_S2_asr_i_p : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_p">; + def int_hexagon_A4_combineri : -Hexagon_di_sisi_Intrinsic<"HEXAGON_A4_combineri">; -// -// BUILTIN_INFO(HEXAGON.A4_combineir,DI_ftype_SISI,2) -// -def int_hexagon_A4_combineir : -Hexagon_di_sisi_Intrinsic<"HEXAGON_A4_combineir">; -// -// BUILTIN_INFO(HEXAGON.A2_combineii,DI_ftype_SISI,2) -// -def int_hexagon_A2_combineii : -Hexagon_di_sisi_Intrinsic<"HEXAGON_A2_combineii">; -// -// BUILTIN_INFO(HEXAGON.A2_combine_hh,SI_ftype_SISI,2) -// -def int_hexagon_A2_combine_hh : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_combine_hh">; -// -// BUILTIN_INFO(HEXAGON.A2_combine_hl,SI_ftype_SISI,2) -// -def int_hexagon_A2_combine_hl : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_combine_hl">; -// -// BUILTIN_INFO(HEXAGON.A2_combine_lh,SI_ftype_SISI,2) -// -def int_hexagon_A2_combine_lh : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_combine_lh">; -// -// BUILTIN_INFO(HEXAGON.A2_combine_ll,SI_ftype_SISI,2) -// -def int_hexagon_A2_combine_ll : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_combine_ll">; -// -// BUILTIN_INFO(HEXAGON.A2_tfril,SI_ftype_SISI,2) -// -def int_hexagon_A2_tfril : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_tfril">; -// -// BUILTIN_INFO(HEXAGON.A2_tfrih,SI_ftype_SISI,2) -// -def int_hexagon_A2_tfrih : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_tfrih">; -// -// BUILTIN_INFO(HEXAGON.A2_and,SI_ftype_SISI,2) -// -def int_hexagon_A2_and : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_and">; -// -// BUILTIN_INFO(HEXAGON.A2_or,SI_ftype_SISI,2) -// -def int_hexagon_A2_or : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_or">; -// -// BUILTIN_INFO(HEXAGON.A2_xor,SI_ftype_SISI,2) -// -def int_hexagon_A2_xor : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_xor">; -// -// BUILTIN_INFO(HEXAGON.A2_not,SI_ftype_SI,1) -// -def int_hexagon_A2_not : -Hexagon_si_si_Intrinsic<"HEXAGON_A2_not">; -// -// BUILTIN_INFO(HEXAGON.M2_xor_xacc,SI_ftype_SISISI,3) -// -def int_hexagon_M2_xor_xacc : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M2_xor_xacc">; -// -// BUILTIN_INFO(HEXAGON.M4_xor_xacc,DI_ftype_DIDIDI,3) -// -def int_hexagon_M4_xor_xacc : -Hexagon_di_dididi_Intrinsic<"HEXAGON_M4_xor_xacc">; -// -// BUILTIN_INFO(HEXAGON.A4_andn,SI_ftype_SISI,2) -// -def int_hexagon_A4_andn : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_andn">; -// -// BUILTIN_INFO(HEXAGON.A4_orn,SI_ftype_SISI,2) -// -def int_hexagon_A4_orn : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_orn">; -// -// BUILTIN_INFO(HEXAGON.A4_andnp,DI_ftype_DIDI,2) -// -def int_hexagon_A4_andnp : -Hexagon_di_didi_Intrinsic<"HEXAGON_A4_andnp">; -// -// BUILTIN_INFO(HEXAGON.A4_ornp,DI_ftype_DIDI,2) -// -def int_hexagon_A4_ornp : -Hexagon_di_didi_Intrinsic<"HEXAGON_A4_ornp">; -// -// BUILTIN_INFO(HEXAGON.S4_addaddi,SI_ftype_SISISI,3) -// -def int_hexagon_S4_addaddi : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_addaddi">; -// -// BUILTIN_INFO(HEXAGON.S4_subaddi,SI_ftype_SISISI,3) -// -def int_hexagon_S4_subaddi : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_subaddi">; -// -// BUILTIN_INFO(HEXAGON.M4_and_and,SI_ftype_SISISI,3) -// -def int_hexagon_M4_and_and : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_and_and">; -// -// BUILTIN_INFO(HEXAGON.M4_and_andn,SI_ftype_SISISI,3) -// -def int_hexagon_M4_and_andn : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_and_andn">; -// -// BUILTIN_INFO(HEXAGON.M4_and_or,SI_ftype_SISISI,3) -// -def int_hexagon_M4_and_or : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_and_or">; -// -// BUILTIN_INFO(HEXAGON.M4_and_xor,SI_ftype_SISISI,3) -// -def int_hexagon_M4_and_xor : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_and_xor">; -// -// BUILTIN_INFO(HEXAGON.M4_or_and,SI_ftype_SISISI,3) -// -def int_hexagon_M4_or_and : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_or_and">; -// -// BUILTIN_INFO(HEXAGON.M4_or_andn,SI_ftype_SISISI,3) -// -def int_hexagon_M4_or_andn : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_or_andn">; -// -// BUILTIN_INFO(HEXAGON.M4_or_or,SI_ftype_SISISI,3) -// -def int_hexagon_M4_or_or : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_or_or">; -// -// BUILTIN_INFO(HEXAGON.M4_or_xor,SI_ftype_SISISI,3) -// -def int_hexagon_M4_or_xor : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_or_xor">; -// -// BUILTIN_INFO(HEXAGON.S4_or_andix,SI_ftype_SISISI,3) -// -def int_hexagon_S4_or_andix : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_or_andix">; -// -// BUILTIN_INFO(HEXAGON.S4_or_andi,SI_ftype_SISISI,3) -// -def int_hexagon_S4_or_andi : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_or_andi">; -// -// BUILTIN_INFO(HEXAGON.S4_or_ori,SI_ftype_SISISI,3) -// -def int_hexagon_S4_or_ori : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_or_ori">; -// -// BUILTIN_INFO(HEXAGON.M4_xor_and,SI_ftype_SISISI,3) -// -def int_hexagon_M4_xor_and : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_xor_and">; -// -// BUILTIN_INFO(HEXAGON.M4_xor_or,SI_ftype_SISISI,3) -// -def int_hexagon_M4_xor_or : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_xor_or">; -// -// BUILTIN_INFO(HEXAGON.M4_xor_andn,SI_ftype_SISISI,3) -// -def int_hexagon_M4_xor_andn : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_M4_xor_andn">; -// -// BUILTIN_INFO(HEXAGON.A2_subri,SI_ftype_SISI,2) -// -def int_hexagon_A2_subri : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_subri">; -// -// BUILTIN_INFO(HEXAGON.A2_andir,SI_ftype_SISI,2) -// -def int_hexagon_A2_andir : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_andir">; -// -// BUILTIN_INFO(HEXAGON.A2_orir,SI_ftype_SISI,2) -// -def int_hexagon_A2_orir : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_orir">; -// -// BUILTIN_INFO(HEXAGON.A2_andp,DI_ftype_DIDI,2) -// -def int_hexagon_A2_andp : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_andp">; -// -// BUILTIN_INFO(HEXAGON.A2_orp,DI_ftype_DIDI,2) -// -def int_hexagon_A2_orp : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_orp">; -// -// BUILTIN_INFO(HEXAGON.A2_xorp,DI_ftype_DIDI,2) -// -def int_hexagon_A2_xorp : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_xorp">; -// -// BUILTIN_INFO(HEXAGON.A2_notp,DI_ftype_DI,1) -// +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A4_combineri">; + +def int_hexagon_M2_mpy_nac_sat_hl_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hl_s1">; + +def int_hexagon_M4_vpmpyh_acc : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M4_vpmpyh_acc">; + +def int_hexagon_M2_vcmpy_s0_sat_i : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vcmpy_s0_sat_i">; + def int_hexagon_A2_notp : -Hexagon_di_di_Intrinsic<"HEXAGON_A2_notp">; -// -// BUILTIN_INFO(HEXAGON.A2_sxtw,DI_ftype_SI,1) -// -def int_hexagon_A2_sxtw : -Hexagon_di_si_Intrinsic<"HEXAGON_A2_sxtw">; -// -// BUILTIN_INFO(HEXAGON.A2_sat,SI_ftype_DI,1) -// -def int_hexagon_A2_sat : -Hexagon_si_di_Intrinsic<"HEXAGON_A2_sat">; -// -// BUILTIN_INFO(HEXAGON.A2_roundsat,SI_ftype_DI,1) -// -def int_hexagon_A2_roundsat : -Hexagon_si_di_Intrinsic<"HEXAGON_A2_roundsat">; -// -// BUILTIN_INFO(HEXAGON.A2_sath,SI_ftype_SI,1) -// -def int_hexagon_A2_sath : -Hexagon_si_si_Intrinsic<"HEXAGON_A2_sath">; -// -// BUILTIN_INFO(HEXAGON.A2_satuh,SI_ftype_SI,1) -// -def int_hexagon_A2_satuh : -Hexagon_si_si_Intrinsic<"HEXAGON_A2_satuh">; -// -// BUILTIN_INFO(HEXAGON.A2_satub,SI_ftype_SI,1) -// -def int_hexagon_A2_satub : -Hexagon_si_si_Intrinsic<"HEXAGON_A2_satub">; -// -// BUILTIN_INFO(HEXAGON.A2_satb,SI_ftype_SI,1) -// -def int_hexagon_A2_satb : -Hexagon_si_si_Intrinsic<"HEXAGON_A2_satb">; -// -// BUILTIN_INFO(HEXAGON.A2_vaddub,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vaddub : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddub">; -// -// BUILTIN_INFO(HEXAGON.A2_vaddb_map,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vaddb_map : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddb_map">; -// -// BUILTIN_INFO(HEXAGON.A2_vaddubs,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vaddubs : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddubs">; -// -// BUILTIN_INFO(HEXAGON.A2_vaddh,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vaddh : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddh">; -// -// BUILTIN_INFO(HEXAGON.A2_vaddhs,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vaddhs : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddhs">; -// -// BUILTIN_INFO(HEXAGON.A2_vadduhs,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vadduhs : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vadduhs">; -// -// BUILTIN_INFO(HEXAGON.A5_vaddhubs,SI_ftype_DIDI,2) -// -def int_hexagon_A5_vaddhubs : -Hexagon_si_didi_Intrinsic<"HEXAGON_A5_vaddhubs">; -// -// BUILTIN_INFO(HEXAGON.A2_vaddw,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vaddw : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddw">; -// -// BUILTIN_INFO(HEXAGON.A2_vaddws,DI_ftype_DIDI,2) -// +Hexagon_i64_i64_Intrinsic<"HEXAGON_A2_notp">; + +def int_hexagon_M2_mpy_hl_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_hl_s1">; + +def int_hexagon_M2_mpy_hl_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_hl_s0">; + +def int_hexagon_C4_or_and : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C4_or_and">; + +def int_hexagon_M2_vmac2s_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_vmac2s_s0">; + +def int_hexagon_M2_vmac2s_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_vmac2s_s1">; + +def int_hexagon_S2_brevp : +Hexagon_i64_i64_Intrinsic<"HEXAGON_S2_brevp">; + +def int_hexagon_M4_pmpyw_acc : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M4_pmpyw_acc">; + +def int_hexagon_S2_cl1 : +Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_cl1">; + +def int_hexagon_C4_cmplte : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_cmplte">; + +def int_hexagon_M2_mmpyul_s0 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyul_s0">; + def int_hexagon_A2_vaddws : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vaddws">; -// -// BUILTIN_INFO(HEXAGON.S4_vxaddsubw,DI_ftype_DIDI,2) -// -def int_hexagon_S4_vxaddsubw : -Hexagon_di_didi_Intrinsic<"HEXAGON_S4_vxaddsubw">; -// -// BUILTIN_INFO(HEXAGON.S4_vxsubaddw,DI_ftype_DIDI,2) -// +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vaddws">; + +def int_hexagon_A2_maxup : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_maxup">; + +def int_hexagon_A4_vcmphgti : +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmphgti">; + +def int_hexagon_S2_interleave : +Hexagon_i64_i64_Intrinsic<"HEXAGON_S2_interleave">; + +def int_hexagon_M2_vrcmpyi_s0 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vrcmpyi_s0">; + +def int_hexagon_A2_abssat : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_abssat">; + +def int_hexagon_A2_vcmpwgtu : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmpwgtu">; + +def int_hexagon_C2_cmpgtu : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgtu">; + +def int_hexagon_C2_cmpgtp : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_C2_cmpgtp">; + +def int_hexagon_A4_cmphgtui : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmphgtui">; + +def int_hexagon_C2_cmpgti : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgti">; + +def int_hexagon_M2_mpyi : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyi">; + +def int_hexagon_F2_conv_df2uw_chop : +Hexagon_i32_double_Intrinsic<"HEXAGON_F2_conv_df2uw_chop">; + +def int_hexagon_A4_cmpheq : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpheq">; + +def int_hexagon_M2_mpy_lh_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_lh_s1">; + +def int_hexagon_M2_mpy_lh_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_lh_s0">; + +def int_hexagon_S2_lsr_i_r_xacc : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_xacc">; + +def int_hexagon_S2_vrcnegh : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_vrcnegh">; + +def int_hexagon_S2_extractup : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_S2_extractup">; + +def int_hexagon_S2_asr_i_p_rnd_goodsyntax : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_rnd_goodsyntax">; + +def int_hexagon_S4_ntstbit_r : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S4_ntstbit_r">; + +def int_hexagon_F2_conv_w2sf : +Hexagon_float_i32_Intrinsic<"HEXAGON_F2_conv_w2sf">; + +def int_hexagon_C2_not : +Hexagon_i32_i32_Intrinsic<"HEXAGON_C2_not">; + +def int_hexagon_C2_tfrpr : +Hexagon_i32_i32_Intrinsic<"HEXAGON_C2_tfrpr">; + +def int_hexagon_M2_mpy_ll_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_ll_s1">; + +def int_hexagon_M2_mpy_ll_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_ll_s0">; + +def int_hexagon_A4_cmpbgt : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbgt">; + +def int_hexagon_S2_asr_r_r_and : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_r_r_and">; + +def int_hexagon_A4_rcmpneqi : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_rcmpneqi">; + +def int_hexagon_S2_asl_i_r_nac : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_nac">; + +def int_hexagon_M2_subacc : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_subacc">; + +def int_hexagon_A2_orp : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_orp">; + +def int_hexagon_M2_mpyu_up : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyu_up">; + +def int_hexagon_M2_mpy_acc_sat_lh_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_sat_lh_s1">; + +def int_hexagon_S2_asr_i_vh : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_vh">; + +def int_hexagon_S2_asr_i_vw : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_vw">; + +def int_hexagon_A4_cmpbgtu : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbgtu">; + +def int_hexagon_A4_vcmpbeq_any : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A4_vcmpbeq_any">; + +def int_hexagon_A4_cmpbgti : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbgti">; + +def int_hexagon_M2_mpyd_lh_s1 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_lh_s1">; + +def int_hexagon_S2_asl_r_p_nac : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_r_p_nac">; + +def int_hexagon_S2_lsr_i_r_nac : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_nac">; + +def int_hexagon_A2_addsp : +Hexagon_i64_i32i64_Intrinsic<"HEXAGON_A2_addsp">; + def int_hexagon_S4_vxsubaddw : -Hexagon_di_didi_Intrinsic<"HEXAGON_S4_vxsubaddw">; -// -// BUILTIN_INFO(HEXAGON.S4_vxaddsubh,DI_ftype_DIDI,2) -// -def int_hexagon_S4_vxaddsubh : -Hexagon_di_didi_Intrinsic<"HEXAGON_S4_vxaddsubh">; -// -// BUILTIN_INFO(HEXAGON.S4_vxsubaddh,DI_ftype_DIDI,2) -// +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S4_vxsubaddw">; + +def int_hexagon_A4_vcmpheqi : +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpheqi">; + def int_hexagon_S4_vxsubaddh : -Hexagon_di_didi_Intrinsic<"HEXAGON_S4_vxsubaddh">; -// -// BUILTIN_INFO(HEXAGON.S4_vxaddsubhr,DI_ftype_DIDI,2) -// -def int_hexagon_S4_vxaddsubhr : -Hexagon_di_didi_Intrinsic<"HEXAGON_S4_vxaddsubhr">; -// -// BUILTIN_INFO(HEXAGON.S4_vxsubaddhr,DI_ftype_DIDI,2) -// -def int_hexagon_S4_vxsubaddhr : -Hexagon_di_didi_Intrinsic<"HEXAGON_S4_vxsubaddhr">; -// -// BUILTIN_INFO(HEXAGON.A2_svavgh,SI_ftype_SISI,2) -// -def int_hexagon_A2_svavgh : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svavgh">; -// -// BUILTIN_INFO(HEXAGON.A2_svavghs,SI_ftype_SISI,2) -// -def int_hexagon_A2_svavghs : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svavghs">; -// -// BUILTIN_INFO(HEXAGON.A2_svnavgh,SI_ftype_SISI,2) -// -def int_hexagon_A2_svnavgh : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svnavgh">; -// -// BUILTIN_INFO(HEXAGON.A2_svaddh,SI_ftype_SISI,2) -// -def int_hexagon_A2_svaddh : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svaddh">; -// -// BUILTIN_INFO(HEXAGON.A2_svaddhs,SI_ftype_SISI,2) -// -def int_hexagon_A2_svaddhs : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svaddhs">; -// -// BUILTIN_INFO(HEXAGON.A2_svadduhs,SI_ftype_SISI,2) -// -def int_hexagon_A2_svadduhs : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svadduhs">; -// -// BUILTIN_INFO(HEXAGON.A2_svsubh,SI_ftype_SISI,2) -// -def int_hexagon_A2_svsubh : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svsubh">; -// -// BUILTIN_INFO(HEXAGON.A2_svsubhs,SI_ftype_SISI,2) -// -def int_hexagon_A2_svsubhs : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svsubhs">; -// -// BUILTIN_INFO(HEXAGON.A2_svsubuhs,SI_ftype_SISI,2) -// -def int_hexagon_A2_svsubuhs : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A2_svsubuhs">; -// -// BUILTIN_INFO(HEXAGON.A2_vraddub,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vraddub : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vraddub">; -// -// BUILTIN_INFO(HEXAGON.A2_vraddub_acc,DI_ftype_DIDIDI,3) -// -def int_hexagon_A2_vraddub_acc : -Hexagon_di_dididi_Intrinsic<"HEXAGON_A2_vraddub_acc">; -// -// BUILTIN_INFO(HEXAGON.M2_vraddh,SI_ftype_DIDI,2) -// -def int_hexagon_M2_vraddh : -Hexagon_si_didi_Intrinsic<"HEXAGON_M2_vraddh">; -// -// BUILTIN_INFO(HEXAGON.M2_vradduh,SI_ftype_DIDI,2) -// -def int_hexagon_M2_vradduh : -Hexagon_si_didi_Intrinsic<"HEXAGON_M2_vradduh">; -// -// BUILTIN_INFO(HEXAGON.A2_vsubub,DI_ftype_DIDI,2) -// +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S4_vxsubaddh">; + +def int_hexagon_M4_pmpyw : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M4_pmpyw">; + +def int_hexagon_S2_vsathb : +Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_vsathb">; + +def int_hexagon_S2_asr_r_p_and : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_r_p_and">; + +def int_hexagon_M2_mpyu_acc_lh_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_acc_lh_s1">; + +def int_hexagon_M2_mpyu_acc_lh_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_acc_lh_s0">; + +def int_hexagon_S2_lsl_r_p_acc : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsl_r_p_acc">; + +def int_hexagon_A2_pxorf : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_A2_pxorf">; + +def int_hexagon_C2_cmpgei : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgei">; + def int_hexagon_A2_vsubub : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubub">; -// -// BUILTIN_INFO(HEXAGON.A2_vsubb_map,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vsubb_map : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubb_map">; -// -// BUILTIN_INFO(HEXAGON.A2_vsububs,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vsububs : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsububs">; -// -// BUILTIN_INFO(HEXAGON.A2_vsubh,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vsubh : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubh">; -// -// BUILTIN_INFO(HEXAGON.A2_vsubhs,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vsubhs : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubhs">; -// -// BUILTIN_INFO(HEXAGON.A2_vsubuhs,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vsubuhs : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubuhs">; -// -// BUILTIN_INFO(HEXAGON.A2_vsubw,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vsubw : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubw">; -// -// BUILTIN_INFO(HEXAGON.A2_vsubws,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vsubws : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vsubws">; -// -// BUILTIN_INFO(HEXAGON.A2_vabsh,DI_ftype_DI,1) -// -def int_hexagon_A2_vabsh : -Hexagon_di_di_Intrinsic<"HEXAGON_A2_vabsh">; -// -// BUILTIN_INFO(HEXAGON.A2_vabshsat,DI_ftype_DI,1) -// -def int_hexagon_A2_vabshsat : -Hexagon_di_di_Intrinsic<"HEXAGON_A2_vabshsat">; -// -// BUILTIN_INFO(HEXAGON.A2_vabsw,DI_ftype_DI,1) -// -def int_hexagon_A2_vabsw : -Hexagon_di_di_Intrinsic<"HEXAGON_A2_vabsw">; -// -// BUILTIN_INFO(HEXAGON.A2_vabswsat,DI_ftype_DI,1) -// -def int_hexagon_A2_vabswsat : -Hexagon_di_di_Intrinsic<"HEXAGON_A2_vabswsat">; -// -// BUILTIN_INFO(HEXAGON.M2_vabsdiffw,DI_ftype_DIDI,2) -// -def int_hexagon_M2_vabsdiffw : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vabsdiffw">; -// -// BUILTIN_INFO(HEXAGON.M2_vabsdiffh,DI_ftype_DIDI,2) -// -def int_hexagon_M2_vabsdiffh : -Hexagon_di_didi_Intrinsic<"HEXAGON_M2_vabsdiffh">; -// -// BUILTIN_INFO(HEXAGON.A2_vrsadub,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vrsadub : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vrsadub">; -// -// BUILTIN_INFO(HEXAGON.A2_vrsadub_acc,DI_ftype_DIDIDI,3) -// -def int_hexagon_A2_vrsadub_acc : -Hexagon_di_dididi_Intrinsic<"HEXAGON_A2_vrsadub_acc">; -// -// BUILTIN_INFO(HEXAGON.A2_vavgub,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vavgub : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavgub">; -// -// BUILTIN_INFO(HEXAGON.A2_vavguh,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vavguh : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavguh">; -// -// BUILTIN_INFO(HEXAGON.A2_vavgh,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vavgh : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavgh">; -// -// BUILTIN_INFO(HEXAGON.A2_vnavgh,DI_ftype_DIDI,2) -// +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vsubub">; + +def int_hexagon_S2_asl_i_p : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asl_i_p">; + +def int_hexagon_S2_asl_i_r : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asl_i_r">; + +def int_hexagon_A4_vrminuw : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_A4_vrminuw">; + +def int_hexagon_F2_sffma : +Hexagon_float_floatfloatfloat_Intrinsic<"HEXAGON_F2_sffma">; + +def int_hexagon_A2_absp : +Hexagon_i64_i64_Intrinsic<"HEXAGON_A2_absp">; + +def int_hexagon_C2_all8 : +Hexagon_i32_i32_Intrinsic<"HEXAGON_C2_all8">; + +def int_hexagon_A4_vrminuh : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_A4_vrminuh">; + +def int_hexagon_F2_sffma_lib : +Hexagon_float_floatfloatfloat_Intrinsic<"HEXAGON_F2_sffma_lib">; + +def int_hexagon_M4_vrmpyoh_s0 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M4_vrmpyoh_s0">; + +def int_hexagon_M4_vrmpyoh_s1 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M4_vrmpyoh_s1">; + +def int_hexagon_C2_bitsset : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_bitsset">; + +def int_hexagon_M2_mpysip : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpysip">; + +def int_hexagon_M2_mpysin : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpysin">; + +def int_hexagon_A4_boundscheck : +Hexagon_i32_i32i64_Intrinsic<"HEXAGON_A4_boundscheck">; + +def int_hexagon_M5_vrmpybuu : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M5_vrmpybuu">; + +def int_hexagon_C4_fastcorner9 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_fastcorner9">; + +def int_hexagon_M2_vrcmpys_s1rp : +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_M2_vrcmpys_s1rp">; + +def int_hexagon_A2_neg : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_neg">; + +def int_hexagon_A2_subsat : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subsat">; + +def int_hexagon_S2_asl_r_r : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asl_r_r">; + +def int_hexagon_S2_asl_r_p : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asl_r_p">; + def int_hexagon_A2_vnavgh : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vnavgh">; -// -// BUILTIN_INFO(HEXAGON.A2_vavgw,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vavgw : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavgw">; -// -// BUILTIN_INFO(HEXAGON.A2_vnavgw,DI_ftype_DIDI,2) -// +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vnavgh">; + +def int_hexagon_M2_mpy_nac_sat_hl_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hl_s0">; + +def int_hexagon_F2_conv_ud2df : +Hexagon_double_i64_Intrinsic<"HEXAGON_F2_conv_ud2df">; + def int_hexagon_A2_vnavgw : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vnavgw">; -// -// BUILTIN_INFO(HEXAGON.A2_vavgwr,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vavgwr : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavgwr">; -// -// BUILTIN_INFO(HEXAGON.A2_vnavgwr,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vnavgwr : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vnavgwr">; -// -// BUILTIN_INFO(HEXAGON.A2_vavgwcr,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vavgwcr : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavgwcr">; -// -// BUILTIN_INFO(HEXAGON.A2_vnavgwcr,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vnavgwcr : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vnavgwcr">; -// -// BUILTIN_INFO(HEXAGON.A2_vavghcr,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vavghcr : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavghcr">; -// -// BUILTIN_INFO(HEXAGON.A2_vnavghcr,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vnavghcr : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vnavghcr">; -// -// BUILTIN_INFO(HEXAGON.A2_vavguw,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vavguw : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavguw">; -// -// BUILTIN_INFO(HEXAGON.A2_vavguwr,DI_ftype_DIDI,2) -// +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vnavgw">; + +def int_hexagon_S2_asl_i_r_acc : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_acc">; + +def int_hexagon_S4_subi_lsr_ri : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_subi_lsr_ri">; + +def int_hexagon_S2_vzxthw : +Hexagon_i64_i32_Intrinsic<"HEXAGON_S2_vzxthw">; + +def int_hexagon_F2_sfadd : +Hexagon_float_floatfloat_Intrinsic<"HEXAGON_F2_sfadd">; + +def int_hexagon_A2_sub : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_sub">; + +def int_hexagon_M2_vmac2su_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_vmac2su_s0">; + +def int_hexagon_M2_vmac2su_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_vmac2su_s1">; + +def int_hexagon_M2_dpmpyss_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_dpmpyss_s0">; + +def int_hexagon_S2_insert : +Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_insert">; + +def int_hexagon_S2_packhl : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_S2_packhl">; + +def int_hexagon_A4_vcmpwgti : +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpwgti">; + def int_hexagon_A2_vavguwr : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavguwr">; -// -// BUILTIN_INFO(HEXAGON.A2_vavgubr,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vavgubr : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavgubr">; -// -// BUILTIN_INFO(HEXAGON.A2_vavguhr,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vavguhr : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavguhr">; -// -// BUILTIN_INFO(HEXAGON.A2_vavghr,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vavghr : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vavghr">; -// -// BUILTIN_INFO(HEXAGON.A2_vnavghr,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vnavghr : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vnavghr">; -// -// BUILTIN_INFO(HEXAGON.A4_round_ri,SI_ftype_SISI,2) -// -def int_hexagon_A4_round_ri : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_round_ri">; -// -// BUILTIN_INFO(HEXAGON.A4_round_rr,SI_ftype_SISI,2) -// -def int_hexagon_A4_round_rr : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_round_rr">; -// -// BUILTIN_INFO(HEXAGON.A4_round_ri_sat,SI_ftype_SISI,2) -// -def int_hexagon_A4_round_ri_sat : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_round_ri_sat">; -// -// BUILTIN_INFO(HEXAGON.A4_round_rr_sat,SI_ftype_SISI,2) -// -def int_hexagon_A4_round_rr_sat : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_round_rr_sat">; -// -// BUILTIN_INFO(HEXAGON.A4_cround_ri,SI_ftype_SISI,2) -// -def int_hexagon_A4_cround_ri : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cround_ri">; -// -// BUILTIN_INFO(HEXAGON.A4_cround_rr,SI_ftype_SISI,2) -// -def int_hexagon_A4_cround_rr : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_cround_rr">; -// -// BUILTIN_INFO(HEXAGON.A4_vrminh,DI_ftype_DIDISI,3) -// -def int_hexagon_A4_vrminh : -Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrminh">; -// -// BUILTIN_INFO(HEXAGON.A4_vrmaxh,DI_ftype_DIDISI,3) -// +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavguwr">; + +def int_hexagon_S2_asl_r_r_and : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_r_r_and">; + +def int_hexagon_A2_svsubhs : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_svsubhs">; + +def int_hexagon_A2_addh_l16_hl : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_l16_hl">; + +def int_hexagon_M4_and_and : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_and_and">; + +def int_hexagon_F2_conv_d2df : +Hexagon_double_i64_Intrinsic<"HEXAGON_F2_conv_d2df">; + +def int_hexagon_C2_cmpgtui : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgtui">; + +def int_hexagon_A2_vconj : +Hexagon_i64_i64_Intrinsic<"HEXAGON_A2_vconj">; + +def int_hexagon_S2_lsr_r_vw : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsr_r_vw">; + +def int_hexagon_S2_lsr_r_vh : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsr_r_vh">; + +def int_hexagon_A2_subh_l16_hl : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_l16_hl">; + +def int_hexagon_S4_vxsubaddhr : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S4_vxsubaddhr">; + +def int_hexagon_S2_clbp : +Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_clbp">; + +def int_hexagon_S2_deinterleave : +Hexagon_i64_i64_Intrinsic<"HEXAGON_S2_deinterleave">; + +def int_hexagon_C2_any8 : +Hexagon_i32_i32_Intrinsic<"HEXAGON_C2_any8">; + +def int_hexagon_S2_togglebit_r : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_togglebit_r">; + +def int_hexagon_S2_togglebit_i : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_togglebit_i">; + +def int_hexagon_F2_conv_uw2sf : +Hexagon_float_i32_Intrinsic<"HEXAGON_F2_conv_uw2sf">; + +def int_hexagon_S2_vsathb_nopack : +Hexagon_i64_i64_Intrinsic<"HEXAGON_S2_vsathb_nopack">; + +def int_hexagon_M2_cmacs_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cmacs_s0">; + +def int_hexagon_M2_cmacs_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cmacs_s1">; + +def int_hexagon_M2_mpy_sat_hh_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_hh_s0">; + +def int_hexagon_M2_mpy_sat_hh_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_hh_s1">; + +def int_hexagon_M2_mmacuhs_s1 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmacuhs_s1">; + +def int_hexagon_M2_mmacuhs_s0 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmacuhs_s0">; + +def int_hexagon_S2_clrbit_r : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_clrbit_r">; + +def int_hexagon_C4_or_andn : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C4_or_andn">; + +def int_hexagon_S2_asl_r_r_nac : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_r_r_nac">; + +def int_hexagon_S2_asl_i_p_acc : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_acc">; + +def int_hexagon_A4_vcmpwgtui : +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpwgtui">; + +def int_hexagon_M4_vrmpyoh_acc_s0 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M4_vrmpyoh_acc_s0">; + +def int_hexagon_M4_vrmpyoh_acc_s1 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M4_vrmpyoh_acc_s1">; + def int_hexagon_A4_vrmaxh : -Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrmaxh">; -// -// BUILTIN_INFO(HEXAGON.A4_vrminuh,DI_ftype_DIDISI,3) -// -def int_hexagon_A4_vrminuh : -Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrminuh">; -// -// BUILTIN_INFO(HEXAGON.A4_vrmaxuh,DI_ftype_DIDISI,3) -// -def int_hexagon_A4_vrmaxuh : -Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrmaxuh">; -// -// BUILTIN_INFO(HEXAGON.A4_vrminw,DI_ftype_DIDISI,3) -// -def int_hexagon_A4_vrminw : -Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrminw">; -// -// BUILTIN_INFO(HEXAGON.A4_vrmaxw,DI_ftype_DIDISI,3) -// +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_A4_vrmaxh">; + +def int_hexagon_A2_vcmpbeq : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmpbeq">; + +def int_hexagon_A2_vcmphgt : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmphgt">; + +def int_hexagon_A2_vnavgwcr : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vnavgwcr">; + +def int_hexagon_M2_vrcmacr_s0c : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vrcmacr_s0c">; + +def int_hexagon_A2_vavgwcr : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavgwcr">; + +def int_hexagon_S2_asl_i_p_xacc : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_xacc">; + def int_hexagon_A4_vrmaxw : -Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrmaxw">; -// -// BUILTIN_INFO(HEXAGON.A4_vrminuw,DI_ftype_DIDISI,3) -// -def int_hexagon_A4_vrminuw : -Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrminuw">; -// -// BUILTIN_INFO(HEXAGON.A4_vrmaxuw,DI_ftype_DIDISI,3) -// -def int_hexagon_A4_vrmaxuw : -Hexagon_di_didisi_Intrinsic<"HEXAGON_A4_vrmaxuw">; -// -// BUILTIN_INFO(HEXAGON.A2_vminb,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vminb : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vminb">; -// -// BUILTIN_INFO(HEXAGON.A2_vmaxb,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vmaxb : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vmaxb">; -// -// BUILTIN_INFO(HEXAGON.A2_vminub,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vminub : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vminub">; -// -// BUILTIN_INFO(HEXAGON.A2_vmaxub,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vmaxub : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vmaxub">; -// -// BUILTIN_INFO(HEXAGON.A2_vminh,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vminh : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vminh">; -// -// BUILTIN_INFO(HEXAGON.A2_vmaxh,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vmaxh : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vmaxh">; -// -// BUILTIN_INFO(HEXAGON.A2_vminuh,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vminuh : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vminuh">; -// -// BUILTIN_INFO(HEXAGON.A2_vmaxuh,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vmaxuh : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vmaxuh">; -// -// BUILTIN_INFO(HEXAGON.A2_vminw,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vminw : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vminw">; -// -// BUILTIN_INFO(HEXAGON.A2_vmaxw,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vmaxw : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vmaxw">; -// -// BUILTIN_INFO(HEXAGON.A2_vminuw,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vminuw : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vminuw">; -// -// BUILTIN_INFO(HEXAGON.A2_vmaxuw,DI_ftype_DIDI,2) -// -def int_hexagon_A2_vmaxuw : -Hexagon_di_didi_Intrinsic<"HEXAGON_A2_vmaxuw">; -// -// BUILTIN_INFO(HEXAGON.A4_modwrapu,SI_ftype_SISI,2) -// -def int_hexagon_A4_modwrapu : -Hexagon_si_sisi_Intrinsic<"HEXAGON_A4_modwrapu">; -// -// BUILTIN_INFO(HEXAGON.F2_sfadd,SF_ftype_SFSF,2) -// -def int_hexagon_F2_sfadd : -Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sfadd">; -// -// BUILTIN_INFO(HEXAGON.F2_sfsub,SF_ftype_SFSF,2) -// -def int_hexagon_F2_sfsub : -Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sfsub">; -// -// BUILTIN_INFO(HEXAGON.F2_sfmpy,SF_ftype_SFSF,2) -// -def int_hexagon_F2_sfmpy : -Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sfmpy">; -// -// BUILTIN_INFO(HEXAGON.F2_sffma,SF_ftype_SFSFSF,3) -// -def int_hexagon_F2_sffma : -Hexagon_sf_sfsfsf_Intrinsic<"HEXAGON_F2_sffma">; -// -// BUILTIN_INFO(HEXAGON.F2_sffma_sc,SF_ftype_SFSFSFQI,4) -// -def int_hexagon_F2_sffma_sc : -Hexagon_sf_sfsfsfqi_Intrinsic<"HEXAGON_F2_sffma_sc">; -// -// BUILTIN_INFO(HEXAGON.F2_sffms,SF_ftype_SFSFSF,3) -// -def int_hexagon_F2_sffms : -Hexagon_sf_sfsfsf_Intrinsic<"HEXAGON_F2_sffms">; -// -// BUILTIN_INFO(HEXAGON.F2_sffma_lib,SF_ftype_SFSFSF,3) -// -def int_hexagon_F2_sffma_lib : -Hexagon_sf_sfsfsf_Intrinsic<"HEXAGON_F2_sffma_lib">; -// -// BUILTIN_INFO(HEXAGON.F2_sffms_lib,SF_ftype_SFSFSF,3) -// -def int_hexagon_F2_sffms_lib : -Hexagon_sf_sfsfsf_Intrinsic<"HEXAGON_F2_sffms_lib">; -// -// BUILTIN_INFO(HEXAGON.F2_sfcmpeq,QI_ftype_SFSF,2) -// -def int_hexagon_F2_sfcmpeq : -Hexagon_si_sfsf_Intrinsic<"HEXAGON_F2_sfcmpeq">; -// -// BUILTIN_INFO(HEXAGON.F2_sfcmpgt,QI_ftype_SFSF,2) -// -def int_hexagon_F2_sfcmpgt : -Hexagon_si_sfsf_Intrinsic<"HEXAGON_F2_sfcmpgt">; -// -// BUILTIN_INFO(HEXAGON.F2_sfcmpge,QI_ftype_SFSF,2) -// -def int_hexagon_F2_sfcmpge : -Hexagon_si_sfsf_Intrinsic<"HEXAGON_F2_sfcmpge">; -// -// BUILTIN_INFO(HEXAGON.F2_sfcmpuo,QI_ftype_SFSF,2) -// -def int_hexagon_F2_sfcmpuo : -Hexagon_si_sfsf_Intrinsic<"HEXAGON_F2_sfcmpuo">; -// -// BUILTIN_INFO(HEXAGON.F2_sfmax,SF_ftype_SFSF,2) -// -def int_hexagon_F2_sfmax : -Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sfmax">; -// -// BUILTIN_INFO(HEXAGON.F2_sfmin,SF_ftype_SFSF,2) -// -def int_hexagon_F2_sfmin : -Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sfmin">; -// -// BUILTIN_INFO(HEXAGON.F2_sfclass,QI_ftype_SFSI,2) -// -def int_hexagon_F2_sfclass : -Hexagon_si_sfsi_Intrinsic<"HEXAGON_F2_sfclass">; -// -// BUILTIN_INFO(HEXAGON.F2_sfimm_p,SF_ftype_SI,1) -// -def int_hexagon_F2_sfimm_p : -Hexagon_sf_si_Intrinsic<"HEXAGON_F2_sfimm_p">; -// -// BUILTIN_INFO(HEXAGON.F2_sfimm_n,SF_ftype_SI,1) -// -def int_hexagon_F2_sfimm_n : -Hexagon_sf_si_Intrinsic<"HEXAGON_F2_sfimm_n">; -// -// BUILTIN_INFO(HEXAGON.F2_sffixupn,SF_ftype_SFSF,2) -// +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_A4_vrmaxw">; + +def int_hexagon_A2_vnavghr : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vnavghr">; + +def int_hexagon_M4_cmpyi_wh : +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_M4_cmpyi_wh">; + +def int_hexagon_A2_tfrsi : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_tfrsi">; + +def int_hexagon_S2_asr_i_r_acc : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_acc">; + +def int_hexagon_A2_svnavgh : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_svnavgh">; + +def int_hexagon_S2_lsr_i_r : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r">; + +def int_hexagon_M2_vmac2 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_vmac2">; + +def int_hexagon_A4_vcmphgtui : +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmphgtui">; + +def int_hexagon_A2_svavgh : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_svavgh">; + +def int_hexagon_M4_vrmpyeh_acc_s0 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M4_vrmpyeh_acc_s0">; + +def int_hexagon_M4_vrmpyeh_acc_s1 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M4_vrmpyeh_acc_s1">; + +def int_hexagon_S2_lsr_i_p : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p">; + +def int_hexagon_A2_combine_hl : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_combine_hl">; + +def int_hexagon_M2_mpy_up : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_up">; + +def int_hexagon_A2_combine_hh : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_combine_hh">; + +def int_hexagon_A2_negsat : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_negsat">; + +def int_hexagon_M2_mpyd_hl_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_hl_s0">; + +def int_hexagon_M2_mpyd_hl_s1 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_hl_s1">; + +def int_hexagon_A4_bitsplit : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A4_bitsplit">; + +def int_hexagon_A2_vabshsat : +Hexagon_i64_i64_Intrinsic<"HEXAGON_A2_vabshsat">; + +def int_hexagon_M2_mpyui : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyui">; + +def int_hexagon_A2_addh_l16_sat_ll : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_l16_sat_ll">; + +def int_hexagon_S2_lsl_r_r_and : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsl_r_r_and">; + +def int_hexagon_M2_mmpyul_rs0 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyul_rs0">; + +def int_hexagon_S2_asr_i_r_rnd_goodsyntax : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_rnd_goodsyntax">; + +def int_hexagon_S2_lsr_r_p_nac : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_r_p_nac">; + +def int_hexagon_C2_cmplt : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmplt">; + +def int_hexagon_M2_cmacr_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cmacr_s0">; + +def int_hexagon_M4_or_and : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_or_and">; + +def int_hexagon_M4_mpyrr_addi : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mpyrr_addi">; + +def int_hexagon_S4_or_andi : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_or_andi">; + +def int_hexagon_M2_mpy_sat_hl_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_hl_s0">; + +def int_hexagon_M2_mpy_sat_hl_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_hl_s1">; + +def int_hexagon_M4_mpyrr_addr : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mpyrr_addr">; + +def int_hexagon_M2_mmachs_rs0 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmachs_rs0">; + +def int_hexagon_M2_mmachs_rs1 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmachs_rs1">; + +def int_hexagon_M2_vrcmpyr_s0c : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vrcmpyr_s0c">; + +def int_hexagon_M2_mpy_acc_sat_hl_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hl_s0">; + +def int_hexagon_M2_mpyd_acc_ll_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_acc_ll_s1">; + def int_hexagon_F2_sffixupn : -Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sffixupn">; -// -// BUILTIN_INFO(HEXAGON.F2_sffixupd,SF_ftype_SFSF,2) -// -def int_hexagon_F2_sffixupd : -Hexagon_sf_sfsf_Intrinsic<"HEXAGON_F2_sffixupd">; -// -// BUILTIN_INFO(HEXAGON.F2_sffixupr,SF_ftype_SF,1) -// -def int_hexagon_F2_sffixupr : -Hexagon_sf_sf_Intrinsic<"HEXAGON_F2_sffixupr">; -// -// BUILTIN_INFO(HEXAGON.F2_dfcmpeq,QI_ftype_DFDF,2) -// -def int_hexagon_F2_dfcmpeq : -Hexagon_si_dfdf_Intrinsic<"HEXAGON_F2_dfcmpeq">; -// -// BUILTIN_INFO(HEXAGON.F2_dfcmpgt,QI_ftype_DFDF,2) -// -def int_hexagon_F2_dfcmpgt : -Hexagon_si_dfdf_Intrinsic<"HEXAGON_F2_dfcmpgt">; -// -// BUILTIN_INFO(HEXAGON.F2_dfcmpge,QI_ftype_DFDF,2) -// -def int_hexagon_F2_dfcmpge : -Hexagon_si_dfdf_Intrinsic<"HEXAGON_F2_dfcmpge">; -// -// BUILTIN_INFO(HEXAGON.F2_dfcmpuo,QI_ftype_DFDF,2) -// -def int_hexagon_F2_dfcmpuo : -Hexagon_si_dfdf_Intrinsic<"HEXAGON_F2_dfcmpuo">; -// -// BUILTIN_INFO(HEXAGON.F2_dfclass,QI_ftype_DFSI,2) -// -def int_hexagon_F2_dfclass : -Hexagon_si_dfsi_Intrinsic<"HEXAGON_F2_dfclass">; -// -// BUILTIN_INFO(HEXAGON.F2_dfimm_p,DF_ftype_SI,1) -// -def int_hexagon_F2_dfimm_p : -Hexagon_df_si_Intrinsic<"HEXAGON_F2_dfimm_p">; -// -// BUILTIN_INFO(HEXAGON.F2_dfimm_n,DF_ftype_SI,1) -// -def int_hexagon_F2_dfimm_n : -Hexagon_df_si_Intrinsic<"HEXAGON_F2_dfimm_n">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_sf2df,DF_ftype_SF,1) -// -def int_hexagon_F2_conv_sf2df : -Hexagon_df_sf_Intrinsic<"HEXAGON_F2_conv_sf2df">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_df2sf,SF_ftype_DF,1) -// -def int_hexagon_F2_conv_df2sf : -Hexagon_sf_df_Intrinsic<"HEXAGON_F2_conv_df2sf">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_uw2sf,SF_ftype_SI,1) -// -def int_hexagon_F2_conv_uw2sf : -Hexagon_sf_si_Intrinsic<"HEXAGON_F2_conv_uw2sf">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_uw2df,DF_ftype_SI,1) -// -def int_hexagon_F2_conv_uw2df : -Hexagon_df_si_Intrinsic<"HEXAGON_F2_conv_uw2df">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_w2sf,SF_ftype_SI,1) -// -def int_hexagon_F2_conv_w2sf : -Hexagon_sf_si_Intrinsic<"HEXAGON_F2_conv_w2sf">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_w2df,DF_ftype_SI,1) -// -def int_hexagon_F2_conv_w2df : -Hexagon_df_si_Intrinsic<"HEXAGON_F2_conv_w2df">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_ud2sf,SF_ftype_DI,1) -// -def int_hexagon_F2_conv_ud2sf : -Hexagon_sf_di_Intrinsic<"HEXAGON_F2_conv_ud2sf">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_ud2df,DF_ftype_DI,1) -// -def int_hexagon_F2_conv_ud2df : -Hexagon_df_di_Intrinsic<"HEXAGON_F2_conv_ud2df">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_d2sf,SF_ftype_DI,1) -// -def int_hexagon_F2_conv_d2sf : -Hexagon_sf_di_Intrinsic<"HEXAGON_F2_conv_d2sf">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_d2df,DF_ftype_DI,1) -// -def int_hexagon_F2_conv_d2df : -Hexagon_df_di_Intrinsic<"HEXAGON_F2_conv_d2df">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_sf2uw,SI_ftype_SF,1) -// -def int_hexagon_F2_conv_sf2uw : -Hexagon_si_sf_Intrinsic<"HEXAGON_F2_conv_sf2uw">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_sf2w,SI_ftype_SF,1) -// -def int_hexagon_F2_conv_sf2w : -Hexagon_si_sf_Intrinsic<"HEXAGON_F2_conv_sf2w">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_sf2ud,DI_ftype_SF,1) -// -def int_hexagon_F2_conv_sf2ud : -Hexagon_di_sf_Intrinsic<"HEXAGON_F2_conv_sf2ud">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_sf2d,DI_ftype_SF,1) -// -def int_hexagon_F2_conv_sf2d : -Hexagon_di_sf_Intrinsic<"HEXAGON_F2_conv_sf2d">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_df2uw,SI_ftype_DF,1) -// -def int_hexagon_F2_conv_df2uw : -Hexagon_si_df_Intrinsic<"HEXAGON_F2_conv_df2uw">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_df2w,SI_ftype_DF,1) -// -def int_hexagon_F2_conv_df2w : -Hexagon_si_df_Intrinsic<"HEXAGON_F2_conv_df2w">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_df2ud,DI_ftype_DF,1) -// -def int_hexagon_F2_conv_df2ud : -Hexagon_di_df_Intrinsic<"HEXAGON_F2_conv_df2ud">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_df2d,DI_ftype_DF,1) -// -def int_hexagon_F2_conv_df2d : -Hexagon_di_df_Intrinsic<"HEXAGON_F2_conv_df2d">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_sf2uw_chop,SI_ftype_SF,1) -// -def int_hexagon_F2_conv_sf2uw_chop : -Hexagon_si_sf_Intrinsic<"HEXAGON_F2_conv_sf2uw_chop">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_sf2w_chop,SI_ftype_SF,1) -// -def int_hexagon_F2_conv_sf2w_chop : -Hexagon_si_sf_Intrinsic<"HEXAGON_F2_conv_sf2w_chop">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_sf2ud_chop,DI_ftype_SF,1) -// -def int_hexagon_F2_conv_sf2ud_chop : -Hexagon_di_sf_Intrinsic<"HEXAGON_F2_conv_sf2ud_chop">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_sf2d_chop,DI_ftype_SF,1) -// -def int_hexagon_F2_conv_sf2d_chop : -Hexagon_di_sf_Intrinsic<"HEXAGON_F2_conv_sf2d_chop">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_df2uw_chop,SI_ftype_DF,1) -// -def int_hexagon_F2_conv_df2uw_chop : -Hexagon_si_df_Intrinsic<"HEXAGON_F2_conv_df2uw_chop">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_df2w_chop,SI_ftype_DF,1) -// -def int_hexagon_F2_conv_df2w_chop : -Hexagon_si_df_Intrinsic<"HEXAGON_F2_conv_df2w_chop">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_df2ud_chop,DI_ftype_DF,1) -// -def int_hexagon_F2_conv_df2ud_chop : -Hexagon_di_df_Intrinsic<"HEXAGON_F2_conv_df2ud_chop">; -// -// BUILTIN_INFO(HEXAGON.F2_conv_df2d_chop,DI_ftype_DF,1) -// -def int_hexagon_F2_conv_df2d_chop : -Hexagon_di_df_Intrinsic<"HEXAGON_F2_conv_df2d_chop">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_r_r,SI_ftype_SISI,2) -// -def int_hexagon_S2_asr_r_r : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asr_r_r">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_r_r,SI_ftype_SISI,2) -// -def int_hexagon_S2_asl_r_r : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asl_r_r">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_r_r,SI_ftype_SISI,2) -// -def int_hexagon_S2_lsr_r_r : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_lsr_r_r">; -// -// BUILTIN_INFO(HEXAGON.S2_lsl_r_r,SI_ftype_SISI,2) -// -def int_hexagon_S2_lsl_r_r : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_lsl_r_r">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_r_p,DI_ftype_DISI,2) -// -def int_hexagon_S2_asr_r_p : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_r_p">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_r_p,DI_ftype_DISI,2) -// -def int_hexagon_S2_asl_r_p : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asl_r_p">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_r_p,DI_ftype_DISI,2) -// -def int_hexagon_S2_lsr_r_p : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsr_r_p">; -// -// BUILTIN_INFO(HEXAGON.S2_lsl_r_p,DI_ftype_DISI,2) -// -def int_hexagon_S2_lsl_r_p : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsl_r_p">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_r_r_acc,SI_ftype_SISISI,3) -// -def int_hexagon_S2_asr_r_r_acc : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_r_r_acc">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_r_r_acc,SI_ftype_SISISI,3) -// -def int_hexagon_S2_asl_r_r_acc : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_r_r_acc">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_r_r_acc,SI_ftype_SISISI,3) -// -def int_hexagon_S2_lsr_r_r_acc : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_r_r_acc">; -// -// BUILTIN_INFO(HEXAGON.S2_lsl_r_r_acc,SI_ftype_SISISI,3) -// +Hexagon_float_floatfloat_Intrinsic<"HEXAGON_F2_sffixupn">; + +def int_hexagon_M2_mpyd_acc_lh_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_acc_lh_s0">; + +def int_hexagon_M2_mpyd_acc_lh_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_acc_lh_s1">; + +def int_hexagon_M2_mpy_rnd_hh_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_rnd_hh_s0">; + +def int_hexagon_M2_mpy_rnd_hh_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_rnd_hh_s1">; + +def int_hexagon_A2_vadduhs : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vadduhs">; + +def int_hexagon_A2_vsubuhs : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vsubuhs">; + +def int_hexagon_A2_subh_h16_hl : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_h16_hl">; + +def int_hexagon_A2_subh_h16_hh : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_h16_hh">; + +def int_hexagon_A2_xorp : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_xorp">; + +def int_hexagon_A4_tfrpcp : +Hexagon_i64_i64_Intrinsic<"HEXAGON_A4_tfrpcp">; + +def int_hexagon_A2_addh_h16_lh : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_h16_lh">; + +def int_hexagon_A2_addh_h16_sat_hl : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_h16_sat_hl">; + +def int_hexagon_A2_addh_h16_ll : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_h16_ll">; + +def int_hexagon_A2_addh_h16_sat_hh : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_h16_sat_hh">; + +def int_hexagon_A2_zxtb : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_zxtb">; + +def int_hexagon_A2_zxth : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_zxth">; + +def int_hexagon_A2_vnavgwr : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vnavgwr">; + +def int_hexagon_M4_or_xor : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_or_xor">; + +def int_hexagon_M2_mpyud_acc_hh_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_acc_hh_s0">; + +def int_hexagon_M2_mpyud_acc_hh_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_acc_hh_s1">; + +def int_hexagon_M5_vmacbsu : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M5_vmacbsu">; + +def int_hexagon_M2_dpmpyuu_acc_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_dpmpyuu_acc_s0">; + +def int_hexagon_M2_mpy_rnd_hl_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_rnd_hl_s0">; + +def int_hexagon_M2_mpy_rnd_hl_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_rnd_hl_s1">; + +def int_hexagon_F2_sffms_lib : +Hexagon_float_floatfloatfloat_Intrinsic<"HEXAGON_F2_sffms_lib">; + +def int_hexagon_C4_cmpneqi : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_cmpneqi">; + +def int_hexagon_M4_and_xor : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_and_xor">; + +def int_hexagon_A2_sat : +Hexagon_i32_i64_Intrinsic<"HEXAGON_A2_sat">; + +def int_hexagon_M2_mpyd_nac_lh_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_nac_lh_s1">; + +def int_hexagon_M2_mpyd_nac_lh_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_nac_lh_s0">; + +def int_hexagon_A2_addsat : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addsat">; + +def int_hexagon_A2_svavghs : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_svavghs">; + +def int_hexagon_A2_vrsadub_acc : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_A2_vrsadub_acc">; + +def int_hexagon_C2_bitsclri : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_bitsclri">; + +def int_hexagon_A2_subh_h16_sat_hh : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_h16_sat_hh">; + +def int_hexagon_A2_subh_h16_sat_hl : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_h16_sat_hl">; + +def int_hexagon_M2_mmaculs_rs0 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmaculs_rs0">; + +def int_hexagon_M2_mmaculs_rs1 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmaculs_rs1">; + +def int_hexagon_M2_vradduh : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_M2_vradduh">; + +def int_hexagon_A4_addp_c : +Hexagon_i64i32_i64i64i32_Intrinsic<"HEXAGON_A4_addp_c">; + +def int_hexagon_C2_xor : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_xor">; + def int_hexagon_S2_lsl_r_r_acc : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsl_r_r_acc">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_r_p_acc,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_asr_r_p_acc : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_r_p_acc">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_r_p_acc,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_asl_r_p_acc : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_r_p_acc">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_r_p_acc,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_lsr_r_p_acc : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_r_p_acc">; -// -// BUILTIN_INFO(HEXAGON.S2_lsl_r_p_acc,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_lsl_r_p_acc : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsl_r_p_acc">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_r_r_nac,SI_ftype_SISISI,3) -// -def int_hexagon_S2_asr_r_r_nac : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_r_r_nac">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_r_r_nac,SI_ftype_SISISI,3) -// -def int_hexagon_S2_asl_r_r_nac : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_r_r_nac">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_r_r_nac,SI_ftype_SISISI,3) -// +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsl_r_r_acc">; + +def int_hexagon_M2_mmpyh_rs1 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyh_rs1">; + +def int_hexagon_M2_mmpyh_rs0 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyh_rs0">; + +def int_hexagon_F2_conv_df2ud_chop : +Hexagon_i64_double_Intrinsic<"HEXAGON_F2_conv_df2ud_chop">; + +def int_hexagon_C4_or_or : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C4_or_or">; + +def int_hexagon_S4_vxaddsubhr : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S4_vxaddsubhr">; + +def int_hexagon_S2_vsathub : +Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_vsathub">; + +def int_hexagon_F2_conv_df2sf : +Hexagon_float_double_Intrinsic<"HEXAGON_F2_conv_df2sf">; + +def int_hexagon_M2_hmmpyh_rs1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_hmmpyh_rs1">; + +def int_hexagon_M2_hmmpyh_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_hmmpyh_s1">; + +def int_hexagon_A2_vavgwr : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavgwr">; + +def int_hexagon_S2_tableidxh_goodsyntax : +Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_tableidxh_goodsyntax">; + +def int_hexagon_A2_sxth : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_sxth">; + +def int_hexagon_A2_sxtb : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_sxtb">; + +def int_hexagon_C4_or_orn : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C4_or_orn">; + +def int_hexagon_M2_vrcmaci_s0c : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vrcmaci_s0c">; + +def int_hexagon_A2_sxtw : +Hexagon_i64_i32_Intrinsic<"HEXAGON_A2_sxtw">; + +def int_hexagon_M2_vabsdiffh : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vabsdiffh">; + +def int_hexagon_M2_mpy_acc_lh_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_lh_s1">; + +def int_hexagon_M2_mpy_acc_lh_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_lh_s0">; + +def int_hexagon_M2_hmmpyl_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_hmmpyl_s1">; + +def int_hexagon_S2_cl1p : +Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_cl1p">; + +def int_hexagon_M2_vabsdiffw : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vabsdiffw">; + +def int_hexagon_A4_andnp : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A4_andnp">; + +def int_hexagon_C2_vmux : +Hexagon_i64_i32i64i64_Intrinsic<"HEXAGON_C2_vmux">; + +def int_hexagon_S2_parityp : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_S2_parityp">; + +def int_hexagon_S2_lsr_i_p_and : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_and">; + +def int_hexagon_S2_asr_i_r_or : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_or">; + +def int_hexagon_M2_mpyu_nac_ll_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_nac_ll_s0">; + +def int_hexagon_M2_mpyu_nac_ll_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_nac_ll_s1">; + +def int_hexagon_F2_sfcmpeq : +Hexagon_i32_floatfloat_Intrinsic<"HEXAGON_F2_sfcmpeq">; + +def int_hexagon_A2_vaddb_map : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vaddb_map">; + def int_hexagon_S2_lsr_r_r_nac : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_r_r_nac">; -// -// BUILTIN_INFO(HEXAGON.S2_lsl_r_r_nac,SI_ftype_SISISI,3) -// -def int_hexagon_S2_lsl_r_r_nac : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsl_r_r_nac">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_r_p_nac,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_asr_r_p_nac : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_r_p_nac">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_r_p_nac,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_asl_r_p_nac : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_r_p_nac">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_r_p_nac,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_lsr_r_p_nac : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_r_p_nac">; -// -// BUILTIN_INFO(HEXAGON.S2_lsl_r_p_nac,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_lsl_r_p_nac : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsl_r_p_nac">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_r_r_and,SI_ftype_SISISI,3) -// -def int_hexagon_S2_asr_r_r_and : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_r_r_and">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_r_r_and,SI_ftype_SISISI,3) -// -def int_hexagon_S2_asl_r_r_and : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_r_r_and">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_r_r_and,SI_ftype_SISISI,3) -// -def int_hexagon_S2_lsr_r_r_and : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_r_r_and">; -// -// BUILTIN_INFO(HEXAGON.S2_lsl_r_r_and,SI_ftype_SISISI,3) -// -def int_hexagon_S2_lsl_r_r_and : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsl_r_r_and">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_r_r_or,SI_ftype_SISISI,3) -// -def int_hexagon_S2_asr_r_r_or : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_r_r_or">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_r_r_or,SI_ftype_SISISI,3) -// -def int_hexagon_S2_asl_r_r_or : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_r_r_or">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_r_r_or,SI_ftype_SISISI,3) -// -def int_hexagon_S2_lsr_r_r_or : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_r_r_or">; -// -// BUILTIN_INFO(HEXAGON.S2_lsl_r_r_or,SI_ftype_SISISI,3) -// -def int_hexagon_S2_lsl_r_r_or : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsl_r_r_or">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_r_p_and,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_asr_r_p_and : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_r_p_and">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_r_p_and,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_asl_r_p_and : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_r_p_and">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_r_p_and,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_lsr_r_p_and : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_r_p_and">; -// -// BUILTIN_INFO(HEXAGON.S2_lsl_r_p_and,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_lsl_r_p_and : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsl_r_p_and">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_r_p_or,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_asr_r_p_or : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_r_p_or">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_r_p_or,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_asl_r_p_or : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_r_p_or">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_r_p_or,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_lsr_r_p_or : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_r_p_or">; -// -// BUILTIN_INFO(HEXAGON.S2_lsl_r_p_or,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_lsl_r_p_or : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsl_r_p_or">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_r_p_xor,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_asr_r_p_xor : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_r_p_xor">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_r_p_xor,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_asl_r_p_xor : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_r_p_xor">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_r_p_xor,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_lsr_r_p_xor : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_r_p_xor">; -// -// BUILTIN_INFO(HEXAGON.S2_lsl_r_p_xor,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_lsl_r_p_xor : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsl_r_p_xor">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_r_r_sat,SI_ftype_SISI,2) -// -def int_hexagon_S2_asr_r_r_sat : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asr_r_r_sat">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_r_r_sat,SI_ftype_SISI,2) -// -def int_hexagon_S2_asl_r_r_sat : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asl_r_r_sat">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_i_r,SI_ftype_SISI,2) -// -def int_hexagon_S2_asr_i_r : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asr_i_r">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_i_r,SI_ftype_SISI,2) -// -def int_hexagon_S2_lsr_i_r : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_lsr_i_r">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_i_r,SI_ftype_SISI,2) -// -def int_hexagon_S2_asl_i_r : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asl_i_r">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_i_p,DI_ftype_DISI,2) -// -def int_hexagon_S2_asr_i_p : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_i_p">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_i_p,DI_ftype_DISI,2) -// -def int_hexagon_S2_lsr_i_p : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsr_i_p">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_i_p,DI_ftype_DISI,2) -// -def int_hexagon_S2_asl_i_p : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asl_i_p">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_i_r_acc,SI_ftype_SISISI,3) -// -def int_hexagon_S2_asr_i_r_acc : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_i_r_acc">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_i_r_acc,SI_ftype_SISISI,3) -// -def int_hexagon_S2_lsr_i_r_acc : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_i_r_acc">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_i_r_acc,SI_ftype_SISISI,3) -// -def int_hexagon_S2_asl_i_r_acc : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_i_r_acc">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_i_p_acc,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_asr_i_p_acc : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_i_p_acc">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_i_p_acc,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_lsr_i_p_acc : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_i_p_acc">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_i_p_acc,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_asl_i_p_acc : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_i_p_acc">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_i_r_nac,SI_ftype_SISISI,3) -// -def int_hexagon_S2_asr_i_r_nac : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_i_r_nac">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_i_r_nac,SI_ftype_SISISI,3) -// -def int_hexagon_S2_lsr_i_r_nac : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_i_r_nac">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_i_r_nac,SI_ftype_SISISI,3) -// -def int_hexagon_S2_asl_i_r_nac : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_i_r_nac">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_i_p_nac,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_asr_i_p_nac : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_i_p_nac">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_i_p_nac,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_lsr_i_p_nac : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_i_p_nac">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_i_p_nac,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_asl_i_p_nac : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_i_p_nac">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_i_r_xacc,SI_ftype_SISISI,3) -// -def int_hexagon_S2_lsr_i_r_xacc : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_i_r_xacc">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_i_r_xacc,SI_ftype_SISISI,3) -// -def int_hexagon_S2_asl_i_r_xacc : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_i_r_xacc">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_i_p_xacc,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_lsr_i_p_xacc : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_i_p_xacc">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_i_p_xacc,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_asl_i_p_xacc : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_i_p_xacc">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_i_r_and,SI_ftype_SISISI,3) -// -def int_hexagon_S2_asr_i_r_and : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_i_r_and">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_i_r_and,SI_ftype_SISISI,3) -// -def int_hexagon_S2_lsr_i_r_and : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_i_r_and">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_i_r_and,SI_ftype_SISISI,3) -// +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_r_r_nac">; + +def int_hexagon_A2_vcmpheq : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmpheq">; + +def int_hexagon_S2_clbnorm : +Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_clbnorm">; + +def int_hexagon_M2_cnacsc_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cnacsc_s1">; + +def int_hexagon_M2_cnacsc_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cnacsc_s0">; + +def int_hexagon_S4_subaddi : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_subaddi">; + +def int_hexagon_M2_mpyud_nac_hl_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_nac_hl_s1">; + +def int_hexagon_M2_mpyud_nac_hl_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_nac_hl_s0">; + +def int_hexagon_S5_vasrhrnd_goodsyntax : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S5_vasrhrnd_goodsyntax">; + +def int_hexagon_S2_tstbit_r : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_tstbit_r">; + +def int_hexagon_S4_vrcrotate : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_S4_vrcrotate">; + +def int_hexagon_M2_mmachs_s1 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmachs_s1">; + +def int_hexagon_M2_mmachs_s0 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmachs_s0">; + +def int_hexagon_S2_tstbit_i : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_tstbit_i">; + +def int_hexagon_M2_mpy_up_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_up_s1">; + +def int_hexagon_S2_extractu_rp : +Hexagon_i32_i32i64_Intrinsic<"HEXAGON_S2_extractu_rp">; + +def int_hexagon_M2_mmpyuh_rs0 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyuh_rs0">; + +def int_hexagon_S2_lsr_i_vw : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsr_i_vw">; + +def int_hexagon_M2_mpy_rnd_ll_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_rnd_ll_s0">; + +def int_hexagon_M2_mpy_rnd_ll_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_rnd_ll_s1">; + +def int_hexagon_M4_or_or : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_or_or">; + +def int_hexagon_M2_mpyu_hh_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyu_hh_s1">; + +def int_hexagon_M2_mpyu_hh_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyu_hh_s0">; + +def int_hexagon_S2_asl_r_p_acc : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_r_p_acc">; + +def int_hexagon_M2_mpyu_nac_lh_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_nac_lh_s0">; + +def int_hexagon_M2_mpyu_nac_lh_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_nac_lh_s1">; + +def int_hexagon_M2_mpy_sat_ll_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_ll_s0">; + +def int_hexagon_M2_mpy_sat_ll_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_ll_s1">; + +def int_hexagon_F2_conv_w2df : +Hexagon_double_i32_Intrinsic<"HEXAGON_F2_conv_w2df">; + +def int_hexagon_A2_subh_l16_sat_hl : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_l16_sat_hl">; + +def int_hexagon_C2_cmpeqi : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpeqi">; + def int_hexagon_S2_asl_i_r_and : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_i_r_and">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_i_r_or,SI_ftype_SISISI,3) -// -def int_hexagon_S2_asr_i_r_or : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asr_i_r_or">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_i_r_or,SI_ftype_SISISI,3) -// -def int_hexagon_S2_lsr_i_r_or : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_lsr_i_r_or">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_i_r_or,SI_ftype_SISISI,3) -// -def int_hexagon_S2_asl_i_r_or : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_asl_i_r_or">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_i_p_and,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_asr_i_p_and : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_i_p_and">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_i_p_and,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_lsr_i_p_and : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_i_p_and">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_i_p_and,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_asl_i_p_and : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_i_p_and">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_i_p_or,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_asr_i_p_or : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asr_i_p_or">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_i_p_or,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_lsr_i_p_or : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_lsr_i_p_or">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_i_p_or,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_asl_i_p_or : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_asl_i_p_or">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_i_r_sat,SI_ftype_SISI,2) -// -def int_hexagon_S2_asl_i_r_sat : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asl_i_r_sat">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_i_r_rnd,SI_ftype_SISI,2) -// -def int_hexagon_S2_asr_i_r_rnd : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asr_i_r_rnd">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_i_r_rnd_goodsyntax,SI_ftype_SISI,2) -// -def int_hexagon_S2_asr_i_r_rnd_goodsyntax : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_asr_i_r_rnd_goodsyntax">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_i_p_rnd,DI_ftype_DISI,2) -// -def int_hexagon_S2_asr_i_p_rnd : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_i_p_rnd">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_i_p_rnd_goodsyntax,DI_ftype_DISI,2) -// -def int_hexagon_S2_asr_i_p_rnd_goodsyntax : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_i_p_rnd_goodsyntax">; -// -// BUILTIN_INFO(HEXAGON.S4_lsli,SI_ftype_SISI,2) -// -def int_hexagon_S4_lsli : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S4_lsli">; -// -// BUILTIN_INFO(HEXAGON.S2_addasl_rrri,SI_ftype_SISISI,3) -// -def int_hexagon_S2_addasl_rrri : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_addasl_rrri">; -// -// BUILTIN_INFO(HEXAGON.S4_andi_asl_ri,SI_ftype_SISISI,3) -// -def int_hexagon_S4_andi_asl_ri : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_andi_asl_ri">; -// -// BUILTIN_INFO(HEXAGON.S4_ori_asl_ri,SI_ftype_SISISI,3) -// -def int_hexagon_S4_ori_asl_ri : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_ori_asl_ri">; -// -// BUILTIN_INFO(HEXAGON.S4_addi_asl_ri,SI_ftype_SISISI,3) -// -def int_hexagon_S4_addi_asl_ri : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_addi_asl_ri">; -// -// BUILTIN_INFO(HEXAGON.S4_subi_asl_ri,SI_ftype_SISISI,3) -// -def int_hexagon_S4_subi_asl_ri : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_subi_asl_ri">; -// -// BUILTIN_INFO(HEXAGON.S4_andi_lsr_ri,SI_ftype_SISISI,3) -// -def int_hexagon_S4_andi_lsr_ri : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_andi_lsr_ri">; -// -// BUILTIN_INFO(HEXAGON.S4_ori_lsr_ri,SI_ftype_SISISI,3) -// -def int_hexagon_S4_ori_lsr_ri : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_ori_lsr_ri">; -// -// BUILTIN_INFO(HEXAGON.S4_addi_lsr_ri,SI_ftype_SISISI,3) -// -def int_hexagon_S4_addi_lsr_ri : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_addi_lsr_ri">; -// -// BUILTIN_INFO(HEXAGON.S4_subi_lsr_ri,SI_ftype_SISISI,3) -// -def int_hexagon_S4_subi_lsr_ri : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_subi_lsr_ri">; -// -// BUILTIN_INFO(HEXAGON.S2_valignib,DI_ftype_DIDISI,3) -// +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_and">; + +def int_hexagon_S2_vcnegh : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_vcnegh">; + +def int_hexagon_A4_vcmpweqi : +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpweqi">; + +def int_hexagon_M2_vdmpyrs_s0 : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_M2_vdmpyrs_s0">; + +def int_hexagon_M2_vdmpyrs_s1 : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_M2_vdmpyrs_s1">; + +def int_hexagon_M4_xor_xacc : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M4_xor_xacc">; + +def int_hexagon_M2_vdmpys_s1 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vdmpys_s1">; + +def int_hexagon_M2_vdmpys_s0 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vdmpys_s0">; + +def int_hexagon_A2_vavgubr : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavgubr">; + +def int_hexagon_M2_mpyu_hl_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyu_hl_s1">; + +def int_hexagon_M2_mpyu_hl_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyu_hl_s0">; + +def int_hexagon_S2_asl_r_r_acc : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_r_r_acc">; + +def int_hexagon_S2_cl0p : +Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_cl0p">; + def int_hexagon_S2_valignib : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_valignib">; -// -// BUILTIN_INFO(HEXAGON.S2_valignrb,DI_ftype_DIDIQI,3) -// -def int_hexagon_S2_valignrb : -Hexagon_di_didiqi_Intrinsic<"HEXAGON_S2_valignrb">; -// -// BUILTIN_INFO(HEXAGON.S2_vspliceib,DI_ftype_DIDISI,3) -// -def int_hexagon_S2_vspliceib : -Hexagon_di_didisi_Intrinsic<"HEXAGON_S2_vspliceib">; -// -// BUILTIN_INFO(HEXAGON.S2_vsplicerb,DI_ftype_DIDIQI,3) -// -def int_hexagon_S2_vsplicerb : -Hexagon_di_didiqi_Intrinsic<"HEXAGON_S2_vsplicerb">; -// -// BUILTIN_INFO(HEXAGON.S2_vsplatrh,DI_ftype_SI,1) -// -def int_hexagon_S2_vsplatrh : -Hexagon_di_si_Intrinsic<"HEXAGON_S2_vsplatrh">; -// -// BUILTIN_INFO(HEXAGON.S2_vsplatrb,SI_ftype_SI,1) -// -def int_hexagon_S2_vsplatrb : -Hexagon_si_si_Intrinsic<"HEXAGON_S2_vsplatrb">; -// -// BUILTIN_INFO(HEXAGON.S2_insert,SI_ftype_SISISISI,4) -// -def int_hexagon_S2_insert : -Hexagon_si_sisisisi_Intrinsic<"HEXAGON_S2_insert">; -// -// BUILTIN_INFO(HEXAGON.S2_tableidxb_goodsyntax,SI_ftype_SISISISI,4) -// -def int_hexagon_S2_tableidxb_goodsyntax : -Hexagon_si_sisisisi_Intrinsic<"HEXAGON_S2_tableidxb_goodsyntax">; -// -// BUILTIN_INFO(HEXAGON.S2_tableidxh_goodsyntax,SI_ftype_SISISISI,4) -// -def int_hexagon_S2_tableidxh_goodsyntax : -Hexagon_si_sisisisi_Intrinsic<"HEXAGON_S2_tableidxh_goodsyntax">; -// -// BUILTIN_INFO(HEXAGON.S2_tableidxw_goodsyntax,SI_ftype_SISISISI,4) -// -def int_hexagon_S2_tableidxw_goodsyntax : -Hexagon_si_sisisisi_Intrinsic<"HEXAGON_S2_tableidxw_goodsyntax">; -// -// BUILTIN_INFO(HEXAGON.S2_tableidxd_goodsyntax,SI_ftype_SISISISI,4) -// -def int_hexagon_S2_tableidxd_goodsyntax : -Hexagon_si_sisisisi_Intrinsic<"HEXAGON_S2_tableidxd_goodsyntax">; -// -// BUILTIN_INFO(HEXAGON.A4_bitspliti,DI_ftype_SISI,2) -// -def int_hexagon_A4_bitspliti : -Hexagon_di_sisi_Intrinsic<"HEXAGON_A4_bitspliti">; -// -// BUILTIN_INFO(HEXAGON.A4_bitsplit,DI_ftype_SISI,2) -// -def int_hexagon_A4_bitsplit : -Hexagon_di_sisi_Intrinsic<"HEXAGON_A4_bitsplit">; -// -// BUILTIN_INFO(HEXAGON.S4_extract,SI_ftype_SISISI,3) -// -def int_hexagon_S4_extract : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S4_extract">; -// -// BUILTIN_INFO(HEXAGON.S2_extractu,SI_ftype_SISISI,3) -// -def int_hexagon_S2_extractu : -Hexagon_si_sisisi_Intrinsic<"HEXAGON_S2_extractu">; -// -// BUILTIN_INFO(HEXAGON.S2_insertp,DI_ftype_DIDISISI,4) -// -def int_hexagon_S2_insertp : -Hexagon_di_didisisi_Intrinsic<"HEXAGON_S2_insertp">; -// -// BUILTIN_INFO(HEXAGON.S4_extractp,DI_ftype_DISISI,3) -// -def int_hexagon_S4_extractp : -Hexagon_di_disisi_Intrinsic<"HEXAGON_S4_extractp">; -// -// BUILTIN_INFO(HEXAGON.S2_extractup,DI_ftype_DISISI,3) -// -def int_hexagon_S2_extractup : -Hexagon_di_disisi_Intrinsic<"HEXAGON_S2_extractup">; -// -// BUILTIN_INFO(HEXAGON.S2_insert_rp,SI_ftype_SISIDI,3) -// -def int_hexagon_S2_insert_rp : -Hexagon_si_sisidi_Intrinsic<"HEXAGON_S2_insert_rp">; -// -// BUILTIN_INFO(HEXAGON.S4_extract_rp,SI_ftype_SIDI,2) -// +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_valignib">; + +def int_hexagon_F2_sffixupd : +Hexagon_float_floatfloat_Intrinsic<"HEXAGON_F2_sffixupd">; + +def int_hexagon_M2_mpy_sat_rnd_hl_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hl_s1">; + +def int_hexagon_M2_mpy_sat_rnd_hl_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hl_s0">; + +def int_hexagon_M2_cmacsc_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cmacsc_s0">; + +def int_hexagon_M2_cmacsc_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cmacsc_s1">; + +def int_hexagon_S2_ct1 : +Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_ct1">; + +def int_hexagon_S2_ct0 : +Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_ct0">; + +def int_hexagon_M2_dpmpyuu_nac_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_dpmpyuu_nac_s0">; + +def int_hexagon_M2_mmpyul_rs1 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyul_rs1">; + +def int_hexagon_S4_ntstbit_i : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S4_ntstbit_i">; + +def int_hexagon_F2_sffixupr : +Hexagon_float_float_Intrinsic<"HEXAGON_F2_sffixupr">; + +def int_hexagon_S2_asr_r_p_xor : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_r_p_xor">; + +def int_hexagon_M2_mpyud_acc_hl_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_acc_hl_s0">; + +def int_hexagon_M2_mpyud_acc_hl_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_acc_hl_s1">; + +def int_hexagon_A2_vcmphgtu : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmphgtu">; + +def int_hexagon_C2_andn : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_andn">; + +def int_hexagon_M2_vmpy2s_s0pack : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_vmpy2s_s0pack">; + +def int_hexagon_S4_addaddi : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_addaddi">; + +def int_hexagon_M2_mpyd_acc_ll_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_acc_ll_s0">; + +def int_hexagon_M2_mpy_acc_sat_hl_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hl_s1">; + +def int_hexagon_A4_rcmpeqi : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_rcmpeqi">; + +def int_hexagon_M4_xor_and : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_xor_and">; + +def int_hexagon_S2_asl_i_p_and : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_and">; + +def int_hexagon_M2_mmpyuh_rs1 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyuh_rs1">; + +def int_hexagon_S2_asr_r_r_or : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_r_r_or">; + +def int_hexagon_A4_round_ri : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_round_ri">; + +def int_hexagon_A2_max : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_max">; + +def int_hexagon_A4_round_rr : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_round_rr">; + +def int_hexagon_A4_combineii : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A4_combineii">; + +def int_hexagon_A4_combineir : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A4_combineir">; + +def int_hexagon_C4_and_orn : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C4_and_orn">; + +def int_hexagon_M5_vmacbuu : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M5_vmacbuu">; + +def int_hexagon_A4_rcmpeq : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_rcmpeq">; + +def int_hexagon_M4_cmpyr_whc : +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_M4_cmpyr_whc">; + +def int_hexagon_S2_lsr_i_r_acc : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_acc">; + +def int_hexagon_S2_vzxtbh : +Hexagon_i64_i32_Intrinsic<"HEXAGON_S2_vzxtbh">; + +def int_hexagon_M2_mmacuhs_rs1 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmacuhs_rs1">; + +def int_hexagon_S2_asr_r_r_sat : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asr_r_r_sat">; + +def int_hexagon_A2_combinew : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A2_combinew">; + +def int_hexagon_M2_mpy_acc_ll_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_ll_s1">; + +def int_hexagon_M2_mpy_acc_ll_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_ll_s0">; + +def int_hexagon_M2_cmpyi_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_cmpyi_s0">; + +def int_hexagon_S2_asl_r_p_or : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_r_p_or">; + +def int_hexagon_S4_ori_asl_ri : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_ori_asl_ri">; + +def int_hexagon_C4_nbitsset : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_nbitsset">; + +def int_hexagon_M2_mpyu_acc_hh_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_acc_hh_s1">; + +def int_hexagon_M2_mpyu_acc_hh_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_acc_hh_s0">; + +def int_hexagon_M2_mpyu_ll_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyu_ll_s1">; + +def int_hexagon_M2_mpyu_ll_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyu_ll_s0">; + +def int_hexagon_A2_addh_l16_ll : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_l16_ll">; + +def int_hexagon_S2_lsr_r_r_and : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_r_r_and">; + +def int_hexagon_A4_modwrapu : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_modwrapu">; + +def int_hexagon_A4_rcmpneq : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_rcmpneq">; + +def int_hexagon_M2_mpyd_acc_hh_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_acc_hh_s0">; + +def int_hexagon_M2_mpyd_acc_hh_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_acc_hh_s1">; + +def int_hexagon_F2_sfimm_p : +Hexagon_float_i32_Intrinsic<"HEXAGON_F2_sfimm_p">; + +def int_hexagon_F2_sfimm_n : +Hexagon_float_i32_Intrinsic<"HEXAGON_F2_sfimm_n">; + +def int_hexagon_M4_cmpyr_wh : +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_M4_cmpyr_wh">; + +def int_hexagon_S2_lsl_r_p_and : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsl_r_p_and">; + +def int_hexagon_A2_vavgub : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavgub">; + +def int_hexagon_F2_conv_d2sf : +Hexagon_float_i64_Intrinsic<"HEXAGON_F2_conv_d2sf">; + +def int_hexagon_A2_vavguh : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavguh">; + +def int_hexagon_A4_cmpbeqi : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbeqi">; + +def int_hexagon_F2_sfcmpuo : +Hexagon_i32_floatfloat_Intrinsic<"HEXAGON_F2_sfcmpuo">; + +def int_hexagon_A2_vavguw : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavguw">; + +def int_hexagon_S2_asr_i_p_nac : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_nac">; + +def int_hexagon_S2_vsatwh_nopack : +Hexagon_i64_i64_Intrinsic<"HEXAGON_S2_vsatwh_nopack">; + +def int_hexagon_M2_mpyd_hh_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_hh_s0">; + +def int_hexagon_M2_mpyd_hh_s1 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_hh_s1">; + +def int_hexagon_S2_lsl_r_p_or : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsl_r_p_or">; + +def int_hexagon_A2_minu : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_minu">; + +def int_hexagon_M2_mpy_sat_lh_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_lh_s1">; + +def int_hexagon_M4_or_andn : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_or_andn">; + +def int_hexagon_A2_minp : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_minp">; + +def int_hexagon_S4_or_andix : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_or_andix">; + +def int_hexagon_M2_mpy_rnd_lh_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_rnd_lh_s0">; + +def int_hexagon_M2_mpy_rnd_lh_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_rnd_lh_s1">; + +def int_hexagon_M2_mmpyuh_s0 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyuh_s0">; + +def int_hexagon_M2_mmpyuh_s1 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyuh_s1">; + +def int_hexagon_M2_mpy_acc_sat_lh_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_sat_lh_s0">; + +def int_hexagon_F2_sfcmpge : +Hexagon_i32_floatfloat_Intrinsic<"HEXAGON_F2_sfcmpge">; + +def int_hexagon_F2_sfmin : +Hexagon_float_floatfloat_Intrinsic<"HEXAGON_F2_sfmin">; + +def int_hexagon_F2_sfcmpgt : +Hexagon_i32_floatfloat_Intrinsic<"HEXAGON_F2_sfcmpgt">; + +def int_hexagon_M4_vpmpyh : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M4_vpmpyh">; + +def int_hexagon_M2_mmacuhs_rs0 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmacuhs_rs0">; + +def int_hexagon_M2_mpyd_rnd_lh_s1 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_rnd_lh_s1">; + +def int_hexagon_M2_mpyd_rnd_lh_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_rnd_lh_s0">; + +def int_hexagon_A2_roundsat : +Hexagon_i32_i64_Intrinsic<"HEXAGON_A2_roundsat">; + +def int_hexagon_S2_ct1p : +Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_ct1p">; + def int_hexagon_S4_extract_rp : -Hexagon_si_sidi_Intrinsic<"HEXAGON_S4_extract_rp">; -// -// BUILTIN_INFO(HEXAGON.S2_extractu_rp,SI_ftype_SIDI,2) -// -def int_hexagon_S2_extractu_rp : -Hexagon_si_sidi_Intrinsic<"HEXAGON_S2_extractu_rp">; -// -// BUILTIN_INFO(HEXAGON.S2_insertp_rp,DI_ftype_DIDIDI,3) -// +Hexagon_i32_i32i64_Intrinsic<"HEXAGON_S4_extract_rp">; + +def int_hexagon_S2_lsl_r_r_or : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsl_r_r_or">; + +def int_hexagon_C4_cmplteui : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_cmplteui">; + +def int_hexagon_S4_addi_lsr_ri : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_addi_lsr_ri">; + +def int_hexagon_A4_tfrcpp : +Hexagon_i64_i64_Intrinsic<"HEXAGON_A4_tfrcpp">; + +def int_hexagon_S2_asr_i_svw_trun : +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_S2_asr_i_svw_trun">; + +def int_hexagon_A4_cmphgti : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmphgti">; + +def int_hexagon_A4_vrminh : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_A4_vrminh">; + +def int_hexagon_A4_vrminw : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_A4_vrminw">; + +def int_hexagon_A4_cmphgtu : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmphgtu">; + def int_hexagon_S2_insertp_rp : -Hexagon_di_dididi_Intrinsic<"HEXAGON_S2_insertp_rp">; -// -// BUILTIN_INFO(HEXAGON.S4_extractp_rp,DI_ftype_DIDI,2) -// -def int_hexagon_S4_extractp_rp : -Hexagon_di_didi_Intrinsic<"HEXAGON_S4_extractp_rp">; -// -// BUILTIN_INFO(HEXAGON.S2_extractup_rp,DI_ftype_DIDI,2) -// -def int_hexagon_S2_extractup_rp : -Hexagon_di_didi_Intrinsic<"HEXAGON_S2_extractup_rp">; -// -// BUILTIN_INFO(HEXAGON.S2_tstbit_i,QI_ftype_SISI,2) -// -def int_hexagon_S2_tstbit_i : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_tstbit_i">; -// -// BUILTIN_INFO(HEXAGON.S4_ntstbit_i,QI_ftype_SISI,2) -// -def int_hexagon_S4_ntstbit_i : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S4_ntstbit_i">; -// -// BUILTIN_INFO(HEXAGON.S2_setbit_i,SI_ftype_SISI,2) -// -def int_hexagon_S2_setbit_i : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_setbit_i">; -// -// BUILTIN_INFO(HEXAGON.S2_togglebit_i,SI_ftype_SISI,2) -// -def int_hexagon_S2_togglebit_i : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_togglebit_i">; -// -// BUILTIN_INFO(HEXAGON.S2_clrbit_i,SI_ftype_SISI,2) -// -def int_hexagon_S2_clrbit_i : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_clrbit_i">; -// -// BUILTIN_INFO(HEXAGON.S2_tstbit_r,QI_ftype_SISI,2) -// -def int_hexagon_S2_tstbit_r : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_tstbit_r">; -// -// BUILTIN_INFO(HEXAGON.S4_ntstbit_r,QI_ftype_SISI,2) -// -def int_hexagon_S4_ntstbit_r : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S4_ntstbit_r">; -// -// BUILTIN_INFO(HEXAGON.S2_setbit_r,SI_ftype_SISI,2) -// -def int_hexagon_S2_setbit_r : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_setbit_r">; -// -// BUILTIN_INFO(HEXAGON.S2_togglebit_r,SI_ftype_SISI,2) -// -def int_hexagon_S2_togglebit_r : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_togglebit_r">; -// -// BUILTIN_INFO(HEXAGON.S2_clrbit_r,SI_ftype_SISI,2) -// -def int_hexagon_S2_clrbit_r : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S2_clrbit_r">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_i_vh,DI_ftype_DISI,2) -// -def int_hexagon_S2_asr_i_vh : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_i_vh">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_i_vh,DI_ftype_DISI,2) -// -def int_hexagon_S2_lsr_i_vh : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsr_i_vh">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_i_vh,DI_ftype_DISI,2) -// -def int_hexagon_S2_asl_i_vh : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asl_i_vh">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_r_vh,DI_ftype_DISI,2) -// -def int_hexagon_S2_asr_r_vh : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_r_vh">; -// -// BUILTIN_INFO(HEXAGON.S5_asrhub_rnd_sat_goodsyntax,SI_ftype_DISI,2) -// -def int_hexagon_S5_asrhub_rnd_sat_goodsyntax : -Hexagon_si_disi_Intrinsic<"HEXAGON_S5_asrhub_rnd_sat_goodsyntax">; -// -// BUILTIN_INFO(HEXAGON.S5_asrhub_sat,SI_ftype_DISI,2) -// -def int_hexagon_S5_asrhub_sat : -Hexagon_si_disi_Intrinsic<"HEXAGON_S5_asrhub_sat">; -// -// BUILTIN_INFO(HEXAGON.S5_vasrhrnd_goodsyntax,DI_ftype_DISI,2) -// -def int_hexagon_S5_vasrhrnd_goodsyntax : -Hexagon_di_disi_Intrinsic<"HEXAGON_S5_vasrhrnd_goodsyntax">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_r_vh,DI_ftype_DISI,2) -// -def int_hexagon_S2_asl_r_vh : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asl_r_vh">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_r_vh,DI_ftype_DISI,2) -// -def int_hexagon_S2_lsr_r_vh : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsr_r_vh">; -// -// BUILTIN_INFO(HEXAGON.S2_lsl_r_vh,DI_ftype_DISI,2) -// +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_S2_insertp_rp">; + +def int_hexagon_A2_vnavghcr : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vnavghcr">; + +def int_hexagon_S4_subi_asl_ri : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_subi_asl_ri">; + def int_hexagon_S2_lsl_r_vh : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsl_r_vh">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_i_vw,DI_ftype_DISI,2) -// -def int_hexagon_S2_asr_i_vw : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_i_vw">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_i_svw_trun,SI_ftype_DISI,2) -// -def int_hexagon_S2_asr_i_svw_trun : -Hexagon_si_disi_Intrinsic<"HEXAGON_S2_asr_i_svw_trun">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_r_svw_trun,SI_ftype_DISI,2) -// -def int_hexagon_S2_asr_r_svw_trun : -Hexagon_si_disi_Intrinsic<"HEXAGON_S2_asr_r_svw_trun">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_i_vw,DI_ftype_DISI,2) -// -def int_hexagon_S2_lsr_i_vw : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsr_i_vw">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_i_vw,DI_ftype_DISI,2) -// -def int_hexagon_S2_asl_i_vw : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asl_i_vw">; -// -// BUILTIN_INFO(HEXAGON.S2_asr_r_vw,DI_ftype_DISI,2) -// +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsl_r_vh">; + +def int_hexagon_M2_mpy_hh_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_hh_s0">; + +def int_hexagon_A2_vsubws : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vsubws">; + +def int_hexagon_A2_sath : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_sath">; + +def int_hexagon_S2_asl_r_p_xor : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_r_p_xor">; + +def int_hexagon_A2_satb : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_satb">; + +def int_hexagon_C2_cmpltu : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpltu">; + +def int_hexagon_S2_insertp : +Hexagon_i64_i64i64i32i32_Intrinsic<"HEXAGON_S2_insertp">; + +def int_hexagon_M2_mpyd_rnd_ll_s1 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_rnd_ll_s1">; + +def int_hexagon_M2_mpyd_rnd_ll_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_rnd_ll_s0">; + +def int_hexagon_S2_lsr_i_p_nac : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_nac">; + +def int_hexagon_S2_extractup_rp : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S2_extractup_rp">; + +def int_hexagon_S4_vxaddsubw : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S4_vxaddsubw">; + +def int_hexagon_S4_vxaddsubh : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S4_vxaddsubh">; + +def int_hexagon_A2_asrh : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_asrh">; + +def int_hexagon_S4_extractp_rp : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S4_extractp_rp">; + +def int_hexagon_S2_lsr_r_r_acc : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_r_r_acc">; + +def int_hexagon_M2_mpyd_nac_ll_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_nac_ll_s1">; + +def int_hexagon_M2_mpyd_nac_ll_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_nac_ll_s0">; + +def int_hexagon_C2_or : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_or">; + +def int_hexagon_M2_mmpyul_s1 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyul_s1">; + +def int_hexagon_M2_vrcmacr_s0 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vrcmacr_s0">; + +def int_hexagon_A2_xor : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_xor">; + +def int_hexagon_A2_add : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_add">; + +def int_hexagon_A2_vsububs : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vsububs">; + +def int_hexagon_M2_vmpy2s_s1 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_vmpy2s_s1">; + +def int_hexagon_M2_vmpy2s_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_vmpy2s_s0">; + +def int_hexagon_A2_vraddub_acc : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_A2_vraddub_acc">; + +def int_hexagon_F2_sfinvsqrta : +Hexagon_floati32_float_Intrinsic<"HEXAGON_F2_sfinvsqrta">; + +def int_hexagon_S2_ct0p : +Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_ct0p">; + +def int_hexagon_A2_svaddh : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_svaddh">; + +def int_hexagon_S2_vcrotate : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_vcrotate">; + +def int_hexagon_A2_aslh : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_aslh">; + +def int_hexagon_A2_subh_h16_lh : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_h16_lh">; + +def int_hexagon_A2_subh_h16_ll : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_h16_ll">; + +def int_hexagon_M2_hmmpyl_rs1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_hmmpyl_rs1">; + +def int_hexagon_S2_asr_r_p : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_r_p">; + +def int_hexagon_S2_vsplatrh : +Hexagon_i64_i32_Intrinsic<"HEXAGON_S2_vsplatrh">; + +def int_hexagon_S2_asr_r_r : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asr_r_r">; + +def int_hexagon_A2_addh_h16_hl : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_h16_hl">; + +def int_hexagon_S2_vsplatrb : +Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_vsplatrb">; + +def int_hexagon_A2_addh_h16_hh : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_h16_hh">; + +def int_hexagon_M2_cmpyr_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_cmpyr_s0">; + +def int_hexagon_M2_dpmpyss_rnd_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_dpmpyss_rnd_s0">; + +def int_hexagon_C2_muxri : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C2_muxri">; + +def int_hexagon_M2_vmac2es_s0 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vmac2es_s0">; + +def int_hexagon_M2_vmac2es_s1 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vmac2es_s1">; + +def int_hexagon_C2_pxfer_map : +Hexagon_i32_i32_Intrinsic<"HEXAGON_C2_pxfer_map">; + +def int_hexagon_M2_mpyu_lh_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyu_lh_s1">; + +def int_hexagon_M2_mpyu_lh_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpyu_lh_s0">; + +def int_hexagon_S2_asl_i_r_or : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_or">; + +def int_hexagon_M2_mpyd_acc_hl_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_acc_hl_s0">; + +def int_hexagon_M2_mpyd_acc_hl_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_acc_hl_s1">; + +def int_hexagon_S2_asr_r_p_nac : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_r_p_nac">; + +def int_hexagon_A2_vaddw : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vaddw">; + +def int_hexagon_S2_asr_i_r_and : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_and">; + +def int_hexagon_A2_vaddh : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vaddh">; + +def int_hexagon_M2_mpy_nac_sat_lh_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_sat_lh_s1">; + +def int_hexagon_M2_mpy_nac_sat_lh_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_sat_lh_s0">; + +def int_hexagon_C2_cmpeqp : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_C2_cmpeqp">; + +def int_hexagon_M4_mpyri_addi : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mpyri_addi">; + +def int_hexagon_A2_not : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_not">; + +def int_hexagon_S4_andi_lsr_ri : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_andi_lsr_ri">; + +def int_hexagon_M2_macsip : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_macsip">; + +def int_hexagon_A2_tfrcrr : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_tfrcrr">; + +def int_hexagon_M2_macsin : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_macsin">; + +def int_hexagon_C2_orn : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_orn">; + +def int_hexagon_M4_and_andn : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_and_andn">; + +def int_hexagon_F2_sfmpy : +Hexagon_float_floatfloat_Intrinsic<"HEXAGON_F2_sfmpy">; + +def int_hexagon_M2_mpyud_nac_hh_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_nac_hh_s1">; + +def int_hexagon_M2_mpyud_nac_hh_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_nac_hh_s0">; + +def int_hexagon_S2_lsr_r_p_acc : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_r_p_acc">; + def int_hexagon_S2_asr_r_vw : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asr_r_vw">; -// -// BUILTIN_INFO(HEXAGON.S2_asl_r_vw,DI_ftype_DISI,2) -// -def int_hexagon_S2_asl_r_vw : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_asl_r_vw">; -// -// BUILTIN_INFO(HEXAGON.S2_lsr_r_vw,DI_ftype_DISI,2) -// -def int_hexagon_S2_lsr_r_vw : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsr_r_vw">; -// -// BUILTIN_INFO(HEXAGON.S2_lsl_r_vw,DI_ftype_DISI,2) -// -def int_hexagon_S2_lsl_r_vw : -Hexagon_di_disi_Intrinsic<"HEXAGON_S2_lsl_r_vw">; -// -// BUILTIN_INFO(HEXAGON.S2_vrndpackwh,SI_ftype_DI,1) -// -def int_hexagon_S2_vrndpackwh : -Hexagon_si_di_Intrinsic<"HEXAGON_S2_vrndpackwh">; -// -// BUILTIN_INFO(HEXAGON.S2_vrndpackwhs,SI_ftype_DI,1) -// -def int_hexagon_S2_vrndpackwhs : -Hexagon_si_di_Intrinsic<"HEXAGON_S2_vrndpackwhs">; -// -// BUILTIN_INFO(HEXAGON.S2_vsxtbh,DI_ftype_SI,1) -// -def int_hexagon_S2_vsxtbh : -Hexagon_di_si_Intrinsic<"HEXAGON_S2_vsxtbh">; -// -// BUILTIN_INFO(HEXAGON.S2_vzxtbh,DI_ftype_SI,1) -// -def int_hexagon_S2_vzxtbh : -Hexagon_di_si_Intrinsic<"HEXAGON_S2_vzxtbh">; -// -// BUILTIN_INFO(HEXAGON.S2_vsathub,SI_ftype_DI,1) -// -def int_hexagon_S2_vsathub : -Hexagon_si_di_Intrinsic<"HEXAGON_S2_vsathub">; -// -// BUILTIN_INFO(HEXAGON.S2_svsathub,SI_ftype_SI,1) -// -def int_hexagon_S2_svsathub : -Hexagon_si_si_Intrinsic<"HEXAGON_S2_svsathub">; -// -// BUILTIN_INFO(HEXAGON.S2_svsathb,SI_ftype_SI,1) -// -def int_hexagon_S2_svsathb : -Hexagon_si_si_Intrinsic<"HEXAGON_S2_svsathb">; -// -// BUILTIN_INFO(HEXAGON.S2_vsathb,SI_ftype_DI,1) -// -def int_hexagon_S2_vsathb : -Hexagon_si_di_Intrinsic<"HEXAGON_S2_vsathb">; -// -// BUILTIN_INFO(HEXAGON.S2_vtrunohb,SI_ftype_DI,1) -// -def int_hexagon_S2_vtrunohb : -Hexagon_si_di_Intrinsic<"HEXAGON_S2_vtrunohb">; -// -// BUILTIN_INFO(HEXAGON.S2_vtrunewh,DI_ftype_DIDI,2) -// -def int_hexagon_S2_vtrunewh : -Hexagon_di_didi_Intrinsic<"HEXAGON_S2_vtrunewh">; -// -// BUILTIN_INFO(HEXAGON.S2_vtrunowh,DI_ftype_DIDI,2) -// -def int_hexagon_S2_vtrunowh : -Hexagon_di_didi_Intrinsic<"HEXAGON_S2_vtrunowh">; -// -// BUILTIN_INFO(HEXAGON.S2_vtrunehb,SI_ftype_DI,1) -// -def int_hexagon_S2_vtrunehb : -Hexagon_si_di_Intrinsic<"HEXAGON_S2_vtrunehb">; -// -// BUILTIN_INFO(HEXAGON.S2_vsxthw,DI_ftype_SI,1) -// +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_r_vw">; + +def int_hexagon_M4_and_or : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_and_or">; + +def int_hexagon_S2_asr_r_vh : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_r_vh">; + +def int_hexagon_C2_mask : +Hexagon_i64_i32_Intrinsic<"HEXAGON_C2_mask">; + +def int_hexagon_M2_mpy_nac_hh_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_hh_s0">; + +def int_hexagon_M2_mpy_nac_hh_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_hh_s1">; + +def int_hexagon_M2_mpy_up_s1_sat : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_up_s1_sat">; + +def int_hexagon_A4_vcmpbgt : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A4_vcmpbgt">; + +def int_hexagon_M5_vrmacbsu : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M5_vrmacbsu">; + +def int_hexagon_S2_tableidxw_goodsyntax : +Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_tableidxw_goodsyntax">; + +def int_hexagon_A2_vrsadub : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vrsadub">; + +def int_hexagon_A2_tfrrcr : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_tfrrcr">; + +def int_hexagon_M2_vrcmpys_acc_s1 : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_M2_vrcmpys_acc_s1">; + +def int_hexagon_F2_dfcmpge : +Hexagon_i32_doubledouble_Intrinsic<"HEXAGON_F2_dfcmpge">; + +def int_hexagon_M2_accii : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_accii">; + +def int_hexagon_A5_vaddhubs : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A5_vaddhubs">; + +def int_hexagon_A2_vmaxw : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vmaxw">; + +def int_hexagon_A2_vmaxb : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vmaxb">; + +def int_hexagon_A2_vmaxh : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vmaxh">; + def int_hexagon_S2_vsxthw : -Hexagon_di_si_Intrinsic<"HEXAGON_S2_vsxthw">; -// -// BUILTIN_INFO(HEXAGON.S2_vzxthw,DI_ftype_SI,1) -// -def int_hexagon_S2_vzxthw : -Hexagon_di_si_Intrinsic<"HEXAGON_S2_vzxthw">; -// -// BUILTIN_INFO(HEXAGON.S2_vsatwh,SI_ftype_DI,1) -// -def int_hexagon_S2_vsatwh : -Hexagon_si_di_Intrinsic<"HEXAGON_S2_vsatwh">; -// -// BUILTIN_INFO(HEXAGON.S2_vsatwuh,SI_ftype_DI,1) -// -def int_hexagon_S2_vsatwuh : -Hexagon_si_di_Intrinsic<"HEXAGON_S2_vsatwuh">; -// -// BUILTIN_INFO(HEXAGON.S2_packhl,DI_ftype_SISI,2) -// -def int_hexagon_S2_packhl : -Hexagon_di_sisi_Intrinsic<"HEXAGON_S2_packhl">; -// -// BUILTIN_INFO(HEXAGON.A2_swiz,SI_ftype_SI,1) -// -def int_hexagon_A2_swiz : -Hexagon_si_si_Intrinsic<"HEXAGON_A2_swiz">; -// -// BUILTIN_INFO(HEXAGON.S2_vsathub_nopack,DI_ftype_DI,1) -// -def int_hexagon_S2_vsathub_nopack : -Hexagon_di_di_Intrinsic<"HEXAGON_S2_vsathub_nopack">; -// -// BUILTIN_INFO(HEXAGON.S2_vsathb_nopack,DI_ftype_DI,1) -// -def int_hexagon_S2_vsathb_nopack : -Hexagon_di_di_Intrinsic<"HEXAGON_S2_vsathb_nopack">; -// -// BUILTIN_INFO(HEXAGON.S2_vsatwh_nopack,DI_ftype_DI,1) -// -def int_hexagon_S2_vsatwh_nopack : -Hexagon_di_di_Intrinsic<"HEXAGON_S2_vsatwh_nopack">; -// -// BUILTIN_INFO(HEXAGON.S2_vsatwuh_nopack,DI_ftype_DI,1) -// +Hexagon_i64_i32_Intrinsic<"HEXAGON_S2_vsxthw">; + +def int_hexagon_S4_andi_asl_ri : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_andi_asl_ri">; + +def int_hexagon_S2_asl_i_p_nac : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_nac">; + +def int_hexagon_S2_lsl_r_p_xor : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsl_r_p_xor">; + +def int_hexagon_C2_cmpgt : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgt">; + +def int_hexagon_F2_conv_df2d_chop : +Hexagon_i64_double_Intrinsic<"HEXAGON_F2_conv_df2d_chop">; + +def int_hexagon_M2_mpyu_nac_hl_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_nac_hl_s0">; + +def int_hexagon_M2_mpyu_nac_hl_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_nac_hl_s1">; + +def int_hexagon_F2_conv_sf2w : +Hexagon_i32_float_Intrinsic<"HEXAGON_F2_conv_sf2w">; + +def int_hexagon_S2_lsr_r_p_or : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_r_p_or">; + +def int_hexagon_F2_sfclass : +Hexagon_i32_floati32_Intrinsic<"HEXAGON_F2_sfclass">; + +def int_hexagon_M2_mpyud_acc_lh_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_acc_lh_s0">; + +def int_hexagon_M4_xor_andn : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_xor_andn">; + +def int_hexagon_S2_addasl_rrri : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_addasl_rrri">; + +def int_hexagon_M5_vdmpybsu : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M5_vdmpybsu">; + +def int_hexagon_M2_mpyu_nac_hh_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_nac_hh_s0">; + +def int_hexagon_M2_mpyu_nac_hh_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_nac_hh_s1">; + +def int_hexagon_A2_addi : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addi">; + +def int_hexagon_A2_addp : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_addp">; + +def int_hexagon_M2_vmpy2s_s1pack : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_vmpy2s_s1pack">; + +def int_hexagon_S4_clbpnorm : +Hexagon_i32_i64_Intrinsic<"HEXAGON_S4_clbpnorm">; + +def int_hexagon_A4_round_rr_sat : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_round_rr_sat">; + +def int_hexagon_M2_nacci : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_nacci">; + +def int_hexagon_S2_shuffeh : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S2_shuffeh">; + +def int_hexagon_S2_lsr_i_r_and : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_and">; + +def int_hexagon_M2_mpy_sat_rnd_hh_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hh_s1">; + +def int_hexagon_M2_mpy_sat_rnd_hh_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_hh_s0">; + +def int_hexagon_F2_conv_sf2uw : +Hexagon_i32_float_Intrinsic<"HEXAGON_F2_conv_sf2uw">; + +def int_hexagon_A2_vsubh : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vsubh">; + +def int_hexagon_F2_conv_sf2ud : +Hexagon_i64_float_Intrinsic<"HEXAGON_F2_conv_sf2ud">; + +def int_hexagon_A2_vsubw : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vsubw">; + +def int_hexagon_A2_vcmpwgt : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmpwgt">; + +def int_hexagon_M4_xor_or : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_xor_or">; + +def int_hexagon_F2_conv_sf2uw_chop : +Hexagon_i32_float_Intrinsic<"HEXAGON_F2_conv_sf2uw_chop">; + +def int_hexagon_S2_asl_r_vw : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asl_r_vw">; + def int_hexagon_S2_vsatwuh_nopack : -Hexagon_di_di_Intrinsic<"HEXAGON_S2_vsatwuh_nopack">; -// -// BUILTIN_INFO(HEXAGON.S2_shuffob,DI_ftype_DIDI,2) -// +Hexagon_i64_i64_Intrinsic<"HEXAGON_S2_vsatwuh_nopack">; + +def int_hexagon_S2_asl_r_vh : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asl_r_vh">; + +def int_hexagon_A2_svsubuhs : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_svsubuhs">; + +def int_hexagon_M5_vmpybsu : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M5_vmpybsu">; + +def int_hexagon_A2_subh_l16_sat_ll : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_l16_sat_ll">; + +def int_hexagon_C4_and_and : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C4_and_and">; + +def int_hexagon_M2_mpyu_acc_hl_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_acc_hl_s1">; + +def int_hexagon_M2_mpyu_acc_hl_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_acc_hl_s0">; + +def int_hexagon_S2_lsr_r_p : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsr_r_p">; + +def int_hexagon_S2_lsr_r_r : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_lsr_r_r">; + +def int_hexagon_A4_subp_c : +Hexagon_i64i32_i64i64i32_Intrinsic<"HEXAGON_A4_subp_c">; + +def int_hexagon_A2_vsubhs : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vsubhs">; + +def int_hexagon_C2_vitpack : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_vitpack">; + +def int_hexagon_A2_vavguhr : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavguhr">; + +def int_hexagon_S2_vsplicerb : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_vsplicerb">; + +def int_hexagon_C4_nbitsclr : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_nbitsclr">; + +def int_hexagon_A2_vcmpbgtu : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmpbgtu">; + +def int_hexagon_M2_cmpys_s1 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_cmpys_s1">; + +def int_hexagon_M2_cmpys_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_cmpys_s0">; + +def int_hexagon_F2_dfcmpuo : +Hexagon_i32_doubledouble_Intrinsic<"HEXAGON_F2_dfcmpuo">; + def int_hexagon_S2_shuffob : -Hexagon_di_didi_Intrinsic<"HEXAGON_S2_shuffob">; -// -// BUILTIN_INFO(HEXAGON.S2_shuffeb,DI_ftype_DIDI,2) -// -def int_hexagon_S2_shuffeb : -Hexagon_di_didi_Intrinsic<"HEXAGON_S2_shuffeb">; -// -// BUILTIN_INFO(HEXAGON.S2_shuffoh,DI_ftype_DIDI,2) -// -def int_hexagon_S2_shuffoh : -Hexagon_di_didi_Intrinsic<"HEXAGON_S2_shuffoh">; -// -// BUILTIN_INFO(HEXAGON.S2_shuffeh,DI_ftype_DIDI,2) -// -def int_hexagon_S2_shuffeh : -Hexagon_di_didi_Intrinsic<"HEXAGON_S2_shuffeh">; -// -// BUILTIN_INFO(HEXAGON.S5_popcountp,SI_ftype_DI,1) -// +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S2_shuffob">; + +def int_hexagon_C2_and : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_and">; + def int_hexagon_S5_popcountp : -Hexagon_si_di_Intrinsic<"HEXAGON_S5_popcountp">; -// -// BUILTIN_INFO(HEXAGON.S4_parity,SI_ftype_SISI,2) -// -def int_hexagon_S4_parity : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S4_parity">; -// -// BUILTIN_INFO(HEXAGON.S2_parityp,SI_ftype_DIDI,2) -// -def int_hexagon_S2_parityp : -Hexagon_si_didi_Intrinsic<"HEXAGON_S2_parityp">; -// -// BUILTIN_INFO(HEXAGON.S2_lfsp,DI_ftype_DIDI,2) -// -def int_hexagon_S2_lfsp : -Hexagon_di_didi_Intrinsic<"HEXAGON_S2_lfsp">; -// -// BUILTIN_INFO(HEXAGON.S2_clbnorm,SI_ftype_SI,1) -// -def int_hexagon_S2_clbnorm : -Hexagon_si_si_Intrinsic<"HEXAGON_S2_clbnorm">; -// -// BUILTIN_INFO(HEXAGON.S4_clbaddi,SI_ftype_SISI,2) -// -def int_hexagon_S4_clbaddi : -Hexagon_si_sisi_Intrinsic<"HEXAGON_S4_clbaddi">; -// -// BUILTIN_INFO(HEXAGON.S4_clbpnorm,SI_ftype_DI,1) -// -def int_hexagon_S4_clbpnorm : -Hexagon_si_di_Intrinsic<"HEXAGON_S4_clbpnorm">; -// -// BUILTIN_INFO(HEXAGON.S4_clbpaddi,SI_ftype_DISI,2) -// -def int_hexagon_S4_clbpaddi : -Hexagon_si_disi_Intrinsic<"HEXAGON_S4_clbpaddi">; -// -// BUILTIN_INFO(HEXAGON.S2_clb,SI_ftype_SI,1) -// -def int_hexagon_S2_clb : -Hexagon_si_si_Intrinsic<"HEXAGON_S2_clb">; -// -// BUILTIN_INFO(HEXAGON.S2_cl0,SI_ftype_SI,1) -// +Hexagon_i32_i64_Intrinsic<"HEXAGON_S5_popcountp">; + +def int_hexagon_S4_extractp : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_S4_extractp">; + def int_hexagon_S2_cl0 : -Hexagon_si_si_Intrinsic<"HEXAGON_S2_cl0">; -// -// BUILTIN_INFO(HEXAGON.S2_cl1,SI_ftype_SI,1) -// -def int_hexagon_S2_cl1 : -Hexagon_si_si_Intrinsic<"HEXAGON_S2_cl1">; -// -// BUILTIN_INFO(HEXAGON.S2_clbp,SI_ftype_DI,1) -// -def int_hexagon_S2_clbp : -Hexagon_si_di_Intrinsic<"HEXAGON_S2_clbp">; -// -// BUILTIN_INFO(HEXAGON.S2_cl0p,SI_ftype_DI,1) -// -def int_hexagon_S2_cl0p : -Hexagon_si_di_Intrinsic<"HEXAGON_S2_cl0p">; -// -// BUILTIN_INFO(HEXAGON.S2_cl1p,SI_ftype_DI,1) -// -def int_hexagon_S2_cl1p : -Hexagon_si_di_Intrinsic<"HEXAGON_S2_cl1p">; -// -// BUILTIN_INFO(HEXAGON.S2_brev,SI_ftype_SI,1) -// +Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_cl0">; + +def int_hexagon_A4_vcmpbgti : +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpbgti">; + +def int_hexagon_M2_mmacls_s1 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmacls_s1">; + +def int_hexagon_M2_mmacls_s0 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmacls_s0">; + +def int_hexagon_C4_cmpneq : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_cmpneq">; + +def int_hexagon_M2_vmac2es : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vmac2es">; + +def int_hexagon_M2_vdmacs_s0 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vdmacs_s0">; + +def int_hexagon_M2_vdmacs_s1 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vdmacs_s1">; + +def int_hexagon_M2_mpyud_ll_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyud_ll_s0">; + +def int_hexagon_M2_mpyud_ll_s1 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyud_ll_s1">; + +def int_hexagon_S2_clb : +Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_clb">; + +def int_hexagon_M2_mpy_nac_ll_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_ll_s0">; + +def int_hexagon_M2_mpy_nac_ll_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_ll_s1">; + +def int_hexagon_M2_mpyd_nac_hl_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_nac_hl_s1">; + +def int_hexagon_M2_mpyd_nac_hl_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_nac_hl_s0">; + +def int_hexagon_M2_maci : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_maci">; + +def int_hexagon_A2_vmaxuh : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vmaxuh">; + +def int_hexagon_A4_bitspliti : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A4_bitspliti">; + +def int_hexagon_A2_vmaxub : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vmaxub">; + +def int_hexagon_M2_mpyud_hh_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyud_hh_s0">; + +def int_hexagon_M2_mpyud_hh_s1 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyud_hh_s1">; + +def int_hexagon_M2_vrmac_s0 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vrmac_s0">; + +def int_hexagon_M2_mpy_sat_lh_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_lh_s0">; + +def int_hexagon_S2_asl_r_r_sat : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asl_r_r_sat">; + +def int_hexagon_F2_conv_sf2d : +Hexagon_i64_float_Intrinsic<"HEXAGON_F2_conv_sf2d">; + +def int_hexagon_S2_asr_r_r_nac : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_r_r_nac">; + +def int_hexagon_F2_dfimm_n : +Hexagon_double_i32_Intrinsic<"HEXAGON_F2_dfimm_n">; + +def int_hexagon_A4_cmphgt : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmphgt">; + +def int_hexagon_F2_dfimm_p : +Hexagon_double_i32_Intrinsic<"HEXAGON_F2_dfimm_p">; + +def int_hexagon_M2_mpyud_acc_lh_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_acc_lh_s1">; + +def int_hexagon_M2_vcmpy_s1_sat_r : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vcmpy_s1_sat_r">; + +def int_hexagon_M4_mpyri_addr_u2 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mpyri_addr_u2">; + +def int_hexagon_M2_vcmpy_s1_sat_i : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vcmpy_s1_sat_i">; + +def int_hexagon_S2_lsl_r_p_nac : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsl_r_p_nac">; + +def int_hexagon_M5_vrmacbuu : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M5_vrmacbuu">; + +def int_hexagon_S5_asrhub_rnd_sat_goodsyntax : +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_S5_asrhub_rnd_sat_goodsyntax">; + +def int_hexagon_S2_vspliceib : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_vspliceib">; + +def int_hexagon_M2_dpmpyss_acc_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_dpmpyss_acc_s0">; + +def int_hexagon_M2_cnacs_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cnacs_s1">; + +def int_hexagon_M2_cnacs_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cnacs_s0">; + +def int_hexagon_A2_maxu : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_maxu">; + +def int_hexagon_A2_maxp : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_maxp">; + +def int_hexagon_A2_andir : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_andir">; + +def int_hexagon_F2_sfrecipa : +Hexagon_floati32_floatfloat_Intrinsic<"HEXAGON_F2_sfrecipa">; + +def int_hexagon_A2_combineii : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A2_combineii">; + +def int_hexagon_A4_orn : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_orn">; + +def int_hexagon_A4_cmpbgtui : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbgtui">; + +def int_hexagon_S2_lsr_r_r_or : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_r_r_or">; + +def int_hexagon_A4_vcmpbeqi : +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpbeqi">; + +def int_hexagon_S2_lsl_r_r : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_lsl_r_r">; + +def int_hexagon_S2_lsl_r_p : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsl_r_p">; + +def int_hexagon_A2_or : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_or">; + +def int_hexagon_F2_dfcmpeq : +Hexagon_i32_doubledouble_Intrinsic<"HEXAGON_F2_dfcmpeq">; + +def int_hexagon_C2_cmpeq : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpeq">; + +def int_hexagon_A2_tfrp : +Hexagon_i64_i64_Intrinsic<"HEXAGON_A2_tfrp">; + +def int_hexagon_C4_and_andn : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C4_and_andn">; + +def int_hexagon_S2_vsathub_nopack : +Hexagon_i64_i64_Intrinsic<"HEXAGON_S2_vsathub_nopack">; + +def int_hexagon_A2_satuh : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_satuh">; + +def int_hexagon_A2_satub : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_satub">; + +def int_hexagon_M2_vrcmpys_s1 : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_M2_vrcmpys_s1">; + +def int_hexagon_S4_or_ori : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_or_ori">; + +def int_hexagon_C4_fastcorner9_not : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_fastcorner9_not">; + +def int_hexagon_A2_tfrih : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_tfrih">; + +def int_hexagon_A2_tfril : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_tfril">; + +def int_hexagon_M4_mpyri_addr : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mpyri_addr">; + +def int_hexagon_S2_vtrunehb : +Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_vtrunehb">; + +def int_hexagon_A2_vabsw : +Hexagon_i64_i64_Intrinsic<"HEXAGON_A2_vabsw">; + +def int_hexagon_A2_vabsh : +Hexagon_i64_i64_Intrinsic<"HEXAGON_A2_vabsh">; + +def int_hexagon_F2_sfsub : +Hexagon_float_floatfloat_Intrinsic<"HEXAGON_F2_sfsub">; + +def int_hexagon_C2_muxii : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C2_muxii">; + +def int_hexagon_C2_muxir : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C2_muxir">; + +def int_hexagon_A2_swiz : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_swiz">; + +def int_hexagon_S2_asr_i_p_and : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_and">; + +def int_hexagon_M2_cmpyrsc_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_cmpyrsc_s0">; + +def int_hexagon_M2_cmpyrsc_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_cmpyrsc_s1">; + +def int_hexagon_A2_vraddub : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vraddub">; + +def int_hexagon_A4_tlbmatch : +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_tlbmatch">; + +def int_hexagon_F2_conv_df2w_chop : +Hexagon_i32_double_Intrinsic<"HEXAGON_F2_conv_df2w_chop">; + +def int_hexagon_A2_and : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_and">; + +def int_hexagon_S2_lsr_r_p_and : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_r_p_and">; + +def int_hexagon_M2_mpy_nac_sat_ll_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_sat_ll_s1">; + +def int_hexagon_M2_mpy_nac_sat_ll_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_sat_ll_s0">; + +def int_hexagon_S4_extract : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_extract">; + +def int_hexagon_A2_vcmpweq : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmpweq">; + +def int_hexagon_M2_acci : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_acci">; + +def int_hexagon_S2_lsr_i_p_acc : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_acc">; + +def int_hexagon_S2_lsr_i_p_or : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_or">; + +def int_hexagon_F2_conv_ud2sf : +Hexagon_float_i64_Intrinsic<"HEXAGON_F2_conv_ud2sf">; + +def int_hexagon_A2_tfr : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_tfr">; + +def int_hexagon_S2_asr_i_p_or : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_or">; + +def int_hexagon_A2_subri : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subri">; + +def int_hexagon_A4_vrmaxuw : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_A4_vrmaxuw">; + +def int_hexagon_M5_vmpybuu : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M5_vmpybuu">; + +def int_hexagon_A4_vrmaxuh : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_A4_vrmaxuh">; + +def int_hexagon_S2_asl_i_vw : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asl_i_vw">; + +def int_hexagon_A2_vavgw : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavgw">; + def int_hexagon_S2_brev : -Hexagon_si_si_Intrinsic<"HEXAGON_S2_brev">; -// -// BUILTIN_INFO(HEXAGON.S2_brevp,DI_ftype_DI,1) -// -def int_hexagon_S2_brevp : -Hexagon_di_di_Intrinsic<"HEXAGON_S2_brevp">; -// -// BUILTIN_INFO(HEXAGON.S2_ct0,SI_ftype_SI,1) -// -def int_hexagon_S2_ct0 : -Hexagon_si_si_Intrinsic<"HEXAGON_S2_ct0">; -// -// BUILTIN_INFO(HEXAGON.S2_ct1,SI_ftype_SI,1) -// -def int_hexagon_S2_ct1 : -Hexagon_si_si_Intrinsic<"HEXAGON_S2_ct1">; -// -// BUILTIN_INFO(HEXAGON.S2_ct0p,SI_ftype_DI,1) -// -def int_hexagon_S2_ct0p : -Hexagon_si_di_Intrinsic<"HEXAGON_S2_ct0p">; -// -// BUILTIN_INFO(HEXAGON.S2_ct1p,SI_ftype_DI,1) -// -def int_hexagon_S2_ct1p : -Hexagon_si_di_Intrinsic<"HEXAGON_S2_ct1p">; -// -// BUILTIN_INFO(HEXAGON.S2_interleave,DI_ftype_DI,1) -// -def int_hexagon_S2_interleave : -Hexagon_di_di_Intrinsic<"HEXAGON_S2_interleave">; -// -// BUILTIN_INFO(HEXAGON.S2_deinterleave,DI_ftype_DI,1) -// -def int_hexagon_S2_deinterleave : -Hexagon_di_di_Intrinsic<"HEXAGON_S2_deinterleave">; +Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_brev">; -// -// BUILTIN_INFO(HEXAGON.dcfetch_A,v_ftype_DI*,1) -// -def int_hexagon_prefetch : -Hexagon_Intrinsic<"HEXAGON_prefetch", [], [llvm_ptr_ty], []>; -def int_hexagon_Y2_dccleana : -Hexagon_Intrinsic<"HEXAGON_Y2_dccleana", [], [llvm_ptr_ty], []>; -def int_hexagon_Y2_dccleaninva : -Hexagon_Intrinsic<"HEXAGON_Y2_dccleaninva", [], [llvm_ptr_ty], []>; -def int_hexagon_Y2_dcinva : -Hexagon_Intrinsic<"HEXAGON_Y2_dcinva", [], [llvm_ptr_ty], []>; -def int_hexagon_Y2_dczeroa : -Hexagon_Intrinsic<"HEXAGON_Y2_dczeroa", [], [llvm_ptr_ty], - [IntrWriteMem, IntrArgMemOnly, IntrHasSideEffects]>; -def int_hexagon_Y4_l2fetch : -Hexagon_Intrinsic<"HEXAGON_Y4_l2fetch", [], [llvm_ptr_ty, llvm_i32_ty], []>; -def int_hexagon_Y5_l2fetch : -Hexagon_Intrinsic<"HEXAGON_Y5_l2fetch", [], [llvm_ptr_ty, llvm_i64_ty], []>; +def int_hexagon_A2_vavgh : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavgh">; -def llvm_ptr32_ty : LLVMPointerType; -def llvm_ptr64_ty : LLVMPointerType; +def int_hexagon_S2_clrbit_i : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_clrbit_i">; -// Mark locked loads as read/write to prevent any accidental reordering. -def int_hexagon_L2_loadw_locked : -Hexagon_Intrinsic<"HEXAGON_L2_loadw_locked", [llvm_i32_ty], [llvm_ptr32_ty], - [IntrArgMemOnly, NoCapture<0>]>; -def int_hexagon_L4_loadd_locked : -Hexagon_Intrinsic<"HEXAGON_L4_loadd_locked", [llvm_i64_ty], [llvm_ptr64_ty], - [IntrArgMemOnly, NoCapture<0>]>; +def int_hexagon_S2_asl_i_vh : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asl_i_vh">; -def int_hexagon_S2_storew_locked : -Hexagon_Intrinsic<"HEXAGON_S2_storew_locked", [llvm_i32_ty], - [llvm_ptr32_ty, llvm_i32_ty], [IntrArgMemOnly, NoCapture<0>]>; -def int_hexagon_S4_stored_locked : -Hexagon_Intrinsic<"HEXAGON_S4_stored_locked", [llvm_i32_ty], - [llvm_ptr64_ty, llvm_i64_ty], [IntrArgMemOnly, NoCapture<0>]>; +def int_hexagon_S2_lsr_i_r_or : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_or">; -// V60 +def int_hexagon_S2_lsl_r_r_nac : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsl_r_r_nac">; -class Hexagon_v2048v2048_Intrinsic_T - : Hexagon_Intrinsic; +def int_hexagon_M2_mmpyl_rs1 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyl_rs1">; -// tag : V6_hi_W -// tag : V6_lo_W -class Hexagon_v512v1024_Intrinsic_T - : Hexagon_Intrinsic; +def int_hexagon_M2_mpyud_hl_s1 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyud_hl_s1">; -// tag : V6_hi_W_128B -// tag : V6_lo_W_128B -class Hexagon_v1024v2048_Intrinsic_T - : Hexagon_Intrinsic; +def int_hexagon_M2_mmpyl_s0 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyl_s0">; -class Hexagon_v1024v1024_Intrinsic_T - : Hexagon_Intrinsic; +def int_hexagon_M2_mmpyl_s1 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyl_s1">; -// BUILTIN_INFO(HEXAGON.V6_hi_W,VI_ftype_VI,1) -// tag : V6_hi -def int_hexagon_V6_hi : -Hexagon_v512v1024_Intrinsic_T<"HEXAGON_V6_hi">; +def int_hexagon_M2_naccii : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_naccii">; -// BUILTIN_INFO(HEXAGON.V6_lo_W,VI_ftype_VI,1) -// tag : V6_lo -def int_hexagon_V6_lo : -Hexagon_v512v1024_Intrinsic_T<"HEXAGON_V6_lo">; +def int_hexagon_S2_vrndpackwhs : +Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_vrndpackwhs">; -// BUILTIN_INFO(HEXAGON.V6_hi_W,VI_ftype_VI,1) -// tag : V6_hi_128B -def int_hexagon_V6_hi_128B : -Hexagon_v1024v2048_Intrinsic_T<"HEXAGON_V6_hi_128B">; +def int_hexagon_S2_vtrunewh : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S2_vtrunewh">; -// BUILTIN_INFO(HEXAGON.V6_lo_W,VI_ftype_VI,1) -// tag : V6_lo_128B -def int_hexagon_V6_lo_128B : -Hexagon_v1024v2048_Intrinsic_T<"HEXAGON_V6_lo_128B">; +def int_hexagon_M2_dpmpyss_nac_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_dpmpyss_nac_s0">; -// BUILTIN_INFO(HEXAGON.V6_vassignp,VI_ftype_VI,1) -// tag : V6_vassignp -def int_hexagon_V6_vassignp : -Hexagon_v1024v1024_Intrinsic_T<"HEXAGON_V6_vassignp">; +def int_hexagon_M2_mpyd_ll_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_ll_s0">; -// BUILTIN_INFO(HEXAGON.V6_vassignp,VI_ftype_VI,1) -// tag : V6_vassignp_128B -def int_hexagon_V6_vassignp_128B : -Hexagon_v2048v2048_Intrinsic_T<"HEXAGON_V6_vassignp_128B">; +def int_hexagon_M2_mpyd_ll_s1 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_ll_s1">; +def int_hexagon_M4_mac_up_s1_sat : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mac_up_s1_sat">; -// -// Hexagon_iii_Intrinsic -// tag : S6_rol_i_r -class Hexagon_iii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_S4_vrcrotate_acc : +Hexagon_i64_i64i64i32i32_Intrinsic<"HEXAGON_S4_vrcrotate_acc">; -// -// Hexagon_LLiLLii_Intrinsic -// tag : S6_rol_i_p -class Hexagon_LLiLLii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_F2_conv_uw2df : +Hexagon_double_i32_Intrinsic<"HEXAGON_F2_conv_uw2df">; -// -// Hexagon_iiii_Intrinsic -// tag : S6_rol_i_r_acc -class Hexagon_iiii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_A2_vaddubs : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vaddubs">; -// -// Hexagon_LLiLLiLLii_Intrinsic -// tag : S6_rol_i_p_acc -class Hexagon_LLiLLiLLii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_S2_asr_r_r_acc : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_r_r_acc">; -// -// Hexagon_v512v512v512i_Intrinsic -// tag : V6_valignb -class Hexagon_v512v512v512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_A2_orir : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_orir">; -// -// Hexagon_v1024v1024v1024i_Intrinsic -// tag : V6_valignb_128B -class Hexagon_v1024v1024v1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_A2_andp : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_andp">; -// -// Hexagon_v512v512i_Intrinsic -// tag : V6_vror -class Hexagon_v512v512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_S2_lfsp : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S2_lfsp">; -// -// Hexagon_v1024v1024i_Intrinsic -// tag : V6_vror_128B -class Hexagon_v1024v1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_A2_min : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_min">; -// -// Hexagon_v1024v512_Intrinsic -// tag : V6_vunpackub -class Hexagon_v1024v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_mpysmi : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpysmi">; -// -// Hexagon_v2048v1024_Intrinsic -// tag : V6_vunpackub_128B -class Hexagon_v2048v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_vcmpy_s0_sat_r : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vcmpy_s0_sat_r">; -// -// Hexagon_v1024v1024v512_Intrinsic -// tag : V6_vunpackob -class Hexagon_v1024v1024v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_mpyu_acc_ll_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_acc_ll_s1">; -// -// Hexagon_v2048v2048v1024_Intrinsic -// tag : V6_vunpackob_128B -class Hexagon_v2048v2048v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_mpyu_acc_ll_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpyu_acc_ll_s0">; -// -// Hexagon_v512v512v512_Intrinsic -// tag : V6_vpackeb -class Hexagon_v512v512v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_S2_asr_r_svw_trun : +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_S2_asr_r_svw_trun">; -// -// Hexagon_v1024v1024v1024_Intrinsic -// tag : V6_vpackeb_128B -class Hexagon_v1024v1024v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_mmpyh_s0 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyh_s0">; -// -// Hexagon_v2048v2048i_Intrinsic -// tag : V6_vdmpybus_dv_128B -class Hexagon_v2048v2048i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_mmpyh_s1 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyh_s1">; -// -// Hexagon_v2048v2048v2048i_Intrinsic -// tag : V6_vdmpybus_dv_acc_128B -class Hexagon_v2048v2048v2048i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_F2_conv_sf2df : +Hexagon_double_float_Intrinsic<"HEXAGON_F2_conv_sf2df">; -// -// Hexagon_v512v512v512v512_Intrinsic -// tag : V6_vdmpyhvsat_acc -class Hexagon_v512v512v512v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_S2_vtrunohb : +Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_vtrunohb">; -// -// Hexagon_v1024v1024v1024v1024_Intrinsic -// tag : V6_vdmpyhvsat_acc_128B -class Hexagon_v1024v1024v1024v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_F2_conv_sf2d_chop : +Hexagon_i64_float_Intrinsic<"HEXAGON_F2_conv_sf2d_chop">; -// -// Hexagon_v512v1024i_Intrinsic -// tag : V6_vdmpyhisat -class Hexagon_v512v1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_mpyd_lh_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_lh_s0">; -// -// Hexagon_v1024v2048i_Intrinsic -// tag : V6_vdmpyhisat_128B -class Hexagon_v1024v2048i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_F2_conv_df2w : +Hexagon_i32_double_Intrinsic<"HEXAGON_F2_conv_df2w">; -// -// Hexagon_v512v512v1024i_Intrinsic -// tag : V6_vdmpyhisat_acc -class Hexagon_v512v512v1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_S5_asrhub_sat : +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_S5_asrhub_sat">; -// -// Hexagon_v1024v1024v2048i_Intrinsic -// tag : V6_vdmpyhisat_acc_128B -class Hexagon_v1024v1024v2048i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_S2_asl_i_r_xacc : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_xacc">; -// -// Hexagon_v1024v1024ii_Intrinsic -// tag : V6_vrmpyubi -class Hexagon_v1024v1024ii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_F2_conv_df2d : +Hexagon_i64_double_Intrinsic<"HEXAGON_F2_conv_df2d">; -// -// Hexagon_v2048v2048ii_Intrinsic -// tag : V6_vrmpyubi_128B -class Hexagon_v2048v2048ii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_mmaculs_s1 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmaculs_s1">; -// -// Hexagon_v1024v1024v1024ii_Intrinsic -// tag : V6_vrmpyubi_acc -class Hexagon_v1024v1024v1024ii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_mmaculs_s0 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmaculs_s0">; -// -// Hexagon_v2048v2048v2048ii_Intrinsic -// tag : V6_vrmpyubi_acc_128B -class Hexagon_v2048v2048v2048ii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_A2_svadduhs : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_svadduhs">; -// -// Hexagon_v2048v2048v2048_Intrinsic -// tag : V6_vaddb_dv_128B -class Hexagon_v2048v2048v2048_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_F2_conv_sf2w_chop : +Hexagon_i32_float_Intrinsic<"HEXAGON_F2_conv_sf2w_chop">; -// -// Hexagon_v1024v512v512_Intrinsic -// tag : V6_vaddubh -class Hexagon_v1024v512v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_S2_svsathub : +Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_svsathub">; -// -// Hexagon_v2048v1024v1024_Intrinsic -// tag : V6_vaddubh_128B -class Hexagon_v2048v1024v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_mpyd_rnd_hl_s1 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_rnd_hl_s1">; -// -// Hexagon_v512_Intrinsic -// tag : V6_vd0 -class Hexagon_v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_mpyd_rnd_hl_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_rnd_hl_s0">; -// -// Hexagon_v1024_Intrinsic -// tag : V6_vd0_128B -class Hexagon_v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_S2_setbit_r : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_setbit_r">; -// -// Hexagon_v512v64iv512v512_Intrinsic -// tag : V6_vaddbq -class Hexagon_v512v64iv512v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_A2_vavghr : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavghr">; -// -// Hexagon_v1024v128iv1024v1024_Intrinsic -// tag : V6_vaddbq_128B -class Hexagon_v1024v128iv1024v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_F2_sffma_sc : +Hexagon_float_floatfloatfloati32_Intrinsic<"HEXAGON_F2_sffma_sc">; -// -// Hexagon_v512v512_Intrinsic -// tag : V6_vabsh -class Hexagon_v512v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_F2_dfclass : +Hexagon_i32_doublei32_Intrinsic<"HEXAGON_F2_dfclass">; -// -// Hexagon_v1024v1024_Intrinsic -// tag : V6_vabsh_128B -class Hexagon_v1024v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_F2_conv_df2ud : +Hexagon_i64_double_Intrinsic<"HEXAGON_F2_conv_df2ud">; -// -// Hexagon_v1024v1024v512v512_Intrinsic -// tag : V6_vmpybv_acc -class Hexagon_v1024v1024v512v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_F2_conv_df2uw : +Hexagon_i32_double_Intrinsic<"HEXAGON_F2_conv_df2uw">; -// -// Hexagon_v2048v2048v1024v1024_Intrinsic -// tag : V6_vmpybv_acc_128B -class Hexagon_v2048v2048v1024v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_cmpyrs_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_cmpyrs_s0">; -// -// Hexagon_v1024v512i_Intrinsic -// tag : V6_vmpyub -class Hexagon_v1024v512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_cmpyrs_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_cmpyrs_s1">; -// -// Hexagon_v2048v1024i_Intrinsic -// tag : V6_vmpyub_128B -class Hexagon_v2048v1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_C4_cmpltei : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_cmpltei">; -// -// Hexagon_v1024v1024v512i_Intrinsic -// tag : V6_vmpyub_acc -class Hexagon_v1024v1024v512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_C4_cmplteu : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_cmplteu">; -// -// Hexagon_v2048v2048v1024i_Intrinsic -// tag : V6_vmpyub_acc_128B -class Hexagon_v2048v2048v1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_A2_vsubb_map : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vsubb_map">; -// -// Hexagon_v512v64ii_Intrinsic -// tag : V6_vandqrt -class Hexagon_v512v64ii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_A2_subh_l16_ll : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_l16_ll">; -// -// Hexagon_v1024v128ii_Intrinsic -// tag : V6_vandqrt_128B -class Hexagon_v1024v128ii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_S2_asr_i_r_rnd : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_rnd">; -// -// Hexagon_v512v512v64ii_Intrinsic -// tag : V6_vandqrt_acc -class Hexagon_v512v512v64ii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_vrmpy_s0 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vrmpy_s0">; -// -// Hexagon_v1024v1024v128ii_Intrinsic -// tag : V6_vandqrt_acc_128B -class Hexagon_v1024v1024v128ii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_mpyd_rnd_hh_s1 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_rnd_hh_s1">; -// -// Hexagon_v64iv512i_Intrinsic -// tag : V6_vandvrt -class Hexagon_v64iv512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_mpyd_rnd_hh_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyd_rnd_hh_s0">; -// -// Hexagon_v128iv1024i_Intrinsic -// tag : V6_vandvrt_128B -class Hexagon_v128iv1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_A2_minup : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_minup">; -// -// Hexagon_v64iv64iv512i_Intrinsic -// tag : V6_vandvrt_acc -class Hexagon_v64iv64iv512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_S2_valignrb : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_valignrb">; -// -// Hexagon_v128iv128iv1024i_Intrinsic -// tag : V6_vandvrt_acc_128B -class Hexagon_v128iv128iv1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_S2_asr_r_p_acc : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_r_p_acc">; -// -// Hexagon_v64iv512v512_Intrinsic -// tag : V6_vgtw -class Hexagon_v64iv512v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_mmpyl_rs0 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_mmpyl_rs0">; -// -// Hexagon_v128iv1024v1024_Intrinsic -// tag : V6_vgtw_128B -class Hexagon_v128iv1024v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_vrcmaci_s0 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vrcmaci_s0">; -// -// Hexagon_v64iv64iv512v512_Intrinsic -// tag : V6_vgtw_and -class Hexagon_v64iv64iv512v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_A2_vaddub : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vaddub">; -// -// Hexagon_v128iv128iv1024v1024_Intrinsic -// tag : V6_vgtw_and_128B -class Hexagon_v128iv128iv1024v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_A2_combine_lh : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_combine_lh">; -// -// Hexagon_v64iv64iv64i_Intrinsic -// tag : V6_pred_or -class Hexagon_v64iv64iv64i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M5_vdmacbsu : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M5_vdmacbsu">; -// -// Hexagon_v128iv128iv128i_Intrinsic -// tag : V6_pred_or_128B -class Hexagon_v128iv128iv128i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_A2_combine_ll : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_combine_ll">; -// -// Hexagon_v64iv64i_Intrinsic -// tag : V6_pred_not -class Hexagon_v64iv64i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_mpyud_hl_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyud_hl_s0">; -// -// Hexagon_v128iv128i_Intrinsic -// tag : V6_pred_not_128B -class Hexagon_v128iv128i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_vrcmpyi_s0c : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vrcmpyi_s0c">; -// -// Hexagon_v64ii_Intrinsic -// tag : V6_pred_scalar2 -class Hexagon_v64ii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_S2_asr_i_p_rnd : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_rnd">; -// -// Hexagon_v128ii_Intrinsic -// tag : V6_pred_scalar2_128B -class Hexagon_v128ii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_A2_addpsat : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_addpsat">; -// -// Hexagon_v1024v64iv512v512_Intrinsic -// tag : V6_vswap -class Hexagon_v1024v64iv512v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_A2_svaddhs : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_svaddhs">; -// -// Hexagon_v2048v128iv1024v1024_Intrinsic -// tag : V6_vswap_128B -class Hexagon_v2048v128iv1024v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_S4_ori_lsr_ri : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_ori_lsr_ri">; -// -// Hexagon_v1024v512v512i_Intrinsic -// tag : V6_vshuffvdd -class Hexagon_v1024v512v512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_mpy_sat_rnd_ll_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_ll_s1">; -// -// Hexagon_v2048v1024v1024i_Intrinsic -// tag : V6_vshuffvdd_128B -class Hexagon_v2048v1024v1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_mpy_sat_rnd_ll_s0 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_sat_rnd_ll_s0">; +def int_hexagon_A2_vminw : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vminw">; -// -// Hexagon_iv512i_Intrinsic -// tag : V6_extractw -class Hexagon_iv512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_A2_vminh : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vminh">; -// -// Hexagon_iv1024i_Intrinsic -// tag : V6_extractw_128B -class Hexagon_iv1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_vrcmpyr_s0 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vrcmpyr_s0">; -// -// Hexagon_v512i_Intrinsic -// tag : V6_lvsplatw -class Hexagon_v512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_A2_vminb : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vminb">; -// -// Hexagon_v1024i_Intrinsic -// tag : V6_lvsplatw_128B -class Hexagon_v1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_vcmac_s0_sat_i : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vcmac_s0_sat_i">; -// -// Hexagon_v512v512v512v512i_Intrinsic -// tag : V6_vlutvvb_oracc -class Hexagon_v512v512v512v512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_mpyud_lh_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyud_lh_s0">; -// -// Hexagon_v1024v1024v1024v1024i_Intrinsic -// tag : V6_vlutvvb_oracc_128B -class Hexagon_v1024v1024v1024v1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_mpyud_lh_s1 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyud_lh_s1">; -// -// Hexagon_v1024v1024v512v512i_Intrinsic -// tag : V6_vlutvwh_oracc -class Hexagon_v1024v1024v512v512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_S2_asl_r_r_or : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_r_r_or">; -// -// Hexagon_v2048v2048v1024v1024i_Intrinsic -// tag : V6_vlutvwh_oracc_128B -class Hexagon_v2048v2048v1024v1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_S4_lsli : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S4_lsli">; -// -// Hexagon_vv64ivmemv512_Intrinsic -// tag: V6_vS32b_qpred_ai -class Hexagon_vv64ivmemv512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_S2_lsl_r_vw : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsl_r_vw">; -// -// Hexagon_vv128ivmemv1024_Intrinsic -// tag: V6_vS32b_qpred_ai_128B -class Hexagon_vv128ivmemv1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_M2_mpy_hh_s1 : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpy_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.S6_rol_i_r,SI_ftype_SISI,2) -// tag : S6_rol_i_r -def int_hexagon_S6_rol_i_r : -Hexagon_iii_Intrinsic<"HEXAGON_S6_rol_i_r">; +def int_hexagon_M4_vrmpyeh_s0 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M4_vrmpyeh_s0">; -// -// BUILTIN_INFO(HEXAGON.S6_rol_i_p,DI_ftype_DISI,2) -// tag : S6_rol_i_p -def int_hexagon_S6_rol_i_p : -Hexagon_LLiLLii_Intrinsic<"HEXAGON_S6_rol_i_p">; +def int_hexagon_M4_vrmpyeh_s1 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M4_vrmpyeh_s1">; -// -// BUILTIN_INFO(HEXAGON.S6_rol_i_r_acc,SI_ftype_SISISI,3) -// tag : S6_rol_i_r_acc -def int_hexagon_S6_rol_i_r_acc : -Hexagon_iiii_Intrinsic<"HEXAGON_S6_rol_i_r_acc">; +def int_hexagon_M2_mpy_nac_lh_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.S6_rol_i_p_acc,DI_ftype_DIDISI,3) -// tag : S6_rol_i_p_acc -def int_hexagon_S6_rol_i_p_acc : -Hexagon_LLiLLiLLii_Intrinsic<"HEXAGON_S6_rol_i_p_acc">; +def int_hexagon_M2_mpy_nac_lh_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.S6_rol_i_r_nac,SI_ftype_SISISI,3) -// tag : S6_rol_i_r_nac -def int_hexagon_S6_rol_i_r_nac : -Hexagon_iiii_Intrinsic<"HEXAGON_S6_rol_i_r_nac">; +def int_hexagon_M2_vraddh : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_M2_vraddh">; -// -// BUILTIN_INFO(HEXAGON.S6_rol_i_p_nac,DI_ftype_DIDISI,3) -// tag : S6_rol_i_p_nac -def int_hexagon_S6_rol_i_p_nac : -Hexagon_LLiLLiLLii_Intrinsic<"HEXAGON_S6_rol_i_p_nac">; +def int_hexagon_C2_tfrrp : +Hexagon_i32_i32_Intrinsic<"HEXAGON_C2_tfrrp">; -// -// BUILTIN_INFO(HEXAGON.S6_rol_i_r_xacc,SI_ftype_SISISI,3) -// tag : S6_rol_i_r_xacc -def int_hexagon_S6_rol_i_r_xacc : -Hexagon_iiii_Intrinsic<"HEXAGON_S6_rol_i_r_xacc">; +def int_hexagon_M2_mpy_acc_sat_ll_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_sat_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.S6_rol_i_p_xacc,DI_ftype_DIDISI,3) -// tag : S6_rol_i_p_xacc -def int_hexagon_S6_rol_i_p_xacc : -Hexagon_LLiLLiLLii_Intrinsic<"HEXAGON_S6_rol_i_p_xacc">; +def int_hexagon_M2_mpy_acc_sat_ll_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_sat_ll_s1">; -// -// BUILTIN_INFO(HEXAGON.S6_rol_i_r_and,SI_ftype_SISISI,3) -// tag : S6_rol_i_r_and -def int_hexagon_S6_rol_i_r_and : -Hexagon_iiii_Intrinsic<"HEXAGON_S6_rol_i_r_and">; +def int_hexagon_S2_vtrunowh : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S2_vtrunowh">; -// -// BUILTIN_INFO(HEXAGON.S6_rol_i_r_or,SI_ftype_SISISI,3) -// tag : S6_rol_i_r_or -def int_hexagon_S6_rol_i_r_or : -Hexagon_iiii_Intrinsic<"HEXAGON_S6_rol_i_r_or">; +def int_hexagon_A2_abs : +Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_abs">; -// -// BUILTIN_INFO(HEXAGON.S6_rol_i_p_and,DI_ftype_DIDISI,3) -// tag : S6_rol_i_p_and -def int_hexagon_S6_rol_i_p_and : -Hexagon_LLiLLiLLii_Intrinsic<"HEXAGON_S6_rol_i_p_and">; +def int_hexagon_A4_cmpbeq : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbeq">; -// -// BUILTIN_INFO(HEXAGON.S6_rol_i_p_or,DI_ftype_DIDISI,3) -// tag : S6_rol_i_p_or -def int_hexagon_S6_rol_i_p_or : -Hexagon_LLiLLiLLii_Intrinsic<"HEXAGON_S6_rol_i_p_or">; +def int_hexagon_A2_negp : +Hexagon_i64_i64_Intrinsic<"HEXAGON_A2_negp">; -// -// BUILTIN_INFO(HEXAGON.S2_cabacencbin,DI_ftype_DIDIQI,3) -// tag : S2_cabacencbin -def int_hexagon_S2_cabacencbin : -Hexagon_LLiLLiLLii_Intrinsic<"HEXAGON_S2_cabacencbin">; +def int_hexagon_S2_asl_i_r_sat : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_sat">; -// -// BUILTIN_INFO(HEXAGON.V6_valignb,VI_ftype_VIVISI,3) -// tag : V6_valignb -def int_hexagon_V6_valignb : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_valignb">; +def int_hexagon_A2_addh_l16_sat_hl : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_l16_sat_hl">; -// -// BUILTIN_INFO(HEXAGON.V6_valignb_128B,VI_ftype_VIVISI,3) -// tag : V6_valignb_128B -def int_hexagon_V6_valignb_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_valignb_128B">; +def int_hexagon_S2_vsatwuh : +Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_vsatwuh">; -// -// BUILTIN_INFO(HEXAGON.V6_vlalignb,VI_ftype_VIVISI,3) -// tag : V6_vlalignb -def int_hexagon_V6_vlalignb : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vlalignb">; +def int_hexagon_F2_dfcmpgt : +Hexagon_i32_doubledouble_Intrinsic<"HEXAGON_F2_dfcmpgt">; -// -// BUILTIN_INFO(HEXAGON.V6_vlalignb_128B,VI_ftype_VIVISI,3) -// tag : V6_vlalignb_128B -def int_hexagon_V6_vlalignb_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vlalignb_128B">; +def int_hexagon_S2_svsathb : +Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_svsathb">; -// -// BUILTIN_INFO(HEXAGON.V6_valignbi,VI_ftype_VIVISI,3) -// tag : V6_valignbi -def int_hexagon_V6_valignbi : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_valignbi">; +def int_hexagon_C2_cmpgtup : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_C2_cmpgtup">; -// -// BUILTIN_INFO(HEXAGON.V6_valignbi_128B,VI_ftype_VIVISI,3) -// tag : V6_valignbi_128B -def int_hexagon_V6_valignbi_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_valignbi_128B">; +def int_hexagon_A4_cround_ri : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cround_ri">; -// -// BUILTIN_INFO(HEXAGON.V6_vlalignbi,VI_ftype_VIVISI,3) -// tag : V6_vlalignbi -def int_hexagon_V6_vlalignbi : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vlalignbi">; +def int_hexagon_S4_clbpaddi : +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_S4_clbpaddi">; -// -// BUILTIN_INFO(HEXAGON.V6_vlalignbi_128B,VI_ftype_VIVISI,3) -// tag : V6_vlalignbi_128B -def int_hexagon_V6_vlalignbi_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vlalignbi_128B">; +def int_hexagon_A4_cround_rr : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cround_rr">; -// -// BUILTIN_INFO(HEXAGON.V6_vror,VI_ftype_VISI,2) -// tag : V6_vror -def int_hexagon_V6_vror : -Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vror">; +def int_hexagon_C2_mux : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C2_mux">; -// -// BUILTIN_INFO(HEXAGON.V6_vror_128B,VI_ftype_VISI,2) -// tag : V6_vror_128B -def int_hexagon_V6_vror_128B : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vror_128B">; +def int_hexagon_M2_dpmpyuu_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_dpmpyuu_s0">; -// -// BUILTIN_INFO(HEXAGON.V6_vunpackub,VD_ftype_VI,1) -// tag : V6_vunpackub -def int_hexagon_V6_vunpackub : -Hexagon_v1024v512_Intrinsic<"HEXAGON_V6_vunpackub">; +def int_hexagon_S2_shuffeb : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S2_shuffeb">; -// -// BUILTIN_INFO(HEXAGON.V6_vunpackub_128B,VD_ftype_VI,1) -// tag : V6_vunpackub_128B -def int_hexagon_V6_vunpackub_128B : -Hexagon_v2048v1024_Intrinsic<"HEXAGON_V6_vunpackub_128B">; +def int_hexagon_A2_vminuw : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vminuw">; -// -// BUILTIN_INFO(HEXAGON.V6_vunpackb,VD_ftype_VI,1) -// tag : V6_vunpackb -def int_hexagon_V6_vunpackb : -Hexagon_v1024v512_Intrinsic<"HEXAGON_V6_vunpackb">; +def int_hexagon_A2_vaddhs : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vaddhs">; -// -// BUILTIN_INFO(HEXAGON.V6_vunpackb_128B,VD_ftype_VI,1) -// tag : V6_vunpackb_128B -def int_hexagon_V6_vunpackb_128B : -Hexagon_v2048v1024_Intrinsic<"HEXAGON_V6_vunpackb_128B">; +def int_hexagon_S2_insert_rp : +Hexagon_i32_i32i32i64_Intrinsic<"HEXAGON_S2_insert_rp">; -// -// BUILTIN_INFO(HEXAGON.V6_vunpackuh,VD_ftype_VI,1) -// tag : V6_vunpackuh -def int_hexagon_V6_vunpackuh : -Hexagon_v1024v512_Intrinsic<"HEXAGON_V6_vunpackuh">; +def int_hexagon_A2_vminuh : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vminuh">; -// -// BUILTIN_INFO(HEXAGON.V6_vunpackuh_128B,VD_ftype_VI,1) -// tag : V6_vunpackuh_128B -def int_hexagon_V6_vunpackuh_128B : -Hexagon_v2048v1024_Intrinsic<"HEXAGON_V6_vunpackuh_128B">; +def int_hexagon_A2_vminub : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vminub">; -// -// BUILTIN_INFO(HEXAGON.V6_vunpackh,VD_ftype_VI,1) -// tag : V6_vunpackh -def int_hexagon_V6_vunpackh : -Hexagon_v1024v512_Intrinsic<"HEXAGON_V6_vunpackh">; +def int_hexagon_S2_extractu : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_extractu">; -// -// BUILTIN_INFO(HEXAGON.V6_vunpackh_128B,VD_ftype_VI,1) -// tag : V6_vunpackh_128B -def int_hexagon_V6_vunpackh_128B : -Hexagon_v2048v1024_Intrinsic<"HEXAGON_V6_vunpackh_128B">; +def int_hexagon_A2_svsubh : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_svsubh">; -// -// BUILTIN_INFO(HEXAGON.V6_vunpackob,VD_ftype_VDVI,2) -// tag : V6_vunpackob -def int_hexagon_V6_vunpackob : -Hexagon_v1024v1024v512_Intrinsic<"HEXAGON_V6_vunpackob">; +def int_hexagon_S4_clbaddi : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S4_clbaddi">; -// -// BUILTIN_INFO(HEXAGON.V6_vunpackob_128B,VD_ftype_VDVI,2) -// tag : V6_vunpackob_128B -def int_hexagon_V6_vunpackob_128B : -Hexagon_v2048v2048v1024_Intrinsic<"HEXAGON_V6_vunpackob_128B">; +def int_hexagon_F2_sffms : +Hexagon_float_floatfloatfloat_Intrinsic<"HEXAGON_F2_sffms">; -// -// BUILTIN_INFO(HEXAGON.V6_vunpackoh,VD_ftype_VDVI,2) -// tag : V6_vunpackoh -def int_hexagon_V6_vunpackoh : -Hexagon_v1024v1024v512_Intrinsic<"HEXAGON_V6_vunpackoh">; +def int_hexagon_S2_vsxtbh : +Hexagon_i64_i32_Intrinsic<"HEXAGON_S2_vsxtbh">; -// -// BUILTIN_INFO(HEXAGON.V6_vunpackoh_128B,VD_ftype_VDVI,2) -// tag : V6_vunpackoh_128B -def int_hexagon_V6_vunpackoh_128B : -Hexagon_v2048v2048v1024_Intrinsic<"HEXAGON_V6_vunpackoh_128B">; +def int_hexagon_M2_mpyud_nac_ll_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_nac_ll_s1">; -// -// BUILTIN_INFO(HEXAGON.V6_vpackeb,VI_ftype_VIVI,2) -// tag : V6_vpackeb -def int_hexagon_V6_vpackeb : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vpackeb">; +def int_hexagon_M2_mpyud_nac_ll_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_nac_ll_s0">; -// -// BUILTIN_INFO(HEXAGON.V6_vpackeb_128B,VI_ftype_VIVI,2) -// tag : V6_vpackeb_128B -def int_hexagon_V6_vpackeb_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vpackeb_128B">; +def int_hexagon_A2_subp : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_subp">; -// -// BUILTIN_INFO(HEXAGON.V6_vpackeh,VI_ftype_VIVI,2) -// tag : V6_vpackeh -def int_hexagon_V6_vpackeh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vpackeh">; +def int_hexagon_M2_vmpy2es_s1 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vmpy2es_s1">; -// -// BUILTIN_INFO(HEXAGON.V6_vpackeh_128B,VI_ftype_VIVI,2) -// tag : V6_vpackeh_128B -def int_hexagon_V6_vpackeh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vpackeh_128B">; +def int_hexagon_M2_vmpy2es_s0 : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M2_vmpy2es_s0">; -// -// BUILTIN_INFO(HEXAGON.V6_vpackob,VI_ftype_VIVI,2) -// tag : V6_vpackob -def int_hexagon_V6_vpackob : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vpackob">; +def int_hexagon_S4_parity : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S4_parity">; -// -// BUILTIN_INFO(HEXAGON.V6_vpackob_128B,VI_ftype_VIVI,2) -// tag : V6_vpackob_128B -def int_hexagon_V6_vpackob_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vpackob_128B">; +def int_hexagon_M2_mpy_acc_hh_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.V6_vpackoh,VI_ftype_VIVI,2) -// tag : V6_vpackoh -def int_hexagon_V6_vpackoh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vpackoh">; +def int_hexagon_M2_mpy_acc_hh_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.V6_vpackoh_128B,VI_ftype_VIVI,2) -// tag : V6_vpackoh_128B -def int_hexagon_V6_vpackoh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vpackoh_128B">; +def int_hexagon_S4_addi_asl_ri : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_addi_asl_ri">; -// -// BUILTIN_INFO(HEXAGON.V6_vpackhub_sat,VI_ftype_VIVI,2) -// tag : V6_vpackhub_sat -def int_hexagon_V6_vpackhub_sat : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vpackhub_sat">; +def int_hexagon_M2_mpyd_nac_hh_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_nac_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.V6_vpackhub_sat_128B,VI_ftype_VIVI,2) -// tag : V6_vpackhub_sat_128B -def int_hexagon_V6_vpackhub_sat_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vpackhub_sat_128B">; +def int_hexagon_M2_mpyd_nac_hh_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyd_nac_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.V6_vpackhb_sat,VI_ftype_VIVI,2) -// tag : V6_vpackhb_sat -def int_hexagon_V6_vpackhb_sat : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vpackhb_sat">; +def int_hexagon_S2_asr_i_r_nac : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_nac">; -// -// BUILTIN_INFO(HEXAGON.V6_vpackhb_sat_128B,VI_ftype_VIVI,2) -// tag : V6_vpackhb_sat_128B -def int_hexagon_V6_vpackhb_sat_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vpackhb_sat_128B">; +def int_hexagon_A4_cmpheqi : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpheqi">; -// -// BUILTIN_INFO(HEXAGON.V6_vpackwuh_sat,VI_ftype_VIVI,2) -// tag : V6_vpackwuh_sat -def int_hexagon_V6_vpackwuh_sat : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vpackwuh_sat">; +def int_hexagon_S2_lsr_r_p_xor : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_r_p_xor">; -// -// BUILTIN_INFO(HEXAGON.V6_vpackwuh_sat_128B,VI_ftype_VIVI,2) -// tag : V6_vpackwuh_sat_128B -def int_hexagon_V6_vpackwuh_sat_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vpackwuh_sat_128B">; +def int_hexagon_M2_mpy_acc_hl_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.V6_vpackwh_sat,VI_ftype_VIVI,2) -// tag : V6_vpackwh_sat -def int_hexagon_V6_vpackwh_sat : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vpackwh_sat">; +def int_hexagon_M2_mpy_acc_hl_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.V6_vpackwh_sat_128B,VI_ftype_VIVI,2) -// tag : V6_vpackwh_sat_128B -def int_hexagon_V6_vpackwh_sat_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vpackwh_sat_128B">; +def int_hexagon_F2_conv_sf2ud_chop : +Hexagon_i64_float_Intrinsic<"HEXAGON_F2_conv_sf2ud_chop">; -// -// BUILTIN_INFO(HEXAGON.V6_vzb,VD_ftype_VI,1) -// tag : V6_vzb -def int_hexagon_V6_vzb : -Hexagon_v1024v512_Intrinsic<"HEXAGON_V6_vzb">; +def int_hexagon_C2_cmpgeui : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgeui">; -// -// BUILTIN_INFO(HEXAGON.V6_vzb_128B,VD_ftype_VI,1) -// tag : V6_vzb_128B -def int_hexagon_V6_vzb_128B : -Hexagon_v2048v1024_Intrinsic<"HEXAGON_V6_vzb_128B">; +def int_hexagon_M2_mpy_acc_sat_hh_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.V6_vsb,VD_ftype_VI,1) -// tag : V6_vsb -def int_hexagon_V6_vsb : -Hexagon_v1024v512_Intrinsic<"HEXAGON_V6_vsb">; +def int_hexagon_M2_mpy_acc_sat_hh_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_acc_sat_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.V6_vsb_128B,VD_ftype_VI,1) -// tag : V6_vsb_128B -def int_hexagon_V6_vsb_128B : -Hexagon_v2048v1024_Intrinsic<"HEXAGON_V6_vsb_128B">; +def int_hexagon_S2_asl_r_p_and : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_r_p_and">; -// -// BUILTIN_INFO(HEXAGON.V6_vzh,VD_ftype_VI,1) -// tag : V6_vzh -def int_hexagon_V6_vzh : -Hexagon_v1024v512_Intrinsic<"HEXAGON_V6_vzh">; +def int_hexagon_A2_addh_h16_sat_lh : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_h16_sat_lh">; -// -// BUILTIN_INFO(HEXAGON.V6_vzh_128B,VD_ftype_VI,1) -// tag : V6_vzh_128B -def int_hexagon_V6_vzh_128B : -Hexagon_v2048v1024_Intrinsic<"HEXAGON_V6_vzh_128B">; +def int_hexagon_A2_addh_h16_sat_ll : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_h16_sat_ll">; -// -// BUILTIN_INFO(HEXAGON.V6_vsh,VD_ftype_VI,1) -// tag : V6_vsh -def int_hexagon_V6_vsh : -Hexagon_v1024v512_Intrinsic<"HEXAGON_V6_vsh">; +def int_hexagon_M4_nac_up_s1_sat : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_nac_up_s1_sat">; -// -// BUILTIN_INFO(HEXAGON.V6_vsh_128B,VD_ftype_VI,1) -// tag : V6_vsh_128B -def int_hexagon_V6_vsh_128B : -Hexagon_v2048v1024_Intrinsic<"HEXAGON_V6_vsh_128B">; +def int_hexagon_M2_mpyud_nac_lh_s1 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_nac_lh_s1">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpybus,VI_ftype_VISI,2) -// tag : V6_vdmpybus -def int_hexagon_V6_vdmpybus : -Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vdmpybus">; +def int_hexagon_M2_mpyud_nac_lh_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_mpyud_nac_lh_s0">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpybus_128B,VI_ftype_VISI,2) -// tag : V6_vdmpybus_128B -def int_hexagon_V6_vdmpybus_128B : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpybus_128B">; +def int_hexagon_A4_round_ri_sat : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_round_ri_sat">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpybus_acc,VI_ftype_VIVISI,3) -// tag : V6_vdmpybus_acc -def int_hexagon_V6_vdmpybus_acc : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vdmpybus_acc">; +def int_hexagon_M2_mpy_nac_hl_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_hl_s0">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpybus_acc_128B,VI_ftype_VIVISI,3) -// tag : V6_vdmpybus_acc_128B -def int_hexagon_V6_vdmpybus_acc_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpybus_acc_128B">; +def int_hexagon_M2_mpy_nac_hl_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_hl_s1">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpybus_dv,VD_ftype_VDSI,2) -// tag : V6_vdmpybus_dv -def int_hexagon_V6_vdmpybus_dv : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpybus_dv">; +def int_hexagon_A2_vavghcr : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vavghcr">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpybus_dv_128B,VD_ftype_VDSI,2) -// tag : V6_vdmpybus_dv_128B -def int_hexagon_V6_vdmpybus_dv_128B : -Hexagon_v2048v2048i_Intrinsic<"HEXAGON_V6_vdmpybus_dv_128B">; +def int_hexagon_M2_mmacls_rs0 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmacls_rs0">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpybus_dv_acc,VD_ftype_VDVDSI,3) -// tag : V6_vdmpybus_dv_acc -def int_hexagon_V6_vdmpybus_dv_acc : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpybus_dv_acc">; +def int_hexagon_M2_mmacls_rs1 : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_mmacls_rs1">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpybus_dv_acc_128B,VD_ftype_VDVDSI,3) -// tag : V6_vdmpybus_dv_acc_128B -def int_hexagon_V6_vdmpybus_dv_acc_128B : -Hexagon_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vdmpybus_dv_acc_128B">; +def int_hexagon_M2_cmaci_s0 : +Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_M2_cmaci_s0">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhb,VI_ftype_VISI,2) -// tag : V6_vdmpyhb -def int_hexagon_V6_vdmpyhb : -Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vdmpyhb">; +def int_hexagon_S2_setbit_i : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_setbit_i">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhb_128B,VI_ftype_VISI,2) -// tag : V6_vdmpyhb_128B -def int_hexagon_V6_vdmpyhb_128B : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpyhb_128B">; +def int_hexagon_S2_asl_i_p_or : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_or">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhb_acc,VI_ftype_VIVISI,3) -// tag : V6_vdmpyhb_acc -def int_hexagon_V6_vdmpyhb_acc : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vdmpyhb_acc">; +def int_hexagon_A4_andn : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_andn">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhb_acc_128B,VI_ftype_VIVISI,3) -// tag : V6_vdmpyhb_acc_128B -def int_hexagon_V6_vdmpyhb_acc_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpyhb_acc_128B">; +def int_hexagon_M5_vrmpybsu : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M5_vrmpybsu">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhb_dv,VD_ftype_VDSI,2) -// tag : V6_vdmpyhb_dv -def int_hexagon_V6_vdmpyhb_dv : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpyhb_dv">; +def int_hexagon_S2_vrndpackwh : +Hexagon_i32_i64_Intrinsic<"HEXAGON_S2_vrndpackwh">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhb_dv_128B,VD_ftype_VDSI,2) -// tag : V6_vdmpyhb_dv_128B -def int_hexagon_V6_vdmpyhb_dv_128B : -Hexagon_v2048v2048i_Intrinsic<"HEXAGON_V6_vdmpyhb_dv_128B">; +def int_hexagon_M2_vcmac_s0_sat_r : +Hexagon_i64_i64i64i64_Intrinsic<"HEXAGON_M2_vcmac_s0_sat_r">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhb_dv_acc,VD_ftype_VDVDSI,3) -// tag : V6_vdmpyhb_dv_acc -def int_hexagon_V6_vdmpyhb_dv_acc : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpyhb_dv_acc">; +def int_hexagon_A2_vmaxuw : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vmaxuw">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhb_dv_acc_128B,VD_ftype_VDVDSI,3) -// tag : V6_vdmpyhb_dv_acc_128B -def int_hexagon_V6_vdmpyhb_dv_acc_128B : -Hexagon_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vdmpyhb_dv_acc_128B">; +def int_hexagon_C2_bitsclr : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_bitsclr">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhvsat,VI_ftype_VIVI,2) -// tag : V6_vdmpyhvsat -def int_hexagon_V6_vdmpyhvsat : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vdmpyhvsat">; +def int_hexagon_M2_xor_xacc : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_xor_xacc">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhvsat_128B,VI_ftype_VIVI,2) -// tag : V6_vdmpyhvsat_128B -def int_hexagon_V6_vdmpyhvsat_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vdmpyhvsat_128B">; +def int_hexagon_A4_vcmpbgtui : +Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpbgtui">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhvsat_acc,VI_ftype_VIVIVI,3) -// tag : V6_vdmpyhvsat_acc -def int_hexagon_V6_vdmpyhvsat_acc : -Hexagon_v512v512v512v512_Intrinsic<"HEXAGON_V6_vdmpyhvsat_acc">; +def int_hexagon_A4_ornp : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A4_ornp">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhvsat_acc_128B,VI_ftype_VIVIVI,3) -// tag : V6_vdmpyhvsat_acc_128B -def int_hexagon_V6_vdmpyhvsat_acc_128B : -Hexagon_v1024v1024v1024v1024_Intrinsic<"HEXAGON_V6_vdmpyhvsat_acc_128B">; +def int_hexagon_A2_tfrpi : +Hexagon_i64_i32_Intrinsic<"HEXAGON_A2_tfrpi">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhsat,VI_ftype_VISI,2) -// tag : V6_vdmpyhsat -def int_hexagon_V6_vdmpyhsat : -Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vdmpyhsat">; +def int_hexagon_C4_and_or : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C4_and_or">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhsat_128B,VI_ftype_VISI,2) -// tag : V6_vdmpyhsat_128B -def int_hexagon_V6_vdmpyhsat_128B : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpyhsat_128B">; +def int_hexagon_M2_mpy_nac_sat_hh_s1 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hh_s1">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhsat_acc,VI_ftype_VIVISI,3) -// tag : V6_vdmpyhsat_acc -def int_hexagon_V6_vdmpyhsat_acc : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vdmpyhsat_acc">; +def int_hexagon_M2_mpy_nac_sat_hh_s0 : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mpy_nac_sat_hh_s0">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhsat_acc_128B,VI_ftype_VIVISI,3) -// tag : V6_vdmpyhsat_acc_128B -def int_hexagon_V6_vdmpyhsat_acc_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpyhsat_acc_128B">; +def int_hexagon_A2_subh_h16_sat_ll : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_h16_sat_ll">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhisat,VI_ftype_VDSI,2) -// tag : V6_vdmpyhisat -def int_hexagon_V6_vdmpyhisat : -Hexagon_v512v1024i_Intrinsic<"HEXAGON_V6_vdmpyhisat">; +def int_hexagon_A2_subh_h16_sat_lh : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subh_h16_sat_lh">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhisat_128B,VI_ftype_VDSI,2) -// tag : V6_vdmpyhisat_128B -def int_hexagon_V6_vdmpyhisat_128B : -Hexagon_v1024v2048i_Intrinsic<"HEXAGON_V6_vdmpyhisat_128B">; +def int_hexagon_M2_vmpy2su_s1 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_vmpy2su_s1">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhisat_acc,VI_ftype_VIVDSI,3) -// tag : V6_vdmpyhisat_acc -def int_hexagon_V6_vdmpyhisat_acc : -Hexagon_v512v512v1024i_Intrinsic<"HEXAGON_V6_vdmpyhisat_acc">; +def int_hexagon_M2_vmpy2su_s0 : +Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_vmpy2su_s0">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhisat_acc_128B,VI_ftype_VIVDSI,3) -// tag : V6_vdmpyhisat_acc_128B -def int_hexagon_V6_vdmpyhisat_acc_128B : -Hexagon_v1024v1024v2048i_Intrinsic<"HEXAGON_V6_vdmpyhisat_acc_128B">; +def int_hexagon_S2_asr_i_p_acc : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhsusat,VI_ftype_VISI,2) -// tag : V6_vdmpyhsusat -def int_hexagon_V6_vdmpyhsusat : -Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vdmpyhsusat">; +def int_hexagon_C4_nbitsclri : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_nbitsclri">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhsusat_128B,VI_ftype_VISI,2) -// tag : V6_vdmpyhsusat_128B -def int_hexagon_V6_vdmpyhsusat_128B : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpyhsusat_128B">; +def int_hexagon_S2_lsr_i_vh : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsr_i_vh">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhsusat_acc,VI_ftype_VIVISI,3) -// tag : V6_vdmpyhsusat_acc -def int_hexagon_V6_vdmpyhsusat_acc : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vdmpyhsusat_acc">; +def int_hexagon_S2_lsr_i_p_xacc : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_xacc">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhsusat_acc_128B,VI_ftype_VIVISI,3) -// tag : V6_vdmpyhsusat_acc_128B -def int_hexagon_V6_vdmpyhsusat_acc_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vdmpyhsusat_acc_128B">; +// V55 Scalar Instructions. -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhsuisat,VI_ftype_VDSI,2) -// tag : V6_vdmpyhsuisat -def int_hexagon_V6_vdmpyhsuisat : -Hexagon_v512v1024i_Intrinsic<"HEXAGON_V6_vdmpyhsuisat">; +def int_hexagon_A5_ACS : +Hexagon_i64i32_i64i64i64_Intrinsic<"HEXAGON_A5_ACS">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhsuisat_128B,VI_ftype_VDSI,2) -// tag : V6_vdmpyhsuisat_128B -def int_hexagon_V6_vdmpyhsuisat_128B : -Hexagon_v1024v2048i_Intrinsic<"HEXAGON_V6_vdmpyhsuisat_128B">; +// V60 Scalar Instructions. -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhsuisat_acc,VI_ftype_VIVDSI,3) -// tag : V6_vdmpyhsuisat_acc -def int_hexagon_V6_vdmpyhsuisat_acc : -Hexagon_v512v512v1024i_Intrinsic<"HEXAGON_V6_vdmpyhsuisat_acc">; +def int_hexagon_S6_rol_i_p_and : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_and">; -// -// BUILTIN_INFO(HEXAGON.V6_vdmpyhsuisat_acc_128B,VI_ftype_VIVDSI,3) -// tag : V6_vdmpyhsuisat_acc_128B -def int_hexagon_V6_vdmpyhsuisat_acc_128B : -Hexagon_v1024v1024v2048i_Intrinsic<"HEXAGON_V6_vdmpyhsuisat_acc_128B">; +def int_hexagon_S6_rol_i_r_xacc : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_xacc">; -// -// BUILTIN_INFO(HEXAGON.V6_vtmpyb,VD_ftype_VDSI,2) -// tag : V6_vtmpyb -def int_hexagon_V6_vtmpyb : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vtmpyb">; +def int_hexagon_S6_rol_i_r_and : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_and">; -// -// BUILTIN_INFO(HEXAGON.V6_vtmpyb_128B,VD_ftype_VDSI,2) -// tag : V6_vtmpyb_128B -def int_hexagon_V6_vtmpyb_128B : -Hexagon_v2048v2048i_Intrinsic<"HEXAGON_V6_vtmpyb_128B">; +def int_hexagon_S6_rol_i_r_acc : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vtmpyb_acc,VD_ftype_VDVDSI,3) -// tag : V6_vtmpyb_acc -def int_hexagon_V6_vtmpyb_acc : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vtmpyb_acc">; +def int_hexagon_S6_rol_i_p_xacc : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_xacc">; -// -// BUILTIN_INFO(HEXAGON.V6_vtmpyb_acc_128B,VD_ftype_VDVDSI,3) -// tag : V6_vtmpyb_acc_128B -def int_hexagon_V6_vtmpyb_acc_128B : -Hexagon_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vtmpyb_acc_128B">; +def int_hexagon_S6_rol_i_p : +Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S6_rol_i_p">; -// -// BUILTIN_INFO(HEXAGON.V6_vtmpybus,VD_ftype_VDSI,2) -// tag : V6_vtmpybus -def int_hexagon_V6_vtmpybus : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vtmpybus">; +def int_hexagon_S6_rol_i_p_nac : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_nac">; -// -// BUILTIN_INFO(HEXAGON.V6_vtmpybus_128B,VD_ftype_VDSI,2) -// tag : V6_vtmpybus_128B -def int_hexagon_V6_vtmpybus_128B : -Hexagon_v2048v2048i_Intrinsic<"HEXAGON_V6_vtmpybus_128B">; +def int_hexagon_S6_rol_i_p_acc : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vtmpybus_acc,VD_ftype_VDVDSI,3) -// tag : V6_vtmpybus_acc -def int_hexagon_V6_vtmpybus_acc : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vtmpybus_acc">; +def int_hexagon_S6_rol_i_r_or : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_or">; -// -// BUILTIN_INFO(HEXAGON.V6_vtmpybus_acc_128B,VD_ftype_VDVDSI,3) -// tag : V6_vtmpybus_acc_128B -def int_hexagon_V6_vtmpybus_acc_128B : -Hexagon_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vtmpybus_acc_128B">; +def int_hexagon_S6_rol_i_r : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S6_rol_i_r">; -// -// BUILTIN_INFO(HEXAGON.V6_vtmpyhb,VD_ftype_VDSI,2) -// tag : V6_vtmpyhb -def int_hexagon_V6_vtmpyhb : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vtmpyhb">; +def int_hexagon_S6_rol_i_r_nac : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_nac">; -// -// BUILTIN_INFO(HEXAGON.V6_vtmpyhb_128B,VD_ftype_VDSI,2) -// tag : V6_vtmpyhb_128B -def int_hexagon_V6_vtmpyhb_128B : -Hexagon_v2048v2048i_Intrinsic<"HEXAGON_V6_vtmpyhb_128B">; +def int_hexagon_S6_rol_i_p_or : +Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_or">; -// -// BUILTIN_INFO(HEXAGON.V6_vtmpyhb_acc,VD_ftype_VDVDSI,3) -// tag : V6_vtmpyhb_acc -def int_hexagon_V6_vtmpyhb_acc : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vtmpyhb_acc">; +// V62 Scalar Instructions. -// -// BUILTIN_INFO(HEXAGON.V6_vtmpyhb_acc_128B,VD_ftype_VDVDSI,3) -// tag : V6_vtmpyhb_acc_128B -def int_hexagon_V6_vtmpyhb_acc_128B : -Hexagon_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vtmpyhb_acc_128B">; +def int_hexagon_S6_vtrunehb_ppp : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S6_vtrunehb_ppp">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpyub,VI_ftype_VISI,2) -// tag : V6_vrmpyub -def int_hexagon_V6_vrmpyub : -Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vrmpyub">; +def int_hexagon_V6_ldntnt0 : +Hexagon_v16i32_i32_Intrinsic<"HEXAGON_V6_ldntnt0">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpyub_128B,VI_ftype_VISI,2) -// tag : V6_vrmpyub_128B -def int_hexagon_V6_vrmpyub_128B : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vrmpyub_128B">; +def int_hexagon_M6_vabsdiffub : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M6_vabsdiffub">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpyub_acc,VI_ftype_VIVISI,3) -// tag : V6_vrmpyub_acc -def int_hexagon_V6_vrmpyub_acc : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vrmpyub_acc">; +def int_hexagon_S6_vtrunohb_ppp : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S6_vtrunohb_ppp">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpyub_acc_128B,VI_ftype_VIVISI,3) -// tag : V6_vrmpyub_acc_128B -def int_hexagon_V6_vrmpyub_acc_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vrmpyub_acc_128B">; +def int_hexagon_M6_vabsdiffb : +Hexagon_i64_i64i64_Intrinsic<"HEXAGON_M6_vabsdiffb">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpyubv,VI_ftype_VIVI,2) -// tag : V6_vrmpyubv -def int_hexagon_V6_vrmpyubv : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vrmpyubv">; +def int_hexagon_A6_vminub_RdP : +Hexagon_i64i32_i64i64_Intrinsic<"HEXAGON_A6_vminub_RdP">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpyubv_128B,VI_ftype_VIVI,2) -// tag : V6_vrmpyubv_128B -def int_hexagon_V6_vrmpyubv_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrmpyubv_128B">; +def int_hexagon_S6_vsplatrbp : +Hexagon_i64_i32_Intrinsic<"HEXAGON_S6_vsplatrbp">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpyubv_acc,VI_ftype_VIVIVI,3) -// tag : V6_vrmpyubv_acc -def int_hexagon_V6_vrmpyubv_acc : -Hexagon_v512v512v512v512_Intrinsic<"HEXAGON_V6_vrmpyubv_acc">; +// V65 Scalar Instructions. -// -// BUILTIN_INFO(HEXAGON.V6_vrmpyubv_acc_128B,VI_ftype_VIVIVI,3) -// tag : V6_vrmpyubv_acc_128B -def int_hexagon_V6_vrmpyubv_acc_128B : -Hexagon_v1024v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrmpyubv_acc_128B">; +def int_hexagon_A6_vcmpbeq_notany : +Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A6_vcmpbeq_notany">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybv,VI_ftype_VIVI,2) -// tag : V6_vrmpybv -def int_hexagon_V6_vrmpybv : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vrmpybv">; +// V66 Scalar Instructions. -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybv_128B,VI_ftype_VIVI,2) -// tag : V6_vrmpybv_128B -def int_hexagon_V6_vrmpybv_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrmpybv_128B">; +def int_hexagon_F2_dfsub : +Hexagon_double_doubledouble_Intrinsic<"HEXAGON_F2_dfsub">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybv_acc,VI_ftype_VIVIVI,3) -// tag : V6_vrmpybv_acc -def int_hexagon_V6_vrmpybv_acc : -Hexagon_v512v512v512v512_Intrinsic<"HEXAGON_V6_vrmpybv_acc">; +def int_hexagon_F2_dfadd : +Hexagon_double_doubledouble_Intrinsic<"HEXAGON_F2_dfadd">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybv_acc_128B,VI_ftype_VIVIVI,3) -// tag : V6_vrmpybv_acc_128B -def int_hexagon_V6_vrmpybv_acc_128B : -Hexagon_v1024v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrmpybv_acc_128B">; +def int_hexagon_M2_mnaci : +Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_mnaci">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpyubi,VD_ftype_VDSISI,3) -// tag : V6_vrmpyubi -def int_hexagon_V6_vrmpyubi : -Hexagon_v1024v1024ii_Intrinsic<"HEXAGON_V6_vrmpyubi">; +def int_hexagon_S2_mask : +Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_mask">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpyubi_128B,VD_ftype_VDSISI,3) -// tag : V6_vrmpyubi_128B -def int_hexagon_V6_vrmpyubi_128B : -Hexagon_v2048v2048ii_Intrinsic<"HEXAGON_V6_vrmpyubi_128B">; +// V60 HVX Instructions. -// -// BUILTIN_INFO(HEXAGON.V6_vrmpyubi_acc,VD_ftype_VDVDSISI,4) -// tag : V6_vrmpyubi_acc -def int_hexagon_V6_vrmpyubi_acc : -Hexagon_v1024v1024v1024ii_Intrinsic<"HEXAGON_V6_vrmpyubi_acc">; +def int_hexagon_V6_veqb_or : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqb_or">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpyubi_acc_128B,VD_ftype_VDVDSISI,4) -// tag : V6_vrmpyubi_acc_128B -def int_hexagon_V6_vrmpyubi_acc_128B : -Hexagon_v2048v2048v2048ii_Intrinsic<"HEXAGON_V6_vrmpyubi_acc_128B">; +def int_hexagon_V6_veqb_or_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqb_or_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybus,VI_ftype_VISI,2) -// tag : V6_vrmpybus -def int_hexagon_V6_vrmpybus : -Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vrmpybus">; +def int_hexagon_V6_vminub : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vminub">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybus_128B,VI_ftype_VISI,2) -// tag : V6_vrmpybus_128B -def int_hexagon_V6_vrmpybus_128B : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vrmpybus_128B">; +def int_hexagon_V6_vminub_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vminub_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybus_acc,VI_ftype_VIVISI,3) -// tag : V6_vrmpybus_acc -def int_hexagon_V6_vrmpybus_acc : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vrmpybus_acc">; +def int_hexagon_V6_vaslw_acc : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vaslw_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybus_acc_128B,VI_ftype_VIVISI,3) -// tag : V6_vrmpybus_acc_128B -def int_hexagon_V6_vrmpybus_acc_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vrmpybus_acc_128B">; +def int_hexagon_V6_vaslw_acc_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vaslw_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybusi,VD_ftype_VDSISI,3) -// tag : V6_vrmpybusi -def int_hexagon_V6_vrmpybusi : -Hexagon_v1024v1024ii_Intrinsic<"HEXAGON_V6_vrmpybusi">; +def int_hexagon_V6_vmpyhvsrs : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyhvsrs">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybusi_128B,VD_ftype_VDSISI,3) -// tag : V6_vrmpybusi_128B -def int_hexagon_V6_vrmpybusi_128B : -Hexagon_v2048v2048ii_Intrinsic<"HEXAGON_V6_vrmpybusi_128B">; +def int_hexagon_V6_vmpyhvsrs_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyhvsrs_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybusi_acc,VD_ftype_VDVDSISI,4) -// tag : V6_vrmpybusi_acc -def int_hexagon_V6_vrmpybusi_acc : -Hexagon_v1024v1024v1024ii_Intrinsic<"HEXAGON_V6_vrmpybusi_acc">; +def int_hexagon_V6_vsathub : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsathub">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybusi_acc_128B,VD_ftype_VDVDSISI,4) -// tag : V6_vrmpybusi_acc_128B -def int_hexagon_V6_vrmpybusi_acc_128B : -Hexagon_v2048v2048v2048ii_Intrinsic<"HEXAGON_V6_vrmpybusi_acc_128B">; +def int_hexagon_V6_vsathub_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsathub_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybusv,VI_ftype_VIVI,2) -// tag : V6_vrmpybusv -def int_hexagon_V6_vrmpybusv : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vrmpybusv">; +def int_hexagon_V6_vaddh_dv : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddh_dv">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybusv_128B,VI_ftype_VIVI,2) -// tag : V6_vrmpybusv_128B -def int_hexagon_V6_vrmpybusv_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrmpybusv_128B">; +def int_hexagon_V6_vaddh_dv_128B : +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vaddh_dv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybusv_acc,VI_ftype_VIVIVI,3) -// tag : V6_vrmpybusv_acc -def int_hexagon_V6_vrmpybusv_acc : -Hexagon_v512v512v512v512_Intrinsic<"HEXAGON_V6_vrmpybusv_acc">; +def int_hexagon_V6_vrmpybusi : +Hexagon_v32i32_v32i32i32i32_Intrinsic<"HEXAGON_V6_vrmpybusi">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybusv_acc_128B,VI_ftype_VIVIVI,3) -// tag : V6_vrmpybusv_acc_128B -def int_hexagon_V6_vrmpybusv_acc_128B : -Hexagon_v1024v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrmpybusv_acc_128B">; +def int_hexagon_V6_vrmpybusi_128B : +Hexagon_v64i32_v64i32i32i32_Intrinsic<"HEXAGON_V6_vrmpybusi_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vdsaduh,VD_ftype_VDSI,2) -// tag : V6_vdsaduh -def int_hexagon_V6_vdsaduh : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vdsaduh">; +def int_hexagon_V6_vshufoh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vshufoh">; -// -// BUILTIN_INFO(HEXAGON.V6_vdsaduh_128B,VD_ftype_VDSI,2) -// tag : V6_vdsaduh_128B -def int_hexagon_V6_vdsaduh_128B : -Hexagon_v2048v2048i_Intrinsic<"HEXAGON_V6_vdsaduh_128B">; +def int_hexagon_V6_vshufoh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vshufoh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vdsaduh_acc,VD_ftype_VDVDSI,3) -// tag : V6_vdsaduh_acc -def int_hexagon_V6_vdsaduh_acc : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vdsaduh_acc">; +def int_hexagon_V6_vasrwv : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vasrwv">; -// -// BUILTIN_INFO(HEXAGON.V6_vdsaduh_acc_128B,VD_ftype_VDVDSI,3) -// tag : V6_vdsaduh_acc_128B -def int_hexagon_V6_vdsaduh_acc_128B : -Hexagon_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vdsaduh_acc_128B">; +def int_hexagon_V6_vasrwv_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vasrwv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vrsadubi,VD_ftype_VDSISI,3) -// tag : V6_vrsadubi -def int_hexagon_V6_vrsadubi : -Hexagon_v1024v1024ii_Intrinsic<"HEXAGON_V6_vrsadubi">; +def int_hexagon_V6_vdmpyhsuisat : +Hexagon_v16i32_v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsuisat">; -// -// BUILTIN_INFO(HEXAGON.V6_vrsadubi_128B,VD_ftype_VDSISI,3) -// tag : V6_vrsadubi_128B -def int_hexagon_V6_vrsadubi_128B : -Hexagon_v2048v2048ii_Intrinsic<"HEXAGON_V6_vrsadubi_128B">; +def int_hexagon_V6_vdmpyhsuisat_128B : +Hexagon_v32i32_v64i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsuisat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vrsadubi_acc,VD_ftype_VDVDSISI,4) -// tag : V6_vrsadubi_acc def int_hexagon_V6_vrsadubi_acc : -Hexagon_v1024v1024v1024ii_Intrinsic<"HEXAGON_V6_vrsadubi_acc">; +Hexagon_v32i32_v32i32v32i32i32i32_Intrinsic<"HEXAGON_V6_vrsadubi_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vrsadubi_acc_128B,VD_ftype_VDVDSISI,4) -// tag : V6_vrsadubi_acc_128B def int_hexagon_V6_vrsadubi_acc_128B : -Hexagon_v2048v2048v2048ii_Intrinsic<"HEXAGON_V6_vrsadubi_acc_128B">; +Hexagon_v64i32_v64i32v64i32i32i32_Intrinsic<"HEXAGON_V6_vrsadubi_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrw,VI_ftype_VISI,2) -// tag : V6_vasrw -def int_hexagon_V6_vasrw : -Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vasrw">; +def int_hexagon_V6_vnavgw : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vnavgw">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrw_128B,VI_ftype_VISI,2) -// tag : V6_vasrw_128B -def int_hexagon_V6_vasrw_128B : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vasrw_128B">; +def int_hexagon_V6_vnavgw_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vnavgw_128B">; +def int_hexagon_V6_vnavgh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vnavgh">; -// -// BUILTIN_INFO(HEXAGON.V6_vaslw,VI_ftype_VISI,2) -// tag : V6_vaslw -def int_hexagon_V6_vaslw : -Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vaslw">; +def int_hexagon_V6_vnavgh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vnavgh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaslw_128B,VI_ftype_VISI,2) -// tag : V6_vaslw_128B -def int_hexagon_V6_vaslw_128B : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vaslw_128B">; +def int_hexagon_V6_vavgub : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavgub">; -// -// BUILTIN_INFO(HEXAGON.V6_vlsrw,VI_ftype_VISI,2) -// tag : V6_vlsrw -def int_hexagon_V6_vlsrw : -Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vlsrw">; +def int_hexagon_V6_vavgub_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavgub_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vlsrw_128B,VI_ftype_VISI,2) -// tag : V6_vlsrw_128B -def int_hexagon_V6_vlsrw_128B : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vlsrw_128B">; +def int_hexagon_V6_vsubb : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubb">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrwv,VI_ftype_VIVI,2) -// tag : V6_vasrwv -def int_hexagon_V6_vasrwv : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vasrwv">; +def int_hexagon_V6_vsubb_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrwv_128B,VI_ftype_VIVI,2) -// tag : V6_vasrwv_128B -def int_hexagon_V6_vasrwv_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vasrwv_128B">; +def int_hexagon_V6_vgtw_and : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtw_and">; -// -// BUILTIN_INFO(HEXAGON.V6_vaslwv,VI_ftype_VIVI,2) -// tag : V6_vaslwv -def int_hexagon_V6_vaslwv : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vaslwv">; +def int_hexagon_V6_vgtw_and_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtw_and_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaslwv_128B,VI_ftype_VIVI,2) -// tag : V6_vaslwv_128B -def int_hexagon_V6_vaslwv_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaslwv_128B">; +def int_hexagon_V6_vavgubrnd : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavgubrnd">; -// -// BUILTIN_INFO(HEXAGON.V6_vlsrwv,VI_ftype_VIVI,2) -// tag : V6_vlsrwv -def int_hexagon_V6_vlsrwv : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vlsrwv">; +def int_hexagon_V6_vavgubrnd_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavgubrnd_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vlsrwv_128B,VI_ftype_VIVI,2) -// tag : V6_vlsrwv_128B -def int_hexagon_V6_vlsrwv_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vlsrwv_128B">; +def int_hexagon_V6_vrmpybusv : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vrmpybusv">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrh,VI_ftype_VISI,2) -// tag : V6_vasrh -def int_hexagon_V6_vasrh : -Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vasrh">; +def int_hexagon_V6_vrmpybusv_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vrmpybusv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrh_128B,VI_ftype_VISI,2) -// tag : V6_vasrh_128B -def int_hexagon_V6_vasrh_128B : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vasrh_128B">; +def int_hexagon_V6_vsubbnq : +Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubbnq">; -// -// BUILTIN_INFO(HEXAGON.V6_vaslh,VI_ftype_VISI,2) -// tag : V6_vaslh -def int_hexagon_V6_vaslh : -Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vaslh">; +def int_hexagon_V6_vsubbnq_128B : +Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubbnq_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaslh_128B,VI_ftype_VISI,2) -// tag : V6_vaslh_128B -def int_hexagon_V6_vaslh_128B : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vaslh_128B">; +def int_hexagon_V6_vroundhb : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vroundhb">; -// -// BUILTIN_INFO(HEXAGON.V6_vlsrh,VI_ftype_VISI,2) -// tag : V6_vlsrh -def int_hexagon_V6_vlsrh : -Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vlsrh">; +def int_hexagon_V6_vroundhb_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vroundhb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vlsrh_128B,VI_ftype_VISI,2) -// tag : V6_vlsrh_128B -def int_hexagon_V6_vlsrh_128B : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vlsrh_128B">; +def int_hexagon_V6_vadduhsat_dv : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vadduhsat_dv">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrhv,VI_ftype_VIVI,2) -// tag : V6_vasrhv -def int_hexagon_V6_vasrhv : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vasrhv">; +def int_hexagon_V6_vadduhsat_dv_128B : +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vadduhsat_dv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrhv_128B,VI_ftype_VIVI,2) -// tag : V6_vasrhv_128B -def int_hexagon_V6_vasrhv_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vasrhv_128B">; +def int_hexagon_V6_vsububsat : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsububsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vaslhv,VI_ftype_VIVI,2) -// tag : V6_vaslhv -def int_hexagon_V6_vaslhv : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vaslhv">; +def int_hexagon_V6_vsububsat_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsububsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaslhv_128B,VI_ftype_VIVI,2) -// tag : V6_vaslhv_128B -def int_hexagon_V6_vaslhv_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaslhv_128B">; +def int_hexagon_V6_vmpabus_acc : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpabus_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vlsrhv,VI_ftype_VIVI,2) -// tag : V6_vlsrhv -def int_hexagon_V6_vlsrhv : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vlsrhv">; +def int_hexagon_V6_vmpabus_acc_128B : +Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vmpabus_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vlsrhv_128B,VI_ftype_VIVI,2) -// tag : V6_vlsrhv_128B -def int_hexagon_V6_vlsrhv_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vlsrhv_128B">; +def int_hexagon_V6_vmux : +Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vmux">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrwh,VI_ftype_VIVISI,3) -// tag : V6_vasrwh -def int_hexagon_V6_vasrwh : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrwh">; +def int_hexagon_V6_vmux_128B : +Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vmux_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrwh_128B,VI_ftype_VIVISI,3) -// tag : V6_vasrwh_128B -def int_hexagon_V6_vasrwh_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrwh_128B">; +def int_hexagon_V6_vmpyhus : +Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyhus">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrwhsat,VI_ftype_VIVISI,3) -// tag : V6_vasrwhsat -def int_hexagon_V6_vasrwhsat : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrwhsat">; +def int_hexagon_V6_vmpyhus_128B : +Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyhus_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrwhsat_128B,VI_ftype_VIVISI,3) -// tag : V6_vasrwhsat_128B -def int_hexagon_V6_vasrwhsat_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrwhsat_128B">; +def int_hexagon_V6_vpackeb : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vpackeb">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrwhrndsat,VI_ftype_VIVISI,3) -// tag : V6_vasrwhrndsat -def int_hexagon_V6_vasrwhrndsat : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrwhrndsat">; +def int_hexagon_V6_vpackeb_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vpackeb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrwhrndsat_128B,VI_ftype_VIVISI,3) -// tag : V6_vasrwhrndsat_128B -def int_hexagon_V6_vasrwhrndsat_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrwhrndsat_128B">; +def int_hexagon_V6_vsubhnq : +Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubhnq">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrwuhsat,VI_ftype_VIVISI,3) -// tag : V6_vasrwuhsat -def int_hexagon_V6_vasrwuhsat : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrwuhsat">; +def int_hexagon_V6_vsubhnq_128B : +Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubhnq_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrwuhsat_128B,VI_ftype_VIVISI,3) -// tag : V6_vasrwuhsat_128B -def int_hexagon_V6_vasrwuhsat_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrwuhsat_128B">; +def int_hexagon_V6_vavghrnd : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavghrnd">; -// -// BUILTIN_INFO(HEXAGON.V6_vroundwh,VI_ftype_VIVI,2) -// tag : V6_vroundwh -def int_hexagon_V6_vroundwh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vroundwh">; +def int_hexagon_V6_vavghrnd_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavghrnd_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vroundwh_128B,VI_ftype_VIVI,2) -// tag : V6_vroundwh_128B -def int_hexagon_V6_vroundwh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vroundwh_128B">; +def int_hexagon_V6_vtran2x2_map : +Hexagon_v16i32v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vtran2x2_map">; -// -// BUILTIN_INFO(HEXAGON.V6_vroundwuh,VI_ftype_VIVI,2) -// tag : V6_vroundwuh -def int_hexagon_V6_vroundwuh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vroundwuh">; +def int_hexagon_V6_vtran2x2_map_128B : +Hexagon_v32i32v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vtran2x2_map_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vroundwuh_128B,VI_ftype_VIVI,2) -// tag : V6_vroundwuh_128B -def int_hexagon_V6_vroundwuh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vroundwuh_128B">; +def int_hexagon_V6_vdelta : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vdelta">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrhubsat,VI_ftype_VIVISI,3) -// tag : V6_vasrhubsat -def int_hexagon_V6_vasrhubsat : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrhubsat">; +def int_hexagon_V6_vdelta_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vdelta_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrhubsat_128B,VI_ftype_VIVISI,3) -// tag : V6_vasrhubsat_128B -def int_hexagon_V6_vasrhubsat_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrhubsat_128B">; +def int_hexagon_V6_vgtuh_and : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtuh_and">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrhubrndsat,VI_ftype_VIVISI,3) -// tag : V6_vasrhubrndsat -def int_hexagon_V6_vasrhubrndsat : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrhubrndsat">; +def int_hexagon_V6_vgtuh_and_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtuh_and_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrhubrndsat_128B,VI_ftype_VIVISI,3) -// tag : V6_vasrhubrndsat_128B -def int_hexagon_V6_vasrhubrndsat_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrhubrndsat_128B">; +def int_hexagon_V6_vtmpyhb : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vtmpyhb">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrhbrndsat,VI_ftype_VIVISI,3) -// tag : V6_vasrhbrndsat -def int_hexagon_V6_vasrhbrndsat : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrhbrndsat">; +def int_hexagon_V6_vtmpyhb_128B : +Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vtmpyhb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrhbrndsat_128B,VI_ftype_VIVISI,3) -// tag : V6_vasrhbrndsat_128B -def int_hexagon_V6_vasrhbrndsat_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrhbrndsat_128B">; +def int_hexagon_V6_vpackob : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vpackob">; + +def int_hexagon_V6_vpackob_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vpackob_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vroundhb,VI_ftype_VIVI,2) -// tag : V6_vroundhb -def int_hexagon_V6_vroundhb : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vroundhb">; +def int_hexagon_V6_vmaxh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmaxh">; -// -// BUILTIN_INFO(HEXAGON.V6_vroundhb_128B,VI_ftype_VIVI,2) -// tag : V6_vroundhb_128B -def int_hexagon_V6_vroundhb_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vroundhb_128B">; +def int_hexagon_V6_vmaxh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmaxh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vroundhub,VI_ftype_VIVI,2) -// tag : V6_vroundhub -def int_hexagon_V6_vroundhub : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vroundhub">; +def int_hexagon_V6_vtmpybus_acc : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vtmpybus_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vroundhub_128B,VI_ftype_VIVI,2) -// tag : V6_vroundhub_128B -def int_hexagon_V6_vroundhub_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vroundhub_128B">; +def int_hexagon_V6_vtmpybus_acc_128B : +Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vtmpybus_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaslw_acc,VI_ftype_VIVISI,3) -// tag : V6_vaslw_acc -def int_hexagon_V6_vaslw_acc : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vaslw_acc">; +def int_hexagon_V6_vsubuhsat : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubuhsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vaslw_acc_128B,VI_ftype_VIVISI,3) -// tag : V6_vaslw_acc_128B -def int_hexagon_V6_vaslw_acc_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vaslw_acc_128B">; +def int_hexagon_V6_vsubuhsat_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubuhsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrw_acc,VI_ftype_VIVISI,3) -// tag : V6_vasrw_acc def int_hexagon_V6_vasrw_acc : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrw_acc">; +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrw_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrw_acc_128B,VI_ftype_VIVISI,3) -// tag : V6_vasrw_acc_128B def int_hexagon_V6_vasrw_acc_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrw_acc_128B">; +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrw_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddb,VI_ftype_VIVI,2) -// tag : V6_vaddb -def int_hexagon_V6_vaddb : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vaddb">; +def int_hexagon_V6_pred_or : +Hexagon_v512i1_v512i1v512i1_Intrinsic<"HEXAGON_V6_pred_or">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddb_128B,VI_ftype_VIVI,2) -// tag : V6_vaddb_128B -def int_hexagon_V6_vaddb_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddb_128B">; +def int_hexagon_V6_pred_or_128B : +Hexagon_v1024i1_v1024i1v1024i1_Intrinsic<"HEXAGON_V6_pred_or_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubb,VI_ftype_VIVI,2) -// tag : V6_vsubb -def int_hexagon_V6_vsubb : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vsubb">; +def int_hexagon_V6_vrmpyub_acc : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vrmpyub_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubb_128B,VI_ftype_VIVI,2) -// tag : V6_vsubb_128B -def int_hexagon_V6_vsubb_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubb_128B">; +def int_hexagon_V6_vrmpyub_acc_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vrmpyub_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddb_dv,VD_ftype_VDVD,2) -// tag : V6_vaddb_dv -def int_hexagon_V6_vaddb_dv : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddb_dv">; +def int_hexagon_V6_lo : +Hexagon_v16i32_v32i32_Intrinsic<"HEXAGON_V6_lo">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddb_dv_128B,VD_ftype_VDVD,2) -// tag : V6_vaddb_dv_128B -def int_hexagon_V6_vaddb_dv_128B : -Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vaddb_dv_128B">; +def int_hexagon_V6_lo_128B : +Hexagon_v32i32_v64i32_Intrinsic<"HEXAGON_V6_lo_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubb_dv,VD_ftype_VDVD,2) -// tag : V6_vsubb_dv def int_hexagon_V6_vsubb_dv : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubb_dv">; +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubb_dv">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubb_dv_128B,VD_ftype_VDVD,2) -// tag : V6_vsubb_dv_128B def int_hexagon_V6_vsubb_dv_128B : -Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsubb_dv_128B">; - -// -// BUILTIN_INFO(HEXAGON.V6_vaddh,VI_ftype_VIVI,2) -// tag : V6_vaddh -def int_hexagon_V6_vaddh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vaddh">; - -// -// BUILTIN_INFO(HEXAGON.V6_vaddh_128B,VI_ftype_VIVI,2) -// tag : V6_vaddh_128B -def int_hexagon_V6_vaddh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddh_128B">; +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vsubb_dv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubh,VI_ftype_VIVI,2) -// tag : V6_vsubh -def int_hexagon_V6_vsubh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vsubh">; +def int_hexagon_V6_vsubhsat_dv : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubhsat_dv">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubh_128B,VI_ftype_VIVI,2) -// tag : V6_vsubh_128B -def int_hexagon_V6_vsubh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubh_128B">; +def int_hexagon_V6_vsubhsat_dv_128B : +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vsubhsat_dv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddh_dv,VD_ftype_VDVD,2) -// tag : V6_vaddh_dv -def int_hexagon_V6_vaddh_dv : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddh_dv">; +def int_hexagon_V6_vmpyiwh : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyiwh">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddh_dv_128B,VD_ftype_VDVD,2) -// tag : V6_vaddh_dv_128B -def int_hexagon_V6_vaddh_dv_128B : -Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vaddh_dv_128B">; +def int_hexagon_V6_vmpyiwh_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyiwh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubh_dv,VD_ftype_VDVD,2) -// tag : V6_vsubh_dv -def int_hexagon_V6_vsubh_dv : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubh_dv">; +def int_hexagon_V6_vmpyiwb : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyiwb">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubh_dv_128B,VD_ftype_VDVD,2) -// tag : V6_vsubh_dv_128B -def int_hexagon_V6_vsubh_dv_128B : -Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsubh_dv_128B">; +def int_hexagon_V6_vmpyiwb_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyiwb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddw,VI_ftype_VIVI,2) -// tag : V6_vaddw -def int_hexagon_V6_vaddw : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vaddw">; +def int_hexagon_V6_ldu0 : +Hexagon_v16i32_i32_Intrinsic<"HEXAGON_V6_ldu0">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddw_128B,VI_ftype_VIVI,2) -// tag : V6_vaddw_128B -def int_hexagon_V6_vaddw_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddw_128B">; +def int_hexagon_V6_ldu0_128B : +Hexagon_v32i32_i32_Intrinsic<"HEXAGON_V6_ldu0_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubw,VI_ftype_VIVI,2) -// tag : V6_vsubw -def int_hexagon_V6_vsubw : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vsubw">; +def int_hexagon_V6_vgtuh_xor : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtuh_xor">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubw_128B,VI_ftype_VIVI,2) -// tag : V6_vsubw_128B -def int_hexagon_V6_vsubw_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubw_128B">; +def int_hexagon_V6_vgtuh_xor_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtuh_xor_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddw_dv,VD_ftype_VDVD,2) -// tag : V6_vaddw_dv -def int_hexagon_V6_vaddw_dv : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddw_dv">; +def int_hexagon_V6_vgth_or : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgth_or">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddw_dv_128B,VD_ftype_VDVD,2) -// tag : V6_vaddw_dv_128B -def int_hexagon_V6_vaddw_dv_128B : -Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vaddw_dv_128B">; +def int_hexagon_V6_vgth_or_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgth_or_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubw_dv,VD_ftype_VDVD,2) -// tag : V6_vsubw_dv -def int_hexagon_V6_vsubw_dv : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubw_dv">; +def int_hexagon_V6_vavgh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavgh">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubw_dv_128B,VD_ftype_VDVD,2) -// tag : V6_vsubw_dv_128B -def int_hexagon_V6_vsubw_dv_128B : -Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsubw_dv_128B">; +def int_hexagon_V6_vavgh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavgh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddubsat,VI_ftype_VIVI,2) -// tag : V6_vaddubsat -def int_hexagon_V6_vaddubsat : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vaddubsat">; +def int_hexagon_V6_vlalignb : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlalignb">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddubsat_128B,VI_ftype_VIVI,2) -// tag : V6_vaddubsat_128B -def int_hexagon_V6_vaddubsat_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddubsat_128B">; +def int_hexagon_V6_vlalignb_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlalignb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddubsat_dv,VD_ftype_VDVD,2) -// tag : V6_vaddubsat_dv -def int_hexagon_V6_vaddubsat_dv : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddubsat_dv">; +def int_hexagon_V6_vsh : +Hexagon_v32i32_v16i32_Intrinsic<"HEXAGON_V6_vsh">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddubsat_dv_128B,VD_ftype_VDVD,2) -// tag : V6_vaddubsat_dv_128B -def int_hexagon_V6_vaddubsat_dv_128B : -Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vaddubsat_dv_128B">; +def int_hexagon_V6_vsh_128B : +Hexagon_v64i32_v32i32_Intrinsic<"HEXAGON_V6_vsh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsububsat,VI_ftype_VIVI,2) -// tag : V6_vsububsat -def int_hexagon_V6_vsububsat : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vsububsat">; +def int_hexagon_V6_pred_and_n : +Hexagon_v512i1_v512i1v512i1_Intrinsic<"HEXAGON_V6_pred_and_n">; -// -// BUILTIN_INFO(HEXAGON.V6_vsububsat_128B,VI_ftype_VIVI,2) -// tag : V6_vsububsat_128B -def int_hexagon_V6_vsububsat_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsububsat_128B">; +def int_hexagon_V6_pred_and_n_128B : +Hexagon_v1024i1_v1024i1v1024i1_Intrinsic<"HEXAGON_V6_pred_and_n_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsububsat_dv,VD_ftype_VDVD,2) -// tag : V6_vsububsat_dv -def int_hexagon_V6_vsububsat_dv : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsububsat_dv">; +def int_hexagon_V6_vsb : +Hexagon_v32i32_v16i32_Intrinsic<"HEXAGON_V6_vsb">; -// -// BUILTIN_INFO(HEXAGON.V6_vsububsat_dv_128B,VD_ftype_VDVD,2) -// tag : V6_vsububsat_dv_128B -def int_hexagon_V6_vsububsat_dv_128B : -Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsububsat_dv_128B">; +def int_hexagon_V6_vsb_128B : +Hexagon_v64i32_v32i32_Intrinsic<"HEXAGON_V6_vsb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vadduhsat,VI_ftype_VIVI,2) -// tag : V6_vadduhsat -def int_hexagon_V6_vadduhsat : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vadduhsat">; +def int_hexagon_V6_vroundwuh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vroundwuh">; -// -// BUILTIN_INFO(HEXAGON.V6_vadduhsat_128B,VI_ftype_VIVI,2) -// tag : V6_vadduhsat_128B -def int_hexagon_V6_vadduhsat_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vadduhsat_128B">; +def int_hexagon_V6_vroundwuh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vroundwuh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vadduhsat_dv,VD_ftype_VDVD,2) -// tag : V6_vadduhsat_dv -def int_hexagon_V6_vadduhsat_dv : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vadduhsat_dv">; +def int_hexagon_V6_vasrhv : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vasrhv">; -// -// BUILTIN_INFO(HEXAGON.V6_vadduhsat_dv_128B,VD_ftype_VDVD,2) -// tag : V6_vadduhsat_dv_128B -def int_hexagon_V6_vadduhsat_dv_128B : -Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vadduhsat_dv_128B">; +def int_hexagon_V6_vasrhv_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vasrhv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubuhsat,VI_ftype_VIVI,2) -// tag : V6_vsubuhsat -def int_hexagon_V6_vsubuhsat : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vsubuhsat">; +def int_hexagon_V6_vshuffh : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vshuffh">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubuhsat_128B,VI_ftype_VIVI,2) -// tag : V6_vsubuhsat_128B -def int_hexagon_V6_vsubuhsat_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubuhsat_128B">; +def int_hexagon_V6_vshuffh_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vshuffh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubuhsat_dv,VD_ftype_VDVD,2) -// tag : V6_vsubuhsat_dv -def int_hexagon_V6_vsubuhsat_dv : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubuhsat_dv">; +def int_hexagon_V6_vaddhsat_dv : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddhsat_dv">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubuhsat_dv_128B,VD_ftype_VDVD,2) -// tag : V6_vsubuhsat_dv_128B -def int_hexagon_V6_vsubuhsat_dv_128B : -Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsubuhsat_dv_128B">; +def int_hexagon_V6_vaddhsat_dv_128B : +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vaddhsat_dv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddhsat,VI_ftype_VIVI,2) -// tag : V6_vaddhsat -def int_hexagon_V6_vaddhsat : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vaddhsat">; +def int_hexagon_V6_vnavgub : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vnavgub">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddhsat_128B,VI_ftype_VIVI,2) -// tag : V6_vaddhsat_128B -def int_hexagon_V6_vaddhsat_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddhsat_128B">; +def int_hexagon_V6_vnavgub_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vnavgub_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddhsat_dv,VD_ftype_VDVD,2) -// tag : V6_vaddhsat_dv -def int_hexagon_V6_vaddhsat_dv : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddhsat_dv">; +def int_hexagon_V6_vrmpybv : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vrmpybv">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddhsat_dv_128B,VD_ftype_VDVD,2) -// tag : V6_vaddhsat_dv_128B -def int_hexagon_V6_vaddhsat_dv_128B : -Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vaddhsat_dv_128B">; +def int_hexagon_V6_vrmpybv_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vrmpybv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubhsat,VI_ftype_VIVI,2) -// tag : V6_vsubhsat -def int_hexagon_V6_vsubhsat : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vsubhsat">; +def int_hexagon_V6_vnormamth : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vnormamth">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubhsat_128B,VI_ftype_VIVI,2) -// tag : V6_vsubhsat_128B -def int_hexagon_V6_vsubhsat_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubhsat_128B">; +def int_hexagon_V6_vnormamth_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vnormamth_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubhsat_dv,VD_ftype_VDVD,2) -// tag : V6_vsubhsat_dv -def int_hexagon_V6_vsubhsat_dv : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubhsat_dv">; +def int_hexagon_V6_vdmpyhb : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vdmpyhb">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubhsat_dv_128B,VD_ftype_VDVD,2) -// tag : V6_vsubhsat_dv_128B -def int_hexagon_V6_vsubhsat_dv_128B : -Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsubhsat_dv_128B">; +def int_hexagon_V6_vdmpyhb_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddwsat,VI_ftype_VIVI,2) -// tag : V6_vaddwsat -def int_hexagon_V6_vaddwsat : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vaddwsat">; +def int_hexagon_V6_vavguh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavguh">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddwsat_128B,VI_ftype_VIVI,2) -// tag : V6_vaddwsat_128B -def int_hexagon_V6_vaddwsat_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddwsat_128B">; +def int_hexagon_V6_vavguh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavguh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddwsat_dv,VD_ftype_VDVD,2) -// tag : V6_vaddwsat_dv -def int_hexagon_V6_vaddwsat_dv : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddwsat_dv">; +def int_hexagon_V6_vlsrwv : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vlsrwv">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddwsat_dv_128B,VD_ftype_VDVD,2) -// tag : V6_vaddwsat_dv_128B -def int_hexagon_V6_vaddwsat_dv_128B : -Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vaddwsat_dv_128B">; +def int_hexagon_V6_vlsrwv_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vlsrwv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubwsat,VI_ftype_VIVI,2) -// tag : V6_vsubwsat -def int_hexagon_V6_vsubwsat : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vsubwsat">; +def int_hexagon_V6_vlsrhv : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vlsrhv">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubwsat_128B,VI_ftype_VIVI,2) -// tag : V6_vsubwsat_128B -def int_hexagon_V6_vsubwsat_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubwsat_128B">; +def int_hexagon_V6_vlsrhv_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vlsrhv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubwsat_dv,VD_ftype_VDVD,2) -// tag : V6_vsubwsat_dv -def int_hexagon_V6_vsubwsat_dv : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubwsat_dv">; +def int_hexagon_V6_vdmpyhisat : +Hexagon_v16i32_v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhisat">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubwsat_dv_128B,VD_ftype_VDVD,2) -// tag : V6_vsubwsat_dv_128B -def int_hexagon_V6_vsubwsat_dv_128B : -Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsubwsat_dv_128B">; +def int_hexagon_V6_vdmpyhisat_128B : +Hexagon_v32i32_v64i32i32_Intrinsic<"HEXAGON_V6_vdmpyhisat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vavgub,VI_ftype_VIVI,2) -// tag : V6_vavgub -def int_hexagon_V6_vavgub : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vavgub">; +def int_hexagon_V6_vdmpyhvsat : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vdmpyhvsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vavgub_128B,VI_ftype_VIVI,2) -// tag : V6_vavgub_128B -def int_hexagon_V6_vavgub_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavgub_128B">; +def int_hexagon_V6_vdmpyhvsat_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vdmpyhvsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vavgubrnd,VI_ftype_VIVI,2) -// tag : V6_vavgubrnd -def int_hexagon_V6_vavgubrnd : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vavgubrnd">; +def int_hexagon_V6_vaddw : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddw">; -// -// BUILTIN_INFO(HEXAGON.V6_vavgubrnd_128B,VI_ftype_VIVI,2) -// tag : V6_vavgubrnd_128B -def int_hexagon_V6_vavgubrnd_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavgubrnd_128B">; +def int_hexagon_V6_vaddw_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddw_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vavguh,VI_ftype_VIVI,2) -// tag : V6_vavguh -def int_hexagon_V6_vavguh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vavguh">; +def int_hexagon_V6_vzh : +Hexagon_v32i32_v16i32_Intrinsic<"HEXAGON_V6_vzh">; -// -// BUILTIN_INFO(HEXAGON.V6_vavguh_128B,VI_ftype_VIVI,2) -// tag : V6_vavguh_128B -def int_hexagon_V6_vavguh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavguh_128B">; +def int_hexagon_V6_vzh_128B : +Hexagon_v64i32_v32i32_Intrinsic<"HEXAGON_V6_vzh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vavguhrnd,VI_ftype_VIVI,2) -// tag : V6_vavguhrnd -def int_hexagon_V6_vavguhrnd : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vavguhrnd">; +def int_hexagon_V6_vaddh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddh">; -// -// BUILTIN_INFO(HEXAGON.V6_vavguhrnd_128B,VI_ftype_VIVI,2) -// tag : V6_vavguhrnd_128B -def int_hexagon_V6_vavguhrnd_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavguhrnd_128B">; +def int_hexagon_V6_vaddh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vavgh,VI_ftype_VIVI,2) -// tag : V6_vavgh -def int_hexagon_V6_vavgh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vavgh">; +def int_hexagon_V6_vmaxub : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmaxub">; -// -// BUILTIN_INFO(HEXAGON.V6_vavgh_128B,VI_ftype_VIVI,2) -// tag : V6_vavgh_128B -def int_hexagon_V6_vavgh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavgh_128B">; +def int_hexagon_V6_vmaxub_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmaxub_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vavghrnd,VI_ftype_VIVI,2) -// tag : V6_vavghrnd -def int_hexagon_V6_vavghrnd : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vavghrnd">; +def int_hexagon_V6_vmpyhv_acc : +Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyhv_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vavghrnd_128B,VI_ftype_VIVI,2) -// tag : V6_vavghrnd_128B -def int_hexagon_V6_vavghrnd_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavghrnd_128B">; +def int_hexagon_V6_vmpyhv_acc_128B : +Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyhv_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vnavgh,VI_ftype_VIVI,2) -// tag : V6_vnavgh -def int_hexagon_V6_vnavgh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vnavgh">; +def int_hexagon_V6_vadduhsat : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vadduhsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vnavgh_128B,VI_ftype_VIVI,2) -// tag : V6_vnavgh_128B -def int_hexagon_V6_vnavgh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vnavgh_128B">; +def int_hexagon_V6_vadduhsat_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vadduhsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vavgw,VI_ftype_VIVI,2) -// tag : V6_vavgw -def int_hexagon_V6_vavgw : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vavgw">; +def int_hexagon_V6_vshufoeh : +Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vshufoeh">; -// -// BUILTIN_INFO(HEXAGON.V6_vavgw_128B,VI_ftype_VIVI,2) -// tag : V6_vavgw_128B -def int_hexagon_V6_vavgw_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavgw_128B">; +def int_hexagon_V6_vshufoeh_128B : +Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vshufoeh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vavgwrnd,VI_ftype_VIVI,2) -// tag : V6_vavgwrnd -def int_hexagon_V6_vavgwrnd : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vavgwrnd">; +def int_hexagon_V6_vmpyuhv_acc : +Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyuhv_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vavgwrnd_128B,VI_ftype_VIVI,2) -// tag : V6_vavgwrnd_128B -def int_hexagon_V6_vavgwrnd_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavgwrnd_128B">; +def int_hexagon_V6_vmpyuhv_acc_128B : +Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyuhv_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vnavgw,VI_ftype_VIVI,2) -// tag : V6_vnavgw -def int_hexagon_V6_vnavgw : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vnavgw">; +def int_hexagon_V6_veqh : +Hexagon_v512i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_veqh">; -// -// BUILTIN_INFO(HEXAGON.V6_vnavgw_128B,VI_ftype_VIVI,2) -// tag : V6_vnavgw_128B -def int_hexagon_V6_vnavgw_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vnavgw_128B">; +def int_hexagon_V6_veqh_128B : +Hexagon_v1024i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_veqh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsdiffub,VI_ftype_VIVI,2) -// tag : V6_vabsdiffub -def int_hexagon_V6_vabsdiffub : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vabsdiffub">; +def int_hexagon_V6_vmpabuuv : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpabuuv">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsdiffub_128B,VI_ftype_VIVI,2) -// tag : V6_vabsdiffub_128B -def int_hexagon_V6_vabsdiffub_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vabsdiffub_128B">; +def int_hexagon_V6_vmpabuuv_128B : +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vmpabuuv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsdiffuh,VI_ftype_VIVI,2) -// tag : V6_vabsdiffuh -def int_hexagon_V6_vabsdiffuh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vabsdiffuh">; +def int_hexagon_V6_vasrwhsat : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrwhsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsdiffuh_128B,VI_ftype_VIVI,2) -// tag : V6_vabsdiffuh_128B -def int_hexagon_V6_vabsdiffuh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vabsdiffuh_128B">; +def int_hexagon_V6_vasrwhsat_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrwhsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsdiffh,VI_ftype_VIVI,2) -// tag : V6_vabsdiffh -def int_hexagon_V6_vabsdiffh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vabsdiffh">; +def int_hexagon_V6_vminuh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vminuh">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsdiffh_128B,VI_ftype_VIVI,2) -// tag : V6_vabsdiffh_128B -def int_hexagon_V6_vabsdiffh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vabsdiffh_128B">; +def int_hexagon_V6_vminuh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vminuh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsdiffw,VI_ftype_VIVI,2) -// tag : V6_vabsdiffw -def int_hexagon_V6_vabsdiffw : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vabsdiffw">; +def int_hexagon_V6_vror : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vror">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsdiffw_128B,VI_ftype_VIVI,2) -// tag : V6_vabsdiffw_128B -def int_hexagon_V6_vabsdiffw_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vabsdiffw_128B">; +def int_hexagon_V6_vror_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vror_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vnavgub,VI_ftype_VIVI,2) -// tag : V6_vnavgub -def int_hexagon_V6_vnavgub : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vnavgub">; +def int_hexagon_V6_vmpyowh_rnd_sacc : +Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyowh_rnd_sacc">; -// -// BUILTIN_INFO(HEXAGON.V6_vnavgub_128B,VI_ftype_VIVI,2) -// tag : V6_vnavgub_128B -def int_hexagon_V6_vnavgub_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vnavgub_128B">; +def int_hexagon_V6_vmpyowh_rnd_sacc_128B : +Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyowh_rnd_sacc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddubh,VD_ftype_VIVI,2) -// tag : V6_vaddubh -def int_hexagon_V6_vaddubh : -Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vaddubh">; +def int_hexagon_V6_vmaxuh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmaxuh">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddubh_128B,VD_ftype_VIVI,2) -// tag : V6_vaddubh_128B -def int_hexagon_V6_vaddubh_128B : -Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vaddubh_128B">; +def int_hexagon_V6_vmaxuh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmaxuh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsububh,VD_ftype_VIVI,2) -// tag : V6_vsububh -def int_hexagon_V6_vsububh : -Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vsububh">; +def int_hexagon_V6_vabsh_sat : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabsh_sat">; -// -// BUILTIN_INFO(HEXAGON.V6_vsububh_128B,VD_ftype_VIVI,2) -// tag : V6_vsububh_128B -def int_hexagon_V6_vsububh_128B : -Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vsububh_128B">; +def int_hexagon_V6_vabsh_sat_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabsh_sat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddhw,VD_ftype_VIVI,2) -// tag : V6_vaddhw -def int_hexagon_V6_vaddhw : -Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vaddhw">; +def int_hexagon_V6_pred_or_n : +Hexagon_v512i1_v512i1v512i1_Intrinsic<"HEXAGON_V6_pred_or_n">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddhw_128B,VD_ftype_VIVI,2) -// tag : V6_vaddhw_128B -def int_hexagon_V6_vaddhw_128B : -Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vaddhw_128B">; +def int_hexagon_V6_pred_or_n_128B : +Hexagon_v1024i1_v1024i1v1024i1_Intrinsic<"HEXAGON_V6_pred_or_n_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubhw,VD_ftype_VIVI,2) -// tag : V6_vsubhw -def int_hexagon_V6_vsubhw : -Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vsubhw">; +def int_hexagon_V6_vdealb : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vdealb">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubhw_128B,VD_ftype_VIVI,2) -// tag : V6_vsubhw_128B -def int_hexagon_V6_vsubhw_128B : -Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vsubhw_128B">; +def int_hexagon_V6_vdealb_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vdealb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vadduhw,VD_ftype_VIVI,2) -// tag : V6_vadduhw -def int_hexagon_V6_vadduhw : -Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vadduhw">; +def int_hexagon_V6_vmpybusv : +Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpybusv">; -// -// BUILTIN_INFO(HEXAGON.V6_vadduhw_128B,VD_ftype_VIVI,2) -// tag : V6_vadduhw_128B -def int_hexagon_V6_vadduhw_128B : -Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vadduhw_128B">; +def int_hexagon_V6_vmpybusv_128B : +Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpybusv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubuhw,VD_ftype_VIVI,2) -// tag : V6_vsubuhw -def int_hexagon_V6_vsubuhw : -Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vsubuhw">; +def int_hexagon_V6_vzb : +Hexagon_v32i32_v16i32_Intrinsic<"HEXAGON_V6_vzb">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubuhw_128B,VD_ftype_VIVI,2) -// tag : V6_vsubuhw_128B -def int_hexagon_V6_vsubuhw_128B : -Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vsubuhw_128B">; +def int_hexagon_V6_vzb_128B : +Hexagon_v64i32_v32i32_Intrinsic<"HEXAGON_V6_vzb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vd0,VI_ftype_,0) -// tag : V6_vd0 -def int_hexagon_V6_vd0 : -Hexagon_v512_Intrinsic<"HEXAGON_V6_vd0">; +def int_hexagon_V6_vdmpybus_dv : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vdmpybus_dv">; -// -// BUILTIN_INFO(HEXAGON.V6_vd0_128B,VI_ftype_,0) -// tag : V6_vd0_128B -def int_hexagon_V6_vd0_128B : -Hexagon_v1024_Intrinsic<"HEXAGON_V6_vd0_128B">; +def int_hexagon_V6_vdmpybus_dv_128B : +Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vdmpybus_dv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddbq,VI_ftype_QVVIVI,3) -// tag : V6_vaddbq def int_hexagon_V6_vaddbq : -Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vaddbq">; +Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddbq">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddbq_128B,VI_ftype_QVVIVI,3) -// tag : V6_vaddbq_128B def int_hexagon_V6_vaddbq_128B : -Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vaddbq_128B">; - +Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddbq_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubbq,VI_ftype_QVVIVI,3) -// tag : V6_vsubbq -def int_hexagon_V6_vsubbq : -Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vsubbq">; +def int_hexagon_V6_vaddb : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddb">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubbq_128B,VI_ftype_QVVIVI,3) -// tag : V6_vsubbq_128B -def int_hexagon_V6_vsubbq_128B : -Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vsubbq_128B">; +def int_hexagon_V6_vaddb_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddbnq,VI_ftype_QVVIVI,3) -// tag : V6_vaddbnq -def int_hexagon_V6_vaddbnq : -Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vaddbnq">; +def int_hexagon_V6_vaddwq : +Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddwq">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddbnq_128B,VI_ftype_QVVIVI,3) -// tag : V6_vaddbnq_128B -def int_hexagon_V6_vaddbnq_128B : -Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vaddbnq_128B">; +def int_hexagon_V6_vaddwq_128B : +Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddwq_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubbnq,VI_ftype_QVVIVI,3) -// tag : V6_vsubbnq -def int_hexagon_V6_vsubbnq : -Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vsubbnq">; +def int_hexagon_V6_vasrhubrndsat : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrhubrndsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubbnq_128B,VI_ftype_QVVIVI,3) -// tag : V6_vsubbnq_128B -def int_hexagon_V6_vsubbnq_128B : -Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vsubbnq_128B">; +def int_hexagon_V6_vasrhubrndsat_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrhubrndsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddhq,VI_ftype_QVVIVI,3) -// tag : V6_vaddhq -def int_hexagon_V6_vaddhq : -Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vaddhq">; +def int_hexagon_V6_vasrhubsat : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrhubsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddhq_128B,VI_ftype_QVVIVI,3) -// tag : V6_vaddhq_128B -def int_hexagon_V6_vaddhq_128B : -Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vaddhq_128B">; +def int_hexagon_V6_vasrhubsat_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrhubsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubhq,VI_ftype_QVVIVI,3) -// tag : V6_vsubhq -def int_hexagon_V6_vsubhq : -Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vsubhq">; +def int_hexagon_V6_vshufoeb : +Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vshufoeb">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubhq_128B,VI_ftype_QVVIVI,3) -// tag : V6_vsubhq_128B -def int_hexagon_V6_vsubhq_128B : -Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vsubhq_128B">; +def int_hexagon_V6_vshufoeb_128B : +Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vshufoeb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddhnq,VI_ftype_QVVIVI,3) -// tag : V6_vaddhnq -def int_hexagon_V6_vaddhnq : -Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vaddhnq">; +def int_hexagon_V6_vpackhub_sat : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vpackhub_sat">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddhnq_128B,VI_ftype_QVVIVI,3) -// tag : V6_vaddhnq_128B -def int_hexagon_V6_vaddhnq_128B : -Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vaddhnq_128B">; +def int_hexagon_V6_vpackhub_sat_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vpackhub_sat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubhnq,VI_ftype_QVVIVI,3) -// tag : V6_vsubhnq -def int_hexagon_V6_vsubhnq : -Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vsubhnq">; +def int_hexagon_V6_vmpyiwh_acc : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpyiwh_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubhnq_128B,VI_ftype_QVVIVI,3) -// tag : V6_vsubhnq_128B -def int_hexagon_V6_vsubhnq_128B : -Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vsubhnq_128B">; +def int_hexagon_V6_vmpyiwh_acc_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpyiwh_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddwq,VI_ftype_QVVIVI,3) -// tag : V6_vaddwq -def int_hexagon_V6_vaddwq : -Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vaddwq">; +def int_hexagon_V6_vtmpyb : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vtmpyb">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddwq_128B,VI_ftype_QVVIVI,3) -// tag : V6_vaddwq_128B -def int_hexagon_V6_vaddwq_128B : -Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vaddwq_128B">; +def int_hexagon_V6_vtmpyb_128B : +Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vtmpyb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubwq,VI_ftype_QVVIVI,3) -// tag : V6_vsubwq -def int_hexagon_V6_vsubwq : -Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vsubwq">; +def int_hexagon_V6_vmpabusv : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpabusv">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubwq_128B,VI_ftype_QVVIVI,3) -// tag : V6_vsubwq_128B -def int_hexagon_V6_vsubwq_128B : -Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vsubwq_128B">; +def int_hexagon_V6_vmpabusv_128B : +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vmpabusv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddwnq,VI_ftype_QVVIVI,3) -// tag : V6_vaddwnq -def int_hexagon_V6_vaddwnq : -Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vaddwnq">; +def int_hexagon_V6_pred_and : +Hexagon_v512i1_v512i1v512i1_Intrinsic<"HEXAGON_V6_pred_and">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddwnq_128B,VI_ftype_QVVIVI,3) -// tag : V6_vaddwnq_128B -def int_hexagon_V6_vaddwnq_128B : -Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vaddwnq_128B">; +def int_hexagon_V6_pred_and_128B : +Hexagon_v1024i1_v1024i1v1024i1_Intrinsic<"HEXAGON_V6_pred_and_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubwnq,VI_ftype_QVVIVI,3) -// tag : V6_vsubwnq def int_hexagon_V6_vsubwnq : -Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vsubwnq">; +Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubwnq">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubwnq_128B,VI_ftype_QVVIVI,3) -// tag : V6_vsubwnq_128B def int_hexagon_V6_vsubwnq_128B : -Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vsubwnq_128B">; +Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubwnq_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsh,VI_ftype_VI,1) -// tag : V6_vabsh -def int_hexagon_V6_vabsh : -Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vabsh">; +def int_hexagon_V6_vpackwuh_sat : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vpackwuh_sat">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsh_128B,VI_ftype_VI,1) -// tag : V6_vabsh_128B -def int_hexagon_V6_vabsh_128B : -Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vabsh_128B">; +def int_hexagon_V6_vpackwuh_sat_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vpackwuh_sat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsh_sat,VI_ftype_VI,1) -// tag : V6_vabsh_sat -def int_hexagon_V6_vabsh_sat : -Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vabsh_sat">; +def int_hexagon_V6_vswap : +Hexagon_v32i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vswap">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsh_sat_128B,VI_ftype_VI,1) -// tag : V6_vabsh_sat_128B -def int_hexagon_V6_vabsh_sat_128B : -Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vabsh_sat_128B">; +def int_hexagon_V6_vswap_128B : +Hexagon_v64i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vswap_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsw,VI_ftype_VI,1) -// tag : V6_vabsw -def int_hexagon_V6_vabsw : -Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vabsw">; +def int_hexagon_V6_vrmpyubv_acc : +Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vrmpyubv_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsw_128B,VI_ftype_VI,1) -// tag : V6_vabsw_128B -def int_hexagon_V6_vabsw_128B : -Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vabsw_128B">; +def int_hexagon_V6_vrmpyubv_acc_128B : +Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vrmpyubv_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsw_sat,VI_ftype_VI,1) -// tag : V6_vabsw_sat -def int_hexagon_V6_vabsw_sat : -Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vabsw_sat">; +def int_hexagon_V6_vgtb_and : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtb_and">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsw_sat_128B,VI_ftype_VI,1) -// tag : V6_vabsw_sat_128B -def int_hexagon_V6_vabsw_sat_128B : -Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vabsw_sat_128B">; +def int_hexagon_V6_vgtb_and_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtb_and_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpybv,VD_ftype_VIVI,2) -// tag : V6_vmpybv -def int_hexagon_V6_vmpybv : -Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vmpybv">; +def int_hexagon_V6_vaslw : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vaslw">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpybv_128B,VD_ftype_VIVI,2) -// tag : V6_vmpybv_128B -def int_hexagon_V6_vmpybv_128B : -Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpybv_128B">; +def int_hexagon_V6_vaslw_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vaslw_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpybv_acc,VD_ftype_VDVIVI,3) -// tag : V6_vmpybv_acc -def int_hexagon_V6_vmpybv_acc : -Hexagon_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vmpybv_acc">; +def int_hexagon_V6_vpackhb_sat : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vpackhb_sat">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpybv_acc_128B,VD_ftype_VDVIVI,3) -// tag : V6_vmpybv_acc_128B -def int_hexagon_V6_vmpybv_acc_128B : -Hexagon_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpybv_acc_128B">; +def int_hexagon_V6_vpackhb_sat_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vpackhb_sat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyubv,VD_ftype_VIVI,2) -// tag : V6_vmpyubv -def int_hexagon_V6_vmpyubv : -Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyubv">; +def int_hexagon_V6_vmpyih_acc : +Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyih_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyubv_128B,VD_ftype_VIVI,2) -// tag : V6_vmpyubv_128B -def int_hexagon_V6_vmpyubv_128B : -Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyubv_128B">; +def int_hexagon_V6_vmpyih_acc_128B : +Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyih_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyubv_acc,VD_ftype_VDVIVI,3) -// tag : V6_vmpyubv_acc -def int_hexagon_V6_vmpyubv_acc : -Hexagon_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyubv_acc">; +def int_hexagon_V6_vshuffvdd : +Hexagon_v32i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vshuffvdd">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyubv_acc_128B,VD_ftype_VDVIVI,3) -// tag : V6_vmpyubv_acc_128B -def int_hexagon_V6_vmpyubv_acc_128B : -Hexagon_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyubv_acc_128B">; +def int_hexagon_V6_vshuffvdd_128B : +Hexagon_v64i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vshuffvdd_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpybusv,VD_ftype_VIVI,2) -// tag : V6_vmpybusv -def int_hexagon_V6_vmpybusv : -Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vmpybusv">; +def int_hexagon_V6_vaddb_dv : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddb_dv">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpybusv_128B,VD_ftype_VIVI,2) -// tag : V6_vmpybusv_128B -def int_hexagon_V6_vmpybusv_128B : -Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpybusv_128B">; +def int_hexagon_V6_vaddb_dv_128B : +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vaddb_dv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpybusv_acc,VD_ftype_VDVIVI,3) -// tag : V6_vmpybusv_acc -def int_hexagon_V6_vmpybusv_acc : -Hexagon_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vmpybusv_acc">; +def int_hexagon_V6_vunpackub : +Hexagon_v32i32_v16i32_Intrinsic<"HEXAGON_V6_vunpackub">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpybusv_acc_128B,VD_ftype_VDVIVI,3) -// tag : V6_vmpybusv_acc_128B -def int_hexagon_V6_vmpybusv_acc_128B : -Hexagon_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpybusv_acc_128B">; +def int_hexagon_V6_vunpackub_128B : +Hexagon_v64i32_v32i32_Intrinsic<"HEXAGON_V6_vunpackub_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpabusv,VD_ftype_VDVD,2) -// tag : V6_vmpabusv -def int_hexagon_V6_vmpabusv : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpabusv">; +def int_hexagon_V6_vgtuw : +Hexagon_v512i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtuw">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpabusv_128B,VD_ftype_VDVD,2) -// tag : V6_vmpabusv_128B -def int_hexagon_V6_vmpabusv_128B : -Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vmpabusv_128B">; +def int_hexagon_V6_vgtuw_128B : +Hexagon_v1024i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtuw_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpabuuv,VD_ftype_VDVD,2) -// tag : V6_vmpabuuv -def int_hexagon_V6_vmpabuuv : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpabuuv">; +def int_hexagon_V6_vlutvwh : +Hexagon_v32i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvwh">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpabuuv_128B,VD_ftype_VDVD,2) -// tag : V6_vmpabuuv_128B -def int_hexagon_V6_vmpabuuv_128B : -Hexagon_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vmpabuuv_128B">; +def int_hexagon_V6_vlutvwh_128B : +Hexagon_v64i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvwh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyhv,VD_ftype_VIVI,2) -// tag : V6_vmpyhv -def int_hexagon_V6_vmpyhv : -Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyhv">; +def int_hexagon_V6_vgtub : +Hexagon_v512i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtub">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyhv_128B,VD_ftype_VIVI,2) -// tag : V6_vmpyhv_128B -def int_hexagon_V6_vmpyhv_128B : -Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyhv_128B">; +def int_hexagon_V6_vgtub_128B : +Hexagon_v1024i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtub_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyhv_acc,VD_ftype_VDVIVI,3) -// tag : V6_vmpyhv_acc -def int_hexagon_V6_vmpyhv_acc : -Hexagon_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyhv_acc">; +def int_hexagon_V6_vmpyowh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyowh">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyhv_acc_128B,VD_ftype_VDVIVI,3) -// tag : V6_vmpyhv_acc_128B -def int_hexagon_V6_vmpyhv_acc_128B : -Hexagon_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyhv_acc_128B">; +def int_hexagon_V6_vmpyowh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyowh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyuhv,VD_ftype_VIVI,2) -// tag : V6_vmpyuhv -def int_hexagon_V6_vmpyuhv : -Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyuhv">; +def int_hexagon_V6_vmpyieoh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyieoh">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyuhv_128B,VD_ftype_VIVI,2) -// tag : V6_vmpyuhv_128B -def int_hexagon_V6_vmpyuhv_128B : -Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyuhv_128B">; +def int_hexagon_V6_vmpyieoh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyieoh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyuhv_acc,VD_ftype_VDVIVI,3) -// tag : V6_vmpyuhv_acc -def int_hexagon_V6_vmpyuhv_acc : -Hexagon_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyuhv_acc">; +def int_hexagon_V6_extractw : +Hexagon_i32_v16i32i32_Intrinsic<"HEXAGON_V6_extractw">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyuhv_acc_128B,VD_ftype_VDVIVI,3) -// tag : V6_vmpyuhv_acc_128B -def int_hexagon_V6_vmpyuhv_acc_128B : -Hexagon_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyuhv_acc_128B">; +def int_hexagon_V6_extractw_128B : +Hexagon_i32_v32i32i32_Intrinsic<"HEXAGON_V6_extractw_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyhvsrs,VI_ftype_VIVI,2) -// tag : V6_vmpyhvsrs -def int_hexagon_V6_vmpyhvsrs : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmpyhvsrs">; +def int_hexagon_V6_vavgwrnd : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavgwrnd">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyhvsrs_128B,VI_ftype_VIVI,2) -// tag : V6_vmpyhvsrs_128B -def int_hexagon_V6_vmpyhvsrs_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyhvsrs_128B">; +def int_hexagon_V6_vavgwrnd_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavgwrnd_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyhus,VD_ftype_VIVI,2) -// tag : V6_vmpyhus -def int_hexagon_V6_vmpyhus : -Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyhus">; +def int_hexagon_V6_vdmpyhsat_acc : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsat_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyhus_128B,VD_ftype_VIVI,2) -// tag : V6_vmpyhus_128B -def int_hexagon_V6_vmpyhus_128B : -Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyhus_128B">; +def int_hexagon_V6_vdmpyhsat_acc_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsat_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyhus_acc,VD_ftype_VDVIVI,3) -// tag : V6_vmpyhus_acc -def int_hexagon_V6_vmpyhus_acc : -Hexagon_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyhus_acc">; +def int_hexagon_V6_vgtub_xor : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtub_xor">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyhus_acc_128B,VD_ftype_VDVIVI,3) -// tag : V6_vmpyhus_acc_128B -def int_hexagon_V6_vmpyhus_acc_128B : -Hexagon_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyhus_acc_128B">; +def int_hexagon_V6_vgtub_xor_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtub_xor_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyih,VI_ftype_VIVI,2) -// tag : V6_vmpyih -def int_hexagon_V6_vmpyih : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmpyih">; +def int_hexagon_V6_vmpyub : +Hexagon_v32i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyub">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyih_128B,VI_ftype_VIVI,2) -// tag : V6_vmpyih_128B -def int_hexagon_V6_vmpyih_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyih_128B">; +def int_hexagon_V6_vmpyub_128B : +Hexagon_v64i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyub_128B">; + +def int_hexagon_V6_vmpyuh : +Hexagon_v32i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyuh">; + +def int_hexagon_V6_vmpyuh_128B : +Hexagon_v64i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyuh_128B">; + +def int_hexagon_V6_vunpackob : +Hexagon_v32i32_v32i32v16i32_Intrinsic<"HEXAGON_V6_vunpackob">; + +def int_hexagon_V6_vunpackob_128B : +Hexagon_v64i32_v64i32v32i32_Intrinsic<"HEXAGON_V6_vunpackob_128B">; + +def int_hexagon_V6_vmpahb : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpahb">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyih_acc,VI_ftype_VIVIVI,3) -// tag : V6_vmpyih_acc -def int_hexagon_V6_vmpyih_acc : -Hexagon_v512v512v512v512_Intrinsic<"HEXAGON_V6_vmpyih_acc">; +def int_hexagon_V6_vmpahb_128B : +Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vmpahb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyih_acc_128B,VI_ftype_VIVIVI,3) -// tag : V6_vmpyih_acc_128B -def int_hexagon_V6_vmpyih_acc_128B : -Hexagon_v1024v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyih_acc_128B">; +def int_hexagon_V6_veqw_or : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqw_or">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyewuh,VI_ftype_VIVI,2) -// tag : V6_vmpyewuh -def int_hexagon_V6_vmpyewuh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmpyewuh">; +def int_hexagon_V6_veqw_or_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqw_or_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyewuh_128B,VI_ftype_VIVI,2) -// tag : V6_vmpyewuh_128B -def int_hexagon_V6_vmpyewuh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyewuh_128B">; +def int_hexagon_V6_vandqrt : +Hexagon_v16i32_v512i1i32_Intrinsic<"HEXAGON_V6_vandqrt">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyowh,VI_ftype_VIVI,2) -// tag : V6_vmpyowh -def int_hexagon_V6_vmpyowh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmpyowh">; +def int_hexagon_V6_vandqrt_128B : +Hexagon_v32i32_v1024i1i32_Intrinsic<"HEXAGON_V6_vandqrt_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyowh_128B,VI_ftype_VIVI,2) -// tag : V6_vmpyowh_128B -def int_hexagon_V6_vmpyowh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyowh_128B">; +def int_hexagon_V6_vxor : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vxor">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyowh_rnd,VI_ftype_VIVI,2) -// tag : V6_vmpyowh_rnd -def int_hexagon_V6_vmpyowh_rnd : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmpyowh_rnd">; +def int_hexagon_V6_vxor_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vxor_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyowh_rnd_128B,VI_ftype_VIVI,2) -// tag : V6_vmpyowh_rnd_128B -def int_hexagon_V6_vmpyowh_rnd_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyowh_rnd_128B">; +def int_hexagon_V6_vasrwhrndsat : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrwhrndsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyowh_sacc,VI_ftype_VIVIVI,3) -// tag : V6_vmpyowh_sacc -def int_hexagon_V6_vmpyowh_sacc : -Hexagon_v512v512v512v512_Intrinsic<"HEXAGON_V6_vmpyowh_sacc">; +def int_hexagon_V6_vasrwhrndsat_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrwhrndsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyowh_sacc_128B,VI_ftype_VIVIVI,3) -// tag : V6_vmpyowh_sacc_128B -def int_hexagon_V6_vmpyowh_sacc_128B : -Hexagon_v1024v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyowh_sacc_128B">; +def int_hexagon_V6_vmpyhsat_acc : +Hexagon_v32i32_v32i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpyhsat_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyowh_rnd_sacc,VI_ftype_VIVIVI,3) -// tag : V6_vmpyowh_rnd_sacc -def int_hexagon_V6_vmpyowh_rnd_sacc : -Hexagon_v512v512v512v512_Intrinsic<"HEXAGON_V6_vmpyowh_rnd_sacc">; +def int_hexagon_V6_vmpyhsat_acc_128B : +Hexagon_v64i32_v64i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpyhsat_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyowh_rnd_sacc_128B,VI_ftype_VIVIVI,3) -// tag : V6_vmpyowh_rnd_sacc_128B -def int_hexagon_V6_vmpyowh_rnd_sacc_128B : -Hexagon_v1024v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyowh_rnd_sacc_128B">; +def int_hexagon_V6_vrmpybus_acc : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vrmpybus_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyieoh,VI_ftype_VIVI,2) -// tag : V6_vmpyieoh -def int_hexagon_V6_vmpyieoh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmpyieoh">; +def int_hexagon_V6_vrmpybus_acc_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vrmpybus_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyieoh_128B,VI_ftype_VIVI,2) -// tag : V6_vmpyieoh_128B -def int_hexagon_V6_vmpyieoh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyieoh_128B">; +def int_hexagon_V6_vsubhw : +Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubhw">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiewuh,VI_ftype_VIVI,2) -// tag : V6_vmpyiewuh -def int_hexagon_V6_vmpyiewuh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmpyiewuh">; +def int_hexagon_V6_vsubhw_128B : +Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubhw_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiewuh_128B,VI_ftype_VIVI,2) -// tag : V6_vmpyiewuh_128B -def int_hexagon_V6_vmpyiewuh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyiewuh_128B">; +def int_hexagon_V6_vdealb4w : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vdealb4w">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiowh,VI_ftype_VIVI,2) -// tag : V6_vmpyiowh -def int_hexagon_V6_vmpyiowh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmpyiowh">; +def int_hexagon_V6_vdealb4w_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vdealb4w_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiowh_128B,VI_ftype_VIVI,2) -// tag : V6_vmpyiowh_128B -def int_hexagon_V6_vmpyiowh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyiowh_128B">; +def int_hexagon_V6_vmpyowh_sacc : +Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyowh_sacc">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiewh_acc,VI_ftype_VIVIVI,3) -// tag : V6_vmpyiewh_acc -def int_hexagon_V6_vmpyiewh_acc : -Hexagon_v512v512v512v512_Intrinsic<"HEXAGON_V6_vmpyiewh_acc">; +def int_hexagon_V6_vmpyowh_sacc_128B : +Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyowh_sacc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiewh_acc_128B,VI_ftype_VIVIVI,3) -// tag : V6_vmpyiewh_acc_128B -def int_hexagon_V6_vmpyiewh_acc_128B : -Hexagon_v1024v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyiewh_acc_128B">; +def int_hexagon_V6_vmpybv : +Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpybv">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiewuh_acc,VI_ftype_VIVIVI,3) -// tag : V6_vmpyiewuh_acc -def int_hexagon_V6_vmpyiewuh_acc : -Hexagon_v512v512v512v512_Intrinsic<"HEXAGON_V6_vmpyiewuh_acc">; +def int_hexagon_V6_vmpybv_128B : +Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpybv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiewuh_acc_128B,VI_ftype_VIVIVI,3) -// tag : V6_vmpyiewuh_acc_128B -def int_hexagon_V6_vmpyiewuh_acc_128B : -Hexagon_v1024v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmpyiewuh_acc_128B">; +def int_hexagon_V6_vabsdiffh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vabsdiffh">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyub,VD_ftype_VISI,2) -// tag : V6_vmpyub -def int_hexagon_V6_vmpyub : -Hexagon_v1024v512i_Intrinsic<"HEXAGON_V6_vmpyub">; +def int_hexagon_V6_vabsdiffh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vabsdiffh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyub_128B,VD_ftype_VISI,2) -// tag : V6_vmpyub_128B -def int_hexagon_V6_vmpyub_128B : -Hexagon_v2048v1024i_Intrinsic<"HEXAGON_V6_vmpyub_128B">; +def int_hexagon_V6_vshuffob : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vshuffob">; + +def int_hexagon_V6_vshuffob_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vshuffob_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyub_acc,VD_ftype_VDVISI,3) -// tag : V6_vmpyub_acc def int_hexagon_V6_vmpyub_acc : -Hexagon_v1024v1024v512i_Intrinsic<"HEXAGON_V6_vmpyub_acc">; +Hexagon_v32i32_v32i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpyub_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyub_acc_128B,VD_ftype_VDVISI,3) -// tag : V6_vmpyub_acc_128B def int_hexagon_V6_vmpyub_acc_128B : -Hexagon_v2048v2048v1024i_Intrinsic<"HEXAGON_V6_vmpyub_acc_128B">; - -// -// BUILTIN_INFO(HEXAGON.V6_vmpybus,VD_ftype_VISI,2) -// tag : V6_vmpybus -def int_hexagon_V6_vmpybus : -Hexagon_v1024v512i_Intrinsic<"HEXAGON_V6_vmpybus">; - -// -// BUILTIN_INFO(HEXAGON.V6_vmpybus_128B,VD_ftype_VISI,2) -// tag : V6_vmpybus_128B -def int_hexagon_V6_vmpybus_128B : -Hexagon_v2048v1024i_Intrinsic<"HEXAGON_V6_vmpybus_128B">; +Hexagon_v64i32_v64i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpyub_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpybus_acc,VD_ftype_VDVISI,3) -// tag : V6_vmpybus_acc -def int_hexagon_V6_vmpybus_acc : -Hexagon_v1024v1024v512i_Intrinsic<"HEXAGON_V6_vmpybus_acc">; +def int_hexagon_V6_vnormamtw : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vnormamtw">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpybus_acc_128B,VD_ftype_VDVISI,3) -// tag : V6_vmpybus_acc_128B -def int_hexagon_V6_vmpybus_acc_128B : -Hexagon_v2048v2048v1024i_Intrinsic<"HEXAGON_V6_vmpybus_acc_128B">; +def int_hexagon_V6_vnormamtw_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vnormamtw_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpabus,VD_ftype_VDSI,2) -// tag : V6_vmpabus -def int_hexagon_V6_vmpabus : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpabus">; +def int_hexagon_V6_vunpackuh : +Hexagon_v32i32_v16i32_Intrinsic<"HEXAGON_V6_vunpackuh">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpabus_128B,VD_ftype_VDSI,2) -// tag : V6_vmpabus_128B -def int_hexagon_V6_vmpabus_128B : -Hexagon_v2048v2048i_Intrinsic<"HEXAGON_V6_vmpabus_128B">; +def int_hexagon_V6_vunpackuh_128B : +Hexagon_v64i32_v32i32_Intrinsic<"HEXAGON_V6_vunpackuh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpabus_acc,VD_ftype_VDVDSI,3) -// tag : V6_vmpabus_acc -def int_hexagon_V6_vmpabus_acc : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpabus_acc">; +def int_hexagon_V6_vgtuh_or : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtuh_or">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpabus_acc_128B,VD_ftype_VDVDSI,3) -// tag : V6_vmpabus_acc_128B -def int_hexagon_V6_vmpabus_acc_128B : -Hexagon_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vmpabus_acc_128B">; +def int_hexagon_V6_vgtuh_or_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtuh_or_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpahb,VD_ftype_VDSI,2) -// tag : V6_vmpahb -def int_hexagon_V6_vmpahb : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpahb">; +def int_hexagon_V6_vmpyiewuh_acc : +Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyiewuh_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpahb_128B,VD_ftype_VDSI,2) -// tag : V6_vmpahb_128B -def int_hexagon_V6_vmpahb_128B : -Hexagon_v2048v2048i_Intrinsic<"HEXAGON_V6_vmpahb_128B">; +def int_hexagon_V6_vmpyiewuh_acc_128B : +Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyiewuh_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpahb_acc,VD_ftype_VDVDSI,3) -// tag : V6_vmpahb_acc -def int_hexagon_V6_vmpahb_acc : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpahb_acc">; +def int_hexagon_V6_vunpackoh : +Hexagon_v32i32_v32i32v16i32_Intrinsic<"HEXAGON_V6_vunpackoh">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpahb_acc_128B,VD_ftype_VDVDSI,3) -// tag : V6_vmpahb_acc_128B -def int_hexagon_V6_vmpahb_acc_128B : -Hexagon_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vmpahb_acc_128B">; +def int_hexagon_V6_vunpackoh_128B : +Hexagon_v64i32_v64i32v32i32_Intrinsic<"HEXAGON_V6_vunpackoh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyh,VD_ftype_VISI,2) -// tag : V6_vmpyh -def int_hexagon_V6_vmpyh : -Hexagon_v1024v512i_Intrinsic<"HEXAGON_V6_vmpyh">; +def int_hexagon_V6_vdmpyhsat : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyh_128B,VD_ftype_VISI,2) -// tag : V6_vmpyh_128B -def int_hexagon_V6_vmpyh_128B : -Hexagon_v2048v1024i_Intrinsic<"HEXAGON_V6_vmpyh_128B">; +def int_hexagon_V6_vdmpyhsat_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyhsat_acc,VD_ftype_VDVISI,3) -// tag : V6_vmpyhsat_acc -def int_hexagon_V6_vmpyhsat_acc : -Hexagon_v1024v1024v512i_Intrinsic<"HEXAGON_V6_vmpyhsat_acc">; +def int_hexagon_V6_vmpyubv : +Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyubv">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyhsat_acc_128B,VD_ftype_VDVISI,3) -// tag : V6_vmpyhsat_acc_128B -def int_hexagon_V6_vmpyhsat_acc_128B : -Hexagon_v2048v2048v1024i_Intrinsic<"HEXAGON_V6_vmpyhsat_acc_128B">; +def int_hexagon_V6_vmpyubv_128B : +Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyubv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyhss,VI_ftype_VISI,2) -// tag : V6_vmpyhss def int_hexagon_V6_vmpyhss : -Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vmpyhss">; +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyhss">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyhss_128B,VI_ftype_VISI,2) -// tag : V6_vmpyhss_128B def int_hexagon_V6_vmpyhss_128B : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyhss_128B">; +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyhss_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyhsrs,VI_ftype_VISI,2) -// tag : V6_vmpyhsrs -def int_hexagon_V6_vmpyhsrs : -Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vmpyhsrs">; +def int_hexagon_V6_hi : +Hexagon_v16i32_v32i32_Intrinsic<"HEXAGON_V6_hi">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyhsrs_128B,VI_ftype_VISI,2) -// tag : V6_vmpyhsrs_128B -def int_hexagon_V6_vmpyhsrs_128B : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyhsrs_128B">; +def int_hexagon_V6_hi_128B : +Hexagon_v32i32_v64i32_Intrinsic<"HEXAGON_V6_hi_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyuh,VD_ftype_VISI,2) -// tag : V6_vmpyuh -def int_hexagon_V6_vmpyuh : -Hexagon_v1024v512i_Intrinsic<"HEXAGON_V6_vmpyuh">; +def int_hexagon_V6_vasrwuhsat : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrwuhsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyuh_128B,VD_ftype_VISI,2) -// tag : V6_vmpyuh_128B -def int_hexagon_V6_vmpyuh_128B : -Hexagon_v2048v1024i_Intrinsic<"HEXAGON_V6_vmpyuh_128B">; +def int_hexagon_V6_vasrwuhsat_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrwuhsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyuh_acc,VD_ftype_VDVISI,3) -// tag : V6_vmpyuh_acc -def int_hexagon_V6_vmpyuh_acc : -Hexagon_v1024v1024v512i_Intrinsic<"HEXAGON_V6_vmpyuh_acc">; +def int_hexagon_V6_veqw : +Hexagon_v512i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_veqw">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyuh_acc_128B,VD_ftype_VDVISI,3) -// tag : V6_vmpyuh_acc_128B -def int_hexagon_V6_vmpyuh_acc_128B : -Hexagon_v2048v2048v1024i_Intrinsic<"HEXAGON_V6_vmpyuh_acc_128B">; +def int_hexagon_V6_veqw_128B : +Hexagon_v1024i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_veqw_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyihb,VI_ftype_VISI,2) -// tag : V6_vmpyihb -def int_hexagon_V6_vmpyihb : -Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vmpyihb">; +def int_hexagon_V6_vdsaduh : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vdsaduh">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyihb_128B,VI_ftype_VISI,2) -// tag : V6_vmpyihb_128B -def int_hexagon_V6_vmpyihb_128B : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyihb_128B">; +def int_hexagon_V6_vdsaduh_128B : +Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vdsaduh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyihb_acc,VI_ftype_VIVISI,3) -// tag : V6_vmpyihb_acc -def int_hexagon_V6_vmpyihb_acc : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vmpyihb_acc">; +def int_hexagon_V6_vsubw : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubw">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyihb_acc_128B,VI_ftype_VIVISI,3) -// tag : V6_vmpyihb_acc_128B -def int_hexagon_V6_vmpyihb_acc_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyihb_acc_128B">; +def int_hexagon_V6_vsubw_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubw_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiwb,VI_ftype_VISI,2) -// tag : V6_vmpyiwb -def int_hexagon_V6_vmpyiwb : -Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vmpyiwb">; +def int_hexagon_V6_vsubw_dv : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubw_dv">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiwb_128B,VI_ftype_VISI,2) -// tag : V6_vmpyiwb_128B -def int_hexagon_V6_vmpyiwb_128B : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyiwb_128B">; +def int_hexagon_V6_vsubw_dv_128B : +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vsubw_dv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiwb_acc,VI_ftype_VIVISI,3) -// tag : V6_vmpyiwb_acc -def int_hexagon_V6_vmpyiwb_acc : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vmpyiwb_acc">; +def int_hexagon_V6_veqb_and : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqb_and">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiwb_acc_128B,VI_ftype_VIVISI,3) -// tag : V6_vmpyiwb_acc_128B -def int_hexagon_V6_vmpyiwb_acc_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyiwb_acc_128B">; +def int_hexagon_V6_veqb_and_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqb_and_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiwh,VI_ftype_VISI,2) -// tag : V6_vmpyiwh -def int_hexagon_V6_vmpyiwh : -Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vmpyiwh">; +def int_hexagon_V6_vmpyih : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyih">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiwh_128B,VI_ftype_VISI,2) -// tag : V6_vmpyiwh_128B -def int_hexagon_V6_vmpyiwh_128B : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyiwh_128B">; +def int_hexagon_V6_vmpyih_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyih_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiwh_acc,VI_ftype_VIVISI,3) -// tag : V6_vmpyiwh_acc -def int_hexagon_V6_vmpyiwh_acc : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vmpyiwh_acc">; +def int_hexagon_V6_vtmpyb_acc : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vtmpyb_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiwh_acc_128B,VI_ftype_VIVISI,3) -// tag : V6_vmpyiwh_acc_128B -def int_hexagon_V6_vmpyiwh_acc_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyiwh_acc_128B">; +def int_hexagon_V6_vtmpyb_acc_128B : +Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vtmpyb_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vand,VI_ftype_VIVI,2) -// tag : V6_vand -def int_hexagon_V6_vand : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vand">; +def int_hexagon_V6_vrmpybus : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vrmpybus">; -// -// BUILTIN_INFO(HEXAGON.V6_vand_128B,VI_ftype_VIVI,2) -// tag : V6_vand_128B -def int_hexagon_V6_vand_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vand_128B">; +def int_hexagon_V6_vrmpybus_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vrmpybus_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vor,VI_ftype_VIVI,2) -// tag : V6_vor -def int_hexagon_V6_vor : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vor">; +def int_hexagon_V6_vmpybus_acc : +Hexagon_v32i32_v32i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpybus_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vor_128B,VI_ftype_VIVI,2) -// tag : V6_vor_128B -def int_hexagon_V6_vor_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vor_128B">; +def int_hexagon_V6_vmpybus_acc_128B : +Hexagon_v64i32_v64i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpybus_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vxor,VI_ftype_VIVI,2) -// tag : V6_vxor -def int_hexagon_V6_vxor : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vxor">; +def int_hexagon_V6_vgth_xor : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgth_xor">; -// -// BUILTIN_INFO(HEXAGON.V6_vxor_128B,VI_ftype_VIVI,2) -// tag : V6_vxor_128B -def int_hexagon_V6_vxor_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vxor_128B">; +def int_hexagon_V6_vgth_xor_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgth_xor_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vnot,VI_ftype_VI,1) -// tag : V6_vnot -def int_hexagon_V6_vnot : -Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vnot">; +def int_hexagon_V6_vsubhsat : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubhsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vnot_128B,VI_ftype_VI,1) -// tag : V6_vnot_128B -def int_hexagon_V6_vnot_128B : -Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vnot_128B">; +def int_hexagon_V6_vsubhsat_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubhsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vandqrt,VI_ftype_QVSI,2) -// tag : V6_vandqrt -def int_hexagon_V6_vandqrt : -Hexagon_v512v64ii_Intrinsic<"HEXAGON_V6_vandqrt">; +def int_hexagon_V6_vrmpyubi_acc : +Hexagon_v32i32_v32i32v32i32i32i32_Intrinsic<"HEXAGON_V6_vrmpyubi_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vandqrt_128B,VI_ftype_QVSI,2) -// tag : V6_vandqrt_128B -def int_hexagon_V6_vandqrt_128B : -Hexagon_v1024v128ii_Intrinsic<"HEXAGON_V6_vandqrt_128B">; +def int_hexagon_V6_vrmpyubi_acc_128B : +Hexagon_v64i32_v64i32v64i32i32i32_Intrinsic<"HEXAGON_V6_vrmpyubi_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vandqrt_acc,VI_ftype_VIQVSI,3) -// tag : V6_vandqrt_acc -def int_hexagon_V6_vandqrt_acc : -Hexagon_v512v512v64ii_Intrinsic<"HEXAGON_V6_vandqrt_acc">; +def int_hexagon_V6_vabsw : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabsw">; -// -// BUILTIN_INFO(HEXAGON.V6_vandqrt_acc_128B,VI_ftype_VIQVSI,3) -// tag : V6_vandqrt_acc_128B -def int_hexagon_V6_vandqrt_acc_128B : -Hexagon_v1024v1024v128ii_Intrinsic<"HEXAGON_V6_vandqrt_acc_128B">; +def int_hexagon_V6_vabsw_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabsw_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vandvrt,QV_ftype_VISI,2) -// tag : V6_vandvrt -def int_hexagon_V6_vandvrt : -Hexagon_v64iv512i_Intrinsic<"HEXAGON_V6_vandvrt">; +def int_hexagon_V6_vaddwsat_dv : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddwsat_dv">; -// -// BUILTIN_INFO(HEXAGON.V6_vandvrt_128B,QV_ftype_VISI,2) -// tag : V6_vandvrt_128B -def int_hexagon_V6_vandvrt_128B : -Hexagon_v128iv1024i_Intrinsic<"HEXAGON_V6_vandvrt_128B">; +def int_hexagon_V6_vaddwsat_dv_128B : +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vaddwsat_dv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vandvrt_acc,QV_ftype_QVVISI,3) -// tag : V6_vandvrt_acc -def int_hexagon_V6_vandvrt_acc : -Hexagon_v64iv64iv512i_Intrinsic<"HEXAGON_V6_vandvrt_acc">; +def int_hexagon_V6_vlsrw : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vlsrw">; -// -// BUILTIN_INFO(HEXAGON.V6_vandvrt_acc_128B,QV_ftype_QVVISI,3) -// tag : V6_vandvrt_acc_128B -def int_hexagon_V6_vandvrt_acc_128B : -Hexagon_v128iv128iv1024i_Intrinsic<"HEXAGON_V6_vandvrt_acc_128B">; +def int_hexagon_V6_vlsrw_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vlsrw_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtw,QV_ftype_VIVI,2) -// tag : V6_vgtw -def int_hexagon_V6_vgtw : -Hexagon_v64iv512v512_Intrinsic<"HEXAGON_V6_vgtw">; +def int_hexagon_V6_vabsh : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabsh">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtw_128B,QV_ftype_VIVI,2) -// tag : V6_vgtw_128B -def int_hexagon_V6_vgtw_128B : -Hexagon_v128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtw_128B">; +def int_hexagon_V6_vabsh_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabsh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtw_and,QV_ftype_QVVIVI,3) -// tag : V6_vgtw_and -def int_hexagon_V6_vgtw_and : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtw_and">; +def int_hexagon_V6_vlsrh : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vlsrh">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtw_and_128B,QV_ftype_QVVIVI,3) -// tag : V6_vgtw_and_128B -def int_hexagon_V6_vgtw_and_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtw_and_128B">; +def int_hexagon_V6_vlsrh_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vlsrh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtw_or,QV_ftype_QVVIVI,3) -// tag : V6_vgtw_or -def int_hexagon_V6_vgtw_or : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtw_or">; +def int_hexagon_V6_valignb : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_valignb">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtw_or_128B,QV_ftype_QVVIVI,3) -// tag : V6_vgtw_or_128B -def int_hexagon_V6_vgtw_or_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtw_or_128B">; +def int_hexagon_V6_valignb_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_valignb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtw_xor,QV_ftype_QVVIVI,3) -// tag : V6_vgtw_xor -def int_hexagon_V6_vgtw_xor : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtw_xor">; +def int_hexagon_V6_vsubhq : +Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubhq">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtw_xor_128B,QV_ftype_QVVIVI,3) -// tag : V6_vgtw_xor_128B -def int_hexagon_V6_vgtw_xor_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtw_xor_128B">; +def int_hexagon_V6_vsubhq_128B : +Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubhq_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_veqw,QV_ftype_VIVI,2) -// tag : V6_veqw -def int_hexagon_V6_veqw : -Hexagon_v64iv512v512_Intrinsic<"HEXAGON_V6_veqw">; +def int_hexagon_V6_vpackoh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vpackoh">; -// -// BUILTIN_INFO(HEXAGON.V6_veqw_128B,QV_ftype_VIVI,2) -// tag : V6_veqw_128B -def int_hexagon_V6_veqw_128B : -Hexagon_v128iv1024v1024_Intrinsic<"HEXAGON_V6_veqw_128B">; +def int_hexagon_V6_vpackoh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vpackoh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_veqw_and,QV_ftype_QVVIVI,3) -// tag : V6_veqw_and -def int_hexagon_V6_veqw_and : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_veqw_and">; +def int_hexagon_V6_vdmpybus_acc : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vdmpybus_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_veqw_and_128B,QV_ftype_QVVIVI,3) -// tag : V6_veqw_and_128B -def int_hexagon_V6_veqw_and_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_veqw_and_128B">; +def int_hexagon_V6_vdmpybus_acc_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vdmpybus_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_veqw_or,QV_ftype_QVVIVI,3) -// tag : V6_veqw_or -def int_hexagon_V6_veqw_or : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_veqw_or">; +def int_hexagon_V6_vdmpyhvsat_acc : +Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vdmpyhvsat_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_veqw_or_128B,QV_ftype_QVVIVI,3) -// tag : V6_veqw_or_128B -def int_hexagon_V6_veqw_or_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_veqw_or_128B">; +def int_hexagon_V6_vdmpyhvsat_acc_128B : +Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vdmpyhvsat_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_veqw_xor,QV_ftype_QVVIVI,3) -// tag : V6_veqw_xor -def int_hexagon_V6_veqw_xor : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_veqw_xor">; +def int_hexagon_V6_vrmpybv_acc : +Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vrmpybv_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_veqw_xor_128B,QV_ftype_QVVIVI,3) -// tag : V6_veqw_xor_128B -def int_hexagon_V6_veqw_xor_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_veqw_xor_128B">; +def int_hexagon_V6_vrmpybv_acc_128B : +Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vrmpybv_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgth,QV_ftype_VIVI,2) -// tag : V6_vgth -def int_hexagon_V6_vgth : -Hexagon_v64iv512v512_Intrinsic<"HEXAGON_V6_vgth">; +def int_hexagon_V6_vaddhsat : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddhsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vgth_128B,QV_ftype_VIVI,2) -// tag : V6_vgth_128B -def int_hexagon_V6_vgth_128B : -Hexagon_v128iv1024v1024_Intrinsic<"HEXAGON_V6_vgth_128B">; +def int_hexagon_V6_vaddhsat_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddhsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgth_and,QV_ftype_QVVIVI,3) -// tag : V6_vgth_and -def int_hexagon_V6_vgth_and : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgth_and">; +def int_hexagon_V6_vcombine : +Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vcombine">; -// -// BUILTIN_INFO(HEXAGON.V6_vgth_and_128B,QV_ftype_QVVIVI,3) -// tag : V6_vgth_and_128B -def int_hexagon_V6_vgth_and_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgth_and_128B">; +def int_hexagon_V6_vcombine_128B : +Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vcombine_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgth_or,QV_ftype_QVVIVI,3) -// tag : V6_vgth_or -def int_hexagon_V6_vgth_or : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgth_or">; +def int_hexagon_V6_vandqrt_acc : +Hexagon_v16i32_v16i32v512i1i32_Intrinsic<"HEXAGON_V6_vandqrt_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vgth_or_128B,QV_ftype_QVVIVI,3) -// tag : V6_vgth_or_128B -def int_hexagon_V6_vgth_or_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgth_or_128B">; +def int_hexagon_V6_vandqrt_acc_128B : +Hexagon_v32i32_v32i32v1024i1i32_Intrinsic<"HEXAGON_V6_vandqrt_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgth_xor,QV_ftype_QVVIVI,3) -// tag : V6_vgth_xor -def int_hexagon_V6_vgth_xor : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgth_xor">; +def int_hexagon_V6_vaslhv : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaslhv">; -// -// BUILTIN_INFO(HEXAGON.V6_vgth_xor_128B,QV_ftype_QVVIVI,3) -// tag : V6_vgth_xor_128B -def int_hexagon_V6_vgth_xor_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgth_xor_128B">; +def int_hexagon_V6_vaslhv_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaslhv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_veqh,QV_ftype_VIVI,2) -// tag : V6_veqh -def int_hexagon_V6_veqh : -Hexagon_v64iv512v512_Intrinsic<"HEXAGON_V6_veqh">; +def int_hexagon_V6_vinsertwr : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vinsertwr">; -// -// BUILTIN_INFO(HEXAGON.V6_veqh_128B,QV_ftype_VIVI,2) -// tag : V6_veqh_128B -def int_hexagon_V6_veqh_128B : -Hexagon_v128iv1024v1024_Intrinsic<"HEXAGON_V6_veqh_128B">; +def int_hexagon_V6_vinsertwr_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vinsertwr_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_veqh_and,QV_ftype_QVVIVI,3) -// tag : V6_veqh_and -def int_hexagon_V6_veqh_and : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_veqh_and">; +def int_hexagon_V6_vsubh_dv : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubh_dv">; -// -// BUILTIN_INFO(HEXAGON.V6_veqh_and_128B,QV_ftype_QVVIVI,3) -// tag : V6_veqh_and_128B -def int_hexagon_V6_veqh_and_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_veqh_and_128B">; +def int_hexagon_V6_vsubh_dv_128B : +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vsubh_dv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_veqh_or,QV_ftype_QVVIVI,3) -// tag : V6_veqh_or -def int_hexagon_V6_veqh_or : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_veqh_or">; +def int_hexagon_V6_vshuffb : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vshuffb">; -// -// BUILTIN_INFO(HEXAGON.V6_veqh_or_128B,QV_ftype_QVVIVI,3) -// tag : V6_veqh_or_128B -def int_hexagon_V6_veqh_or_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_veqh_or_128B">; +def int_hexagon_V6_vshuffb_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vshuffb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_veqh_xor,QV_ftype_QVVIVI,3) -// tag : V6_veqh_xor -def int_hexagon_V6_veqh_xor : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_veqh_xor">; +def int_hexagon_V6_vand : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vand">; -// -// BUILTIN_INFO(HEXAGON.V6_veqh_xor_128B,QV_ftype_QVVIVI,3) -// tag : V6_veqh_xor_128B -def int_hexagon_V6_veqh_xor_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_veqh_xor_128B">; +def int_hexagon_V6_vand_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vand_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtb,QV_ftype_VIVI,2) -// tag : V6_vgtb -def int_hexagon_V6_vgtb : -Hexagon_v64iv512v512_Intrinsic<"HEXAGON_V6_vgtb">; +def int_hexagon_V6_vmpyhv : +Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyhv">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtb_128B,QV_ftype_VIVI,2) -// tag : V6_vgtb_128B -def int_hexagon_V6_vgtb_128B : -Hexagon_v128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtb_128B">; +def int_hexagon_V6_vmpyhv_128B : +Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyhv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtb_and,QV_ftype_QVVIVI,3) -// tag : V6_vgtb_and -def int_hexagon_V6_vgtb_and : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtb_and">; +def int_hexagon_V6_vdmpyhsuisat_acc : +Hexagon_v16i32_v16i32v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsuisat_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtb_and_128B,QV_ftype_QVVIVI,3) -// tag : V6_vgtb_and_128B -def int_hexagon_V6_vgtb_and_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtb_and_128B">; +def int_hexagon_V6_vdmpyhsuisat_acc_128B : +Hexagon_v32i32_v32i32v64i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsuisat_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtb_or,QV_ftype_QVVIVI,3) -// tag : V6_vgtb_or -def int_hexagon_V6_vgtb_or : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtb_or">; +def int_hexagon_V6_vsububsat_dv : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsububsat_dv">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtb_or_128B,QV_ftype_QVVIVI,3) -// tag : V6_vgtb_or_128B -def int_hexagon_V6_vgtb_or_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtb_or_128B">; +def int_hexagon_V6_vsububsat_dv_128B : +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vsububsat_dv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtb_xor,QV_ftype_QVVIVI,3) -// tag : V6_vgtb_xor def int_hexagon_V6_vgtb_xor : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtb_xor">; +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtb_xor">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtb_xor_128B,QV_ftype_QVVIVI,3) -// tag : V6_vgtb_xor_128B def int_hexagon_V6_vgtb_xor_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtb_xor_128B">; +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtb_xor_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_veqb,QV_ftype_VIVI,2) -// tag : V6_veqb -def int_hexagon_V6_veqb : -Hexagon_v64iv512v512_Intrinsic<"HEXAGON_V6_veqb">; +def int_hexagon_V6_vdsaduh_acc : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vdsaduh_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_veqb_128B,QV_ftype_VIVI,2) -// tag : V6_veqb_128B -def int_hexagon_V6_veqb_128B : -Hexagon_v128iv1024v1024_Intrinsic<"HEXAGON_V6_veqb_128B">; +def int_hexagon_V6_vdsaduh_acc_128B : +Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vdsaduh_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_veqb_and,QV_ftype_QVVIVI,3) -// tag : V6_veqb_and -def int_hexagon_V6_veqb_and : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_veqb_and">; +def int_hexagon_V6_vrmpyub : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vrmpyub">; -// -// BUILTIN_INFO(HEXAGON.V6_veqb_and_128B,QV_ftype_QVVIVI,3) -// tag : V6_veqb_and_128B -def int_hexagon_V6_veqb_and_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_veqb_and_128B">; +def int_hexagon_V6_vrmpyub_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vrmpyub_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_veqb_or,QV_ftype_QVVIVI,3) -// tag : V6_veqb_or -def int_hexagon_V6_veqb_or : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_veqb_or">; +def int_hexagon_V6_vmpyuh_acc : +Hexagon_v32i32_v32i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpyuh_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_veqb_or_128B,QV_ftype_QVVIVI,3) -// tag : V6_veqb_or_128B -def int_hexagon_V6_veqb_or_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_veqb_or_128B">; +def int_hexagon_V6_vmpyuh_acc_128B : +Hexagon_v64i32_v64i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpyuh_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_veqb_xor,QV_ftype_QVVIVI,3) -// tag : V6_veqb_xor -def int_hexagon_V6_veqb_xor : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_veqb_xor">; +def int_hexagon_V6_vcl0h : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vcl0h">; -// -// BUILTIN_INFO(HEXAGON.V6_veqb_xor_128B,QV_ftype_QVVIVI,3) -// tag : V6_veqb_xor_128B -def int_hexagon_V6_veqb_xor_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_veqb_xor_128B">; +def int_hexagon_V6_vcl0h_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vcl0h_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtuw,QV_ftype_VIVI,2) -// tag : V6_vgtuw -def int_hexagon_V6_vgtuw : -Hexagon_v64iv512v512_Intrinsic<"HEXAGON_V6_vgtuw">; +def int_hexagon_V6_vmpyhus_acc : +Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyhus_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtuw_128B,QV_ftype_VIVI,2) -// tag : V6_vgtuw_128B -def int_hexagon_V6_vgtuw_128B : -Hexagon_v128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtuw_128B">; +def int_hexagon_V6_vmpyhus_acc_128B : +Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyhus_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtuw_and,QV_ftype_QVVIVI,3) -// tag : V6_vgtuw_and -def int_hexagon_V6_vgtuw_and : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtuw_and">; +def int_hexagon_V6_vmpybv_acc : +Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpybv_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtuw_and_128B,QV_ftype_QVVIVI,3) -// tag : V6_vgtuw_and_128B -def int_hexagon_V6_vgtuw_and_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtuw_and_128B">; +def int_hexagon_V6_vmpybv_acc_128B : +Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpybv_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtuw_or,QV_ftype_QVVIVI,3) -// tag : V6_vgtuw_or -def int_hexagon_V6_vgtuw_or : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtuw_or">; +def int_hexagon_V6_vrsadubi : +Hexagon_v32i32_v32i32i32i32_Intrinsic<"HEXAGON_V6_vrsadubi">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtuw_or_128B,QV_ftype_QVVIVI,3) -// tag : V6_vgtuw_or_128B -def int_hexagon_V6_vgtuw_or_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtuw_or_128B">; +def int_hexagon_V6_vrsadubi_128B : +Hexagon_v64i32_v64i32i32i32_Intrinsic<"HEXAGON_V6_vrsadubi_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtuw_xor,QV_ftype_QVVIVI,3) -// tag : V6_vgtuw_xor -def int_hexagon_V6_vgtuw_xor : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtuw_xor">; +def int_hexagon_V6_vdmpyhb_dv_acc : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhb_dv_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtuw_xor_128B,QV_ftype_QVVIVI,3) -// tag : V6_vgtuw_xor_128B -def int_hexagon_V6_vgtuw_xor_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtuw_xor_128B">; +def int_hexagon_V6_vdmpyhb_dv_acc_128B : +Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vdmpyhb_dv_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtuh,QV_ftype_VIVI,2) -// tag : V6_vgtuh -def int_hexagon_V6_vgtuh : -Hexagon_v64iv512v512_Intrinsic<"HEXAGON_V6_vgtuh">; +def int_hexagon_V6_vshufeh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vshufeh">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtuh_128B,QV_ftype_VIVI,2) -// tag : V6_vgtuh_128B -def int_hexagon_V6_vgtuh_128B : -Hexagon_v128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtuh_128B">; +def int_hexagon_V6_vshufeh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vshufeh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtuh_and,QV_ftype_QVVIVI,3) -// tag : V6_vgtuh_and -def int_hexagon_V6_vgtuh_and : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtuh_and">; +def int_hexagon_V6_vmpyewuh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyewuh">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtuh_and_128B,QV_ftype_QVVIVI,3) -// tag : V6_vgtuh_and_128B -def int_hexagon_V6_vgtuh_and_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtuh_and_128B">; +def int_hexagon_V6_vmpyewuh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyewuh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtuh_or,QV_ftype_QVVIVI,3) -// tag : V6_vgtuh_or -def int_hexagon_V6_vgtuh_or : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtuh_or">; +def int_hexagon_V6_vmpyhsrs : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyhsrs">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtuh_or_128B,QV_ftype_QVVIVI,3) -// tag : V6_vgtuh_or_128B -def int_hexagon_V6_vgtuh_or_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtuh_or_128B">; +def int_hexagon_V6_vmpyhsrs_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyhsrs_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtuh_xor,QV_ftype_QVVIVI,3) -// tag : V6_vgtuh_xor -def int_hexagon_V6_vgtuh_xor : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtuh_xor">; +def int_hexagon_V6_vdmpybus_dv_acc : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vdmpybus_dv_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtuh_xor_128B,QV_ftype_QVVIVI,3) -// tag : V6_vgtuh_xor_128B -def int_hexagon_V6_vgtuh_xor_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtuh_xor_128B">; +def int_hexagon_V6_vdmpybus_dv_acc_128B : +Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vdmpybus_dv_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtub,QV_ftype_VIVI,2) -// tag : V6_vgtub -def int_hexagon_V6_vgtub : -Hexagon_v64iv512v512_Intrinsic<"HEXAGON_V6_vgtub">; +def int_hexagon_V6_vaddubh : +Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddubh">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtub_128B,QV_ftype_VIVI,2) -// tag : V6_vgtub_128B -def int_hexagon_V6_vgtub_128B : -Hexagon_v128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtub_128B">; +def int_hexagon_V6_vaddubh_128B : +Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddubh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtub_and,QV_ftype_QVVIVI,3) -// tag : V6_vgtub_and -def int_hexagon_V6_vgtub_and : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtub_and">; +def int_hexagon_V6_vasrwh : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrwh">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtub_and_128B,QV_ftype_QVVIVI,3) -// tag : V6_vgtub_and_128B -def int_hexagon_V6_vgtub_and_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtub_and_128B">; +def int_hexagon_V6_vasrwh_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrwh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtub_or,QV_ftype_QVVIVI,3) -// tag : V6_vgtub_or -def int_hexagon_V6_vgtub_or : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtub_or">; +def int_hexagon_V6_ld0 : +Hexagon_v16i32_i32_Intrinsic<"HEXAGON_V6_ld0">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtub_or_128B,QV_ftype_QVVIVI,3) -// tag : V6_vgtub_or_128B -def int_hexagon_V6_vgtub_or_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtub_or_128B">; +def int_hexagon_V6_ld0_128B : +Hexagon_v32i32_i32_Intrinsic<"HEXAGON_V6_ld0_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtub_xor,QV_ftype_QVVIVI,3) -// tag : V6_vgtub_xor -def int_hexagon_V6_vgtub_xor : -Hexagon_v64iv64iv512v512_Intrinsic<"HEXAGON_V6_vgtub_xor">; +def int_hexagon_V6_vpopcounth : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vpopcounth">; -// -// BUILTIN_INFO(HEXAGON.V6_vgtub_xor_128B,QV_ftype_QVVIVI,3) -// tag : V6_vgtub_xor_128B -def int_hexagon_V6_vgtub_xor_128B : -Hexagon_v128iv128iv1024v1024_Intrinsic<"HEXAGON_V6_vgtub_xor_128B">; +def int_hexagon_V6_vpopcounth_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vpopcounth_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_pred_or,QV_ftype_QVQV,2) -// tag : V6_pred_or -def int_hexagon_V6_pred_or : -Hexagon_v64iv64iv64i_Intrinsic<"HEXAGON_V6_pred_or">; +def int_hexagon_V6_ldnt0 : +Hexagon_v16i32_i32_Intrinsic<"HEXAGON_V6_ldnt0">; -// -// BUILTIN_INFO(HEXAGON.V6_pred_or_128B,QV_ftype_QVQV,2) -// tag : V6_pred_or_128B -def int_hexagon_V6_pred_or_128B : -Hexagon_v128iv128iv128i_Intrinsic<"HEXAGON_V6_pred_or_128B">; +def int_hexagon_V6_ldnt0_128B : +Hexagon_v32i32_i32_Intrinsic<"HEXAGON_V6_ldnt0_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_pred_and,QV_ftype_QVQV,2) -// tag : V6_pred_and -def int_hexagon_V6_pred_and : -Hexagon_v64iv64iv64i_Intrinsic<"HEXAGON_V6_pred_and">; +def int_hexagon_V6_vgth_and : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgth_and">; -// -// BUILTIN_INFO(HEXAGON.V6_pred_and_128B,QV_ftype_QVQV,2) -// tag : V6_pred_and_128B -def int_hexagon_V6_pred_and_128B : -Hexagon_v128iv128iv128i_Intrinsic<"HEXAGON_V6_pred_and_128B">; +def int_hexagon_V6_vgth_and_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgth_and_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_pred_not,QV_ftype_QV,1) -// tag : V6_pred_not -def int_hexagon_V6_pred_not : -Hexagon_v64iv64i_Intrinsic<"HEXAGON_V6_pred_not">; +def int_hexagon_V6_vaddubsat_dv : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddubsat_dv">; -// -// BUILTIN_INFO(HEXAGON.V6_pred_not_128B,QV_ftype_QV,1) -// tag : V6_pred_not_128B -def int_hexagon_V6_pred_not_128B : -Hexagon_v128iv128i_Intrinsic<"HEXAGON_V6_pred_not_128B">; +def int_hexagon_V6_vaddubsat_dv_128B : +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vaddubsat_dv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_pred_xor,QV_ftype_QVQV,2) -// tag : V6_pred_xor -def int_hexagon_V6_pred_xor : -Hexagon_v64iv64iv64i_Intrinsic<"HEXAGON_V6_pred_xor">; +def int_hexagon_V6_vpackeh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vpackeh">; -// -// BUILTIN_INFO(HEXAGON.V6_pred_xor_128B,QV_ftype_QVQV,2) -// tag : V6_pred_xor_128B -def int_hexagon_V6_pred_xor_128B : -Hexagon_v128iv128iv128i_Intrinsic<"HEXAGON_V6_pred_xor_128B">; +def int_hexagon_V6_vpackeh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vpackeh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_pred_and_n,QV_ftype_QVQV,2) -// tag : V6_pred_and_n -def int_hexagon_V6_pred_and_n : -Hexagon_v64iv64iv64i_Intrinsic<"HEXAGON_V6_pred_and_n">; +def int_hexagon_V6_vmpyh : +Hexagon_v32i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyh">; -// -// BUILTIN_INFO(HEXAGON.V6_pred_and_n_128B,QV_ftype_QVQV,2) -// tag : V6_pred_and_n_128B -def int_hexagon_V6_pred_and_n_128B : -Hexagon_v128iv128iv128i_Intrinsic<"HEXAGON_V6_pred_and_n_128B">; +def int_hexagon_V6_vmpyh_128B : +Hexagon_v64i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_pred_or_n,QV_ftype_QVQV,2) -// tag : V6_pred_or_n -def int_hexagon_V6_pred_or_n : -Hexagon_v64iv64iv64i_Intrinsic<"HEXAGON_V6_pred_or_n">; +def int_hexagon_V6_vminh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vminh">; -// -// BUILTIN_INFO(HEXAGON.V6_pred_or_n_128B,QV_ftype_QVQV,2) -// tag : V6_pred_or_n_128B -def int_hexagon_V6_pred_or_n_128B : -Hexagon_v128iv128iv128i_Intrinsic<"HEXAGON_V6_pred_or_n_128B">; +def int_hexagon_V6_vminh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vminh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_pred_scalar2,QV_ftype_SI,1) -// tag : V6_pred_scalar2 def int_hexagon_V6_pred_scalar2 : -Hexagon_v64ii_Intrinsic<"HEXAGON_V6_pred_scalar2">; +Hexagon_v512i1_i32_Intrinsic<"HEXAGON_V6_pred_scalar2">; -// -// BUILTIN_INFO(HEXAGON.V6_pred_scalar2_128B,QV_ftype_SI,1) -// tag : V6_pred_scalar2_128B def int_hexagon_V6_pred_scalar2_128B : -Hexagon_v128ii_Intrinsic<"HEXAGON_V6_pred_scalar2_128B">; - -// -// BUILTIN_INFO(HEXAGON.V6_vmux,VI_ftype_QVVIVI,3) -// tag : V6_vmux -def int_hexagon_V6_vmux : -Hexagon_v512v64iv512v512_Intrinsic<"HEXAGON_V6_vmux">; +Hexagon_v1024i1_i32_Intrinsic<"HEXAGON_V6_pred_scalar2_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmux_128B,VI_ftype_QVVIVI,3) -// tag : V6_vmux_128B -def int_hexagon_V6_vmux_128B : -Hexagon_v1024v128iv1024v1024_Intrinsic<"HEXAGON_V6_vmux_128B">; +def int_hexagon_V6_vdealh : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vdealh">; -// -// BUILTIN_INFO(HEXAGON.V6_vswap,VD_ftype_QVVIVI,3) -// tag : V6_vswap -def int_hexagon_V6_vswap : -Hexagon_v1024v64iv512v512_Intrinsic<"HEXAGON_V6_vswap">; +def int_hexagon_V6_vdealh_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vdealh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vswap_128B,VD_ftype_QVVIVI,3) -// tag : V6_vswap_128B -def int_hexagon_V6_vswap_128B : -Hexagon_v2048v128iv1024v1024_Intrinsic<"HEXAGON_V6_vswap_128B">; +def int_hexagon_V6_vpackwh_sat : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vpackwh_sat">; -// -// BUILTIN_INFO(HEXAGON.V6_vmaxub,VI_ftype_VIVI,2) -// tag : V6_vmaxub -def int_hexagon_V6_vmaxub : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmaxub">; +def int_hexagon_V6_vpackwh_sat_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vpackwh_sat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmaxub_128B,VI_ftype_VIVI,2) -// tag : V6_vmaxub_128B -def int_hexagon_V6_vmaxub_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmaxub_128B">; +def int_hexagon_V6_vaslh : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vaslh">; -// -// BUILTIN_INFO(HEXAGON.V6_vminub,VI_ftype_VIVI,2) -// tag : V6_vminub -def int_hexagon_V6_vminub : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vminub">; +def int_hexagon_V6_vaslh_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vaslh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vminub_128B,VI_ftype_VIVI,2) -// tag : V6_vminub_128B -def int_hexagon_V6_vminub_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vminub_128B">; +def int_hexagon_V6_vgtuw_and : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtuw_and">; -// -// BUILTIN_INFO(HEXAGON.V6_vmaxuh,VI_ftype_VIVI,2) -// tag : V6_vmaxuh -def int_hexagon_V6_vmaxuh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmaxuh">; +def int_hexagon_V6_vgtuw_and_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtuw_and_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmaxuh_128B,VI_ftype_VIVI,2) -// tag : V6_vmaxuh_128B -def int_hexagon_V6_vmaxuh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmaxuh_128B">; +def int_hexagon_V6_vor : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vor">; -// -// BUILTIN_INFO(HEXAGON.V6_vminuh,VI_ftype_VIVI,2) -// tag : V6_vminuh -def int_hexagon_V6_vminuh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vminuh">; +def int_hexagon_V6_vor_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vor_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vminuh_128B,VI_ftype_VIVI,2) -// tag : V6_vminuh_128B -def int_hexagon_V6_vminuh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vminuh_128B">; +def int_hexagon_V6_vlutvvb : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvvb">; -// -// BUILTIN_INFO(HEXAGON.V6_vmaxh,VI_ftype_VIVI,2) -// tag : V6_vmaxh -def int_hexagon_V6_vmaxh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmaxh">; +def int_hexagon_V6_vlutvvb_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvvb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmaxh_128B,VI_ftype_VIVI,2) -// tag : V6_vmaxh_128B -def int_hexagon_V6_vmaxh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmaxh_128B">; +def int_hexagon_V6_vmpyiowh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyiowh">; -// -// BUILTIN_INFO(HEXAGON.V6_vminh,VI_ftype_VIVI,2) -// tag : V6_vminh -def int_hexagon_V6_vminh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vminh">; +def int_hexagon_V6_vmpyiowh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyiowh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vminh_128B,VI_ftype_VIVI,2) -// tag : V6_vminh_128B -def int_hexagon_V6_vminh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vminh_128B">; +def int_hexagon_V6_vlutvvb_oracc : +Hexagon_v16i32_v16i32v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvvb_oracc">; -// -// BUILTIN_INFO(HEXAGON.V6_vmaxw,VI_ftype_VIVI,2) -// tag : V6_vmaxw -def int_hexagon_V6_vmaxw : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vmaxw">; +def int_hexagon_V6_vlutvvb_oracc_128B : +Hexagon_v32i32_v32i32v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvvb_oracc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmaxw_128B,VI_ftype_VIVI,2) -// tag : V6_vmaxw_128B -def int_hexagon_V6_vmaxw_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmaxw_128B">; +def int_hexagon_V6_vandvrt : +Hexagon_v512i1_v16i32i32_Intrinsic<"HEXAGON_V6_vandvrt">; -// -// BUILTIN_INFO(HEXAGON.V6_vminw,VI_ftype_VIVI,2) -// tag : V6_vminw -def int_hexagon_V6_vminw : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vminw">; +def int_hexagon_V6_vandvrt_128B : +Hexagon_v1024i1_v32i32i32_Intrinsic<"HEXAGON_V6_vandvrt_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vminw_128B,VI_ftype_VIVI,2) -// tag : V6_vminw_128B -def int_hexagon_V6_vminw_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vminw_128B">; +def int_hexagon_V6_veqh_xor : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqh_xor">; -// -// BUILTIN_INFO(HEXAGON.V6_vsathub,VI_ftype_VIVI,2) -// tag : V6_vsathub -def int_hexagon_V6_vsathub : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vsathub">; +def int_hexagon_V6_veqh_xor_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqh_xor_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsathub_128B,VI_ftype_VIVI,2) -// tag : V6_vsathub_128B -def int_hexagon_V6_vsathub_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsathub_128B">; +def int_hexagon_V6_vadduhw : +Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vadduhw">; -// -// BUILTIN_INFO(HEXAGON.V6_vsatwh,VI_ftype_VIVI,2) -// tag : V6_vsatwh -def int_hexagon_V6_vsatwh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vsatwh">; +def int_hexagon_V6_vadduhw_128B : +Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vadduhw_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsatwh_128B,VI_ftype_VIVI,2) -// tag : V6_vsatwh_128B -def int_hexagon_V6_vsatwh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsatwh_128B">; +def int_hexagon_V6_vcl0w : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vcl0w">; -// -// BUILTIN_INFO(HEXAGON.V6_vshuffeb,VI_ftype_VIVI,2) -// tag : V6_vshuffeb -def int_hexagon_V6_vshuffeb : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vshuffeb">; +def int_hexagon_V6_vcl0w_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vcl0w_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vshuffeb_128B,VI_ftype_VIVI,2) -// tag : V6_vshuffeb_128B -def int_hexagon_V6_vshuffeb_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vshuffeb_128B">; +def int_hexagon_V6_vmpyihb : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyihb">; -// -// BUILTIN_INFO(HEXAGON.V6_vshuffob,VI_ftype_VIVI,2) -// tag : V6_vshuffob -def int_hexagon_V6_vshuffob : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vshuffob">; +def int_hexagon_V6_vmpyihb_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyihb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vshuffob_128B,VI_ftype_VIVI,2) -// tag : V6_vshuffob_128B -def int_hexagon_V6_vshuffob_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vshuffob_128B">; +def int_hexagon_V6_vtmpybus : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vtmpybus">; -// -// BUILTIN_INFO(HEXAGON.V6_vshufeh,VI_ftype_VIVI,2) -// tag : V6_vshufeh -def int_hexagon_V6_vshufeh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vshufeh">; +def int_hexagon_V6_vtmpybus_128B : +Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vtmpybus_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vshufeh_128B,VI_ftype_VIVI,2) -// tag : V6_vshufeh_128B -def int_hexagon_V6_vshufeh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vshufeh_128B">; +def int_hexagon_V6_vd0 : +Hexagon_v16i32__Intrinsic<"HEXAGON_V6_vd0">; -// -// BUILTIN_INFO(HEXAGON.V6_vshufoh,VI_ftype_VIVI,2) -// tag : V6_vshufoh -def int_hexagon_V6_vshufoh : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vshufoh">; +def int_hexagon_V6_vd0_128B : +Hexagon_v32i32__Intrinsic<"HEXAGON_V6_vd0_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vshufoh_128B,VI_ftype_VIVI,2) -// tag : V6_vshufoh_128B -def int_hexagon_V6_vshufoh_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vshufoh_128B">; +def int_hexagon_V6_veqh_or : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqh_or">; -// -// BUILTIN_INFO(HEXAGON.V6_vshuffvdd,VD_ftype_VIVISI,3) -// tag : V6_vshuffvdd -def int_hexagon_V6_vshuffvdd : -Hexagon_v1024v512v512i_Intrinsic<"HEXAGON_V6_vshuffvdd">; +def int_hexagon_V6_veqh_or_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqh_or_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vshuffvdd_128B,VD_ftype_VIVISI,3) -// tag : V6_vshuffvdd_128B -def int_hexagon_V6_vshuffvdd_128B : -Hexagon_v2048v1024v1024i_Intrinsic<"HEXAGON_V6_vshuffvdd_128B">; +def int_hexagon_V6_vgtw_or : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtw_or">; -// -// BUILTIN_INFO(HEXAGON.V6_vdealvdd,VD_ftype_VIVISI,3) -// tag : V6_vdealvdd -def int_hexagon_V6_vdealvdd : -Hexagon_v1024v512v512i_Intrinsic<"HEXAGON_V6_vdealvdd">; +def int_hexagon_V6_vgtw_or_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtw_or_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vdealvdd_128B,VD_ftype_VIVISI,3) -// tag : V6_vdealvdd_128B -def int_hexagon_V6_vdealvdd_128B : -Hexagon_v2048v1024v1024i_Intrinsic<"HEXAGON_V6_vdealvdd_128B">; +def int_hexagon_V6_vdmpybus : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vdmpybus">; -// -// BUILTIN_INFO(HEXAGON.V6_vshufoeh,VD_ftype_VIVI,2) -// tag : V6_vshufoeh -def int_hexagon_V6_vshufoeh : -Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vshufoeh">; +def int_hexagon_V6_vdmpybus_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vdmpybus_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vshufoeh_128B,VD_ftype_VIVI,2) -// tag : V6_vshufoeh_128B -def int_hexagon_V6_vshufoeh_128B : -Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vshufoeh_128B">; +def int_hexagon_V6_vgtub_or : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtub_or">; -// -// BUILTIN_INFO(HEXAGON.V6_vshufoeb,VD_ftype_VIVI,2) -// tag : V6_vshufoeb -def int_hexagon_V6_vshufoeb : -Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vshufoeb">; +def int_hexagon_V6_vgtub_or_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtub_or_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vshufoeb_128B,VD_ftype_VIVI,2) -// tag : V6_vshufoeb_128B -def int_hexagon_V6_vshufoeb_128B : -Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vshufoeb_128B">; +def int_hexagon_V6_vmpybus : +Hexagon_v32i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpybus">; -// -// BUILTIN_INFO(HEXAGON.V6_vdealh,VI_ftype_VI,1) -// tag : V6_vdealh -def int_hexagon_V6_vdealh : -Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vdealh">; +def int_hexagon_V6_vmpybus_128B : +Hexagon_v64i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpybus_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vdealh_128B,VI_ftype_VI,1) -// tag : V6_vdealh_128B -def int_hexagon_V6_vdealh_128B : -Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vdealh_128B">; +def int_hexagon_V6_vdmpyhb_acc : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vdmpyhb_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vdealb,VI_ftype_VI,1) -// tag : V6_vdealb -def int_hexagon_V6_vdealb : -Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vdealb">; +def int_hexagon_V6_vdmpyhb_acc_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhb_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vdealb_128B,VI_ftype_VI,1) -// tag : V6_vdealb_128B -def int_hexagon_V6_vdealb_128B : -Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vdealb_128B">; +def int_hexagon_V6_vandvrt_acc : +Hexagon_v512i1_v512i1v16i32i32_Intrinsic<"HEXAGON_V6_vandvrt_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vdealb4w,VI_ftype_VIVI,2) -// tag : V6_vdealb4w -def int_hexagon_V6_vdealb4w : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vdealb4w">; +def int_hexagon_V6_vandvrt_acc_128B : +Hexagon_v1024i1_v1024i1v32i32i32_Intrinsic<"HEXAGON_V6_vandvrt_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vdealb4w_128B,VI_ftype_VIVI,2) -// tag : V6_vdealb4w_128B -def int_hexagon_V6_vdealb4w_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vdealb4w_128B">; +def int_hexagon_V6_vassign : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vassign">; -// -// BUILTIN_INFO(HEXAGON.V6_vshuffh,VI_ftype_VI,1) -// tag : V6_vshuffh -def int_hexagon_V6_vshuffh : -Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vshuffh">; +def int_hexagon_V6_vassign_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vassign_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vshuffh_128B,VI_ftype_VI,1) -// tag : V6_vshuffh_128B -def int_hexagon_V6_vshuffh_128B : -Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vshuffh_128B">; +def int_hexagon_V6_vaddwnq : +Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddwnq">; -// -// BUILTIN_INFO(HEXAGON.V6_vshuffb,VI_ftype_VI,1) -// tag : V6_vshuffb -def int_hexagon_V6_vshuffb : -Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vshuffb">; +def int_hexagon_V6_vaddwnq_128B : +Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddwnq_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vshuffb_128B,VI_ftype_VI,1) -// tag : V6_vshuffb_128B -def int_hexagon_V6_vshuffb_128B : -Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vshuffb_128B">; +def int_hexagon_V6_vgtub_and : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtub_and">; -// -// BUILTIN_INFO(HEXAGON.V6_extractw,SI_ftype_VISI,2) -// tag : V6_extractw -def int_hexagon_V6_extractw : -Hexagon_iv512i_Intrinsic<"HEXAGON_V6_extractw">; +def int_hexagon_V6_vgtub_and_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtub_and_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_extractw_128B,SI_ftype_VISI,2) -// tag : V6_extractw_128B -def int_hexagon_V6_extractw_128B : -Hexagon_iv1024i_Intrinsic<"HEXAGON_V6_extractw_128B">; +def int_hexagon_V6_vdmpyhb_dv : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhb_dv">; -// -// BUILTIN_INFO(HEXAGON.V6_vinsertwr,VI_ftype_VISI,2) -// tag : V6_vinsertwr -def int_hexagon_V6_vinsertwr : -Hexagon_v512v512i_Intrinsic<"HEXAGON_V6_vinsertwr">; +def int_hexagon_V6_vdmpyhb_dv_128B : +Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vdmpyhb_dv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vinsertwr_128B,VI_ftype_VISI,2) -// tag : V6_vinsertwr_128B -def int_hexagon_V6_vinsertwr_128B : -Hexagon_v1024v1024i_Intrinsic<"HEXAGON_V6_vinsertwr_128B">; +def int_hexagon_V6_vunpackb : +Hexagon_v32i32_v16i32_Intrinsic<"HEXAGON_V6_vunpackb">; -// -// BUILTIN_INFO(HEXAGON.V6_lvsplatw,VI_ftype_SI,1) -// tag : V6_lvsplatw -def int_hexagon_V6_lvsplatw : -Hexagon_v512i_Intrinsic<"HEXAGON_V6_lvsplatw">; +def int_hexagon_V6_vunpackb_128B : +Hexagon_v64i32_v32i32_Intrinsic<"HEXAGON_V6_vunpackb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_lvsplatw_128B,VI_ftype_SI,1) -// tag : V6_lvsplatw_128B -def int_hexagon_V6_lvsplatw_128B : -Hexagon_v1024i_Intrinsic<"HEXAGON_V6_lvsplatw_128B">; +def int_hexagon_V6_vunpackh : +Hexagon_v32i32_v16i32_Intrinsic<"HEXAGON_V6_vunpackh">; -// -// BUILTIN_INFO(HEXAGON.V6_vassign,VI_ftype_VI,1) -// tag : V6_vassign -def int_hexagon_V6_vassign : -Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vassign">; +def int_hexagon_V6_vunpackh_128B : +Hexagon_v64i32_v32i32_Intrinsic<"HEXAGON_V6_vunpackh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vassign_128B,VI_ftype_VI,1) -// tag : V6_vassign_128B -def int_hexagon_V6_vassign_128B : -Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vassign_128B">; +def int_hexagon_V6_vmpahb_acc : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpahb_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vcombine,VD_ftype_VIVI,2) -// tag : V6_vcombine -def int_hexagon_V6_vcombine : -Hexagon_v1024v512v512_Intrinsic<"HEXAGON_V6_vcombine">; +def int_hexagon_V6_vmpahb_acc_128B : +Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vmpahb_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vcombine_128B,VD_ftype_VIVI,2) -// tag : V6_vcombine_128B -def int_hexagon_V6_vcombine_128B : -Hexagon_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vcombine_128B">; +def int_hexagon_V6_vaddbnq : +Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddbnq">; -// -// BUILTIN_INFO(HEXAGON.V6_vdelta,VI_ftype_VIVI,2) -// tag : V6_vdelta -def int_hexagon_V6_vdelta : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vdelta">; +def int_hexagon_V6_vaddbnq_128B : +Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddbnq_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vdelta_128B,VI_ftype_VIVI,2) -// tag : V6_vdelta_128B -def int_hexagon_V6_vdelta_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vdelta_128B">; +def int_hexagon_V6_vlalignbi : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlalignbi">; -// -// BUILTIN_INFO(HEXAGON.V6_vrdelta,VI_ftype_VIVI,2) -// tag : V6_vrdelta -def int_hexagon_V6_vrdelta : -Hexagon_v512v512v512_Intrinsic<"HEXAGON_V6_vrdelta">; +def int_hexagon_V6_vlalignbi_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlalignbi_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vrdelta_128B,VI_ftype_VIVI,2) -// tag : V6_vrdelta_128B -def int_hexagon_V6_vrdelta_128B : -Hexagon_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrdelta_128B">; +def int_hexagon_V6_vsatwh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsatwh">; -// -// BUILTIN_INFO(HEXAGON.V6_vcl0w,VI_ftype_VI,1) -// tag : V6_vcl0w -def int_hexagon_V6_vcl0w : -Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vcl0w">; +def int_hexagon_V6_vsatwh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsatwh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vcl0w_128B,VI_ftype_VI,1) -// tag : V6_vcl0w_128B -def int_hexagon_V6_vcl0w_128B : -Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vcl0w_128B">; +def int_hexagon_V6_vgtuh : +Hexagon_v512i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtuh">; -// -// BUILTIN_INFO(HEXAGON.V6_vcl0h,VI_ftype_VI,1) -// tag : V6_vcl0h -def int_hexagon_V6_vcl0h : -Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vcl0h">; +def int_hexagon_V6_vgtuh_128B : +Hexagon_v1024i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtuh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vcl0h_128B,VI_ftype_VI,1) -// tag : V6_vcl0h_128B -def int_hexagon_V6_vcl0h_128B : -Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vcl0h_128B">; +def int_hexagon_V6_vmpyihb_acc : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpyihb_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vnormamtw,VI_ftype_VI,1) -// tag : V6_vnormamtw -def int_hexagon_V6_vnormamtw : -Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vnormamtw">; +def int_hexagon_V6_vmpyihb_acc_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpyihb_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vnormamtw_128B,VI_ftype_VI,1) -// tag : V6_vnormamtw_128B -def int_hexagon_V6_vnormamtw_128B : -Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vnormamtw_128B">; +def int_hexagon_V6_vrmpybusv_acc : +Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vrmpybusv_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vnormamth,VI_ftype_VI,1) -// tag : V6_vnormamth -def int_hexagon_V6_vnormamth : -Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vnormamth">; +def int_hexagon_V6_vrmpybusv_acc_128B : +Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vrmpybusv_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vnormamth_128B,VI_ftype_VI,1) -// tag : V6_vnormamth_128B -def int_hexagon_V6_vnormamth_128B : -Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vnormamth_128B">; +def int_hexagon_V6_vrdelta : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vrdelta">; -// -// BUILTIN_INFO(HEXAGON.V6_vpopcounth,VI_ftype_VI,1) -// tag : V6_vpopcounth -def int_hexagon_V6_vpopcounth : -Hexagon_v512v512_Intrinsic<"HEXAGON_V6_vpopcounth">; +def int_hexagon_V6_vrdelta_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vrdelta_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vpopcounth_128B,VI_ftype_VI,1) -// tag : V6_vpopcounth_128B -def int_hexagon_V6_vpopcounth_128B : -Hexagon_v1024v1024_Intrinsic<"HEXAGON_V6_vpopcounth_128B">; +def int_hexagon_V6_vroundwh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vroundwh">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvvb,VI_ftype_VIVISI,3) -// tag : V6_vlutvvb -def int_hexagon_V6_vlutvvb : -Hexagon_v512v512v512i_Intrinsic<"HEXAGON_V6_vlutvvb">; +def int_hexagon_V6_vroundwh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vroundwh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvvb_128B,VI_ftype_VIVISI,3) -// tag : V6_vlutvvb_128B -def int_hexagon_V6_vlutvvb_128B : -Hexagon_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvvb_128B">; +def int_hexagon_V6_vaddw_dv : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddw_dv">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvvb_oracc,VI_ftype_VIVIVISI,4) -// tag : V6_vlutvvb_oracc -def int_hexagon_V6_vlutvvb_oracc : -Hexagon_v512v512v512v512i_Intrinsic<"HEXAGON_V6_vlutvvb_oracc">; +def int_hexagon_V6_vaddw_dv_128B : +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vaddw_dv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvvb_oracc_128B,VI_ftype_VIVIVISI,4) -// tag : V6_vlutvvb_oracc_128B -def int_hexagon_V6_vlutvvb_oracc_128B : -Hexagon_v1024v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvvb_oracc_128B">; +def int_hexagon_V6_vmpyiwb_acc : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpyiwb_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvwh,VD_ftype_VIVISI,3) -// tag : V6_vlutvwh -def int_hexagon_V6_vlutvwh : -Hexagon_v1024v512v512i_Intrinsic<"HEXAGON_V6_vlutvwh">; +def int_hexagon_V6_vmpyiwb_acc_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpyiwb_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvwh_128B,VD_ftype_VIVISI,3) -// tag : V6_vlutvwh_128B -def int_hexagon_V6_vlutvwh_128B : -Hexagon_v2048v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvwh_128B">; +def int_hexagon_V6_vsubbq : +Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubbq">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvwh_oracc,VD_ftype_VDVIVISI,4) -// tag : V6_vlutvwh_oracc -def int_hexagon_V6_vlutvwh_oracc : -Hexagon_v1024v1024v512v512i_Intrinsic<"HEXAGON_V6_vlutvwh_oracc">; +def int_hexagon_V6_vsubbq_128B : +Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubbq_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvwh_oracc_128B,VD_ftype_VDVIVISI,4) -// tag : V6_vlutvwh_oracc_128B -def int_hexagon_V6_vlutvwh_oracc_128B : -Hexagon_v2048v2048v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvwh_oracc_128B">; +def int_hexagon_V6_veqh_and : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqh_and">; -// -// Masked vector stores -// -def int_hexagon_V6_vS32b_qpred_ai : -Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vS32b_qpred_ai">; +def int_hexagon_V6_veqh_and_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqh_and_128B">; -def int_hexagon_V6_vS32b_nqpred_ai : -Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vS32b_nqpred_ai">; +def int_hexagon_V6_valignbi : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_valignbi">; -def int_hexagon_V6_vS32b_nt_qpred_ai : -Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vS32b_nt_qpred_ai">; +def int_hexagon_V6_valignbi_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_valignbi_128B">; -def int_hexagon_V6_vS32b_nt_nqpred_ai : -Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vS32b_nt_nqpred_ai">; +def int_hexagon_V6_vaddwsat : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddwsat">; -def int_hexagon_V6_vS32b_qpred_ai_128B : -Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vS32b_qpred_ai_128B">; +def int_hexagon_V6_vaddwsat_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddwsat_128B">; -def int_hexagon_V6_vS32b_nqpred_ai_128B : -Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vS32b_nqpred_ai_128B">; +def int_hexagon_V6_veqw_and : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqw_and">; -def int_hexagon_V6_vS32b_nt_qpred_ai_128B : -Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vS32b_nt_qpred_ai_128B">; +def int_hexagon_V6_veqw_and_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqw_and_128B">; -def int_hexagon_V6_vS32b_nt_nqpred_ai_128B : -Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vS32b_nt_nqpred_ai_128B">; +def int_hexagon_V6_vabsdiffub : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vabsdiffub">; -def int_hexagon_V6_vmaskedstoreq : -Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstoreq">; +def int_hexagon_V6_vabsdiffub_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vabsdiffub_128B">; -def int_hexagon_V6_vmaskedstorenq : -Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstorenq">; +def int_hexagon_V6_vshuffeb : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vshuffeb">; -def int_hexagon_V6_vmaskedstorentq : -Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstorentq">; +def int_hexagon_V6_vshuffeb_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vshuffeb_128B">; -def int_hexagon_V6_vmaskedstorentnq : -Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstorentnq">; +def int_hexagon_V6_vabsdiffuh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vabsdiffuh">; -def int_hexagon_V6_vmaskedstoreq_128B : -Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstoreq_128B">; +def int_hexagon_V6_vabsdiffuh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vabsdiffuh_128B">; -def int_hexagon_V6_vmaskedstorenq_128B : -Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstorenq_128B">; +def int_hexagon_V6_veqw_xor : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqw_xor">; -def int_hexagon_V6_vmaskedstorentq_128B : -Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstorentq_128B">; +def int_hexagon_V6_veqw_xor_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqw_xor_128B">; -def int_hexagon_V6_vmaskedstorentnq_128B : -Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstorentnq_128B">; +def int_hexagon_V6_vgth : +Hexagon_v512i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_vgth">; -multiclass Hexagon_custom_circ_ld_Intrinsic { - def NAME#_pci : Hexagon_NonGCC_Intrinsic< - [ElTy, llvm_ptr_ty], - [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], - [IntrArgMemOnly, NoCapture<3>]>; - def NAME#_pcr : Hexagon_NonGCC_Intrinsic< - [ElTy, llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_ptr_ty], - [IntrArgMemOnly, NoCapture<2>]>; -} +def int_hexagon_V6_vgth_128B : +Hexagon_v1024i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_vgth_128B">; -defm int_hexagon_L2_loadrub : Hexagon_custom_circ_ld_Intrinsic; -defm int_hexagon_L2_loadrb : Hexagon_custom_circ_ld_Intrinsic; -defm int_hexagon_L2_loadruh : Hexagon_custom_circ_ld_Intrinsic; -defm int_hexagon_L2_loadrh : Hexagon_custom_circ_ld_Intrinsic; -defm int_hexagon_L2_loadri : Hexagon_custom_circ_ld_Intrinsic; -defm int_hexagon_L2_loadrd : Hexagon_custom_circ_ld_Intrinsic; +def int_hexagon_V6_vgtuw_xor : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtuw_xor">; -multiclass Hexagon_custom_circ_st_Intrinsic { - def NAME#_pci : Hexagon_NonGCC_Intrinsic< - [llvm_ptr_ty], - [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ElTy, llvm_ptr_ty], - [IntrArgMemOnly, NoCapture<4>]>; - def NAME#_pcr : Hexagon_NonGCC_Intrinsic< - [llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty, ElTy, llvm_ptr_ty], - [IntrArgMemOnly, NoCapture<3>]>; -} +def int_hexagon_V6_vgtuw_xor_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtuw_xor_128B">; -defm int_hexagon_S2_storerb : Hexagon_custom_circ_st_Intrinsic; -defm int_hexagon_S2_storerh : Hexagon_custom_circ_st_Intrinsic; -defm int_hexagon_S2_storerf : Hexagon_custom_circ_st_Intrinsic; -defm int_hexagon_S2_storeri : Hexagon_custom_circ_st_Intrinsic; -defm int_hexagon_S2_storerd : Hexagon_custom_circ_st_Intrinsic; +def int_hexagon_V6_vgtb : +Hexagon_v512i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtb">; -// The front-end emits the intrinsic call with only two arguments. The third -// argument from the builtin is already used by front-end to write to memory -// by generating a store. -class Hexagon_custom_brev_ld_Intrinsic - : Hexagon_NonGCC_Intrinsic< - [ElTy, llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty], - [IntrReadMem]>; +def int_hexagon_V6_vgtb_128B : +Hexagon_v1024i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtb_128B">; -def int_hexagon_L2_loadrub_pbr : Hexagon_custom_brev_ld_Intrinsic; -def int_hexagon_L2_loadrb_pbr : Hexagon_custom_brev_ld_Intrinsic; -def int_hexagon_L2_loadruh_pbr : Hexagon_custom_brev_ld_Intrinsic; -def int_hexagon_L2_loadrh_pbr : Hexagon_custom_brev_ld_Intrinsic; -def int_hexagon_L2_loadri_pbr : Hexagon_custom_brev_ld_Intrinsic; -def int_hexagon_L2_loadrd_pbr : Hexagon_custom_brev_ld_Intrinsic; +def int_hexagon_V6_vgtw : +Hexagon_v512i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtw">; -def int_hexagon_S2_storerb_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_stb">; -def int_hexagon_S2_storerh_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_sth">; -def int_hexagon_S2_storerf_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_sthhi">; -def int_hexagon_S2_storeri_pbr : Hexagon_mem_memsisi_Intrinsic<"brev_stw">; -def int_hexagon_S2_storerd_pbr : Hexagon_mem_memdisi_Intrinsic<"brev_std">; +def int_hexagon_V6_vgtw_128B : +Hexagon_v1024i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtw_128B">; +def int_hexagon_V6_vsubwq : +Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubwq">; -/// -/// HexagonV62 intrinsics -/// +def int_hexagon_V6_vsubwq_128B : +Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubwq_128B">; -// -// Hexagon_LLiLLiLLi_Intrinsic -// tag : M6_vabsdiffb -class Hexagon_LLiLLiLLi_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vnot : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vnot">; -// -// Hexagon_LLii_Intrinsic -// tag : S6_vsplatrbp -class Hexagon_LLii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vnot_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vnot_128B">; -// -// Hexagon_V62_v512v512i_Intrinsic -// tag : V6_vlsrb -class Hexagon_V62_v512v512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vgtb_or : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtb_or">; -// -// Hexagon_V62_v1024v1024i_Intrinsic -// tag : V6_vlsrb_128B -class Hexagon_V62_v1024v1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vgtb_or_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtb_or_128B">; -// -// Hexagon_V62_v512v512v512i_Intrinsic -// tag : V6_vasrwuhrndsat -class Hexagon_V62_v512v512v512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vgtuw_or : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtuw_or">; -// -// Hexagon_V62_v1024v1024v1024i_Intrinsic -// tag : V6_vasrwuhrndsat_128B -class Hexagon_V62_v1024v1024v1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vgtuw_or_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtuw_or_128B">; -// -// Hexagon_V62_v512v512v512_Intrinsic -// tag : V6_vrounduwuh -class Hexagon_V62_v512v512v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vaddubsat : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddubsat">; -// -// Hexagon_V62_v1024v1024v1024_Intrinsic -// tag : V6_vrounduwuh_128B -class Hexagon_V62_v1024v1024v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vaddubsat_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddubsat_128B">; -// -// Hexagon_V62_v2048v2048v2048_Intrinsic -// tag : V6_vadduwsat_dv_128B -class Hexagon_V62_v2048v2048v2048_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vmaxw : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmaxw">; -// -// Hexagon_V62_v1024v1024v512v512_Intrinsic -// tag : V6_vaddhw_acc -class Hexagon_V62_v1024v1024v512v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vmaxw_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmaxw_128B">; -// -// Hexagon_V62_v2048v2048v1024v1024_Intrinsic -// tag : V6_vaddhw_acc_128B -class Hexagon_V62_v2048v2048v1024v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vaslwv : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaslwv">; -// -// Hexagon_V62_v1024v512v512_Intrinsic -// tag : V6_vmpyewuh_64 -class Hexagon_V62_v1024v512v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vaslwv_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaslwv_128B">; -// -// Hexagon_V62_v2048v1024v1024_Intrinsic -// tag : V6_vmpyewuh_64_128B -class Hexagon_V62_v2048v1024v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vabsw_sat : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabsw_sat">; -// -// Hexagon_V62_v2048v2048i_Intrinsic -// tag : V6_vmpauhb_128B -class Hexagon_V62_v2048v2048i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vabsw_sat_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabsw_sat_128B">; -// -// Hexagon_V62_v2048v2048v2048i_Intrinsic -// tag : V6_vmpauhb_acc_128B -class Hexagon_V62_v2048v2048v2048i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vsubwsat_dv : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubwsat_dv">; -// -// Hexagon_V62_v512v64ii_Intrinsic -// tag : V6_vandnqrt -class Hexagon_V62_v512v64ii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vsubwsat_dv_128B : +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vsubwsat_dv_128B">; -// -// Hexagon_V62_v1024v128ii_Intrinsic -// tag : V6_vandnqrt_128B -class Hexagon_V62_v1024v128ii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vroundhub : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vroundhub">; -// -// Hexagon_V62_v512v512v64ii_Intrinsic -// tag : V6_vandnqrt_acc -class Hexagon_V62_v512v512v64ii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vroundhub_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vroundhub_128B">; -// -// Hexagon_V62_v1024v1024v128ii_Intrinsic -// tag : V6_vandnqrt_acc_128B -class Hexagon_V62_v1024v1024v128ii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vdmpyhisat_acc : +Hexagon_v16i32_v16i32v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhisat_acc">; -// -// Hexagon_V62_v512v64iv512_Intrinsic -// tag : V6_vandvqv -class Hexagon_V62_v512v64iv512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vdmpyhisat_acc_128B : +Hexagon_v32i32_v32i32v64i32i32_Intrinsic<"HEXAGON_V6_vdmpyhisat_acc_128B">; -// -// Hexagon_V62_v1024v128iv1024_Intrinsic -// tag : V6_vandvqv_128B -class Hexagon_V62_v1024v128iv1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vmpabus : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpabus">; -// -// Hexagon_V62_v64ii_Intrinsic -// tag : V6_pred_scalar2v2 -class Hexagon_V62_v64ii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vmpabus_128B : +Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vmpabus_128B">; -// -// Hexagon_V62_v128ii_Intrinsic -// tag : V6_pred_scalar2v2_128B -class Hexagon_V62_v128ii_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vassignp : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vassignp">; -// -// Hexagon_V62_v64iv64iv64i_Intrinsic -// tag : V6_shuffeqw -class Hexagon_V62_v64iv64iv64i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vassignp_128B : +Hexagon_v64i32_v64i32_Intrinsic<"HEXAGON_V6_vassignp_128B">; -// -// Hexagon_V62_v128iv128iv128i_Intrinsic -// tag : V6_shuffeqw_128B -class Hexagon_V62_v128iv128iv128i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_veqb : +Hexagon_v512i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_veqb">; -// -// Hexagon_V62_v512i_Intrinsic -// tag : V6_lvsplath -class Hexagon_V62_v512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_veqb_128B : +Hexagon_v1024i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_veqb_128B">; -// -// Hexagon_V62_v1024i_Intrinsic -// tag : V6_lvsplath_128B -class Hexagon_V62_v1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vsububh : +Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsububh">; -// -// Hexagon_V62_v512v512v512v512i_Intrinsic -// tag : V6_vlutvvb_oracci -class Hexagon_V62_v512v512v512v512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vsububh_128B : +Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsububh_128B">; -// -// Hexagon_V62_v1024v1024v1024v1024i_Intrinsic -// tag : V6_vlutvvb_oracci_128B -class Hexagon_V62_v1024v1024v1024v1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_lvsplatw : +Hexagon_v16i32_i32_Intrinsic<"HEXAGON_V6_lvsplatw">; -// -// Hexagon_V62_v1024v512v512i_Intrinsic -// tag : V6_vlutvwhi -class Hexagon_V62_v1024v512v512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_lvsplatw_128B : +Hexagon_v32i32_i32_Intrinsic<"HEXAGON_V6_lvsplatw_128B">; -// -// Hexagon_V62_v2048v1024v1024i_Intrinsic -// tag : V6_vlutvwhi_128B -class Hexagon_V62_v2048v1024v1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vaddhnq : +Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddhnq">; -// -// Hexagon_V62_v1024v1024v512v512i_Intrinsic -// tag : V6_vlutvwh_oracci -class Hexagon_V62_v1024v1024v512v512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vaddhnq_128B : +Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddhnq_128B">; -// -// Hexagon_V62_v2048v2048v1024v1024i_Intrinsic -// tag : V6_vlutvwh_oracci_128B -class Hexagon_V62_v2048v2048v1024v1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vdmpyhsusat : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsusat">; -// Hexagon_v512v64iv512v512v64i_Intrinsic -// tag: V6_vaddcarry -class Hexagon_v512v64iv512v512v64i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vdmpyhsusat_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsusat_128B">; -// Hexagon_v1024v128iv1024v1024v128i_Intrinsic -// tag: V6_vaddcarry_128B -class Hexagon_v1024v128iv1024v1024v128i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_pred_not : +Hexagon_v512i1_v512i1_Intrinsic<"HEXAGON_V6_pred_not">; +def int_hexagon_V6_pred_not_128B : +Hexagon_v1024i1_v1024i1_Intrinsic<"HEXAGON_V6_pred_not_128B">; -// -// BUILTIN_INFO(HEXAGON.M6_vabsdiffb,DI_ftype_DIDI,2) -// tag : M6_vabsdiffb -def int_hexagon_M6_vabsdiffb : -Hexagon_LLiLLiLLi_Intrinsic<"HEXAGON_M6_vabsdiffb">; +def int_hexagon_V6_vlutvwh_oracc : +Hexagon_v32i32_v32i32v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvwh_oracc">; -// -// BUILTIN_INFO(HEXAGON.M6_vabsdiffub,DI_ftype_DIDI,2) -// tag : M6_vabsdiffub -def int_hexagon_M6_vabsdiffub : -Hexagon_LLiLLiLLi_Intrinsic<"HEXAGON_M6_vabsdiffub">; +def int_hexagon_V6_vlutvwh_oracc_128B : +Hexagon_v64i32_v64i32v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvwh_oracc_128B">; -// -// BUILTIN_INFO(HEXAGON.S6_vtrunehb_ppp,DI_ftype_DIDI,2) -// tag : S6_vtrunehb_ppp -def int_hexagon_S6_vtrunehb_ppp : -Hexagon_LLiLLiLLi_Intrinsic<"HEXAGON_S6_vtrunehb_ppp">; +def int_hexagon_V6_vmpyiewh_acc : +Hexagon_v16i32_v16i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyiewh_acc">; -// -// BUILTIN_INFO(HEXAGON.S6_vtrunohb_ppp,DI_ftype_DIDI,2) -// tag : S6_vtrunohb_ppp -def int_hexagon_S6_vtrunohb_ppp : -Hexagon_LLiLLiLLi_Intrinsic<"HEXAGON_S6_vtrunohb_ppp">; +def int_hexagon_V6_vmpyiewh_acc_128B : +Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyiewh_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.S6_vsplatrbp,DI_ftype_SI,1) -// tag : S6_vsplatrbp -def int_hexagon_S6_vsplatrbp : -Hexagon_LLii_Intrinsic<"HEXAGON_S6_vsplatrbp">; +def int_hexagon_V6_vdealvdd : +Hexagon_v32i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vdealvdd">; -// -// BUILTIN_INFO(HEXAGON.V6_vlsrb,VI_ftype_VISI,2) -// tag : V6_vlsrb -def int_hexagon_V6_vlsrb : -Hexagon_V62_v512v512i_Intrinsic<"HEXAGON_V6_vlsrb">; +def int_hexagon_V6_vdealvdd_128B : +Hexagon_v64i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vdealvdd_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vlsrb_128B,VI_ftype_VISI,2) -// tag : V6_vlsrb_128B -def int_hexagon_V6_vlsrb_128B : -Hexagon_V62_v1024v1024i_Intrinsic<"HEXAGON_V6_vlsrb_128B">; +def int_hexagon_V6_vavgw : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavgw">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrwuhrndsat,VI_ftype_VIVISI,3) -// tag : V6_vasrwuhrndsat -def int_hexagon_V6_vasrwuhrndsat : -Hexagon_V62_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrwuhrndsat">; +def int_hexagon_V6_vavgw_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavgw_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrwuhrndsat_128B,VI_ftype_VIVISI,3) -// tag : V6_vasrwuhrndsat_128B -def int_hexagon_V6_vasrwuhrndsat_128B : -Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrwuhrndsat_128B">; +def int_hexagon_V6_vdmpyhsusat_acc : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsusat_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vasruwuhrndsat,VI_ftype_VIVISI,3) -// tag : V6_vasruwuhrndsat -def int_hexagon_V6_vasruwuhrndsat : -Hexagon_V62_v512v512v512i_Intrinsic<"HEXAGON_V6_vasruwuhrndsat">; +def int_hexagon_V6_vdmpyhsusat_acc_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vdmpyhsusat_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vasruwuhrndsat_128B,VI_ftype_VIVISI,3) -// tag : V6_vasruwuhrndsat_128B -def int_hexagon_V6_vasruwuhrndsat_128B : -Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasruwuhrndsat_128B">; +def int_hexagon_V6_vgtw_xor : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vgtw_xor">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrhbsat,VI_ftype_VIVISI,3) -// tag : V6_vasrhbsat -def int_hexagon_V6_vasrhbsat : -Hexagon_V62_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrhbsat">; +def int_hexagon_V6_vgtw_xor_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vgtw_xor_128B">; + +def int_hexagon_V6_vtmpyhb_acc : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vtmpyhb_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrhbsat_128B,VI_ftype_VIVISI,3) -// tag : V6_vasrhbsat_128B -def int_hexagon_V6_vasrhbsat_128B : -Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrhbsat_128B">; +def int_hexagon_V6_vtmpyhb_acc_128B : +Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vtmpyhb_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vrounduwuh,VI_ftype_VIVI,2) -// tag : V6_vrounduwuh -def int_hexagon_V6_vrounduwuh : -Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vrounduwuh">; +def int_hexagon_V6_vaddhw : +Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddhw">; -// -// BUILTIN_INFO(HEXAGON.V6_vrounduwuh_128B,VI_ftype_VIVI,2) -// tag : V6_vrounduwuh_128B -def int_hexagon_V6_vrounduwuh_128B : -Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrounduwuh_128B">; +def int_hexagon_V6_vaddhw_128B : +Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddhw_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vrounduhub,VI_ftype_VIVI,2) -// tag : V6_vrounduhub -def int_hexagon_V6_vrounduhub : -Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vrounduhub">; +def int_hexagon_V6_vaddhq : +Hexagon_v16i32_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddhq">; -// -// BUILTIN_INFO(HEXAGON.V6_vrounduhub_128B,VI_ftype_VIVI,2) -// tag : V6_vrounduhub_128B -def int_hexagon_V6_vrounduhub_128B : -Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrounduhub_128B">; +def int_hexagon_V6_vaddhq_128B : +Hexagon_v32i32_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddhq_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vadduwsat,VI_ftype_VIVI,2) -// tag : V6_vadduwsat -def int_hexagon_V6_vadduwsat : -Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vadduwsat">; +def int_hexagon_V6_vrmpyubv : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vrmpyubv">; -// -// BUILTIN_INFO(HEXAGON.V6_vadduwsat_128B,VI_ftype_VIVI,2) -// tag : V6_vadduwsat_128B -def int_hexagon_V6_vadduwsat_128B : -Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vadduwsat_128B">; +def int_hexagon_V6_vrmpyubv_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vrmpyubv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vadduwsat_dv,VD_ftype_VDVD,2) -// tag : V6_vadduwsat_dv -def int_hexagon_V6_vadduwsat_dv : -Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vadduwsat_dv">; +def int_hexagon_V6_vsubh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubh">; -// -// BUILTIN_INFO(HEXAGON.V6_vadduwsat_dv_128B,VD_ftype_VDVD,2) -// tag : V6_vadduwsat_dv_128B -def int_hexagon_V6_vadduwsat_dv_128B : -Hexagon_V62_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vadduwsat_dv_128B">; +def int_hexagon_V6_vsubh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubuwsat,VI_ftype_VIVI,2) -// tag : V6_vsubuwsat -def int_hexagon_V6_vsubuwsat : -Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vsubuwsat">; +def int_hexagon_V6_vrmpyubi : +Hexagon_v32i32_v32i32i32i32_Intrinsic<"HEXAGON_V6_vrmpyubi">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubuwsat_128B,VI_ftype_VIVI,2) -// tag : V6_vsubuwsat_128B -def int_hexagon_V6_vsubuwsat_128B : -Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubuwsat_128B">; +def int_hexagon_V6_vrmpyubi_128B : +Hexagon_v64i32_v64i32i32i32_Intrinsic<"HEXAGON_V6_vrmpyubi_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubuwsat_dv,VD_ftype_VDVD,2) -// tag : V6_vsubuwsat_dv -def int_hexagon_V6_vsubuwsat_dv : -Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubuwsat_dv">; +def int_hexagon_V6_vminw : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vminw">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubuwsat_dv_128B,VD_ftype_VDVD,2) -// tag : V6_vsubuwsat_dv_128B -def int_hexagon_V6_vsubuwsat_dv_128B : -Hexagon_V62_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsubuwsat_dv_128B">; +def int_hexagon_V6_vminw_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vminw_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddbsat,VI_ftype_VIVI,2) -// tag : V6_vaddbsat -def int_hexagon_V6_vaddbsat : -Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vaddbsat">; +def int_hexagon_V6_vmpyubv_acc : +Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyubv_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddbsat_128B,VI_ftype_VIVI,2) -// tag : V6_vaddbsat_128B -def int_hexagon_V6_vaddbsat_128B : -Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddbsat_128B">; +def int_hexagon_V6_vmpyubv_acc_128B : +Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyubv_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddbsat_dv,VD_ftype_VDVD,2) -// tag : V6_vaddbsat_dv -def int_hexagon_V6_vaddbsat_dv : -Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddbsat_dv">; +def int_hexagon_V6_pred_xor : +Hexagon_v512i1_v512i1v512i1_Intrinsic<"HEXAGON_V6_pred_xor">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddbsat_dv_128B,VD_ftype_VDVD,2) -// tag : V6_vaddbsat_dv_128B -def int_hexagon_V6_vaddbsat_dv_128B : -Hexagon_V62_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vaddbsat_dv_128B">; +def int_hexagon_V6_pred_xor_128B : +Hexagon_v1024i1_v1024i1v1024i1_Intrinsic<"HEXAGON_V6_pred_xor_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubbsat,VI_ftype_VIVI,2) -// tag : V6_vsubbsat -def int_hexagon_V6_vsubbsat : -Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vsubbsat">; +def int_hexagon_V6_veqb_xor : +Hexagon_v512i1_v512i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqb_xor">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubbsat_128B,VI_ftype_VIVI,2) -// tag : V6_vsubbsat_128B -def int_hexagon_V6_vsubbsat_128B : -Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubbsat_128B">; +def int_hexagon_V6_veqb_xor_128B : +Hexagon_v1024i1_v1024i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqb_xor_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubbsat_dv,VD_ftype_VDVD,2) -// tag : V6_vsubbsat_dv -def int_hexagon_V6_vsubbsat_dv : -Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubbsat_dv">; +def int_hexagon_V6_vmpyiewuh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyiewuh">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubbsat_dv_128B,VD_ftype_VDVD,2) -// tag : V6_vsubbsat_dv_128B -def int_hexagon_V6_vsubbsat_dv_128B : -Hexagon_V62_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsubbsat_dv_128B">; +def int_hexagon_V6_vmpyiewuh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyiewuh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddububb_sat,VI_ftype_VIVI,2) -// tag : V6_vaddububb_sat -def int_hexagon_V6_vaddububb_sat : -Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vaddububb_sat">; +def int_hexagon_V6_vmpybusv_acc : +Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpybusv_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddububb_sat_128B,VI_ftype_VIVI,2) -// tag : V6_vaddububb_sat_128B -def int_hexagon_V6_vaddububb_sat_128B : -Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddububb_sat_128B">; +def int_hexagon_V6_vmpybusv_acc_128B : +Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpybusv_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubububb_sat,VI_ftype_VIVI,2) -// tag : V6_vsubububb_sat -def int_hexagon_V6_vsubububb_sat : -Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vsubububb_sat">; +def int_hexagon_V6_vavguhrnd : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavguhrnd">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubububb_sat_128B,VI_ftype_VIVI,2) -// tag : V6_vsubububb_sat_128B -def int_hexagon_V6_vsubububb_sat_128B : -Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubububb_sat_128B">; +def int_hexagon_V6_vavguhrnd_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavguhrnd_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddhw_acc,VD_ftype_VDVIVI,3) -// tag : V6_vaddhw_acc -def int_hexagon_V6_vaddhw_acc : -Hexagon_V62_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vaddhw_acc">; +def int_hexagon_V6_vmpyowh_rnd : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyowh_rnd">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddhw_acc_128B,VD_ftype_VDVIVI,3) -// tag : V6_vaddhw_acc_128B -def int_hexagon_V6_vaddhw_acc_128B : -Hexagon_V62_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vaddhw_acc_128B">; +def int_hexagon_V6_vmpyowh_rnd_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyowh_rnd_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vadduhw_acc,VD_ftype_VDVIVI,3) -// tag : V6_vadduhw_acc -def int_hexagon_V6_vadduhw_acc : -Hexagon_V62_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vadduhw_acc">; +def int_hexagon_V6_vsubwsat : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubwsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vadduhw_acc_128B,VD_ftype_VDVIVI,3) -// tag : V6_vadduhw_acc_128B -def int_hexagon_V6_vadduhw_acc_128B : -Hexagon_V62_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vadduhw_acc_128B">; +def int_hexagon_V6_vsubwsat_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubwsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddubh_acc,VD_ftype_VDVIVI,3) -// tag : V6_vaddubh_acc -def int_hexagon_V6_vaddubh_acc : -Hexagon_V62_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vaddubh_acc">; +def int_hexagon_V6_vsubuhw : +Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubuhw">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddubh_acc_128B,VD_ftype_VDVIVI,3) -// tag : V6_vaddubh_acc_128B -def int_hexagon_V6_vaddubh_acc_128B : -Hexagon_V62_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vaddubh_acc_128B">; +def int_hexagon_V6_vsubuhw_128B : +Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubuhw_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyewuh_64,VD_ftype_VIVI,2) -// tag : V6_vmpyewuh_64 -def int_hexagon_V6_vmpyewuh_64 : -Hexagon_V62_v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyewuh_64">; +def int_hexagon_V6_vrmpybusi_acc : +Hexagon_v32i32_v32i32v32i32i32i32_Intrinsic<"HEXAGON_V6_vrmpybusi_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyewuh_64_128B,VD_ftype_VIVI,2) -// tag : V6_vmpyewuh_64_128B -def int_hexagon_V6_vmpyewuh_64_128B : -Hexagon_V62_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyewuh_64_128B">; +def int_hexagon_V6_vrmpybusi_acc_128B : +Hexagon_v64i32_v64i32v64i32i32i32_Intrinsic<"HEXAGON_V6_vrmpybusi_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyowh_64_acc,VD_ftype_VDVIVI,3) -// tag : V6_vmpyowh_64_acc -def int_hexagon_V6_vmpyowh_64_acc : -Hexagon_V62_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyowh_64_acc">; +def int_hexagon_V6_vasrw : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vasrw">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyowh_64_acc_128B,VD_ftype_VDVIVI,3) -// tag : V6_vmpyowh_64_acc_128B -def int_hexagon_V6_vmpyowh_64_acc_128B : -Hexagon_V62_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyowh_64_acc_128B">; +def int_hexagon_V6_vasrw_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vasrw_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpauhb,VD_ftype_VDSI,2) -// tag : V6_vmpauhb -def int_hexagon_V6_vmpauhb : -Hexagon_V62_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpauhb">; +def int_hexagon_V6_vasrh : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vasrh">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpauhb_128B,VD_ftype_VDSI,2) -// tag : V6_vmpauhb_128B -def int_hexagon_V6_vmpauhb_128B : -Hexagon_V62_v2048v2048i_Intrinsic<"HEXAGON_V6_vmpauhb_128B">; +def int_hexagon_V6_vasrh_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vasrh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpauhb_acc,VD_ftype_VDVDSI,3) -// tag : V6_vmpauhb_acc -def int_hexagon_V6_vmpauhb_acc : -Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpauhb_acc">; +def int_hexagon_V6_vmpyuhv : +Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyuhv">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpauhb_acc_128B,VD_ftype_VDVDSI,3) -// tag : V6_vmpauhb_acc_128B -def int_hexagon_V6_vmpauhb_acc_128B : -Hexagon_V62_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vmpauhb_acc_128B">; +def int_hexagon_V6_vmpyuhv_128B : +Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyuhv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiwub,VI_ftype_VISI,2) -// tag : V6_vmpyiwub -def int_hexagon_V6_vmpyiwub : -Hexagon_V62_v512v512i_Intrinsic<"HEXAGON_V6_vmpyiwub">; +def int_hexagon_V6_vasrhbrndsat : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrhbrndsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiwub_128B,VI_ftype_VISI,2) -// tag : V6_vmpyiwub_128B -def int_hexagon_V6_vmpyiwub_128B : -Hexagon_V62_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyiwub_128B">; +def int_hexagon_V6_vasrhbrndsat_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrhbrndsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiwub_acc,VI_ftype_VIVISI,3) -// tag : V6_vmpyiwub_acc -def int_hexagon_V6_vmpyiwub_acc : -Hexagon_V62_v512v512v512i_Intrinsic<"HEXAGON_V6_vmpyiwub_acc">; +def int_hexagon_V6_vsubuhsat_dv : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubuhsat_dv">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyiwub_acc_128B,VI_ftype_VIVISI,3) -// tag : V6_vmpyiwub_acc_128B -def int_hexagon_V6_vmpyiwub_acc_128B : -Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyiwub_acc_128B">; +def int_hexagon_V6_vsubuhsat_dv_128B : +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vsubuhsat_dv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vandnqrt,VI_ftype_QVSI,2) -// tag : V6_vandnqrt -def int_hexagon_V6_vandnqrt : -Hexagon_V62_v512v64ii_Intrinsic<"HEXAGON_V6_vandnqrt">; +def int_hexagon_V6_vabsdiffw : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vabsdiffw">; -// -// BUILTIN_INFO(HEXAGON.V6_vandnqrt_128B,VI_ftype_QVSI,2) -// tag : V6_vandnqrt_128B -def int_hexagon_V6_vandnqrt_128B : -Hexagon_V62_v1024v128ii_Intrinsic<"HEXAGON_V6_vandnqrt_128B">; +def int_hexagon_V6_vabsdiffw_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vabsdiffw_128B">; + +// V62 HVX Instructions. -// -// BUILTIN_INFO(HEXAGON.V6_vandnqrt_acc,VI_ftype_VIQVSI,3) -// tag : V6_vandnqrt_acc def int_hexagon_V6_vandnqrt_acc : -Hexagon_V62_v512v512v64ii_Intrinsic<"HEXAGON_V6_vandnqrt_acc">; +Hexagon_v16i32_v16i32v512i1i32_Intrinsic<"HEXAGON_V6_vandnqrt_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vandnqrt_acc_128B,VI_ftype_VIQVSI,3) -// tag : V6_vandnqrt_acc_128B def int_hexagon_V6_vandnqrt_acc_128B : -Hexagon_V62_v1024v1024v128ii_Intrinsic<"HEXAGON_V6_vandnqrt_acc_128B">; +Hexagon_v32i32_v32i32v1024i1i32_Intrinsic<"HEXAGON_V6_vandnqrt_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vandvqv,VI_ftype_QVVI,2) -// tag : V6_vandvqv -def int_hexagon_V6_vandvqv : -Hexagon_V62_v512v64iv512_Intrinsic<"HEXAGON_V6_vandvqv">; +def int_hexagon_V6_vaddclbh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddclbh">; -// -// BUILTIN_INFO(HEXAGON.V6_vandvqv_128B,VI_ftype_QVVI,2) -// tag : V6_vandvqv_128B -def int_hexagon_V6_vandvqv_128B : -Hexagon_V62_v1024v128iv1024_Intrinsic<"HEXAGON_V6_vandvqv_128B">; +def int_hexagon_V6_vaddclbh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddclbh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vandvnqv,VI_ftype_QVVI,2) -// tag : V6_vandvnqv -def int_hexagon_V6_vandvnqv : -Hexagon_V62_v512v64iv512_Intrinsic<"HEXAGON_V6_vandvnqv">; +def int_hexagon_V6_vmpyowh_64_acc : +Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyowh_64_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vandvnqv_128B,VI_ftype_QVVI,2) -// tag : V6_vandvnqv_128B -def int_hexagon_V6_vandvnqv_128B : -Hexagon_V62_v1024v128iv1024_Intrinsic<"HEXAGON_V6_vandvnqv_128B">; +def int_hexagon_V6_vmpyowh_64_acc_128B : +Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyowh_64_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_pred_scalar2v2,QV_ftype_SI,1) -// tag : V6_pred_scalar2v2 -def int_hexagon_V6_pred_scalar2v2 : -Hexagon_V62_v64ii_Intrinsic<"HEXAGON_V6_pred_scalar2v2">; +def int_hexagon_V6_vmpyewuh_64 : +Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmpyewuh_64">; -// -// BUILTIN_INFO(HEXAGON.V6_pred_scalar2v2_128B,QV_ftype_SI,1) -// tag : V6_pred_scalar2v2_128B -def int_hexagon_V6_pred_scalar2v2_128B : -Hexagon_V62_v128ii_Intrinsic<"HEXAGON_V6_pred_scalar2v2_128B">; +def int_hexagon_V6_vmpyewuh_64_128B : +Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmpyewuh_64_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_shuffeqw,QV_ftype_QVQV,2) -// tag : V6_shuffeqw -def int_hexagon_V6_shuffeqw : -Hexagon_V62_v64iv64iv64i_Intrinsic<"HEXAGON_V6_shuffeqw">; +def int_hexagon_V6_vsatuwuh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsatuwuh">; -// -// BUILTIN_INFO(HEXAGON.V6_shuffeqw_128B,QV_ftype_QVQV,2) -// tag : V6_shuffeqw_128B -def int_hexagon_V6_shuffeqw_128B : -Hexagon_V62_v128iv128iv128i_Intrinsic<"HEXAGON_V6_shuffeqw_128B">; +def int_hexagon_V6_vsatuwuh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsatuwuh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_shuffeqh,QV_ftype_QVQV,2) -// tag : V6_shuffeqh def int_hexagon_V6_shuffeqh : -Hexagon_V62_v64iv64iv64i_Intrinsic<"HEXAGON_V6_shuffeqh">; +Hexagon_v512i1_v512i1v512i1_Intrinsic<"HEXAGON_V6_shuffeqh">; -// -// BUILTIN_INFO(HEXAGON.V6_shuffeqh_128B,QV_ftype_QVQV,2) -// tag : V6_shuffeqh_128B def int_hexagon_V6_shuffeqh_128B : -Hexagon_V62_v128iv128iv128i_Intrinsic<"HEXAGON_V6_shuffeqh_128B">; +Hexagon_v1024i1_v1024i1v1024i1_Intrinsic<"HEXAGON_V6_shuffeqh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmaxb,VI_ftype_VIVI,2) -// tag : V6_vmaxb -def int_hexagon_V6_vmaxb : -Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vmaxb">; +def int_hexagon_V6_shuffeqw : +Hexagon_v512i1_v512i1v512i1_Intrinsic<"HEXAGON_V6_shuffeqw">; -// -// BUILTIN_INFO(HEXAGON.V6_vmaxb_128B,VI_ftype_VIVI,2) -// tag : V6_vmaxb_128B -def int_hexagon_V6_vmaxb_128B : -Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmaxb_128B">; +def int_hexagon_V6_shuffeqw_128B : +Hexagon_v1024i1_v1024i1v1024i1_Intrinsic<"HEXAGON_V6_shuffeqw_128B">; + +def int_hexagon_V6_ldcnpnt0 : +Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldcnpnt0">; + +def int_hexagon_V6_ldcnpnt0_128B : +Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldcnpnt0_128B">; + +def int_hexagon_V6_vsubcarry : +Hexagon_custom_v16i32v512i1_v16i32v16i32v512i1_Intrinsic; + +def int_hexagon_V6_vsubcarry_128B : +Hexagon_custom_v32i32v1024i1_v32i32v32i32v1024i1_Intrinsic_128B; + +def int_hexagon_V6_vasrhbsat : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrhbsat">; + +def int_hexagon_V6_vasrhbsat_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrhbsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vminb,VI_ftype_VIVI,2) -// tag : V6_vminb def int_hexagon_V6_vminb : -Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vminb">; +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vminb">; -// -// BUILTIN_INFO(HEXAGON.V6_vminb_128B,VI_ftype_VIVI,2) -// tag : V6_vminb_128B def int_hexagon_V6_vminb_128B : -Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vminb_128B">; +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vminb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsatuwuh,VI_ftype_VIVI,2) -// tag : V6_vsatuwuh -def int_hexagon_V6_vsatuwuh : -Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vsatuwuh">; +def int_hexagon_V6_vmpauhb_acc : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpauhb_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vsatuwuh_128B,VI_ftype_VIVI,2) -// tag : V6_vsatuwuh_128B -def int_hexagon_V6_vsatuwuh_128B : -Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsatuwuh_128B">; +def int_hexagon_V6_vmpauhb_acc_128B : +Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vmpauhb_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_lvsplath,VI_ftype_SI,1) -// tag : V6_lvsplath -def int_hexagon_V6_lvsplath : -Hexagon_V62_v512i_Intrinsic<"HEXAGON_V6_lvsplath">; +def int_hexagon_V6_vaddhw_acc : +Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddhw_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_lvsplath_128B,VI_ftype_SI,1) -// tag : V6_lvsplath_128B -def int_hexagon_V6_lvsplath_128B : -Hexagon_V62_v1024i_Intrinsic<"HEXAGON_V6_lvsplath_128B">; +def int_hexagon_V6_vaddhw_acc_128B : +Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddhw_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_lvsplatb,VI_ftype_SI,1) -// tag : V6_lvsplatb -def int_hexagon_V6_lvsplatb : -Hexagon_V62_v512i_Intrinsic<"HEXAGON_V6_lvsplatb">; +def int_hexagon_V6_vlsrb : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vlsrb">; -// -// BUILTIN_INFO(HEXAGON.V6_lvsplatb_128B,VI_ftype_SI,1) -// tag : V6_lvsplatb_128B -def int_hexagon_V6_lvsplatb_128B : -Hexagon_V62_v1024i_Intrinsic<"HEXAGON_V6_lvsplatb_128B">; +def int_hexagon_V6_vlsrb_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vlsrb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddclbw,VI_ftype_VIVI,2) -// tag : V6_vaddclbw -def int_hexagon_V6_vaddclbw : -Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vaddclbw">; +def int_hexagon_V6_vlutvwhi : +Hexagon_v32i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvwhi">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddclbw_128B,VI_ftype_VIVI,2) -// tag : V6_vaddclbw_128B -def int_hexagon_V6_vaddclbw_128B : -Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddclbw_128B">; +def int_hexagon_V6_vlutvwhi_128B : +Hexagon_v64i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvwhi_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddclbh,VI_ftype_VIVI,2) -// tag : V6_vaddclbh -def int_hexagon_V6_vaddclbh : -Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vaddclbh">; +def int_hexagon_V6_vaddububb_sat : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddububb_sat">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddclbh_128B,VI_ftype_VIVI,2) -// tag : V6_vaddclbh_128B -def int_hexagon_V6_vaddclbh_128B : -Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddclbh_128B">; +def int_hexagon_V6_vaddububb_sat_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddububb_sat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvvbi,VI_ftype_VIVISI,3) -// tag : V6_vlutvvbi -def int_hexagon_V6_vlutvvbi : -Hexagon_V62_v512v512v512i_Intrinsic<"HEXAGON_V6_vlutvvbi">; +def int_hexagon_V6_vsubbsat_dv : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubbsat_dv">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvvbi_128B,VI_ftype_VIVISI,3) -// tag : V6_vlutvvbi_128B -def int_hexagon_V6_vlutvvbi_128B : -Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvvbi_128B">; +def int_hexagon_V6_vsubbsat_dv_128B : +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vsubbsat_dv_128B">; + +def int_hexagon_V6_ldtp0 : +Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldtp0">; + +def int_hexagon_V6_ldtp0_128B : +Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldtp0_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvvb_oracci,VI_ftype_VIVIVISI,4) -// tag : V6_vlutvvb_oracci def int_hexagon_V6_vlutvvb_oracci : -Hexagon_V62_v512v512v512v512i_Intrinsic<"HEXAGON_V6_vlutvvb_oracci">; +Hexagon_v16i32_v16i32v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvvb_oracci">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvvb_oracci_128B,VI_ftype_VIVIVISI,4) -// tag : V6_vlutvvb_oracci_128B def int_hexagon_V6_vlutvvb_oracci_128B : -Hexagon_V62_v1024v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvvb_oracci_128B">; +Hexagon_v32i32_v32i32v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvvb_oracci_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvwhi,VD_ftype_VIVISI,3) -// tag : V6_vlutvwhi -def int_hexagon_V6_vlutvwhi : -Hexagon_V62_v1024v512v512i_Intrinsic<"HEXAGON_V6_vlutvwhi">; +def int_hexagon_V6_vsubuwsat_dv : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubuwsat_dv">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvwhi_128B,VD_ftype_VIVISI,3) -// tag : V6_vlutvwhi_128B -def int_hexagon_V6_vlutvwhi_128B : -Hexagon_V62_v2048v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvwhi_128B">; +def int_hexagon_V6_vsubuwsat_dv_128B : +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vsubuwsat_dv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvwh_oracci,VD_ftype_VDVIVISI,4) -// tag : V6_vlutvwh_oracci -def int_hexagon_V6_vlutvwh_oracci : -Hexagon_V62_v1024v1024v512v512i_Intrinsic<"HEXAGON_V6_vlutvwh_oracci">; +def int_hexagon_V6_ldpnt0 : +Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldpnt0">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvwh_oracci_128B,VD_ftype_VDVIVISI,4) -// tag : V6_vlutvwh_oracci_128B -def int_hexagon_V6_vlutvwh_oracci_128B : -Hexagon_V62_v2048v2048v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvwh_oracci_128B">; +def int_hexagon_V6_ldpnt0_128B : +Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldpnt0_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvvb_nm,VI_ftype_VIVISI,3) -// tag : V6_vlutvvb_nm -def int_hexagon_V6_vlutvvb_nm : -Hexagon_V62_v512v512v512i_Intrinsic<"HEXAGON_V6_vlutvvb_nm">; +def int_hexagon_V6_vandvnqv : +Hexagon_v16i32_v512i1v16i32_Intrinsic<"HEXAGON_V6_vandvnqv">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvvb_nm_128B,VI_ftype_VIVISI,3) -// tag : V6_vlutvvb_nm_128B -def int_hexagon_V6_vlutvvb_nm_128B : -Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvvb_nm_128B">; +def int_hexagon_V6_vandvnqv_128B : +Hexagon_v32i32_v1024i1v32i32_Intrinsic<"HEXAGON_V6_vandvnqv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvwh_nm,VD_ftype_VIVISI,3) -// tag : V6_vlutvwh_nm -def int_hexagon_V6_vlutvwh_nm : -Hexagon_V62_v1024v512v512i_Intrinsic<"HEXAGON_V6_vlutvwh_nm">; +def int_hexagon_V6_lvsplatb : +Hexagon_v16i32_i32_Intrinsic<"HEXAGON_V6_lvsplatb">; -// -// BUILTIN_INFO(HEXAGON.V6_vlutvwh_nm_128B,VD_ftype_VIVISI,3) -// tag : V6_vlutvwh_nm_128B -def int_hexagon_V6_vlutvwh_nm_128B : -Hexagon_V62_v2048v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvwh_nm_128B">; +def int_hexagon_V6_lvsplatb_128B : +Hexagon_v32i32_i32_Intrinsic<"HEXAGON_V6_lvsplatb_128B">; + +def int_hexagon_V6_lvsplath : +Hexagon_v16i32_i32_Intrinsic<"HEXAGON_V6_lvsplath">; + +def int_hexagon_V6_lvsplath_128B : +Hexagon_v32i32_i32_Intrinsic<"HEXAGON_V6_lvsplath_128B">; + +def int_hexagon_V6_ldtpnt0 : +Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldtpnt0">; + +def int_hexagon_V6_ldtpnt0_128B : +Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldtpnt0_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddcarry,VI_ftype_VIVIQV,3) -// tag: V6_vaddcarry -def int_hexagon_V6_vaddcarry : -Hexagon_v512v64iv512v512v64i_Intrinsic<"HEXAGON_v6_vaddcarry">; +def int_hexagon_V6_vlutvwh_nm : +Hexagon_v32i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvwh_nm">; -// -// BUILTIN_INFO(HEXAGON.V6_vaddcarry_128B,VI_ftype_VIVIQV,3) -// tag: V6_vaddcarry_128B -def int_hexagon_V6_vaddcarry_128B : -Hexagon_v1024v128iv1024v1024v128i_Intrinsic<"HEXAGON_v6_vaddcarry_128B">; +def int_hexagon_V6_vlutvwh_nm_128B : +Hexagon_v64i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvwh_nm_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubcarry,VI_ftype_VIVIQV,3) -// tag: V6_vsubcarry -def int_hexagon_V6_vsubcarry : -Hexagon_v512v64iv512v512v64i_Intrinsic<"HEXAGON_v6_vsubcarry">; +def int_hexagon_V6_ldnpnt0 : +Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldnpnt0">; -// -// BUILTIN_INFO(HEXAGON.V6_vsubcarry_128B,VI_ftype_VIVIQV,3) -// tag: V6_vsubcarry_128B -def int_hexagon_V6_vsubcarry_128B : -Hexagon_v1024v128iv1024v1024v128i_Intrinsic<"HEXAGON_v6_vsubcarry_128B">; +def int_hexagon_V6_ldnpnt0_128B : +Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldnpnt0_128B">; +def int_hexagon_V6_vmpauhb : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpauhb">; -/// -/// HexagonV65 intrinsics -/// +def int_hexagon_V6_vmpauhb_128B : +Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vmpauhb_128B">; -// -// Hexagon_V65_iLLiLLi_Intrinsic -// tag : A6_vcmpbeq_notany -class Hexagon_V65_iLLiLLi_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_ldtnp0 : +Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldtnp0">; -// -// Hexagon_V65_v1024v512LLi_Intrinsic -// tag : V6_vrmpyub_rtt -class Hexagon_V65_v1024v512LLi_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_ldtnp0_128B : +Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldtnp0_128B">; -// -// Hexagon_V65_v2048v1024LLi_Intrinsic -// tag : V6_vrmpyub_rtt_128B -class Hexagon_V65_v2048v1024LLi_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vrounduhub : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vrounduhub">; -// -// Hexagon_V65_v1024v1024v512LLi_Intrinsic -// tag : V6_vrmpyub_rtt_acc -class Hexagon_V65_v1024v1024v512LLi_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vrounduhub_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vrounduhub_128B">; -// -// Hexagon_V65_v2048v2048v1024LLi_Intrinsic -// tag : V6_vrmpyub_rtt_acc_128B -class Hexagon_V65_v2048v2048v1024LLi_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vadduhw_acc : +Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vadduhw_acc">; -// -// Hexagon_V65_v512v512v512i_Intrinsic -// tag : V6_vasruwuhsat -class Hexagon_V65_v512v512v512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vadduhw_acc_128B : +Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vadduhw_acc_128B">; -// -// Hexagon_V65_v1024v1024v1024i_Intrinsic -// tag : V6_vasruwuhsat_128B -class Hexagon_V65_v1024v1024v1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_ldcp0 : +Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldcp0">; -// -// Hexagon_V65_v512v512v512_Intrinsic -// tag : V6_vavguw -class Hexagon_V65_v512v512v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_ldcp0_128B : +Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldcp0_128B">; -// -// Hexagon_V65_v1024v1024v1024_Intrinsic -// tag : V6_vavguw_128B -class Hexagon_V65_v1024v1024v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vadduwsat : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vadduwsat">; -// -// Hexagon_V65_v512v512_Intrinsic -// tag : V6_vabsb -class Hexagon_V65_v512v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vadduwsat_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vadduwsat_128B">; -// -// Hexagon_V65_v1024v1024_Intrinsic -// tag : V6_vabsb_128B -class Hexagon_V65_v1024v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_ldtnpnt0 : +Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldtnpnt0">; -// -// Hexagon_V65_v1024v1024i_Intrinsic -// tag : V6_vmpabuu -class Hexagon_V65_v1024v1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_ldtnpnt0_128B : +Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldtnpnt0_128B">; -// -// Hexagon_V65_v2048v2048i_Intrinsic -// tag : V6_vmpabuu_128B -class Hexagon_V65_v2048v2048i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vaddbsat : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddbsat">; -// -// Hexagon_V65_v2048v2048v2048i_Intrinsic -// tag : V6_vmpabuu_acc_128B -class Hexagon_V65_v2048v2048v2048i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vaddbsat_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddbsat_128B">; -// -// Hexagon_V65_v1024v1024v512i_Intrinsic -// tag : V6_vmpyh_acc -class Hexagon_V65_v1024v1024v512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vandnqrt : +Hexagon_v16i32_v512i1i32_Intrinsic<"HEXAGON_V6_vandnqrt">; -// -// Hexagon_V65_v2048v2048v1024i_Intrinsic -// tag : V6_vmpyh_acc_128B -class Hexagon_V65_v2048v2048v1024i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vandnqrt_128B : +Hexagon_v32i32_v1024i1i32_Intrinsic<"HEXAGON_V6_vandnqrt_128B">; -// -// Hexagon_V65_v512v512v512LLi_Intrinsic -// tag : V6_vmpahhsat -class Hexagon_V65_v512v512v512LLi_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vmpyiwub_acc : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpyiwub_acc">; -// -// Hexagon_V65_v1024v1024v1024LLi_Intrinsic -// tag : V6_vmpahhsat_128B -class Hexagon_V65_v1024v1024v1024LLi_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vmpyiwub_acc_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpyiwub_acc_128B">; -// -// Hexagon_V65_v512v512LLi_Intrinsic -// tag : V6_vlut4 -class Hexagon_V65_v512v512LLi_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vmaxb : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vmaxb">; -// -// Hexagon_V65_v1024v1024LLi_Intrinsic -// tag : V6_vlut4_128B -class Hexagon_V65_v1024v1024LLi_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vmaxb_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vmaxb_128B">; -// -// Hexagon_V65_v512v512i_Intrinsic -// tag : V6_vmpyuhe -class Hexagon_V65_v512v512i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vandvqv : +Hexagon_v16i32_v512i1v16i32_Intrinsic<"HEXAGON_V6_vandvqv">; -// -// Hexagon_V65_v512v64i_Intrinsic -// tag : V6_vprefixqb -class Hexagon_V65_v512v64i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vandvqv_128B : +Hexagon_v32i32_v1024i1v32i32_Intrinsic<"HEXAGON_V6_vandvqv_128B">; -// -// Hexagon_V65_v1024v128i_Intrinsic -// tag : V6_vprefixqb_128B -class Hexagon_V65_v1024v128i_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vaddcarry : +Hexagon_custom_v16i32v512i1_v16i32v16i32v512i1_Intrinsic; -// -// BUILTIN_INFO(HEXAGON.A6_vcmpbeq_notany,QI_ftype_DIDI,2) -// tag : A6_vcmpbeq_notany -def int_hexagon_A6_vcmpbeq_notany : -Hexagon_V65_iLLiLLi_Intrinsic<"HEXAGON_A6_vcmpbeq_notany">; +def int_hexagon_V6_vaddcarry_128B : +Hexagon_custom_v32i32v1024i1_v32i32v32i32v1024i1_Intrinsic_128B; -// -// BUILTIN_INFO(HEXAGON.A6_vcmpbeq_notany_128B,QI_ftype_DIDI,2) -// tag : A6_vcmpbeq_notany_128B -def int_hexagon_A6_vcmpbeq_notany_128B : -Hexagon_V65_iLLiLLi_Intrinsic<"HEXAGON_A6_vcmpbeq_notany_128B">; +def int_hexagon_V6_vasrwuhrndsat : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrwuhrndsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpyub_rtt,VD_ftype_VIDI,2) -// tag : V6_vrmpyub_rtt -def int_hexagon_V6_vrmpyub_rtt : -Hexagon_V65_v1024v512LLi_Intrinsic<"HEXAGON_V6_vrmpyub_rtt">; +def int_hexagon_V6_vasrwuhrndsat_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrwuhrndsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpyub_rtt_128B,VD_ftype_VIDI,2) -// tag : V6_vrmpyub_rtt_128B -def int_hexagon_V6_vrmpyub_rtt_128B : -Hexagon_V65_v2048v1024LLi_Intrinsic<"HEXAGON_V6_vrmpyub_rtt_128B">; +def int_hexagon_V6_vlutvvbi : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvvbi">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpyub_rtt_acc,VD_ftype_VDVIDI,3) -// tag : V6_vrmpyub_rtt_acc -def int_hexagon_V6_vrmpyub_rtt_acc : -Hexagon_V65_v1024v1024v512LLi_Intrinsic<"HEXAGON_V6_vrmpyub_rtt_acc">; +def int_hexagon_V6_vlutvvbi_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvvbi_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpyub_rtt_acc_128B,VD_ftype_VDVIDI,3) -// tag : V6_vrmpyub_rtt_acc_128B -def int_hexagon_V6_vrmpyub_rtt_acc_128B : -Hexagon_V65_v2048v2048v1024LLi_Intrinsic<"HEXAGON_V6_vrmpyub_rtt_acc_128B">; +def int_hexagon_V6_vsubuwsat : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubuwsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybub_rtt,VD_ftype_VIDI,2) -// tag : V6_vrmpybub_rtt -def int_hexagon_V6_vrmpybub_rtt : -Hexagon_V65_v1024v512LLi_Intrinsic<"HEXAGON_V6_vrmpybub_rtt">; +def int_hexagon_V6_vsubuwsat_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubuwsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybub_rtt_128B,VD_ftype_VIDI,2) -// tag : V6_vrmpybub_rtt_128B -def int_hexagon_V6_vrmpybub_rtt_128B : -Hexagon_V65_v2048v1024LLi_Intrinsic<"HEXAGON_V6_vrmpybub_rtt_128B">; +def int_hexagon_V6_vaddbsat_dv : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddbsat_dv">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybub_rtt_acc,VD_ftype_VDVIDI,3) -// tag : V6_vrmpybub_rtt_acc -def int_hexagon_V6_vrmpybub_rtt_acc : -Hexagon_V65_v1024v1024v512LLi_Intrinsic<"HEXAGON_V6_vrmpybub_rtt_acc">; +def int_hexagon_V6_vaddbsat_dv_128B : +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vaddbsat_dv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vrmpybub_rtt_acc_128B,VD_ftype_VDVIDI,3) -// tag : V6_vrmpybub_rtt_acc_128B -def int_hexagon_V6_vrmpybub_rtt_acc_128B : -Hexagon_V65_v2048v2048v1024LLi_Intrinsic<"HEXAGON_V6_vrmpybub_rtt_acc_128B">; +def int_hexagon_V6_ldnp0 : +Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldnp0">; -// -// BUILTIN_INFO(HEXAGON.V6_vasruwuhsat,VI_ftype_VIVISI,3) -// tag : V6_vasruwuhsat -def int_hexagon_V6_vasruwuhsat : -Hexagon_V65_v512v512v512i_Intrinsic<"HEXAGON_V6_vasruwuhsat">; +def int_hexagon_V6_ldnp0_128B : +Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldnp0_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vasruwuhsat_128B,VI_ftype_VIVISI,3) -// tag : V6_vasruwuhsat_128B -def int_hexagon_V6_vasruwuhsat_128B : -Hexagon_V65_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasruwuhsat_128B">; +def int_hexagon_V6_vasruwuhrndsat : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasruwuhrndsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vasruhubsat,VI_ftype_VIVISI,3) -// tag : V6_vasruhubsat -def int_hexagon_V6_vasruhubsat : -Hexagon_V65_v512v512v512i_Intrinsic<"HEXAGON_V6_vasruhubsat">; +def int_hexagon_V6_vasruwuhrndsat_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasruwuhrndsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vasruhubsat_128B,VI_ftype_VIVISI,3) -// tag : V6_vasruhubsat_128B -def int_hexagon_V6_vasruhubsat_128B : -Hexagon_V65_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasruhubsat_128B">; +def int_hexagon_V6_vrounduwuh : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vrounduwuh">; -// -// BUILTIN_INFO(HEXAGON.V6_vasruhubrndsat,VI_ftype_VIVISI,3) -// tag : V6_vasruhubrndsat -def int_hexagon_V6_vasruhubrndsat : -Hexagon_V65_v512v512v512i_Intrinsic<"HEXAGON_V6_vasruhubrndsat">; +def int_hexagon_V6_vrounduwuh_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vrounduwuh_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vasruhubrndsat_128B,VI_ftype_VIVISI,3) -// tag : V6_vasruhubrndsat_128B -def int_hexagon_V6_vasruhubrndsat_128B : -Hexagon_V65_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasruhubrndsat_128B">; +def int_hexagon_V6_vlutvvb_nm : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvvb_nm">; -// -// BUILTIN_INFO(HEXAGON.V6_vaslh_acc,VI_ftype_VIVISI,3) -// tag : V6_vaslh_acc -def int_hexagon_V6_vaslh_acc : -Hexagon_V65_v512v512v512i_Intrinsic<"HEXAGON_V6_vaslh_acc">; +def int_hexagon_V6_vlutvvb_nm_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvvb_nm_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vaslh_acc_128B,VI_ftype_VIVISI,3) -// tag : V6_vaslh_acc_128B -def int_hexagon_V6_vaslh_acc_128B : -Hexagon_V65_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vaslh_acc_128B">; +def int_hexagon_V6_pred_scalar2v2 : +Hexagon_v512i1_i32_Intrinsic<"HEXAGON_V6_pred_scalar2v2">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrh_acc,VI_ftype_VIVISI,3) -// tag : V6_vasrh_acc -def int_hexagon_V6_vasrh_acc : -Hexagon_V65_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrh_acc">; +def int_hexagon_V6_pred_scalar2v2_128B : +Hexagon_v1024i1_i32_Intrinsic<"HEXAGON_V6_pred_scalar2v2_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vasrh_acc_128B,VI_ftype_VIVISI,3) -// tag : V6_vasrh_acc_128B -def int_hexagon_V6_vasrh_acc_128B : -Hexagon_V65_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrh_acc_128B">; +def int_hexagon_V6_ldp0 : +Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldp0">; -// -// BUILTIN_INFO(HEXAGON.V6_vavguw,VI_ftype_VIVI,2) -// tag : V6_vavguw -def int_hexagon_V6_vavguw : -Hexagon_V65_v512v512v512_Intrinsic<"HEXAGON_V6_vavguw">; +def int_hexagon_V6_ldp0_128B : +Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldp0_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vavguw_128B,VI_ftype_VIVI,2) -// tag : V6_vavguw_128B -def int_hexagon_V6_vavguw_128B : -Hexagon_V65_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavguw_128B">; +def int_hexagon_V6_vaddubh_acc : +Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddubh_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vavguwrnd,VI_ftype_VIVI,2) -// tag : V6_vavguwrnd -def int_hexagon_V6_vavguwrnd : -Hexagon_V65_v512v512v512_Intrinsic<"HEXAGON_V6_vavguwrnd">; +def int_hexagon_V6_vaddubh_acc_128B : +Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddubh_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vavguwrnd_128B,VI_ftype_VIVI,2) -// tag : V6_vavguwrnd_128B -def int_hexagon_V6_vavguwrnd_128B : -Hexagon_V65_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavguwrnd_128B">; +def int_hexagon_V6_vaddclbw : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vaddclbw">; -// -// BUILTIN_INFO(HEXAGON.V6_vavgb,VI_ftype_VIVI,2) -// tag : V6_vavgb -def int_hexagon_V6_vavgb : -Hexagon_V65_v512v512v512_Intrinsic<"HEXAGON_V6_vavgb">; +def int_hexagon_V6_vaddclbw_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddclbw_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vavgb_128B,VI_ftype_VIVI,2) -// tag : V6_vavgb_128B -def int_hexagon_V6_vavgb_128B : -Hexagon_V65_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavgb_128B">; +def int_hexagon_V6_ldcpnt0 : +Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldcpnt0">; -// -// BUILTIN_INFO(HEXAGON.V6_vavgbrnd,VI_ftype_VIVI,2) -// tag : V6_vavgbrnd -def int_hexagon_V6_vavgbrnd : -Hexagon_V65_v512v512v512_Intrinsic<"HEXAGON_V6_vavgbrnd">; +def int_hexagon_V6_ldcpnt0_128B : +Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldcpnt0_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vavgbrnd_128B,VI_ftype_VIVI,2) -// tag : V6_vavgbrnd_128B -def int_hexagon_V6_vavgbrnd_128B : -Hexagon_V65_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vavgbrnd_128B">; +def int_hexagon_V6_vadduwsat_dv : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vadduwsat_dv">; -// -// BUILTIN_INFO(HEXAGON.V6_vnavgb,VI_ftype_VIVI,2) -// tag : V6_vnavgb -def int_hexagon_V6_vnavgb : -Hexagon_V65_v512v512v512_Intrinsic<"HEXAGON_V6_vnavgb">; +def int_hexagon_V6_vadduwsat_dv_128B : +Hexagon_v64i32_v64i32v64i32_Intrinsic<"HEXAGON_V6_vadduwsat_dv_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vnavgb_128B,VI_ftype_VIVI,2) -// tag : V6_vnavgb_128B -def int_hexagon_V6_vnavgb_128B : -Hexagon_V65_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vnavgb_128B">; +def int_hexagon_V6_vmpyiwub : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyiwub">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsb,VI_ftype_VI,1) -// tag : V6_vabsb -def int_hexagon_V6_vabsb : -Hexagon_V65_v512v512_Intrinsic<"HEXAGON_V6_vabsb">; +def int_hexagon_V6_vmpyiwub_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyiwub_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsb_128B,VI_ftype_VI,1) -// tag : V6_vabsb_128B -def int_hexagon_V6_vabsb_128B : -Hexagon_V65_v1024v1024_Intrinsic<"HEXAGON_V6_vabsb_128B">; +def int_hexagon_V6_vsubububb_sat : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubububb_sat">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsb_sat,VI_ftype_VI,1) -// tag : V6_vabsb_sat -def int_hexagon_V6_vabsb_sat : -Hexagon_V65_v512v512_Intrinsic<"HEXAGON_V6_vabsb_sat">; +def int_hexagon_V6_vsubububb_sat_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubububb_sat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vabsb_sat_128B,VI_ftype_VI,1) -// tag : V6_vabsb_sat_128B -def int_hexagon_V6_vabsb_sat_128B : -Hexagon_V65_v1024v1024_Intrinsic<"HEXAGON_V6_vabsb_sat_128B">; +def int_hexagon_V6_ldcnp0 : +Hexagon_v16i32_i32i32_Intrinsic<"HEXAGON_V6_ldcnp0">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpabuu,VD_ftype_VDSI,2) -// tag : V6_vmpabuu -def int_hexagon_V6_vmpabuu : -Hexagon_V65_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpabuu">; +def int_hexagon_V6_ldcnp0_128B : +Hexagon_v32i32_i32i32_Intrinsic<"HEXAGON_V6_ldcnp0_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpabuu_128B,VD_ftype_VDSI,2) -// tag : V6_vmpabuu_128B -def int_hexagon_V6_vmpabuu_128B : -Hexagon_V65_v2048v2048i_Intrinsic<"HEXAGON_V6_vmpabuu_128B">; +def int_hexagon_V6_vlutvwh_oracci : +Hexagon_v32i32_v32i32v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvwh_oracci">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpabuu_acc,VD_ftype_VDVDSI,3) -// tag : V6_vmpabuu_acc -def int_hexagon_V6_vmpabuu_acc : -Hexagon_V65_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpabuu_acc">; +def int_hexagon_V6_vlutvwh_oracci_128B : +Hexagon_v64i32_v64i32v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvwh_oracci_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpabuu_acc_128B,VD_ftype_VDVDSI,3) -// tag : V6_vmpabuu_acc_128B -def int_hexagon_V6_vmpabuu_acc_128B : -Hexagon_V65_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vmpabuu_acc_128B">; +def int_hexagon_V6_vsubbsat : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsubbsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyh_acc,VD_ftype_VDVISI,3) -// tag : V6_vmpyh_acc -def int_hexagon_V6_vmpyh_acc : -Hexagon_V65_v1024v1024v512i_Intrinsic<"HEXAGON_V6_vmpyh_acc">; +def int_hexagon_V6_vsubbsat_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsubbsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyh_acc_128B,VD_ftype_VDVISI,3) -// tag : V6_vmpyh_acc_128B -def int_hexagon_V6_vmpyh_acc_128B : -Hexagon_V65_v2048v2048v1024i_Intrinsic<"HEXAGON_V6_vmpyh_acc_128B">; +// V65 HVX Instructions. -// -// BUILTIN_INFO(HEXAGON.V6_vmpahhsat,VI_ftype_VIVIDI,3) -// tag : V6_vmpahhsat -def int_hexagon_V6_vmpahhsat : -Hexagon_V65_v512v512v512LLi_Intrinsic<"HEXAGON_V6_vmpahhsat">; +def int_hexagon_V6_vasruhubrndsat : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasruhubrndsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpahhsat_128B,VI_ftype_VIVIDI,3) -// tag : V6_vmpahhsat_128B -def int_hexagon_V6_vmpahhsat_128B : -Hexagon_V65_v1024v1024v1024LLi_Intrinsic<"HEXAGON_V6_vmpahhsat_128B">; +def int_hexagon_V6_vasruhubrndsat_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasruhubrndsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpauhuhsat,VI_ftype_VIVIDI,3) -// tag : V6_vmpauhuhsat -def int_hexagon_V6_vmpauhuhsat : -Hexagon_V65_v512v512v512LLi_Intrinsic<"HEXAGON_V6_vmpauhuhsat">; +def int_hexagon_V6_vrmpybub_rtt : +Hexagon_v32i32_v16i32i64_Intrinsic<"HEXAGON_V6_vrmpybub_rtt">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpauhuhsat_128B,VI_ftype_VIVIDI,3) -// tag : V6_vmpauhuhsat_128B -def int_hexagon_V6_vmpauhuhsat_128B : -Hexagon_V65_v1024v1024v1024LLi_Intrinsic<"HEXAGON_V6_vmpauhuhsat_128B">; +def int_hexagon_V6_vrmpybub_rtt_128B : +Hexagon_v64i32_v32i32i64_Intrinsic<"HEXAGON_V6_vrmpybub_rtt_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpsuhuhsat,VI_ftype_VIVIDI,3) -// tag : V6_vmpsuhuhsat -def int_hexagon_V6_vmpsuhuhsat : -Hexagon_V65_v512v512v512LLi_Intrinsic<"HEXAGON_V6_vmpsuhuhsat">; +def int_hexagon_V6_vmpahhsat : +Hexagon_v16i32_v16i32v16i32i64_Intrinsic<"HEXAGON_V6_vmpahhsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpsuhuhsat_128B,VI_ftype_VIVIDI,3) -// tag : V6_vmpsuhuhsat_128B -def int_hexagon_V6_vmpsuhuhsat_128B : -Hexagon_V65_v1024v1024v1024LLi_Intrinsic<"HEXAGON_V6_vmpsuhuhsat_128B">; +def int_hexagon_V6_vmpahhsat_128B : +Hexagon_v32i32_v32i32v32i32i64_Intrinsic<"HEXAGON_V6_vmpahhsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vlut4,VI_ftype_VIDI,2) -// tag : V6_vlut4 -def int_hexagon_V6_vlut4 : -Hexagon_V65_v512v512LLi_Intrinsic<"HEXAGON_V6_vlut4">; +def int_hexagon_V6_vavguwrnd : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavguwrnd">; -// -// BUILTIN_INFO(HEXAGON.V6_vlut4_128B,VI_ftype_VIDI,2) -// tag : V6_vlut4_128B -def int_hexagon_V6_vlut4_128B : -Hexagon_V65_v1024v1024LLi_Intrinsic<"HEXAGON_V6_vlut4_128B">; +def int_hexagon_V6_vavguwrnd_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavguwrnd_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyuhe,VI_ftype_VISI,2) -// tag : V6_vmpyuhe -def int_hexagon_V6_vmpyuhe : -Hexagon_V65_v512v512i_Intrinsic<"HEXAGON_V6_vmpyuhe">; +def int_hexagon_V6_vnavgb : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vnavgb">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyuhe_128B,VI_ftype_VISI,2) -// tag : V6_vmpyuhe_128B -def int_hexagon_V6_vmpyuhe_128B : -Hexagon_V65_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyuhe_128B">; +def int_hexagon_V6_vnavgb_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vnavgb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyuhe_acc,VI_ftype_VIVISI,3) -// tag : V6_vmpyuhe_acc -def int_hexagon_V6_vmpyuhe_acc : -Hexagon_V65_v512v512v512i_Intrinsic<"HEXAGON_V6_vmpyuhe_acc">; +def int_hexagon_V6_vasrh_acc : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasrh_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vmpyuhe_acc_128B,VI_ftype_VIVISI,3) -// tag : V6_vmpyuhe_acc_128B -def int_hexagon_V6_vmpyuhe_acc_128B : -Hexagon_V65_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyuhe_acc_128B">; +def int_hexagon_V6_vasrh_acc_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasrh_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vprefixqb,VI_ftype_QV,1) -// tag : V6_vprefixqb -def int_hexagon_V6_vprefixqb : -Hexagon_V65_v512v64i_Intrinsic<"HEXAGON_V6_vprefixqb">; +def int_hexagon_V6_vmpauhuhsat : +Hexagon_v16i32_v16i32v16i32i64_Intrinsic<"HEXAGON_V6_vmpauhuhsat">; -// -// BUILTIN_INFO(HEXAGON.V6_vprefixqb_128B,VI_ftype_QV,1) -// tag : V6_vprefixqb_128B -def int_hexagon_V6_vprefixqb_128B : -Hexagon_V65_v1024v128i_Intrinsic<"HEXAGON_V6_vprefixqb_128B">; +def int_hexagon_V6_vmpauhuhsat_128B : +Hexagon_v32i32_v32i32v32i32i64_Intrinsic<"HEXAGON_V6_vmpauhuhsat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vprefixqh,VI_ftype_QV,1) -// tag : V6_vprefixqh -def int_hexagon_V6_vprefixqh : -Hexagon_V65_v512v64i_Intrinsic<"HEXAGON_V6_vprefixqh">; +def int_hexagon_V6_vmpyh_acc : +Hexagon_v32i32_v32i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpyh_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vprefixqh_128B,VI_ftype_QV,1) -// tag : V6_vprefixqh_128B -def int_hexagon_V6_vprefixqh_128B : -Hexagon_V65_v1024v128i_Intrinsic<"HEXAGON_V6_vprefixqh_128B">; +def int_hexagon_V6_vmpyh_acc_128B : +Hexagon_v64i32_v64i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpyh_acc_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vprefixqw,VI_ftype_QV,1) -// tag : V6_vprefixqw -def int_hexagon_V6_vprefixqw : -Hexagon_V65_v512v64i_Intrinsic<"HEXAGON_V6_vprefixqw">; +def int_hexagon_V6_vrmpybub_rtt_acc : +Hexagon_v32i32_v32i32v16i32i64_Intrinsic<"HEXAGON_V6_vrmpybub_rtt_acc">; -// -// BUILTIN_INFO(HEXAGON.V6_vprefixqw_128B,VI_ftype_QV,1) -// tag : V6_vprefixqw_128B -def int_hexagon_V6_vprefixqw_128B : -Hexagon_V65_v1024v128i_Intrinsic<"HEXAGON_V6_vprefixqw_128B">; +def int_hexagon_V6_vrmpybub_rtt_acc_128B : +Hexagon_v64i32_v64i32v32i32i64_Intrinsic<"HEXAGON_V6_vrmpybub_rtt_acc_128B">; +def int_hexagon_V6_vavgb : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavgb">; -// The scatter/gather ones below will not be generated from iset.py. Make sure -// you don't overwrite these. -class Hexagon_V65_vvmemiiv512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vavgb_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavgb_128B">; -class Hexagon_V65_vvmemiiv1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vaslh_acc : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vaslh_acc">; -class Hexagon_V65_vvmemiiv2048_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vaslh_acc_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vaslh_acc_128B">; -class Hexagon_V65_vvmemv64iiiv512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vavguw : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavguw">; -class Hexagon_V65_vvmemv128iiiv1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vavguw_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavguw_128B">; -class Hexagon_V65_vvmemv64iiiv1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vlut4 : +Hexagon_v16i32_v16i32i64_Intrinsic<"HEXAGON_V6_vlut4">; -class Hexagon_V65_vvmemv128iiiv2048_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vlut4_128B : +Hexagon_v32i32_v32i32i64_Intrinsic<"HEXAGON_V6_vlut4_128B">; -def int_hexagon_V6_vgathermw : -Hexagon_V65_vvmemiiv512_Intrinsic<"HEXAGON_V6_vgathermw">; +def int_hexagon_V6_vmpyuhe_acc : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vmpyuhe_acc">; -def int_hexagon_V6_vgathermw_128B : -Hexagon_V65_vvmemiiv1024_Intrinsic<"HEXAGON_V6_vgathermw_128B">; +def int_hexagon_V6_vmpyuhe_acc_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpyuhe_acc_128B">; -def int_hexagon_V6_vgathermh : -Hexagon_V65_vvmemiiv512_Intrinsic<"HEXAGON_V6_vgathermh">; +def int_hexagon_V6_vrmpyub_rtt : +Hexagon_v32i32_v16i32i64_Intrinsic<"HEXAGON_V6_vrmpyub_rtt">; -def int_hexagon_V6_vgathermh_128B : -Hexagon_V65_vvmemiiv1024_Intrinsic<"HEXAGON_V6_vgathermh_128B">; +def int_hexagon_V6_vrmpyub_rtt_128B : +Hexagon_v64i32_v32i32i64_Intrinsic<"HEXAGON_V6_vrmpyub_rtt_128B">; -def int_hexagon_V6_vgathermhw : -Hexagon_V65_vvmemiiv1024_Intrinsic<"HEXAGON_V6_vgathermhw">; +def int_hexagon_V6_vmpsuhuhsat : +Hexagon_v16i32_v16i32v16i32i64_Intrinsic<"HEXAGON_V6_vmpsuhuhsat">; -def int_hexagon_V6_vgathermhw_128B : -Hexagon_V65_vvmemiiv2048_Intrinsic<"HEXAGON_V6_vgathermhw_128B">; +def int_hexagon_V6_vmpsuhuhsat_128B : +Hexagon_v32i32_v32i32v32i32i64_Intrinsic<"HEXAGON_V6_vmpsuhuhsat_128B">; -def int_hexagon_V6_vgathermwq : -Hexagon_V65_vvmemv64iiiv512_Intrinsic<"HEXAGON_V6_vgathermwq">; +def int_hexagon_V6_vasruhubsat : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasruhubsat">; -def int_hexagon_V6_vgathermwq_128B : -Hexagon_V65_vvmemv128iiiv1024_Intrinsic<"HEXAGON_V6_vgathermwq_128B">; +def int_hexagon_V6_vasruhubsat_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasruhubsat_128B">; -def int_hexagon_V6_vgathermhq : -Hexagon_V65_vvmemv64iiiv512_Intrinsic<"HEXAGON_V6_vgathermhq">; +def int_hexagon_V6_vmpyuhe : +Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vmpyuhe">; -def int_hexagon_V6_vgathermhq_128B : -Hexagon_V65_vvmemv128iiiv1024_Intrinsic<"HEXAGON_V6_vgathermhq_128B">; +def int_hexagon_V6_vmpyuhe_128B : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpyuhe_128B">; -def int_hexagon_V6_vgathermhwq : -Hexagon_V65_vvmemv64iiiv1024_Intrinsic<"HEXAGON_V6_vgathermhwq">; +def int_hexagon_V6_vrmpyub_rtt_acc : +Hexagon_v32i32_v32i32v16i32i64_Intrinsic<"HEXAGON_V6_vrmpyub_rtt_acc">; -def int_hexagon_V6_vgathermhwq_128B : -Hexagon_V65_vvmemv128iiiv2048_Intrinsic<"HEXAGON_V6_vgathermhwq_128B">; +def int_hexagon_V6_vrmpyub_rtt_acc_128B : +Hexagon_v64i32_v64i32v32i32i64_Intrinsic<"HEXAGON_V6_vrmpyub_rtt_acc_128B">; -class Hexagon_V65_viiv512v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vasruwuhsat : +Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vasruwuhsat">; -class Hexagon_V65_viiv1024v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vasruwuhsat_128B : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vasruwuhsat_128B">; -class Hexagon_V65_vv64iiiv512v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vmpabuu_acc : +Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vmpabuu_acc">; -class Hexagon_V65_vv128iiiv1024v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vmpabuu_acc_128B : +Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vmpabuu_acc_128B">; -class Hexagon_V65_viiv1024v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vprefixqw : +Hexagon_v16i32_v512i1_Intrinsic<"HEXAGON_V6_vprefixqw">; -class Hexagon_V65_viiv2048v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vprefixqw_128B : +Hexagon_v32i32_v1024i1_Intrinsic<"HEXAGON_V6_vprefixqw_128B">; -class Hexagon_V65_vv64iiiv1024v512_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vprefixqh : +Hexagon_v16i32_v512i1_Intrinsic<"HEXAGON_V6_vprefixqh">; -class Hexagon_V65_vv128iiiv2048v1024_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vprefixqh_128B : +Hexagon_v32i32_v1024i1_Intrinsic<"HEXAGON_V6_vprefixqh_128B">; -class Hexagon_V65_v2048_Intrinsic - : Hexagon_Intrinsic; +def int_hexagon_V6_vprefixqb : +Hexagon_v16i32_v512i1_Intrinsic<"HEXAGON_V6_vprefixqb">; -// -// BUILTIN_INFO(HEXAGON.V6_vscattermw,v_ftype_SISIVIVI,4) -// tag : V6_vscattermw -def int_hexagon_V6_vscattermw : -Hexagon_V65_viiv512v512_Intrinsic<"HEXAGON_V6_vscattermw">; +def int_hexagon_V6_vprefixqb_128B : +Hexagon_v32i32_v1024i1_Intrinsic<"HEXAGON_V6_vprefixqb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vscattermw_128B,v_ftype_SISIVIVI,4) -// tag : V6_vscattermw_128B -def int_hexagon_V6_vscattermw_128B : -Hexagon_V65_viiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermw_128B">; +def int_hexagon_V6_vabsb : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabsb">; -// -// BUILTIN_INFO(HEXAGON.V6_vscattermh,v_ftype_SISIVIVI,4) -// tag : V6_vscattermh -def int_hexagon_V6_vscattermh : -Hexagon_V65_viiv512v512_Intrinsic<"HEXAGON_V6_vscattermh">; +def int_hexagon_V6_vabsb_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabsb_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vscattermh_128B,v_ftype_SISIVIVI,4) -// tag : V6_vscattermh_128B -def int_hexagon_V6_vscattermh_128B : -Hexagon_V65_viiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermh_128B">; +def int_hexagon_V6_vavgbrnd : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vavgbrnd">; -// -// BUILTIN_INFO(HEXAGON.V6_vscattermw_add,v_ftype_SISIVIVI,4) -// tag : V6_vscattermw_add -def int_hexagon_V6_vscattermw_add : -Hexagon_V65_viiv512v512_Intrinsic<"HEXAGON_V6_vscattermw_add">; +def int_hexagon_V6_vavgbrnd_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vavgbrnd_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vscattermw_add_128B,v_ftype_SISIVIVI,4) -// tag : V6_vscattermw_add_128B -def int_hexagon_V6_vscattermw_add_128B : -Hexagon_V65_viiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermw_add_128B">; +def int_hexagon_V6_vdd0 : +Hexagon_v32i32__Intrinsic<"HEXAGON_V6_vdd0">; -// -// BUILTIN_INFO(HEXAGON.V6_vscattermh_add,v_ftype_SISIVIVI,4) -// tag : V6_vscattermh_add -def int_hexagon_V6_vscattermh_add : -Hexagon_V65_viiv512v512_Intrinsic<"HEXAGON_V6_vscattermh_add">; +def int_hexagon_V6_vdd0_128B : +Hexagon_v64i32__Intrinsic<"HEXAGON_V6_vdd0_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vscattermh_add_128B,v_ftype_SISIVIVI,4) -// tag : V6_vscattermh_add_128B -def int_hexagon_V6_vscattermh_add_128B : -Hexagon_V65_viiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermh_add_128B">; +def int_hexagon_V6_vmpabuu : +Hexagon_v32i32_v32i32i32_Intrinsic<"HEXAGON_V6_vmpabuu">; -// -// BUILTIN_INFO(HEXAGON.V6_vscattermwq,v_ftype_QVSISIVIVI,5) -// tag : V6_vscattermwq -def int_hexagon_V6_vscattermwq : -Hexagon_V65_vv64iiiv512v512_Intrinsic<"HEXAGON_V6_vscattermwq">; +def int_hexagon_V6_vmpabuu_128B : +Hexagon_v64i32_v64i32i32_Intrinsic<"HEXAGON_V6_vmpabuu_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vscattermwq_128B,v_ftype_QVSISIVIVI,5) -// tag : V6_vscattermwq_128B -def int_hexagon_V6_vscattermwq_128B : -Hexagon_V65_vv128iiiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermwq_128B">; +def int_hexagon_V6_vabsb_sat : +Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabsb_sat">; -// -// BUILTIN_INFO(HEXAGON.V6_vscattermhq,v_ftype_QVSISIVIVI,5) -// tag : V6_vscattermhq -def int_hexagon_V6_vscattermhq : -Hexagon_V65_vv64iiiv512v512_Intrinsic<"HEXAGON_V6_vscattermhq">; +def int_hexagon_V6_vabsb_sat_128B : +Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabsb_sat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vscattermhq_128B,v_ftype_QVSISIVIVI,5) -// tag : V6_vscattermhq_128B -def int_hexagon_V6_vscattermhq_128B : -Hexagon_V65_vv128iiiv1024v1024_Intrinsic<"HEXAGON_V6_vscattermhq_128B">; +// V66 HVX Instructions. -// -// BUILTIN_INFO(HEXAGON.V6_vscattermhw,v_ftype_SISIVDVI,4) -// tag : V6_vscattermhw -def int_hexagon_V6_vscattermhw : -Hexagon_V65_viiv1024v512_Intrinsic<"HEXAGON_V6_vscattermhw">; +def int_hexagon_V6_vaddcarrysat : +Hexagon_v16i32_v16i32v16i32v512i1_Intrinsic<"HEXAGON_V6_vaddcarrysat">; -// -// BUILTIN_INFO(HEXAGON.V6_vscattermhw_128B,v_ftype_SISIVDVI,4) -// tag : V6_vscattermhw_128B -def int_hexagon_V6_vscattermhw_128B : -Hexagon_V65_viiv2048v1024_Intrinsic<"HEXAGON_V6_vscattermhw_128B">; +def int_hexagon_V6_vaddcarrysat_128B : +Hexagon_v32i32_v32i32v32i32v1024i1_Intrinsic<"HEXAGON_V6_vaddcarrysat_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vscattermhwq,v_ftype_QVSISIVDVI,5) -// tag : V6_vscattermhwq -def int_hexagon_V6_vscattermhwq : -Hexagon_V65_vv64iiiv1024v512_Intrinsic<"HEXAGON_V6_vscattermhwq">; +def int_hexagon_V6_vasr_into : +Hexagon_v32i32_v32i32v16i32v16i32_Intrinsic<"HEXAGON_V6_vasr_into">; -// -// BUILTIN_INFO(HEXAGON.V6_vscattermhwq_128B,v_ftype_QVSISIVDVI,5) -// tag : V6_vscattermhwq_128B -def int_hexagon_V6_vscattermhwq_128B : -Hexagon_V65_vv128iiiv2048v1024_Intrinsic<"HEXAGON_V6_vscattermhwq_128B">; +def int_hexagon_V6_vasr_into_128B : +Hexagon_v64i32_v64i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vasr_into_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vscattermhw_add,v_ftype_SISIVDVI,4) -// tag : V6_vscattermhw_add -def int_hexagon_V6_vscattermhw_add : -Hexagon_V65_viiv1024v512_Intrinsic<"HEXAGON_V6_vscattermhw_add">; +def int_hexagon_V6_vsatdw : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsatdw">; -// -// BUILTIN_INFO(HEXAGON.V6_vscattermhw_add_128B,v_ftype_SISIVDVI,4) -// tag : V6_vscattermhw_add_128B -def int_hexagon_V6_vscattermhw_add_128B : -Hexagon_V65_viiv2048v1024_Intrinsic<"HEXAGON_V6_vscattermhw_add_128B">; +def int_hexagon_V6_vsatdw_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsatdw_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vdd0,VD_ftype_,0) -// tag : V6_vdd0 -def int_hexagon_V6_vdd0 : -Hexagon_v1024_Intrinsic<"HEXAGON_V6_vdd0">; +def int_hexagon_V6_vrotr : +Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vrotr">; + +def int_hexagon_V6_vrotr_128B : +Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vrotr_128B">; -// -// BUILTIN_INFO(HEXAGON.V6_vdd0_128B,VD_ftype_,0) -// tag : V6_vdd0_128B -def int_hexagon_V6_vdd0_128B : -Hexagon_V65_v2048_Intrinsic<"HEXAGON_V6_vdd0_128B">; diff --git a/include/llvm/IR/IntrinsicsRISCV.td b/include/llvm/IR/IntrinsicsRISCV.td index b656622b3632d0f509b977ab1576d5b70af9c8f9..0ac7348b56dbf87d1aecaa33510732b544afcea8 100644 --- a/include/llvm/IR/IntrinsicsRISCV.td +++ b/include/llvm/IR/IntrinsicsRISCV.td @@ -36,4 +36,9 @@ def int_riscv_masked_atomicrmw_min_i32 : MaskedAtomicRMW32WithSextIntrinsic; def int_riscv_masked_atomicrmw_umax_i32 : MaskedAtomicRMW32Intrinsic; def int_riscv_masked_atomicrmw_umin_i32 : MaskedAtomicRMW32Intrinsic; +def int_riscv_masked_cmpxchg_i32 + : Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i32_ty], + [IntrArgMemOnly, NoCapture<0>]>; + } // TargetPrefix = "riscv" diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 53a1e214ae9625bccfe684ff4a0bb6ea84d4c2fb..a59dbe7ea4a42150b375e7d6e88ee8538e4cfc13 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -2726,22 +2726,16 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // ADX let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_addcarryx_u32: + def int_x86_addcarry_32: Intrinsic<[llvm_i8_ty, llvm_i32_ty], [llvm_i8_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_addcarryx_u64: + def int_x86_addcarry_64: Intrinsic<[llvm_i8_ty, llvm_i64_ty], [llvm_i8_ty, llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>; - def int_x86_addcarry_u32: + def int_x86_subborrow_32: Intrinsic<[llvm_i8_ty, llvm_i32_ty], [llvm_i8_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_addcarry_u64: - Intrinsic<[llvm_i8_ty, llvm_i64_ty], - [llvm_i8_ty, llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>; - def int_x86_subborrow_u32: - Intrinsic<[llvm_i8_ty, llvm_i32_ty], - [llvm_i8_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_subborrow_u64: + def int_x86_subborrow_64: Intrinsic<[llvm_i8_ty, llvm_i64_ty], [llvm_i8_ty, llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>; } diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h index 91e44addbd606b72e6325ec3ddd74b89ae5b99b7..ef4a4a9840b2f5799930edd61322829f430d5ca1 100644 --- a/include/llvm/IR/Module.h +++ b/include/llvm/IR/Module.h @@ -49,6 +49,7 @@ class MemoryBuffer; class RandomNumberGenerator; template class SmallPtrSetImpl; class StructType; +class VersionTuple; /// A Module instance is used to store all the information related to an /// LLVM module. Modules are the top level container of all other LLVM @@ -868,6 +869,17 @@ public: /// Set that PLT should be avoid for RTLib calls. void setRtLibUseGOT(); + /// @name Utility functions for querying and setting the build SDK version + /// @{ + + /// Attach a build SDK version metadata to this module. + void setSDKVersion(const VersionTuple &V); + + /// Get the build SDK version metadata. + /// + /// An empty version is returned if no such metadata is attached. + VersionTuple getSDKVersion() const; + /// @} /// Take ownership of the given memory buffer. void setOwnedMemoryBuffer(std::unique_ptr MB); diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h index 9a456acf96652e8cdd2e429fe602d87a6b0299fc..6653795d5034b84377a328525cde071012d1b38c 100644 --- a/include/llvm/IR/ModuleSummaryIndex.h +++ b/include/llvm/IR/ModuleSummaryIndex.h @@ -163,13 +163,13 @@ using GlobalValueSummaryMapTy = /// Struct that holds a reference to a particular GUID in a global value /// summary. struct ValueInfo { - PointerIntPair - RefAndFlag; + PointerIntPair + RefAndFlags; ValueInfo() = default; ValueInfo(bool HaveGVs, const GlobalValueSummaryMapTy::value_type *R) { - RefAndFlag.setPointer(R); - RefAndFlag.setInt(HaveGVs); + RefAndFlags.setPointer(R); + RefAndFlags.setInt(HaveGVs); } operator bool() const { return getRef(); } @@ -189,10 +189,12 @@ struct ValueInfo { : getRef()->second.U.Name; } - bool haveGVs() const { return RefAndFlag.getInt(); } + bool haveGVs() const { return RefAndFlags.getInt() & 0x1; } + bool isReadOnly() const { return RefAndFlags.getInt() & 0x2; } + void setReadOnly() { RefAndFlags.setInt(RefAndFlags.getInt() | 0x2); } const GlobalValueSummaryMapTy::value_type *getRef() const { - return RefAndFlag.getPointer(); + return RefAndFlags.getPointer(); } bool isDSOLocal() const; @@ -499,8 +501,9 @@ public: FunctionSummary::GVFlags( GlobalValue::LinkageTypes::AvailableExternallyLinkage, /*NotEligibleToImport=*/true, /*Live=*/true, /*IsLocal=*/false), - 0, FunctionSummary::FFlags{}, std::vector(), - std::move(Edges), std::vector(), + /*InsCount=*/0, FunctionSummary::FFlags{}, /*EntryCount=*/0, + std::vector(), std::move(Edges), + std::vector(), std::vector(), std::vector(), std::vector(), @@ -518,6 +521,11 @@ private: /// Function summary specific flags. FFlags FunFlags; + /// The synthesized entry count of the function. + /// This is only populated during ThinLink phase and remains unused while + /// generating per-module summaries. + uint64_t EntryCount = 0; + /// List of call edge pairs from this function. std::vector CallGraphEdgeList; @@ -525,14 +533,15 @@ private: public: FunctionSummary(GVFlags Flags, unsigned NumInsts, FFlags FunFlags, - std::vector Refs, std::vector CGEdges, + uint64_t EntryCount, std::vector Refs, + std::vector CGEdges, std::vector TypeTests, std::vector TypeTestAssumeVCalls, std::vector TypeCheckedLoadVCalls, std::vector TypeTestAssumeConstVCalls, std::vector TypeCheckedLoadConstVCalls) : GlobalValueSummary(FunctionKind, Flags, std::move(Refs)), - InstCount(NumInsts), FunFlags(FunFlags), + InstCount(NumInsts), FunFlags(FunFlags), EntryCount(EntryCount), CallGraphEdgeList(std::move(CGEdges)) { if (!TypeTests.empty() || !TypeTestAssumeVCalls.empty() || !TypeCheckedLoadVCalls.empty() || !TypeTestAssumeConstVCalls.empty() || @@ -543,6 +552,8 @@ public: std::move(TypeTestAssumeConstVCalls), std::move(TypeCheckedLoadConstVCalls)}); } + // Gets the number of immutable refs in RefEdgeList + unsigned immutableRefCount() const; /// Check if this is a function summary. static bool classof(const GlobalValueSummary *GVS) { @@ -555,6 +566,12 @@ public: /// Get the instruction count recorded for this function. unsigned instCount() const { return InstCount; } + /// Get the synthetic entry count for this function. + uint64_t entryCount() const { return EntryCount; } + + /// Set the synthetic entry count for this function. + void setEntryCount(uint64_t EC) { EntryCount = EC; } + /// Return the list of pairs. ArrayRef calls() const { return CallGraphEdgeList; } @@ -652,19 +669,30 @@ template <> struct DenseMapInfo { /// Global variable summary information to aid decisions and /// implementation of importing. /// -/// Currently this doesn't add anything to the base \p GlobalValueSummary, -/// but is a placeholder as additional info may be added to the summary -/// for variables. +/// Global variable summary has extra flag, telling if it is +/// modified during the program run or not. This affects ThinLTO +/// internalization class GlobalVarSummary : public GlobalValueSummary { - public: - GlobalVarSummary(GVFlags Flags, std::vector Refs) - : GlobalValueSummary(GlobalVarKind, Flags, std::move(Refs)) {} + struct GVarFlags { + GVarFlags(bool ReadOnly = false) : ReadOnly(ReadOnly) {} + + unsigned ReadOnly : 1; + } VarFlags; + + GlobalVarSummary(GVFlags Flags, GVarFlags VarFlags, + std::vector Refs) + : GlobalValueSummary(GlobalVarKind, Flags, std::move(Refs)), + VarFlags(VarFlags) {} /// Check if this is a global variable summary. static bool classof(const GlobalValueSummary *GVS) { return GVS->getSummaryKind() == GlobalVarKind; } + + GVarFlags varflags() const { return VarFlags; } + void setReadOnly(bool RO) { VarFlags.ReadOnly = RO; } + bool isReadOnly() const { return VarFlags.ReadOnly; } }; struct TypeTestResolution { @@ -787,6 +815,9 @@ private: /// considered live. bool WithGlobalValueDeadStripping = false; + /// Indicates that summary-based synthetic entry count propagation has run + bool HasSyntheticEntryCounts = false; + /// Indicates that distributed backend should skip compilation of the /// module. Flag is suppose to be set by distributed ThinLTO indexing /// when it detected that the module is not needed during the final @@ -899,6 +930,9 @@ public: WithGlobalValueDeadStripping = true; } + bool hasSyntheticEntryCounts() const { return HasSyntheticEntryCounts; } + void setHasSyntheticEntryCounts() { HasSyntheticEntryCounts = true; } + bool skipModuleByDistributedBackend() const { return SkipModuleByDistributedBackend; } @@ -931,7 +965,7 @@ public: // Save a string in the Index. Use before passing Name to // getOrInsertValueInfo when the string isn't owned elsewhere (e.g. on the // module's Strtab). - StringRef saveString(std::string String) { return Saver.save(String); } + StringRef saveString(StringRef String) { return Saver.save(String); } /// Return a ValueInfo for \p GUID setting value \p Name. ValueInfo getOrInsertValueInfo(GlobalValue::GUID GUID, StringRef Name) { @@ -1135,11 +1169,15 @@ public: /// Print out strongly connected components for debugging. void dumpSCCs(raw_ostream &OS); + + /// Analyze index and detect unmodified globals + void propagateConstants(const DenseSet &PreservedSymbols); }; /// GraphTraits definition to build SCC for the index template <> struct GraphTraits { typedef ValueInfo NodeRef; + using EdgeRef = FunctionSummary::EdgeTy &; static NodeRef valueInfoFromEdge(FunctionSummary::EdgeTy &P) { return P.first; @@ -1148,6 +1186,8 @@ template <> struct GraphTraits { mapped_iterator::iterator, decltype(&valueInfoFromEdge)>; + using ChildEdgeIteratorType = std::vector::iterator; + static NodeRef getEntryNode(ValueInfo V) { return V; } static ChildIteratorType child_begin(NodeRef N) { @@ -1169,6 +1209,26 @@ template <> struct GraphTraits { cast(N.getSummaryList().front()->getBaseObject()); return ChildIteratorType(F->CallGraphEdgeList.end(), &valueInfoFromEdge); } + + static ChildEdgeIteratorType child_edge_begin(NodeRef N) { + if (!N.getSummaryList().size()) // handle external function + return FunctionSummary::ExternalNode.CallGraphEdgeList.begin(); + + FunctionSummary *F = + cast(N.getSummaryList().front()->getBaseObject()); + return F->CallGraphEdgeList.begin(); + } + + static ChildEdgeIteratorType child_edge_end(NodeRef N) { + if (!N.getSummaryList().size()) // handle external function + return FunctionSummary::ExternalNode.CallGraphEdgeList.end(); + + FunctionSummary *F = + cast(N.getSummaryList().front()->getBaseObject()); + return F->CallGraphEdgeList.end(); + } + + static NodeRef edge_dest(EdgeRef E) { return E.first; } }; template <> @@ -1184,6 +1244,14 @@ struct GraphTraits : public GraphTraits { } }; +static inline bool canImportGlobalVar(GlobalValueSummary *S) { + assert(isa(S->getBaseObject())); + + // We don't import GV with references, because it can result + // in promotion of local variables in the source module. + return !GlobalValue::isInterposableLinkage(S->linkage()) && + !S->notEligibleToImport() && S->refs().empty(); +} } // end namespace llvm #endif // LLVM_IR_MODULESUMMARYINDEX_H diff --git a/include/llvm/IR/ModuleSummaryIndexYAML.h b/include/llvm/IR/ModuleSummaryIndexYAML.h index 56f56b4b8c2e613ccd9365adf280af7d8be12ba3..a88ee26b51c3534abc1b2b280cd9fcd89a911c09 100644 --- a/include/llvm/IR/ModuleSummaryIndexYAML.h +++ b/include/llvm/IR/ModuleSummaryIndexYAML.h @@ -224,7 +224,7 @@ template <> struct CustomMappingTraits { GlobalValueSummary::GVFlags( static_cast(FSum.Linkage), FSum.NotEligibleToImport, FSum.Live, FSum.IsLocal), - 0, FunctionSummary::FFlags{}, Refs, + /*NumInsts=*/0, FunctionSummary::FFlags{}, /*EntryCount=*/0, Refs, ArrayRef{}, std::move(FSum.TypeTests), std::move(FSum.TypeTestAssumeVCalls), std::move(FSum.TypeCheckedLoadVCalls), diff --git a/include/llvm/IR/PassInstrumentation.h b/include/llvm/IR/PassInstrumentation.h index 3718b53c45348f3fbcb00ab23e22ca8f5b384c1a..08dac1c4a2746599f86f46368e5eaa08585aa87f 100644 --- a/include/llvm/IR/PassInstrumentation.h +++ b/include/llvm/IR/PassInstrumentation.h @@ -68,10 +68,17 @@ class PreservedAnalyses; /// PassInstrumentation to pass control to the registered callbacks. class PassInstrumentationCallbacks { public: - // Before/After callbacks accept IRUnits, so they need to take them - // as pointers, wrapped with llvm::Any + // Before/After callbacks accept IRUnits whenever appropriate, so they need + // to take them as constant pointers, wrapped with llvm::Any. + // For the case when IRUnit has been invalidated there is a different + // callback to use - AfterPassInvalidated. + // TODO: currently AfterPassInvalidated does not accept IRUnit, since passing + // already invalidated IRUnit is unsafe. There are ways to handle invalidated IRUnits + // in a safe way, and we might pursue that as soon as there is a useful instrumentation + // that needs it. using BeforePassFunc = bool(StringRef, Any); using AfterPassFunc = void(StringRef, Any); + using AfterPassInvalidatedFunc = void(StringRef); using BeforeAnalysisFunc = void(StringRef, Any); using AfterAnalysisFunc = void(StringRef, Any); @@ -90,6 +97,11 @@ public: AfterPassCallbacks.emplace_back(std::move(C)); } + template + void registerAfterPassInvalidatedCallback(CallableT C) { + AfterPassInvalidatedCallbacks.emplace_back(std::move(C)); + } + template void registerBeforeAnalysisCallback(CallableT C) { BeforeAnalysisCallbacks.emplace_back(std::move(C)); @@ -105,6 +117,8 @@ private: SmallVector, 4> BeforePassCallbacks; SmallVector, 4> AfterPassCallbacks; + SmallVector, 4> + AfterPassInvalidatedCallbacks; SmallVector, 4> BeforeAnalysisCallbacks; SmallVector, 4> @@ -139,7 +153,8 @@ public: } /// AfterPass instrumentation point - takes \p Pass instance that has - /// just been executed and constant reference to IR it operates on. + /// just been executed and constant reference to \p IR it operates on. + /// \p IR is guaranteed to be valid at this point. template void runAfterPass(const PassT &Pass, const IRUnitT &IR) const { if (Callbacks) @@ -147,6 +162,16 @@ public: C(Pass.name(), llvm::Any(&IR)); } + /// AfterPassInvalidated instrumentation point - takes \p Pass instance + /// that has just been executed. For use when IR has been invalidated + /// by \p Pass execution. + template + void runAfterPassInvalidated(const PassT &Pass) const { + if (Callbacks) + for (auto &C : Callbacks->AfterPassInvalidatedCallbacks) + C(Pass.name()); + } + /// BeforeAnalysis instrumentation point - takes \p Analysis instance /// to be executed and constant reference to IR it operates on. template diff --git a/include/llvm/IR/PassTimingInfo.h b/include/llvm/IR/PassTimingInfo.h index 2c6a0e08a368f2405dcbfd81e97b80d00300d75b..e9945f997f4395b65b4a82b9935e8432b8144b0b 100644 --- a/include/llvm/IR/PassTimingInfo.h +++ b/include/llvm/IR/PassTimingInfo.h @@ -99,8 +99,8 @@ private: void stopTimer(StringRef PassID); // Implementation of pass instrumentation callbacks. - bool runBeforePass(StringRef PassID, Any IR); - void runAfterPass(StringRef PassID, Any IR); + bool runBeforePass(StringRef PassID); + void runAfterPass(StringRef PassID); }; } // namespace llvm diff --git a/include/llvm/IR/PatternMatch.h b/include/llvm/IR/PatternMatch.h index dd30072ce57110a849e10dc7149cde2c7daa423f..e1e7c7268b5d690a7e08b77caf5b97310a15e21b 100644 --- a/include/llvm/IR/PatternMatch.h +++ b/include/llvm/IR/PatternMatch.h @@ -215,6 +215,7 @@ template struct cst_pred_ty : public Predicate { // Non-splat vector constant: check each element for a match. unsigned NumElts = V->getType()->getVectorNumElements(); assert(NumElts != 0 && "Constant vector with no elements?"); + bool HasNonUndefElements = false; for (unsigned i = 0; i != NumElts; ++i) { Constant *Elt = C->getAggregateElement(i); if (!Elt) @@ -224,8 +225,9 @@ template struct cst_pred_ty : public Predicate { auto *CI = dyn_cast(Elt); if (!CI || !this->isValue(CI->getValue())) return false; + HasNonUndefElements = true; } - return true; + return HasNonUndefElements; } } return false; @@ -272,6 +274,7 @@ template struct cstfp_pred_ty : public Predicate { // Non-splat vector constant: check each element for a match. unsigned NumElts = V->getType()->getVectorNumElements(); assert(NumElts != 0 && "Constant vector with no elements?"); + bool HasNonUndefElements = false; for (unsigned i = 0; i != NumElts; ++i) { Constant *Elt = C->getAggregateElement(i); if (!Elt) @@ -281,8 +284,9 @@ template struct cstfp_pred_ty : public Predicate { auto *CF = dyn_cast(Elt); if (!CF || !this->isValue(CF->getValueAPF())) return false; + HasNonUndefElements = true; } - return true; + return HasNonUndefElements; } } return false; diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index 1a9c6f82bfdfdecaff405f4003a40f345fb83a5b..206089cf3a8f4a95896573f8c4b51dcec9d2c23b 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -140,6 +140,7 @@ void initializeExpandISelPseudosPass(PassRegistry&); void initializeExpandMemCmpPassPass(PassRegistry&); void initializeExpandPostRAPass(PassRegistry&); void initializeExpandReductionsPass(PassRegistry&); +void initializeMakeGuardsExplicitLegacyPassPass(PassRegistry&); void initializeExternalAAWrapperPassPass(PassRegistry&); void initializeFEntryInserterPass(PassRegistry&); void initializeFinalizeMachineBundlesPass(PassRegistry&); @@ -182,6 +183,7 @@ void initializeInstrProfilingLegacyPassPass(PassRegistry&); void initializeInstructionCombiningPassPass(PassRegistry&); void initializeInstructionSelectPass(PassRegistry&); void initializeInterleavedAccessPass(PassRegistry&); +void initializeInterleavedLoadCombinePass(PassRegistry &); void initializeInternalizeLegacyPassPass(PassRegistry&); void initializeIntervalPartitionPass(PassRegistry&); void initializeJumpThreadingPass(PassRegistry&); @@ -205,7 +207,7 @@ void initializeLiveRangeShrinkPass(PassRegistry&); void initializeLiveRegMatrixPass(PassRegistry&); void initializeLiveStacksPass(PassRegistry&); void initializeLiveVariablesPass(PassRegistry&); -void initializeLoadStoreVectorizerPass(PassRegistry&); +void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry&); void initializeLoaderPassPass(PassRegistry&); void initializeLocalStackSlotPassPass(PassRegistry&); void initializeLocalizerPass(PassRegistry&); @@ -354,7 +356,7 @@ void initializeSampleProfileLoaderLegacyPassPass(PassRegistry&); void initializeSanitizerCoverageModulePass(PassRegistry&); void initializeScalarEvolutionWrapperPassPass(PassRegistry&); void initializeScalarizeMaskedMemIntrinPass(PassRegistry&); -void initializeScalarizerPass(PassRegistry&); +void initializeScalarizerLegacyPassPass(PassRegistry&); void initializeScavengerTestPass(PassRegistry&); void initializeScopedNoAliasAAWrapperPassPass(PassRegistry&); void initializeSeparateConstOffsetFromGEPPass(PassRegistry&); @@ -371,6 +373,8 @@ void initializeSpillPlacementPass(PassRegistry&); void initializeStackColoringPass(PassRegistry&); void initializeStackMapLivenessPass(PassRegistry&); void initializeStackProtectorPass(PassRegistry&); +void initializeStackSafetyGlobalInfoWrapperPassPass(PassRegistry &); +void initializeStackSafetyInfoWrapperPassPass(PassRegistry &); void initializeStackSlotColoringPass(PassRegistry&); void initializeStraightLineStrengthReducePass(PassRegistry&); void initializeStripDeadDebugInfoPass(PassRegistry&); @@ -396,6 +400,7 @@ void initializeUnreachableMachineBlockElimPass(PassRegistry&); void initializeVerifierLegacyPassPass(PassRegistry&); void initializeVirtRegMapPass(PassRegistry&); void initializeVirtRegRewriterPass(PassRegistry&); +void initializeWarnMissedTransformationsLegacyPass(PassRegistry &); void initializeWasmEHPreparePass(PassRegistry&); void initializeWholeProgramDevirtPass(PassRegistry&); void initializeWinEHPreparePass(PassRegistry&); diff --git a/include/llvm/LTO/Config.h b/include/llvm/LTO/Config.h index c0ad32f485c3f0a126ec10204bfc1ebbb02225d5..7058602c3ee2ef01f1456c61ee7d410583c17bb7 100644 --- a/include/llvm/LTO/Config.h +++ b/include/llvm/LTO/Config.h @@ -49,6 +49,10 @@ struct Config { /// Use the new pass manager bool UseNewPM = false; + /// Flag to indicate that the optimizer should not assume builtins are present + /// on the target. + bool Freestanding = false; + /// Disable entirely the optimizer, including importing for ThinLTO bool CodeGenOnly = false; diff --git a/include/llvm/LTO/LTO.h b/include/llvm/LTO/LTO.h index 7d6beab6b441a0dfe4aeeb3ec1546cab99162ee6..15390873056af5b56f921a3acff143ce7db6c886 100644 --- a/include/llvm/LTO/LTO.h +++ b/include/llvm/LTO/LTO.h @@ -40,13 +40,13 @@ class Module; class Target; class raw_pwrite_stream; -/// Resolve Weak and LinkOnce values in the \p Index. Linkage changes recorded -/// in the index and the ThinLTO backends must apply the changes to the Module -/// via thinLTOResolveWeakForLinkerModule. +/// Resolve linkage for prevailing symbols in the \p Index. Linkage changes +/// recorded in the index and the ThinLTO backends must apply the changes to +/// the module via thinLTOResolvePrevailingInModule. /// /// This is done for correctness (if value exported, ensure we always /// emit a copy), and compile-time optimization (allow drop of duplicates). -void thinLTOResolveWeakForLinkerInIndex( +void thinLTOResolvePrevailingInIndex( ModuleSummaryIndex &Index, function_ref isPrevailing, @@ -60,6 +60,19 @@ void thinLTOInternalizeAndPromoteInIndex( ModuleSummaryIndex &Index, function_ref isExported); +/// Computes a unique hash for the Module considering the current list of +/// export/import and other global analysis results. +/// The hash is produced in \p Key. +void computeLTOCacheKey( + SmallString<40> &Key, const lto::Config &Conf, + const ModuleSummaryIndex &Index, StringRef ModuleID, + const FunctionImporter::ImportMapTy &ImportList, + const FunctionImporter::ExportSetTy &ExportList, + const std::map &ResolvedODR, + const GVSummaryMapTy &DefinedGlobals, + const std::set &CfiFunctionDefs = {}, + const std::set &CfiFunctionDecls = {}); + namespace lto { /// Given the original \p Path to an output file, replace any path diff --git a/include/llvm/LTO/SummaryBasedOptimizations.h b/include/llvm/LTO/SummaryBasedOptimizations.h new file mode 100644 index 0000000000000000000000000000000000000000..ad3a8e7dc77b4a1b003a6feeb82fbad6afeae932 --- /dev/null +++ b/include/llvm/LTO/SummaryBasedOptimizations.h @@ -0,0 +1,17 @@ +//=- llvm/LTO/SummaryBasedOptimizations.h -Link time optimizations-*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LTO_SUMMARYBASEDOPTIMIZATIONS_H +#define LLVM_LTO_SUMMARYBASEDOPTIMIZATIONS_H +namespace llvm { +class ModuleSummaryIndex; +void computeSyntheticCounts(ModuleSummaryIndex &Index); + +} // namespace llvm +#endif diff --git a/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h b/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h index 2ff317a322737119789eb9ca223dc9fa8d0222a4..d4c69a1ce260b7337d1ab49dad9bacfa864db806 100644 --- a/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h +++ b/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h @@ -273,8 +273,8 @@ public: /** * Compute and emit the imported files for module at \p ModulePath. */ - static void emitImports(StringRef ModulePath, StringRef OutputName, - ModuleSummaryIndex &Index); + void emitImports(Module &Module, StringRef OutputName, + ModuleSummaryIndex &Index); /** * Perform cross-module importing for the module identified by @@ -285,8 +285,8 @@ public: /** * Compute the list of summaries needed for importing into module. */ - static void gatherImportedSummariesForModule( - StringRef ModulePath, ModuleSummaryIndex &Index, + void gatherImportedSummariesForModule( + Module &Module, ModuleSummaryIndex &Index, std::map &ModuleToSummariesForIndex); /** diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h index e2378cea90ce187fc00c4bc44d8e1ac68eda3305..a31caeea2eedcf6dbbffb87521fe38ccf1dae0d6 100644 --- a/include/llvm/LinkAllPasses.h +++ b/include/llvm/LinkAllPasses.h @@ -50,6 +50,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/InstSimplifyPass.h" +#include "llvm/Transforms/Scalar/Scalarizer.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/SymbolRewriter.h" #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" @@ -219,6 +220,7 @@ namespace { (void) llvm::createFloat2IntPass(); (void) llvm::createEliminateAvailableExternallyPass(); (void) llvm::createScalarizeMaskedMemIntrinPass(); + (void) llvm::createWarnMissedTransformationsPass(); (void)new llvm::IntervalPartition(); (void)new llvm::ScalarEvolutionWrapperPass(); diff --git a/include/llvm/MC/MCAsmMacro.h b/include/llvm/MC/MCAsmMacro.h index 09b32c7ea3337f98673f35687a7494138a8cc620..135fa4f2e33d64a800b95ecf0d018eceaaa67f21 100644 --- a/include/llvm/MC/MCAsmMacro.h +++ b/include/llvm/MC/MCAsmMacro.h @@ -52,7 +52,7 @@ public: Pipe, PipePipe, Caret, Amp, AmpAmp, Exclaim, ExclaimEqual, Percent, Hash, Less, LessEqual, LessLess, LessGreater, - Greater, GreaterEqual, GreaterGreater, At, + Greater, GreaterEqual, GreaterGreater, At, MinusGreater, // MIPS unary expression operators such as %neg. PercentCall16, PercentCall_Hi, PercentCall_Lo, PercentDtprel_Hi, diff --git a/include/llvm/MC/MCAssembler.h b/include/llvm/MC/MCAssembler.h index 0f9499d705e4666a33b650be21c932e5a2e71fb7..986c6e17548fc3501c329a314c66d5a86975fb5c 100644 --- a/include/llvm/MC/MCAssembler.h +++ b/include/llvm/MC/MCAssembler.h @@ -23,6 +23,7 @@ #include "llvm/MC/MCFragment.h" #include "llvm/MC/MCLinkerOptimizationHint.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/VersionTuple.h" #include #include #include @@ -94,6 +95,8 @@ public: unsigned Major; unsigned Minor; unsigned Update; + /// An optional version of the SDK that was used to build the source. + VersionTuple SDKVersion; }; private: @@ -255,20 +258,24 @@ public: /// MachO deployment target version information. const VersionInfoType &getVersionInfo() const { return VersionInfo; } void setVersionMin(MCVersionMinType Type, unsigned Major, unsigned Minor, - unsigned Update) { + unsigned Update, + VersionTuple SDKVersion = VersionTuple()) { VersionInfo.EmitBuildVersion = false; VersionInfo.TypeOrPlatform.Type = Type; VersionInfo.Major = Major; VersionInfo.Minor = Minor; VersionInfo.Update = Update; + VersionInfo.SDKVersion = SDKVersion; } void setBuildVersion(MachO::PlatformType Platform, unsigned Major, - unsigned Minor, unsigned Update) { + unsigned Minor, unsigned Update, + VersionTuple SDKVersion = VersionTuple()) { VersionInfo.EmitBuildVersion = true; VersionInfo.TypeOrPlatform.Platform = Platform; VersionInfo.Major = Major; VersionInfo.Minor = Minor; VersionInfo.Update = Update; + VersionInfo.SDKVersion = SDKVersion; } /// Reuse an assembler instance diff --git a/include/llvm/MC/MCCodeView.h b/include/llvm/MC/MCCodeView.h index 2678cf4388f8d3b56c89320398cd1306e713de7a..cef03a409f9506c903be3739ae138511a9c06228 100644 --- a/include/llvm/MC/MCCodeView.h +++ b/include/llvm/MC/MCCodeView.h @@ -194,7 +194,7 @@ public: void encodeInlineLineTable(MCAsmLayout &Layout, MCCVInlineLineTableFragment &F); - void + MCFragment * emitDefRange(MCObjectStreamer &OS, ArrayRef> Ranges, StringRef FixedSizePortion); diff --git a/include/llvm/MC/MCDwarf.h b/include/llvm/MC/MCDwarf.h index 2bfaf19cf2c64ace28f67de449673d51b2ea05ff..aeaa4e55879b08983bef054277ac874379657fa0 100644 --- a/include/llvm/MC/MCDwarf.h +++ b/include/llvm/MC/MCDwarf.h @@ -430,6 +430,7 @@ public: OpUndefined, OpRegister, OpWindowSave, + OpNegateRAState, OpGnuArgsSize }; @@ -509,6 +510,11 @@ public: return MCCFIInstruction(OpWindowSave, L, 0, 0, ""); } + /// .cfi_negate_ra_state AArch64 negate RA state. + static MCCFIInstruction createNegateRAState(MCSymbol *L) { + return MCCFIInstruction(OpNegateRAState, L, 0, 0, ""); + } + /// .cfi_restore says that the rule for Register is now the same as it /// was at the beginning of the function, after all initial instructions added /// by .cfi_startproc were executed. diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h index 23dde10e785bb69e9a96ecd1f3726a558140b6f2..8cb6b86fd672eedd9d8e57a117fb6fb8146f80bc 100644 --- a/include/llvm/MC/MCExpr.h +++ b/include/llvm/MC/MCExpr.h @@ -288,6 +288,7 @@ public: VK_WebAssembly_FUNCTION, // Function table index, rather than virtual addr VK_WebAssembly_GLOBAL, // Global object index VK_WebAssembly_TYPEINDEX,// Type table index + VK_WebAssembly_EVENT, // Event index VK_AMDGPU_GOTPCREL32_LO, // symbol@gotpcrel32@lo VK_AMDGPU_GOTPCREL32_HI, // symbol@gotpcrel32@hi diff --git a/include/llvm/MC/MCInst.h b/include/llvm/MC/MCInst.h index 67bb11a703872731d4eeb9693bb321ab0f9558ed..d501b686bb2e79d83fc397db4531d46b43850483 100644 --- a/include/llvm/MC/MCInst.h +++ b/include/llvm/MC/MCInst.h @@ -208,6 +208,8 @@ public: /// string. void dump_pretty(raw_ostream &OS, const MCInstPrinter *Printer = nullptr, StringRef Separator = " ") const; + void dump_pretty(raw_ostream &OS, StringRef Name, + StringRef Separator = " ") const; }; inline raw_ostream& operator<<(raw_ostream &OS, const MCOperand &MO) { diff --git a/include/llvm/MC/MCInstrDesc.h b/include/llvm/MC/MCInstrDesc.h index ba97aeb837bedefd7f6bb6d58237a875e95c975b..61e7d09afbcbc4f6507c89c5b936612396be049c 100644 --- a/include/llvm/MC/MCInstrDesc.h +++ b/include/llvm/MC/MCInstrDesc.h @@ -151,7 +151,8 @@ enum Flag { InsertSubreg, Convergent, Add, - Trap + Trap, + VariadicOpsAreDefs, }; } @@ -383,6 +384,11 @@ public: /// additional values. bool isConvergent() const { return Flags & (1ULL << MCID::Convergent); } + /// Return true if variadic operands of this instruction are definitions. + bool variadicOpsAreDefs() const { + return Flags & (1ULL << MCID::VariadicOpsAreDefs); + } + //===--------------------------------------------------------------------===// // Side Effect Analysis //===--------------------------------------------------------------------===// diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h index 729aa23ef333a3117602f8f72d7de99d1066bbd6..f8142ccd8ac5b68ba95fc92731e4c87a2e0ede6c 100644 --- a/include/llvm/MC/MCObjectFileInfo.h +++ b/include/llvm/MC/MCObjectFileInfo.h @@ -18,6 +18,7 @@ #include "llvm/ADT/Triple.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CodeGen.h" +#include "llvm/Support/VersionTuple.h" namespace llvm { class MCContext; @@ -241,6 +242,9 @@ public: MCSection *getCompactUnwindSection() const { return CompactUnwindSection; } MCSection *getDwarfAbbrevSection() const { return DwarfAbbrevSection; } MCSection *getDwarfInfoSection() const { return DwarfInfoSection; } + MCSection *getDwarfInfoSection(uint64_t Hash) const { + return getDwarfComdatSection(".debug_info", Hash); + } MCSection *getDwarfLineSection() const { return DwarfLineSection; } MCSection *getDwarfLineStrSection() const { return DwarfLineStrSection; } MCSection *getDwarfFrameSection() const { return DwarfFrameSection; } @@ -277,7 +281,9 @@ public: return DwarfAccelTypesSection; } MCSection *getDwarfInfoDWOSection() const { return DwarfInfoDWOSection; } - MCSection *getDwarfTypesSection(uint64_t Hash) const; + MCSection *getDwarfTypesSection(uint64_t Hash) const { + return getDwarfComdatSection(".debug_types", Hash); + } MCSection *getDwarfTypesDWOSection() const { return DwarfTypesDWOSection; } MCSection *getDwarfAbbrevDWOSection() const { return DwarfAbbrevDWOSection; } MCSection *getDwarfStrDWOSection() const { return DwarfStrDWOSection; } @@ -385,14 +391,22 @@ private: bool PositionIndependent; MCContext *Ctx; Triple TT; + VersionTuple SDKVersion; void initMachOMCObjectFileInfo(const Triple &T); void initELFMCObjectFileInfo(const Triple &T, bool Large); void initCOFFMCObjectFileInfo(const Triple &T); void initWasmMCObjectFileInfo(const Triple &T); + MCSection *getDwarfComdatSection(const char *Name, uint64_t Hash) const; public: const Triple &getTargetTriple() const { return TT; } + + void setSDKVersion(const VersionTuple &TheSDKVersion) { + SDKVersion = TheSDKVersion; + } + + const VersionTuple &getSDKVersion() const { return SDKVersion; } }; } // end namespace llvm diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h index c9e577b7e295e4a06d77d716708230c157254399..892909656c150a070023cf22e0552ccec02c670f 100644 --- a/include/llvm/MC/MCObjectStreamer.h +++ b/include/llvm/MC/MCObjectStreamer.h @@ -39,12 +39,21 @@ class MCObjectStreamer : public MCStreamer { bool EmitEHFrame; bool EmitDebugFrame; SmallVector PendingLabels; + struct PendingMCFixup { + const MCSymbol *Sym; + MCFixup Fixup; + MCDataFragment *DF; + PendingMCFixup(const MCSymbol *McSym, MCDataFragment *F, MCFixup McFixup) + : Sym(McSym), Fixup(McFixup), DF(F) {} + }; + SmallVector PendingFixups; virtual void EmitInstToData(const MCInst &Inst, const MCSubtargetInfo&) = 0; void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override; void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override; MCSymbol *EmitCFILabel() override; void EmitInstructionImpl(const MCInst &Inst, const MCSubtargetInfo &STI); + void resolvePendingFixups(); protected: MCObjectStreamer(MCContext &Context, std::unique_ptr TAB, diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h index 41305296b004f67b97c6b07c637f65d3473d828a..689ac73cbdd1c37f6aeb882b7ba9b1dfeb6eb001 100644 --- a/include/llvm/MC/MCSchedule.h +++ b/include/llvm/MC/MCSchedule.h @@ -183,6 +183,8 @@ struct MCExtraProcessorInfo { unsigned NumRegisterFiles; const MCRegisterCostEntry *RegisterCostTable; unsigned NumRegisterCostEntries; + unsigned LoadQueueID; + unsigned StoreQueueID; }; /// Machine model for scheduling, bundling, and heuristics. diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h index edf0a72d9c12d3848529a541cf46ff0f14729c28..849ea9a624e899455a1fbc11f95ed44d7b479b62 100644 --- a/include/llvm/MC/MCStreamer.h +++ b/include/llvm/MC/MCStreamer.h @@ -28,6 +28,7 @@ #include "llvm/Support/MD5.h" #include "llvm/Support/SMLoc.h" #include "llvm/Support/TargetParser.h" +#include "llvm/Support/VersionTuple.h" #include #include #include @@ -452,14 +453,17 @@ public: /// Specify the Mach-O minimum deployment target version. virtual void EmitVersionMin(MCVersionMinType Type, unsigned Major, - unsigned Minor, unsigned Update) {} + unsigned Minor, unsigned Update, + VersionTuple SDKVersion) {} /// Emit/Specify Mach-O build version command. /// \p Platform should be one of MachO::PlatformType. virtual void EmitBuildVersion(unsigned Platform, unsigned Major, - unsigned Minor, unsigned Update) {} + unsigned Minor, unsigned Update, + VersionTuple SDKVersion) {} - void EmitVersionForTarget(const Triple &Target); + void EmitVersionForTarget(const Triple &Target, + const VersionTuple &SDKVersion); /// Note in the output that the specified \p Func is a Thumb mode /// function (ARM target only). @@ -896,6 +900,7 @@ public: virtual void EmitCFIUndefined(int64_t Register); virtual void EmitCFIRegister(int64_t Register1, int64_t Register2); virtual void EmitCFIWindowSave(); + virtual void EmitCFINegateRAState(); virtual void EmitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc = SMLoc()); virtual void EmitWinCFIEndProc(SMLoc Loc = SMLoc()); diff --git a/include/llvm/MC/MCSymbolWasm.h b/include/llvm/MC/MCSymbolWasm.h index 281178aea61229158c8a45c629b460b1f7e0786f..8e66dc881d0fba45ca0cac9278fd8202b0fdb853 100644 --- a/include/llvm/MC/MCSymbolWasm.h +++ b/include/llvm/MC/MCSymbolWasm.h @@ -21,8 +21,8 @@ class MCSymbolWasm : public MCSymbol { bool IsComdat = false; std::string ModuleName; wasm::WasmSignature *Signature = nullptr; - wasm::WasmGlobalType GlobalType; - bool GlobalTypeSet = false; + Optional GlobalType; + Optional EventType; /// An expression describing how to calculate the size of a symbol. If a /// symbol has no size this field will be NULL. @@ -42,6 +42,7 @@ public: bool isData() const { return Type == wasm::WASM_SYMBOL_TYPE_DATA; } bool isGlobal() const { return Type == wasm::WASM_SYMBOL_TYPE_GLOBAL; } bool isSection() const { return Type == wasm::WASM_SYMBOL_TYPE_SECTION; } + bool isEvent() const { return Type == wasm::WASM_SYMBOL_TYPE_EVENT; } wasm::WasmSymbolType getType() const { return Type; } void setType(wasm::WasmSymbolType type) { Type = type; } @@ -61,14 +62,16 @@ public: void setSignature(wasm::WasmSignature *Sig) { Signature = Sig; } const wasm::WasmGlobalType &getGlobalType() const { - assert(GlobalTypeSet); - return GlobalType; + assert(GlobalType.hasValue()); + return GlobalType.getValue(); } + void setGlobalType(wasm::WasmGlobalType GT) { GlobalType = GT; } - void setGlobalType(wasm::WasmGlobalType GT) { - GlobalTypeSet = true; - GlobalType = GT; + const wasm::WasmEventType &getEventType() const { + assert(EventType.hasValue()); + return EventType.getValue(); } + void setEventType(wasm::WasmEventType ET) { EventType = ET; } }; } // end namespace llvm diff --git a/tools/llvm-mca/include/Context.h b/include/llvm/MCA/Context.h similarity index 90% rename from tools/llvm-mca/include/Context.h rename to include/llvm/MCA/Context.h index ebd1528e371f722e970785dc689f383ad1e813a5..6b2bee0fdc4210779189c4fe8e7b43dfe9426ed5 100644 --- a/tools/llvm-mca/include/Context.h +++ b/include/llvm/MCA/Context.h @@ -15,14 +15,15 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_MCA_CONTEXT_H -#define LLVM_TOOLS_LLVM_MCA_CONTEXT_H -#include "HardwareUnits/HardwareUnit.h" -#include "InstrBuilder.h" -#include "Pipeline.h" -#include "SourceMgr.h" +#ifndef LLVM_MCA_CONTEXT_H +#define LLVM_MCA_CONTEXT_H + #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MCA/HardwareUnits/HardwareUnit.h" +#include "llvm/MCA/InstrBuilder.h" +#include "llvm/MCA/Pipeline.h" +#include "llvm/MCA/SourceMgr.h" #include namespace llvm { @@ -65,4 +66,4 @@ public: } // namespace mca } // namespace llvm -#endif // LLVM_TOOLS_LLVM_MCA_CONTEXT_H +#endif // LLVM_MCA_CONTEXT_H diff --git a/tools/llvm-mca/include/HWEventListener.h b/include/llvm/MCA/HWEventListener.h similarity index 96% rename from tools/llvm-mca/include/HWEventListener.h rename to include/llvm/MCA/HWEventListener.h index 0216fae7866b71ead66e3aa5635997664e1915c9..3b32b2cd657721d7cb2694e5f7d0c2c684dedfcb 100644 --- a/tools/llvm-mca/include/HWEventListener.h +++ b/include/llvm/MCA/HWEventListener.h @@ -12,12 +12,12 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_MCA_HWEVENTLISTENER_H -#define LLVM_TOOLS_LLVM_MCA_HWEVENTLISTENER_H +#ifndef LLVM_MCA_HWEVENTLISTENER_H +#define LLVM_MCA_HWEVENTLISTENER_H -#include "Instruction.h" -#include "Support.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/MCA/Instruction.h" +#include "llvm/MCA/Support.h" namespace llvm { namespace mca { @@ -153,4 +153,4 @@ private: } // namespace mca } // namespace llvm -#endif +#endif // LLVM_MCA_HWEVENTLISTENER_H diff --git a/tools/llvm-mca/include/HardwareUnits/HardwareUnit.h b/include/llvm/MCA/HardwareUnits/HardwareUnit.h similarity index 86% rename from tools/llvm-mca/include/HardwareUnits/HardwareUnit.h rename to include/llvm/MCA/HardwareUnits/HardwareUnit.h index 5070418c11bffc3f97a2dfd8d21d36adb9eb22cd..104a2009f219d234986557de065cafd3ebe70aca 100644 --- a/tools/llvm-mca/include/HardwareUnits/HardwareUnit.h +++ b/include/llvm/MCA/HardwareUnits/HardwareUnit.h @@ -13,8 +13,8 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_MCA_HARDWAREUNIT_H -#define LLVM_TOOLS_LLVM_MCA_HARDWAREUNIT_H +#ifndef LLVM_MCA_HARDWAREUNIT_H +#define LLVM_MCA_HARDWAREUNIT_H namespace llvm { namespace mca { @@ -30,4 +30,4 @@ public: } // namespace mca } // namespace llvm -#endif // LLVM_TOOLS_LLVM_MCA_HARDWAREUNIT_H +#endif // LLVM_MCA_HARDWAREUNIT_H diff --git a/tools/llvm-mca/include/HardwareUnits/LSUnit.h b/include/llvm/MCA/HardwareUnits/LSUnit.h similarity index 65% rename from tools/llvm-mca/include/HardwareUnits/LSUnit.h rename to include/llvm/MCA/HardwareUnits/LSUnit.h index 6b36282ca7253aa7499d9c32f91ab42558bdf49d..e217fc50f780c5d78cc4a4051dd322c204660e52 100644 --- a/tools/llvm-mca/include/HardwareUnits/LSUnit.h +++ b/include/llvm/MCA/HardwareUnits/LSUnit.h @@ -13,16 +13,18 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_MCA_LSUNIT_H -#define LLVM_TOOLS_LLVM_MCA_LSUNIT_H +#ifndef LLVM_MCA_LSUNIT_H +#define LLVM_MCA_LSUNIT_H -#include "HardwareUnits/HardwareUnit.h" -#include +#include "llvm/ADT/SmallSet.h" +#include "llvm/MC/MCSchedule.h" +#include "llvm/MCA/HardwareUnits/HardwareUnit.h" namespace llvm { namespace mca { class InstRef; +class Scheduler; /// A Load/Store Unit implementing a load and store queues. /// @@ -99,8 +101,46 @@ class LSUnit : public HardwareUnit { // If true, loads will never alias with stores. This is the default. bool NoAlias; - std::set LoadQueue; - std::set StoreQueue; + // When a `MayLoad` instruction is dispatched to the schedulers for execution, + // the LSUnit reserves an entry in the `LoadQueue` for it. + // + // LoadQueue keeps track of all the loads that are in-flight. A load + // instruction is eventually removed from the LoadQueue when it reaches + // completion stage. That means, a load leaves the queue whe it is 'executed', + // and its value can be forwarded on the data path to outside units. + // + // This class doesn't know about the latency of a load instruction. So, it + // conservatively/pessimistically assumes that the latency of a load opcode + // matches the instruction latency. + // + // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses), + // and load/store conflicts, the latency of a load is determined by the depth + // of the load pipeline. So, we could use field `LoadLatency` in the + // MCSchedModel to model that latency. + // Field `LoadLatency` often matches the so-called 'load-to-use' latency from + // L1D, and it usually already accounts for any extra latency due to data + // forwarding. + // When doing throughput analysis, `LoadLatency` is likely to + // be a better predictor of load latency than instruction latency. This is + // particularly true when simulating code with temporal/spatial locality of + // memory accesses. + // Using `LoadLatency` (instead of the instruction latency) is also expected + // to improve the load queue allocation for long latency instructions with + // folded memory operands (See PR39829). + // + // FIXME: On some processors, load/store operations are split into multiple + // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but + // not 256-bit data types. So, a 256-bit load is effectively split into two + // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For + // simplicity, this class optimistically assumes that a load instruction only + // consumes one entry in the LoadQueue. Similarly, store instructions only + // consume a single entry in the StoreQueue. + // In future, we should reassess the quality of this design, and consider + // alternative approaches that let instructions specify the number of + // load/store queue entries which they consume at dispatch stage (See + // PR39830). + SmallSet LoadQueue; + SmallSet StoreQueue; void assignLQSlot(unsigned Index); void assignSQSlot(unsigned Index); @@ -109,12 +149,12 @@ class LSUnit : public HardwareUnit { // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is // conservatively treated as a store barrier. It forces older store to be // executed before newer stores are issued. - std::set StoreBarriers; + SmallSet StoreBarriers; // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is // conservatively treated as a load barrier. It forces older loads to execute // before newer loads are issued. - std::set LoadBarriers; + SmallSet LoadBarriers; bool isSQEmpty() const { return StoreQueue.empty(); } bool isLQEmpty() const { return LoadQueue.empty(); } @@ -122,8 +162,8 @@ class LSUnit : public HardwareUnit { bool isLQFull() const { return LQ_Size != 0 && LoadQueue.size() == LQ_Size; } public: - LSUnit(unsigned LQ = 0, unsigned SQ = 0, bool AssumeNoAlias = false) - : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {} + LSUnit(const MCSchedModel &SM, unsigned LQ = 0, unsigned SQ = 0, + bool AssumeNoAlias = false); #ifndef NDEBUG void dump() const; @@ -149,10 +189,19 @@ public: // 5. A load has to wait until an older load barrier is fully executed. // 6. A store has to wait until an older store barrier is fully executed. virtual bool isReady(const InstRef &IR) const; + + // Load and store instructions are tracked by their corresponding queues from + // dispatch until the "instruction executed" event. + // Only when a load instruction reaches the 'Executed' stage, its value + // becomes available to the users. At that point, the load no longer needs to + // be tracked by the load queue. + // FIXME: For simplicity, we optimistically assume a similar behavior for + // store instructions. In practice, store operations don't tend to leave the + // store queue until they reach the 'Retired' stage (See PR39830). void onInstructionExecuted(const InstRef &IR); }; } // namespace mca } // namespace llvm -#endif +#endif // LLVM_MCA_LSUNIT_H diff --git a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h b/include/llvm/MCA/HardwareUnits/RegisterFile.h similarity index 98% rename from tools/llvm-mca/include/HardwareUnits/RegisterFile.h rename to include/llvm/MCA/HardwareUnits/RegisterFile.h index d9949bf4f6a13be5208240ba299cb4d7927895c1..c23ab038923481363f0d5718af4ca56f56dd983f 100644 --- a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h +++ b/include/llvm/MCA/HardwareUnits/RegisterFile.h @@ -14,14 +14,14 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_MCA_REGISTER_FILE_H -#define LLVM_TOOLS_LLVM_MCA_REGISTER_FILE_H +#ifndef LLVM_MCA_REGISTER_FILE_H +#define LLVM_MCA_REGISTER_FILE_H -#include "HardwareUnits/HardwareUnit.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSchedule.h" +#include "llvm/MCA/HardwareUnits/HardwareUnit.h" #include "llvm/Support/Error.h" namespace llvm { @@ -236,4 +236,4 @@ public: } // namespace mca } // namespace llvm -#endif // LLVM_TOOLS_LLVM_MCA_REGISTER_FILE_H +#endif // LLVM_MCA_REGISTER_FILE_H diff --git a/tools/llvm-mca/include/HardwareUnits/ResourceManager.h b/include/llvm/MCA/HardwareUnits/ResourceManager.h similarity index 97% rename from tools/llvm-mca/include/HardwareUnits/ResourceManager.h rename to include/llvm/MCA/HardwareUnits/ResourceManager.h index 065ead8f1a8abd300ad06f4e0fae775b8cb061b8..5429f2b8cf0a5c928f40a32ab1d26c3ae137e921 100644 --- a/tools/llvm-mca/include/HardwareUnits/ResourceManager.h +++ b/include/llvm/MCA/HardwareUnits/ResourceManager.h @@ -13,15 +13,15 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_MCA_RESOURCE_MANAGER_H -#define LLVM_TOOLS_LLVM_MCA_RESOURCE_MANAGER_H +#ifndef LLVM_MCA_RESOURCE_MANAGER_H +#define LLVM_MCA_RESOURCE_MANAGER_H -#include "Instruction.h" -#include "Support.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/MC/MCSchedule.h" +#include "llvm/MCA/Instruction.h" +#include "llvm/MCA/Support.h" namespace llvm { namespace mca { @@ -119,8 +119,6 @@ class DefaultResourceStrategy final : public ResourceStrategy { /// on the overall performance of the tool. uint64_t RemovedFromNextInSequence; - void skipMask(uint64_t Mask); - public: DefaultResourceStrategy(uint64_t UnitMask) : ResourceStrategy(), ResourceUnitMask(UnitMask), @@ -183,6 +181,8 @@ class ResourceState { /// underlying units (i.e. pipelines) until the resource is released. bool Unavailable; + const bool IsAGroup; + /// Checks for the availability of unit 'SubResMask' in the group. bool isSubResourceReady(uint64_t SubResMask) const { return ReadyMask & SubResMask; @@ -210,7 +210,7 @@ public: /// `NumUnits` available units. bool isReady(unsigned NumUnits = 1) const; - bool isAResourceGroup() const { return countPopulation(ResourceMask) > 1; } + bool isAResourceGroup() const { return IsAGroup; } bool containsResource(uint64_t ID) const { return ResourceMask & ID; } @@ -357,4 +357,4 @@ public: } // namespace mca } // namespace llvm -#endif // LLVM_TOOLS_LLVM_MCA_RESOURCE_MANAGER_H +#endif // LLVM_MCA_RESOURCE_MANAGER_H diff --git a/tools/llvm-mca/include/HardwareUnits/RetireControlUnit.h b/include/llvm/MCA/HardwareUnits/RetireControlUnit.h similarity index 94% rename from tools/llvm-mca/include/HardwareUnits/RetireControlUnit.h rename to include/llvm/MCA/HardwareUnits/RetireControlUnit.h index 12e0a1fba136ea926c1ce3f48d8e9b7fdcf7fa60..71360e984ade5d48914b375df9e628abbf58274c 100644 --- a/tools/llvm-mca/include/HardwareUnits/RetireControlUnit.h +++ b/include/llvm/MCA/HardwareUnits/RetireControlUnit.h @@ -12,12 +12,12 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_MCA_RETIRE_CONTROL_UNIT_H -#define LLVM_TOOLS_LLVM_MCA_RETIRE_CONTROL_UNIT_H +#ifndef LLVM_MCA_RETIRE_CONTROL_UNIT_H +#define LLVM_MCA_RETIRE_CONTROL_UNIT_H -#include "HardwareUnits/HardwareUnit.h" -#include "Instruction.h" #include "llvm/MC/MCSchedule.h" +#include "llvm/MCA/HardwareUnits/HardwareUnit.h" +#include "llvm/MCA/Instruction.h" #include namespace llvm { @@ -101,4 +101,4 @@ public: } // namespace mca } // namespace llvm -#endif // LLVM_TOOLS_LLVM_MCA_RETIRE_CONTROL_UNIT_H +#endif // LLVM_MCA_RETIRE_CONTROL_UNIT_H diff --git a/tools/llvm-mca/include/HardwareUnits/Scheduler.h b/include/llvm/MCA/HardwareUnits/Scheduler.h similarity index 92% rename from tools/llvm-mca/include/HardwareUnits/Scheduler.h rename to include/llvm/MCA/HardwareUnits/Scheduler.h index 17332b430d2edff7f8c5d00ff4e31f75f5757242..351ea4827df9a112c3cb3e14607c97eda19146e2 100644 --- a/tools/llvm-mca/include/HardwareUnits/Scheduler.h +++ b/include/llvm/MCA/HardwareUnits/Scheduler.h @@ -12,15 +12,15 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_MCA_SCHEDULER_H -#define LLVM_TOOLS_LLVM_MCA_SCHEDULER_H +#ifndef LLVM_MCA_SCHEDULER_H +#define LLVM_MCA_SCHEDULER_H -#include "HardwareUnits/HardwareUnit.h" -#include "HardwareUnits/LSUnit.h" -#include "ResourceManager.h" -#include "Support.h" #include "llvm/ADT/SmallVector.h" #include "llvm/MC/MCSchedule.h" +#include "llvm/MCA/HardwareUnits/HardwareUnit.h" +#include "llvm/MCA/HardwareUnits/LSUnit.h" +#include "llvm/MCA/HardwareUnits/ResourceManager.h" +#include "llvm/MCA/Support.h" namespace llvm { namespace mca { @@ -85,7 +85,7 @@ public: /// transition (i.e. from state IS_READY, to state IS_EXECUTING). An Instruction /// leaves the IssuedSet when it reaches the write-back stage. class Scheduler : public HardwareUnit { - LSUnit *LSU; + LSUnit &LSU; // Instruction selection strategy for this Scheduler. std::unique_ptr Strategy; @@ -117,16 +117,15 @@ class Scheduler : public HardwareUnit { void promoteToReadySet(SmallVectorImpl &Ready); public: - Scheduler(const MCSchedModel &Model, LSUnit *Lsu) - : LSU(Lsu), Resources(make_unique(Model)) { - initializeStrategy(nullptr); - } - Scheduler(const MCSchedModel &Model, LSUnit *Lsu, + Scheduler(const MCSchedModel &Model, LSUnit &Lsu) + : Scheduler(Model, Lsu, nullptr) {} + + Scheduler(const MCSchedModel &Model, LSUnit &Lsu, std::unique_ptr SelectStrategy) - : LSU(Lsu), Resources(make_unique(Model)) { - initializeStrategy(std::move(SelectStrategy)); - } - Scheduler(std::unique_ptr RM, LSUnit *Lsu, + : Scheduler(make_unique(Model), Lsu, + std::move(SelectStrategy)) {} + + Scheduler(std::unique_ptr RM, LSUnit &Lsu, std::unique_ptr SelectStrategy) : LSU(Lsu), Resources(std::move(RM)) { initializeStrategy(std::move(SelectStrategy)); @@ -212,4 +211,4 @@ public: } // namespace mca } // namespace llvm -#endif // LLVM_TOOLS_LLVM_MCA_SCHEDULER_H +#endif // LLVM_MCA_SCHEDULER_H diff --git a/tools/llvm-mca/include/InstrBuilder.h b/include/llvm/MCA/InstrBuilder.h similarity index 79% rename from tools/llvm-mca/include/InstrBuilder.h rename to include/llvm/MCA/InstrBuilder.h index 67aa889cf7bf5ebc0751ffc523c3703f42516760..5f998db5e4ce0c232e7b110113608cc1125193e1 100644 --- a/tools/llvm-mca/include/InstrBuilder.h +++ b/include/llvm/MCA/InstrBuilder.h @@ -12,15 +12,15 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_MCA_INSTRBUILDER_H -#define LLVM_TOOLS_LLVM_MCA_INSTRBUILDER_H +#ifndef LLVM_MCA_INSTRBUILDER_H +#define LLVM_MCA_INSTRBUILDER_H -#include "Instruction.h" -#include "Support.h" #include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MCA/Instruction.h" +#include "llvm/MCA/Support.h" #include "llvm/Support/Error.h" namespace llvm { @@ -40,31 +40,38 @@ class InstrBuilder { const MCSubtargetInfo &STI; const MCInstrInfo &MCII; const MCRegisterInfo &MRI; - const MCInstrAnalysis &MCIA; + const MCInstrAnalysis *MCIA; SmallVector ProcResourceMasks; DenseMap> Descriptors; DenseMap> VariantDescriptors; + bool FirstCallInst; + bool FirstReturnInst; + Expected createInstrDescImpl(const MCInst &MCI); Expected getOrCreateInstrDesc(const MCInst &MCI); InstrBuilder(const InstrBuilder &) = delete; InstrBuilder &operator=(const InstrBuilder &) = delete; - Error populateWrites(InstrDesc &ID, const MCInst &MCI, unsigned SchedClassID); - Error populateReads(InstrDesc &ID, const MCInst &MCI, unsigned SchedClassID); + void populateWrites(InstrDesc &ID, const MCInst &MCI, unsigned SchedClassID); + void populateReads(InstrDesc &ID, const MCInst &MCI, unsigned SchedClassID); Error verifyInstrDesc(const InstrDesc &ID, const MCInst &MCI) const; public: InstrBuilder(const MCSubtargetInfo &STI, const MCInstrInfo &MCII, - const MCRegisterInfo &RI, const MCInstrAnalysis &IA); + const MCRegisterInfo &RI, const MCInstrAnalysis *IA); - void clear() { VariantDescriptors.shrink_and_clear(); } + void clear() { + VariantDescriptors.shrink_and_clear(); + FirstCallInst = true; + FirstReturnInst = true; + } Expected> createInstruction(const MCInst &MCI); }; } // namespace mca } // namespace llvm -#endif +#endif // LLVM_MCA_INSTRBUILDER_H diff --git a/tools/llvm-mca/include/Instruction.h b/include/llvm/MCA/Instruction.h similarity index 94% rename from tools/llvm-mca/include/Instruction.h rename to include/llvm/MCA/Instruction.h index 7407283bca2c3933aacd83ed3db269349975b774..148651b22dac04657c598516970db241180182df 100644 --- a/tools/llvm-mca/include/Instruction.h +++ b/include/llvm/MCA/Instruction.h @@ -13,8 +13,8 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H -#define LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H +#ifndef LLVM_MCA_INSTRUCTION_H +#define LLVM_MCA_INSTRUCTION_H #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" @@ -26,8 +26,6 @@ #endif #include -#include -#include namespace llvm { namespace mca { @@ -123,8 +121,10 @@ class WriteState { // that we don't break the WAW, and the two writes can be merged together. const WriteState *DependentWrite; - // Number of writes that are in a WAW dependency with this write. - unsigned NumWriteUsers; + // A partial write that is in a false dependency with this write. + WriteState *PartialWrite; + + unsigned DependentWriteCyclesLeft; // A list of dependent reads. Users is a set of dependent // reads. A dependent read is added to the set only if CyclesLeft @@ -132,14 +132,15 @@ class WriteState { // gets notified with the actual CyclesLeft. // The 'second' element of a pair is a "ReadAdvance" number of cycles. - std::set> Users; + SmallVector, 4> Users; public: WriteState(const WriteDescriptor &Desc, unsigned RegID, bool clearsSuperRegs = false, bool writesZero = false) : WD(&Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID), PRFID(0), ClearsSuperRegs(clearsSuperRegs), WritesZero(writesZero), - IsEliminated(false), DependentWrite(nullptr), NumWriteUsers(0U) {} + IsEliminated(false), DependentWrite(nullptr), PartialWrite(nullptr), + DependentWriteCyclesLeft(0) {} WriteState(const WriteState &Other) = default; WriteState &operator=(const WriteState &Other) = default; @@ -151,8 +152,17 @@ public: unsigned getLatency() const { return WD->Latency; } void addUser(ReadState *Use, int ReadAdvance); + void addUser(WriteState *Use); + + unsigned getDependentWriteCyclesLeft() const { return DependentWriteCyclesLeft; } + + unsigned getNumUsers() const { + unsigned NumUsers = Users.size(); + if (PartialWrite) + ++NumUsers; + return NumUsers; + } - unsigned getNumUsers() const { return Users.size() + NumWriteUsers; } bool clearsSuperRegisters() const { return ClearsSuperRegs; } bool isWriteZero() const { return WritesZero; } bool isEliminated() const { return IsEliminated; } @@ -161,10 +171,12 @@ public: } const WriteState *getDependentWrite() const { return DependentWrite; } - void setDependentWrite(WriteState *Other) { - DependentWrite = Other; - ++Other->NumWriteUsers; + void setDependentWrite(WriteState *Other) { DependentWrite = Other; } + void writeStartEvent(unsigned Cycles) { + DependentWriteCyclesLeft = Cycles; + DependentWrite = nullptr; } + void setWriteZero() { WritesZero = true; } void setEliminated() { assert(Users.empty() && "Write is in an inconsistent state."); @@ -305,15 +317,16 @@ struct ResourceUsage { /// An instruction descriptor struct InstrDesc { - std::vector Writes; // Implicit writes are at the end. - std::vector Reads; // Implicit reads are at the end. + SmallVector Writes; // Implicit writes are at the end. + SmallVector Reads; // Implicit reads are at the end. // For every resource used by an instruction of this kind, this vector // reports the number of "consumed cycles". - std::vector> Resources; + SmallVector, 4> Resources; // A list of buffered resources consumed by this instruction. - std::vector Buffers; + SmallVector Buffers; + unsigned MaxLatency; // Number of MicroOps for this instruction. unsigned NumMicroOps; @@ -321,6 +334,8 @@ struct InstrDesc { bool MayLoad; bool MayStore; bool HasSideEffects; + bool BeginGroup; + bool EndGroup; // A zero latency instruction doesn't consume any scheduler resources. bool isZeroLatency() const { return !MaxLatency && Resources.empty(); } @@ -526,4 +541,4 @@ public: } // namespace mca } // namespace llvm -#endif +#endif // LLVM_MCA_INSTRUCTION_H diff --git a/tools/llvm-mca/include/Pipeline.h b/include/llvm/MCA/Pipeline.h similarity index 90% rename from tools/llvm-mca/include/Pipeline.h rename to include/llvm/MCA/Pipeline.h index 47ff07b288288bb2fc667f158e6e88b51de919df..acd256060bdd77a6f73c142eb3e38e72595de490 100644 --- a/tools/llvm-mca/include/Pipeline.h +++ b/include/llvm/MCA/Pipeline.h @@ -13,12 +13,12 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_MCA_PIPELINE_H -#define LLVM_TOOLS_LLVM_MCA_PIPELINE_H +#ifndef LLVM_MCA_PIPELINE_H +#define LLVM_MCA_PIPELINE_H -#include "HardwareUnits/Scheduler.h" -#include "Stages/Stage.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/MCA/HardwareUnits/Scheduler.h" +#include "llvm/MCA/Stages/Stage.h" #include "llvm/Support/Error.h" namespace llvm { @@ -67,10 +67,13 @@ class Pipeline { public: Pipeline() : Cycles(0) {} void appendStage(std::unique_ptr S); - Error run(); + + /// Returns the total number of simulated cycles. + Expected run(); + void addEventListener(HWEventListener *Listener); }; } // namespace mca } // namespace llvm -#endif // LLVM_TOOLS_LLVM_MCA_PIPELINE_H +#endif // LLVM_MCA_PIPELINE_H diff --git a/tools/llvm-mca/include/SourceMgr.h b/include/llvm/MCA/SourceMgr.h similarity index 95% rename from tools/llvm-mca/include/SourceMgr.h rename to include/llvm/MCA/SourceMgr.h index e5180107011b19a3feab4faa9a92ca4c00f278cb..5e0ca6419f5d4057185d8fb123edd53eed6bdb06 100644 --- a/tools/llvm-mca/include/SourceMgr.h +++ b/include/llvm/MCA/SourceMgr.h @@ -13,8 +13,8 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_MCA_SOURCEMGR_H -#define LLVM_TOOLS_LLVM_MCA_SOURCEMGR_H +#ifndef LLVM_MCA_SOURCEMGR_H +#define LLVM_MCA_SOURCEMGR_H #include "llvm/ADT/ArrayRef.h" @@ -54,4 +54,4 @@ public: } // namespace mca } // namespace llvm -#endif +#endif // LLVM_MCA_SOURCEMGR_H diff --git a/tools/llvm-mca/include/Stages/DispatchStage.h b/include/llvm/MCA/Stages/DispatchStage.h similarity index 91% rename from tools/llvm-mca/include/Stages/DispatchStage.h rename to include/llvm/MCA/Stages/DispatchStage.h index 29cace1022e08ddbb746e6b4fe16d7686ba511ce..f015cd7522eb54edf0e22e0cf412c4f352861ab2 100644 --- a/tools/llvm-mca/include/Stages/DispatchStage.h +++ b/include/llvm/MCA/Stages/DispatchStage.h @@ -16,16 +16,16 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_MCA_DISPATCH_STAGE_H -#define LLVM_TOOLS_LLVM_MCA_DISPATCH_STAGE_H +#ifndef LLVM_MCA_DISPATCH_STAGE_H +#define LLVM_MCA_DISPATCH_STAGE_H -#include "HWEventListener.h" -#include "HardwareUnits/RegisterFile.h" -#include "HardwareUnits/RetireControlUnit.h" -#include "Instruction.h" -#include "Stages/Stage.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MCA/HWEventListener.h" +#include "llvm/MCA/HardwareUnits/RegisterFile.h" +#include "llvm/MCA/HardwareUnits/RetireControlUnit.h" +#include "llvm/MCA/Instruction.h" +#include "llvm/MCA/Stages/Stage.h" namespace llvm { namespace mca { @@ -90,4 +90,4 @@ public: } // namespace mca } // namespace llvm -#endif // LLVM_TOOLS_LLVM_MCA_DISPATCH_STAGE_H +#endif // LLVM_MCA_DISPATCH_STAGE_H diff --git a/tools/llvm-mca/include/Stages/FetchStage.h b/include/llvm/MCA/Stages/EntryStage.h similarity index 52% rename from tools/llvm-mca/include/Stages/FetchStage.h rename to include/llvm/MCA/Stages/EntryStage.h index 55bf2011b32ba5f312d52c650ed941f69170e71a..cd9a65b8cc2bcfaa7a0161bee4dc9d71b449f5ed 100644 --- a/tools/llvm-mca/include/Stages/FetchStage.h +++ b/include/llvm/MCA/Stages/EntryStage.h @@ -1,4 +1,4 @@ -//===---------------------- FetchStage.h ------------------------*- C++ -*-===// +//===---------------------- EntryStage.h ------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -8,35 +8,36 @@ //===----------------------------------------------------------------------===// /// \file /// -/// This file defines the Fetch stage of an instruction pipeline. Its sole -/// purpose in life is to produce instructions for the rest of the pipeline. +/// This file defines the Entry stage of an instruction pipeline. Its sole +/// purpose in life is to pick instructions in sequence and move them to the +/// next pipeline stage. /// //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_MCA_FETCH_STAGE_H -#define LLVM_TOOLS_LLVM_MCA_FETCH_STAGE_H +#ifndef LLVM_MCA_ENTRY_STAGE_H +#define LLVM_MCA_ENTRY_STAGE_H -#include "SourceMgr.h" -#include "Stages/Stage.h" -#include +#include "llvm/ADT/SmallVector.h" +#include "llvm/MCA/SourceMgr.h" +#include "llvm/MCA/Stages/Stage.h" namespace llvm { namespace mca { -class FetchStage final : public Stage { +class EntryStage final : public Stage { InstRef CurrentInstruction; - using InstMap = std::map>; - InstMap Instructions; + SmallVector, 16> Instructions; SourceMgr &SM; + unsigned NumRetired; // Updates the program counter, and sets 'CurrentInstruction'. void getNextInstruction(); - FetchStage(const FetchStage &Other) = delete; - FetchStage &operator=(const FetchStage &Other) = delete; + EntryStage(const EntryStage &Other) = delete; + EntryStage &operator=(const EntryStage &Other) = delete; public: - FetchStage(SourceMgr &SM) : CurrentInstruction(), SM(SM) {} + EntryStage(SourceMgr &SM) : CurrentInstruction(), SM(SM), NumRetired(0) { } bool isAvailable(const InstRef &IR) const override; bool hasWorkToComplete() const override; @@ -48,4 +49,4 @@ public: } // namespace mca } // namespace llvm -#endif // LLVM_TOOLS_LLVM_MCA_FETCH_STAGE_H +#endif // LLVM_MCA_FETCH_STAGE_H diff --git a/tools/llvm-mca/include/Stages/ExecuteStage.h b/include/llvm/MCA/Stages/ExecuteStage.h similarity index 92% rename from tools/llvm-mca/include/Stages/ExecuteStage.h rename to include/llvm/MCA/Stages/ExecuteStage.h index 91b24059c950df6dfdc762c063ce79a2b474922c..4f40c5c65d6bc951acf7b47edebeacb2d301dfa4 100644 --- a/tools/llvm-mca/include/Stages/ExecuteStage.h +++ b/include/llvm/MCA/Stages/ExecuteStage.h @@ -15,13 +15,13 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_MCA_EXECUTE_STAGE_H -#define LLVM_TOOLS_LLVM_MCA_EXECUTE_STAGE_H +#ifndef LLVM_MCA_EXECUTE_STAGE_H +#define LLVM_MCA_EXECUTE_STAGE_H -#include "HardwareUnits/Scheduler.h" -#include "Instruction.h" -#include "Stages/Stage.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/MCA/HardwareUnits/Scheduler.h" +#include "llvm/MCA/Instruction.h" +#include "llvm/MCA/Stages/Stage.h" namespace llvm { namespace mca { @@ -77,4 +77,4 @@ public: } // namespace mca } // namespace llvm -#endif // LLVM_TOOLS_LLVM_MCA_EXECUTE_STAGE_H +#endif // LLVM_MCA_EXECUTE_STAGE_H diff --git a/tools/llvm-mca/include/Stages/InstructionTables.h b/include/llvm/MCA/Stages/InstructionTables.h similarity index 83% rename from tools/llvm-mca/include/Stages/InstructionTables.h rename to include/llvm/MCA/Stages/InstructionTables.h index e618d06b1b74740a23bb7eb48650390c03d6bc0c..50f84fc6fff94c9c455e52a26c6dbe7716e3ab70 100644 --- a/tools/llvm-mca/include/Stages/InstructionTables.h +++ b/include/llvm/MCA/Stages/InstructionTables.h @@ -14,14 +14,14 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTIONTABLES_H -#define LLVM_TOOLS_LLVM_MCA_INSTRUCTIONTABLES_H +#ifndef LLVM_MCA_INSTRUCTIONTABLES_H +#define LLVM_MCA_INSTRUCTIONTABLES_H -#include "HardwareUnits/Scheduler.h" -#include "Stages/Stage.h" -#include "Support.h" #include "llvm/ADT/SmallVector.h" #include "llvm/MC/MCSchedule.h" +#include "llvm/MCA/HardwareUnits/Scheduler.h" +#include "llvm/MCA/Stages/Stage.h" +#include "llvm/MCA/Support.h" namespace llvm { namespace mca { @@ -42,4 +42,4 @@ public: } // namespace mca } // namespace llvm -#endif +#endif // LLVM_MCA_INSTRUCTIONTABLES_H diff --git a/tools/llvm-mca/include/Stages/RetireStage.h b/include/llvm/MCA/Stages/RetireStage.h similarity index 84% rename from tools/llvm-mca/include/Stages/RetireStage.h rename to include/llvm/MCA/Stages/RetireStage.h index 28eda40984f36caf3bb19415a7c3545b565af8bf..2051ce5c86adf990ee87f4f3b1acffbf807c9a46 100644 --- a/tools/llvm-mca/include/Stages/RetireStage.h +++ b/include/llvm/MCA/Stages/RetireStage.h @@ -14,12 +14,12 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_MCA_RETIRE_STAGE_H -#define LLVM_TOOLS_LLVM_MCA_RETIRE_STAGE_H +#ifndef LLVM_MCA_RETIRE_STAGE_H +#define LLVM_MCA_RETIRE_STAGE_H -#include "HardwareUnits/RegisterFile.h" -#include "HardwareUnits/RetireControlUnit.h" -#include "Stages/Stage.h" +#include "llvm/MCA/HardwareUnits/RegisterFile.h" +#include "llvm/MCA/HardwareUnits/RetireControlUnit.h" +#include "llvm/MCA/Stages/Stage.h" namespace llvm { namespace mca { @@ -45,4 +45,4 @@ public: } // namespace mca } // namespace llvm -#endif // LLVM_TOOLS_LLVM_MCA_RETIRE_STAGE_H +#endif // LLVM_MCA_RETIRE_STAGE_H diff --git a/tools/llvm-mca/include/Stages/Stage.h b/include/llvm/MCA/Stages/Stage.h similarity index 94% rename from tools/llvm-mca/include/Stages/Stage.h rename to include/llvm/MCA/Stages/Stage.h index 5665fc453bfcae42279b63ad8401a3cc88633614..fc7ab569bb0f73a30566c75309e8263e126df4c8 100644 --- a/tools/llvm-mca/include/Stages/Stage.h +++ b/include/llvm/MCA/Stages/Stage.h @@ -13,10 +13,10 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_MCA_STAGE_H -#define LLVM_TOOLS_LLVM_MCA_STAGE_H +#ifndef LLVM_MCA_STAGE_H +#define LLVM_MCA_STAGE_H -#include "HWEventListener.h" +#include "llvm/MCA/HWEventListener.h" #include "llvm/Support/Error.h" #include @@ -85,4 +85,4 @@ public: } // namespace mca } // namespace llvm -#endif // LLVM_TOOLS_LLVM_MCA_STAGE_H +#endif // LLVM_MCA_STAGE_H diff --git a/tools/llvm-mca/include/Support.h b/include/llvm/MCA/Support.h similarity index 98% rename from tools/llvm-mca/include/Support.h rename to include/llvm/MCA/Support.h index e7a4e33ed74ecdfc8ae4e5381e693b90d473c25c..5c5a35454330b70ee32fbf194d3956f9f127e4a1 100644 --- a/tools/llvm-mca/include/Support.h +++ b/include/llvm/MCA/Support.h @@ -12,8 +12,8 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_MCA_SUPPORT_H -#define LLVM_TOOLS_LLVM_MCA_SUPPORT_H +#ifndef LLVM_MCA_SUPPORT_H +#define LLVM_MCA_SUPPORT_H #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" @@ -116,4 +116,4 @@ double computeBlockRThroughput(const MCSchedModel &SM, unsigned DispatchWidth, } // namespace mca } // namespace llvm -#endif +#endif // LLVM_MCA_SUPPORT_H diff --git a/include/llvm/Object/COFF.h b/include/llvm/Object/COFF.h index e39a9b2e31c686c3f59c06030a020c7cd32ad355..5fe475ee8f46efbd84c65ff527dc3f111bb5793d 100644 --- a/include/llvm/Object/COFF.h +++ b/include/llvm/Object/COFF.h @@ -971,6 +971,9 @@ public: return nullptr; return reinterpret_cast(base()); } + std::error_code getCOFFHeader(const coff_file_header *&Res) const; + std::error_code + getCOFFBigObjHeader(const coff_bigobj_file_header *&Res) const; std::error_code getPE32Header(const pe32_header *&Res) const; std::error_code getPE32PlusHeader(const pe32plus_header *&Res) const; std::error_code getDataDirectory(uint32_t index, @@ -1062,6 +1065,8 @@ public: bool isRelocatableObject() const override; bool is64() const { return PE32PlusHeader; } + StringRef mapDebugSectionName(StringRef Name) const override; + static bool classof(const Binary *v) { return v->isCOFF(); } }; diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h index 752d468fd25b8e1b5ffe8692253eb015a9bbc061..bcdc190cc7dccceab916ec1e5f81745638b92ca3 100644 --- a/include/llvm/Object/ELF.h +++ b/include/llvm/Object/ELF.h @@ -32,7 +32,7 @@ namespace llvm { namespace object { StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type); -uint32_t getELFRelrRelocationType(uint32_t Machine); +uint32_t getELFRelativeRelocationType(uint32_t Machine); StringRef getELFSectionTypeName(uint32_t Machine, uint32_t Type); // Subclasses of ELFFile may need this for template instantiation @@ -113,7 +113,7 @@ public: StringRef getRelocationTypeName(uint32_t Type) const; void getRelocationTypeName(uint32_t Type, SmallVectorImpl &Result) const; - uint32_t getRelrRelocationType() const; + uint32_t getRelativeRelocationType() const; const char *getDynamicTagAsString(unsigned Arch, uint64_t Type) const; const char *getDynamicTagAsString(uint64_t Type) const; @@ -415,8 +415,8 @@ void ELFFile::getRelocationTypeName(uint32_t Type, } template -uint32_t ELFFile::getRelrRelocationType() const { - return getELFRelrRelocationType(getHeader()->e_machine); +uint32_t ELFFile::getRelativeRelocationType() const { + return getELFRelativeRelocationType(getHeader()->e_machine); } template diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h index dff086078391a2cada4166de03648a58c667c78e..0f620681cd99a421d34b6f496aa0fd84d07c7a98 100644 --- a/include/llvm/Object/ELFObjectFile.h +++ b/include/llvm/Object/ELFObjectFile.h @@ -260,6 +260,8 @@ protected: bool isSectionData(DataRefImpl Sec) const override; bool isSectionBSS(DataRefImpl Sec) const override; bool isSectionVirtual(DataRefImpl Sec) const override; + bool isBerkeleyText(DataRefImpl Sec) const override; + bool isBerkeleyData(DataRefImpl Sec) const override; relocation_iterator section_rel_begin(DataRefImpl Sec) const override; relocation_iterator section_rel_end(DataRefImpl Sec) const override; std::vector dynamic_relocation_sections() const override; @@ -333,9 +335,10 @@ protected: // A symbol is exported if its binding is either GLOBAL or WEAK, and its // visibility is either DEFAULT or PROTECTED. All other symbols are not // exported. - return ((Binding == ELF::STB_GLOBAL || Binding == ELF::STB_WEAK) && - (Visibility == ELF::STV_DEFAULT || - Visibility == ELF::STV_PROTECTED)); + return ( + (Binding == ELF::STB_GLOBAL || Binding == ELF::STB_WEAK || + Binding == ELF::STB_GNU_UNIQUE) && + (Visibility == ELF::STV_DEFAULT || Visibility == ELF::STV_PROTECTED)); } // This flag is used for classof, to distinguish ELFObjectFile from @@ -758,6 +761,20 @@ bool ELFObjectFile::isSectionVirtual(DataRefImpl Sec) const { return getSection(Sec)->sh_type == ELF::SHT_NOBITS; } +template +bool ELFObjectFile::isBerkeleyText(DataRefImpl Sec) const { + return getSection(Sec)->sh_flags & ELF::SHF_ALLOC && + (getSection(Sec)->sh_flags & ELF::SHF_EXECINSTR || + !(getSection(Sec)->sh_flags & ELF::SHF_WRITE)); +} + +template +bool ELFObjectFile::isBerkeleyData(DataRefImpl Sec) const { + const Elf_Shdr *EShdr = getSection(Sec); + return !isBerkeleyText(Sec) && EShdr->sh_type != ELF::SHT_NOBITS && + EShdr->sh_flags & ELF::SHF_ALLOC; +} + template relocation_iterator ELFObjectFile::section_rel_begin(DataRefImpl Sec) const { diff --git a/include/llvm/Object/ELFTypes.h b/include/llvm/Object/ELFTypes.h index fb386120e34d5968c688dd9a63b5ddca0d8c4c43..ec3c8e7bae465ba4b788ad2b350ac60679ca5aa1 100644 --- a/include/llvm/Object/ELFTypes.h +++ b/include/llvm/Object/ELFTypes.h @@ -605,13 +605,12 @@ public: } /// Get the note's descriptor. - ArrayRef getDesc() const { + ArrayRef getDesc() const { if (!Nhdr.n_descsz) - return ArrayRef(); - return ArrayRef( - reinterpret_cast( - reinterpret_cast(&Nhdr) + sizeof(Nhdr) + - alignTo::Align>(Nhdr.n_namesz)), + return ArrayRef(); + return ArrayRef( + reinterpret_cast(&Nhdr) + sizeof(Nhdr) + + alignTo::Align>(Nhdr.n_namesz), Nhdr.n_descsz); } @@ -643,14 +642,19 @@ class Elf_Note_Iterator_Impl // container, either cleanly or with an overflow error. void advanceNhdr(const uint8_t *NhdrPos, size_t NoteSize) { RemainingSize -= NoteSize; - if (RemainingSize == 0u) + if (RemainingSize == 0u) { + // Ensure that if the iterator walks to the end, the error is checked + // afterwards. + *Err = Error::success(); Nhdr = nullptr; - else if (sizeof(*Nhdr) > RemainingSize) + } else if (sizeof(*Nhdr) > RemainingSize) stopWithOverflowError(); else { Nhdr = reinterpret_cast *>(NhdrPos + NoteSize); if (Nhdr->getSize() > RemainingSize) stopWithOverflowError(); + else + *Err = Error::success(); } } @@ -658,6 +662,7 @@ class Elf_Note_Iterator_Impl explicit Elf_Note_Iterator_Impl(Error &Err) : Err(&Err) {} Elf_Note_Iterator_Impl(const uint8_t *Start, size_t Size, Error &Err) : RemainingSize(Size), Err(&Err) { + consumeError(std::move(Err)); assert(Start && "ELF note iterator starting at NULL"); advanceNhdr(Start, 0u); } @@ -671,6 +676,10 @@ public: return *this; } bool operator==(Elf_Note_Iterator_Impl Other) const { + if (!Nhdr && Other.Err) + (void)(bool)(*Other.Err); + if (!Other.Nhdr && Err) + (void)(bool)(*Err); return Nhdr == Other.Nhdr; } bool operator!=(Elf_Note_Iterator_Impl Other) const { diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h index 02d62e8e48798085668d34f9144b64f8ba8fccb2..036c99cb6baff0d5f967302b40f965dff391c38c 100644 --- a/include/llvm/Object/ObjectFile.h +++ b/include/llvm/Object/ObjectFile.h @@ -104,13 +104,25 @@ public: uint64_t getAlignment() const; bool isCompressed() const; + /// Whether this section contains instructions. bool isText() const; + /// Whether this section contains data, not instructions. bool isData() const; + /// Whether this section contains BSS uninitialized data. bool isBSS() const; bool isVirtual() const; bool isBitcode() const; bool isStripped() const; + /// Whether this section will be placed in the text segment, according to the + /// Berkeley size format. This is true if the section is allocatable, and + /// contains either code or readonly data. + bool isBerkeleyText() const; + /// Whether this section will be placed in the data segment, according to the + /// Berkeley size format. This is true if the section is allocatable and + /// contains data (e.g. PROGBITS), but is not text. + bool isBerkeleyData() const; + bool containsSymbol(SymbolRef S) const; relocation_iterator relocation_begin() const; @@ -238,6 +250,8 @@ protected: virtual bool isSectionVirtual(DataRefImpl Sec) const = 0; virtual bool isSectionBitcode(DataRefImpl Sec) const; virtual bool isSectionStripped(DataRefImpl Sec) const; + virtual bool isBerkeleyText(DataRefImpl Sec) const; + virtual bool isBerkeleyData(DataRefImpl Sec) const; virtual relocation_iterator section_rel_begin(DataRefImpl Sec) const = 0; virtual relocation_iterator section_rel_end(DataRefImpl Sec) const = 0; virtual section_iterator getRelocatedSection(DataRefImpl Sec) const; @@ -449,6 +463,14 @@ inline bool SectionRef::isStripped() const { return OwningObject->isSectionStripped(SectionPimpl); } +inline bool SectionRef::isBerkeleyText() const { + return OwningObject->isBerkeleyText(SectionPimpl); +} + +inline bool SectionRef::isBerkeleyData() const { + return OwningObject->isBerkeleyData(SectionPimpl); +} + inline relocation_iterator SectionRef::relocation_begin() const { return OwningObject->section_rel_begin(SectionPimpl); } diff --git a/include/llvm/Object/RelocVisitor.h b/include/llvm/Object/RelocVisitor.h index 008e109f6679e58c4e49c491b9f17652183f86e0..9a978de2e59968e354ce01e3b8f6828beb0afcc3 100644 --- a/include/llvm/Object/RelocVisitor.h +++ b/include/llvm/Object/RelocVisitor.h @@ -129,6 +129,8 @@ private: case ELF::R_X86_64_NONE: return 0; case ELF::R_X86_64_64: + case ELF::R_X86_64_DTPOFF32: + case ELF::R_X86_64_DTPOFF64: return Value + getELFAddend(R); case ELF::R_X86_64_PC32: return Value + getELFAddend(R) - R.getOffset(); @@ -333,6 +335,7 @@ private: case wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB: case wasm::R_WEBASSEMBLY_FUNCTION_OFFSET_I32: case wasm::R_WEBASSEMBLY_SECTION_OFFSET_I32: + case wasm::R_WEBASSEMBLY_EVENT_INDEX_LEB: // For wasm section, its offset at 0 -- ignoring Value return 0; } diff --git a/include/llvm/Object/Wasm.h b/include/llvm/Object/Wasm.h index 25903fc994543803b86b3e697784b285891db1f3..ed857652a0486149658e157987586f7693e4e2b0 100644 --- a/include/llvm/Object/Wasm.h +++ b/include/llvm/Object/Wasm.h @@ -37,13 +37,16 @@ namespace object { class WasmSymbol { public: WasmSymbol(const wasm::WasmSymbolInfo &Info, - const wasm::WasmSignature *FunctionType, - const wasm::WasmGlobalType *GlobalType) - : Info(Info), FunctionType(FunctionType), GlobalType(GlobalType) {} + const wasm::WasmGlobalType *GlobalType, + const wasm::WasmEventType *EventType, + const wasm::WasmSignature *Signature) + : Info(Info), GlobalType(GlobalType), EventType(EventType), + Signature(Signature) {} const wasm::WasmSymbolInfo &Info; - const wasm::WasmSignature *FunctionType; const wasm::WasmGlobalType *GlobalType; + const wasm::WasmEventType *EventType; + const wasm::WasmSignature *Signature; bool isTypeFunction() const { return Info.Kind == wasm::WASM_SYMBOL_TYPE_FUNCTION; @@ -59,6 +62,8 @@ public: return Info.Kind == wasm::WASM_SYMBOL_TYPE_SECTION; } + bool isTypeEvent() const { return Info.Kind == wasm::WASM_SYMBOL_TYPE_EVENT; } + bool isDefined() const { return !isUndefined(); } bool isUndefined() const { @@ -124,12 +129,14 @@ public: static bool classof(const Binary *v) { return v->isWasm(); } + const wasm::WasmDylinkInfo &dylinkInfo() const { return DylinkInfo; } ArrayRef types() const { return Signatures; } ArrayRef functionTypes() const { return FunctionTypes; } ArrayRef imports() const { return Imports; } ArrayRef tables() const { return Tables; } ArrayRef memories() const { return Memories; } ArrayRef globals() const { return Globals; } + ArrayRef events() const { return Events; } ArrayRef exports() const { return Exports; } ArrayRef syms() const { return Symbols; } const wasm::WasmLinkingData &linkingData() const { return LinkingData; } @@ -141,6 +148,7 @@ public: uint32_t startFunction() const { return StartFunction; } uint32_t getNumImportedGlobals() const { return NumImportedGlobals; } uint32_t getNumImportedFunctions() const { return NumImportedFunctions; } + uint32_t getNumImportedEvents() const { return NumImportedEvents; } void moveSymbolNext(DataRefImpl &Symb) const override; @@ -193,6 +201,7 @@ public: Triple::ArchType getArch() const override; SubtargetFeatures getFeatures() const override; bool isRelocatableObject() const override; + bool isSharedObject() const; struct ReadContext { const uint8_t *Start; @@ -205,12 +214,16 @@ private: bool isDefinedFunctionIndex(uint32_t Index) const; bool isValidGlobalIndex(uint32_t Index) const; bool isDefinedGlobalIndex(uint32_t Index) const; + bool isValidEventIndex(uint32_t Index) const; + bool isDefinedEventIndex(uint32_t Index) const; bool isValidFunctionSymbol(uint32_t Index) const; bool isValidGlobalSymbol(uint32_t Index) const; + bool isValidEventSymbol(uint32_t Index) const; bool isValidDataSymbol(uint32_t Index) const; bool isValidSectionSymbol(uint32_t Index) const; wasm::WasmFunction &getDefinedFunction(uint32_t Index); wasm::WasmGlobal &getDefinedGlobal(uint32_t Index); + wasm::WasmEvent &getDefinedEvent(uint32_t Index); const WasmSection &getWasmSection(DataRefImpl Ref) const; const wasm::WasmRelocation &getWasmRelocation(DataRefImpl Ref) const; @@ -226,6 +239,7 @@ private: Error parseTableSection(ReadContext &Ctx); Error parseMemorySection(ReadContext &Ctx); Error parseGlobalSection(ReadContext &Ctx); + Error parseEventSection(ReadContext &Ctx); Error parseExportSection(ReadContext &Ctx); Error parseStartSection(ReadContext &Ctx); Error parseElemSection(ReadContext &Ctx); @@ -233,6 +247,7 @@ private: Error parseDataSection(ReadContext &Ctx); // Custom section types + Error parseDylinkSection(ReadContext &Ctx); Error parseNameSection(ReadContext &Ctx); Error parseLinkingSection(ReadContext &Ctx); Error parseLinkingSectionSymtab(ReadContext &Ctx); @@ -241,11 +256,13 @@ private: wasm::WasmObjectHeader Header; std::vector Sections; + wasm::WasmDylinkInfo DylinkInfo; std::vector Signatures; std::vector FunctionTypes; std::vector Tables; std::vector Memories; std::vector Globals; + std::vector Events; std::vector Imports; std::vector Exports; std::vector ElemSegments; @@ -255,12 +272,58 @@ private: std::vector DebugNames; uint32_t StartFunction = -1; bool HasLinkingSection = false; + bool HasDylinkSection = false; wasm::WasmLinkingData LinkingData; uint32_t NumImportedGlobals = 0; uint32_t NumImportedFunctions = 0; + uint32_t NumImportedEvents = 0; uint32_t CodeSection = 0; uint32_t DataSection = 0; uint32_t GlobalSection = 0; + uint32_t EventSection = 0; +}; + +class WasmSectionOrderChecker { +public: + // We define orders for all core wasm sections and known custom sections. + enum : int { + // Core sections + // The order of standard sections is precisely given by the spec. + WASM_SEC_ORDER_TYPE = 1, + WASM_SEC_ORDER_IMPORT = 2, + WASM_SEC_ORDER_FUNCTION = 3, + WASM_SEC_ORDER_TABLE = 4, + WASM_SEC_ORDER_MEMORY = 5, + WASM_SEC_ORDER_GLOBAL = 6, + WASM_SEC_ORDER_EVENT = 7, + WASM_SEC_ORDER_EXPORT = 8, + WASM_SEC_ORDER_START = 9, + WASM_SEC_ORDER_ELEM = 10, + WASM_SEC_ORDER_DATACOUNT = 11, + WASM_SEC_ORDER_CODE = 12, + WASM_SEC_ORDER_DATA = 13, + + // Custom sections + // "dylink" should be the very first section in the module + WASM_SEC_ORDER_DYLINK = 0, + // "linking" section requires DATA section in order to validate data symbols + WASM_SEC_ORDER_LINKING = 100, + // Must come after "linking" section in order to validate reloc indexes. + WASM_SEC_ORDER_RELOC = 101, + // "name" section must appear after DATA. Comes after "linking" to allow + // symbol table to set default function name. + WASM_SEC_ORDER_NAME = 102, + // "producers" section must appear after "name" section. + WASM_SEC_ORDER_PRODUCERS = 103 + }; + + bool isValidSectionOrder(unsigned ID, StringRef CustomSectionName = ""); + +private: + int LastOrder = -1; // Lastly seen known section's order + + // Returns -1 for unknown sections. + int getSectionOrder(unsigned ID, StringRef CustomSectionName = ""); }; } // end namespace object diff --git a/include/llvm/ObjectYAML/WasmYAML.h b/include/llvm/ObjectYAML/WasmYAML.h index 8cd08e520560b50a6714e3a86b436331f84b5861..406dd7cb515f745fff27a26ef9d16d777c55ec86 100644 --- a/include/llvm/ObjectYAML/WasmYAML.h +++ b/include/llvm/ObjectYAML/WasmYAML.h @@ -74,6 +74,12 @@ struct Global { wasm::WasmInitExpr InitExpr; }; +struct Event { + uint32_t Index; + uint32_t Attribute; + uint32_t SigIndex; +}; + struct Import { StringRef Module; StringRef Field; @@ -83,6 +89,7 @@ struct Import { Global GlobalImport; Table TableImport; Limits Memory; + Event EventImport; }; }; @@ -176,6 +183,21 @@ struct CustomSection : Section { yaml::BinaryRef Payload; }; +struct DylinkSection : CustomSection { + DylinkSection() : CustomSection("dylink") {} + + static bool classof(const Section *S) { + auto C = dyn_cast(S); + return C && C->Name == "dylink"; + } + + uint32_t MemorySize; + uint32_t MemoryAlignment; + uint32_t TableSize; + uint32_t TableAlignment; + std::vector Needed; +}; + struct NameSection : CustomSection { NameSection() : CustomSection("name") {} @@ -262,6 +284,16 @@ struct GlobalSection : Section { std::vector Globals; }; +struct EventSection : Section { + EventSection() : Section(wasm::WASM_SEC_EVENT) {} + + static bool classof(const Section *S) { + return S->Type == wasm::WASM_SEC_EVENT; + } + + std::vector Events; +}; + struct ExportSection : Section { ExportSection() : Section(wasm::WASM_SEC_EXPORT) {} @@ -339,6 +371,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::SymbolInfo) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::InitFunction) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::ComdatEntry) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::Comdat) +LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::Event) namespace llvm { namespace yaml { @@ -471,6 +504,10 @@ template <> struct ScalarEnumerationTraits { static void enumeration(IO &IO, WasmYAML::RelocType &Kind); }; +template <> struct MappingTraits { + static void mapping(IO &IO, WasmYAML::Event &Event); +}; + } // end namespace yaml } // end namespace llvm diff --git a/include/llvm/Passes/PassBuilder.h b/include/llvm/Passes/PassBuilder.h index 22e5eb0caa0fa9ff5608a6b1439b17bbac0fd374..fa59345a02cf72126f7251f17baa51ddf0834389 100644 --- a/include/llvm/Passes/PassBuilder.h +++ b/include/llvm/Passes/PassBuilder.h @@ -501,6 +501,18 @@ public: PipelineStartEPCallbacks.push_back(C); } + /// Register a callback for a default optimizer pipeline extension point + /// + /// This extension point allows adding optimizations at the very end of the + /// function optimization pipeline. A key difference between this and the + /// legacy PassManager's OptimizerLast callback is that this extension point + /// is not triggered at O0. Extensions to the O0 pipeline should append their + /// passes to the end of the overall pipeline. + void registerOptimizerLastEPCallback( + const std::function &C) { + OptimizerLastEPCallbacks.push_back(C); + } + /// Register a callback for parsing an AliasAnalysis Name to populate /// the given AAManager \p AA void registerParseAACallback( @@ -614,6 +626,8 @@ private: CGSCCOptimizerLateEPCallbacks; SmallVector, 2> VectorizerStartEPCallbacks; + SmallVector, 2> + OptimizerLastEPCallbacks; // Module callbacks SmallVector, 2> PipelineStartEPCallbacks; diff --git a/include/llvm/ProfileData/SampleProfReader.h b/include/llvm/ProfileData/SampleProfReader.h index 3c477cc347147ec6a62090604c2a9e37098e769f..5cc729e42cc88f218dd37496ea5f9f4cda1e7401 100644 --- a/include/llvm/ProfileData/SampleProfReader.h +++ b/include/llvm/ProfileData/SampleProfReader.h @@ -548,6 +548,9 @@ public: : SampleProfileReader(std::move(B), C, Underlying->getFormat()) { Profiles = std::move(Underlying->getProfiles()); Summary = takeSummary(*Underlying); + // Keep the underlying reader alive; the profile data may contain + // StringRefs referencing names in its name table. + UnderlyingReader = std::move(Underlying); } /// Create a remapped sample profile from the given remapping file and @@ -569,6 +572,7 @@ public: private: SymbolRemappingReader Remappings; DenseMap SampleMap; + std::unique_ptr UnderlyingReader; }; } // end namespace sampleprof diff --git a/include/llvm/Support/AArch64TargetParser.def b/include/llvm/Support/AArch64TargetParser.def index 65bfee81bf45526989f6a47cd4d3dab68e3f08c4..6a7d16adefc6ad81a77228b831ccd7db9023abf4 100644 --- a/include/llvm/Support/AArch64TargetParser.def +++ b/include/llvm/Support/AArch64TargetParser.def @@ -72,6 +72,7 @@ AARCH64_ARCH_EXT_NAME("sve", AArch64::AEK_SVE, "+sve", "-sve") AARCH64_ARCH_EXT_NAME("rcpc", AArch64::AEK_RCPC, "+rcpc", "-rcpc") AARCH64_ARCH_EXT_NAME("rng", AArch64::AEK_RAND, "+rand", "-rand") AARCH64_ARCH_EXT_NAME("memtag", AArch64::AEK_MTE, "+mte", "-mte") +AARCH64_ARCH_EXT_NAME("ssbs", AArch64::AEK_SSBS, "+ssbs", "-ssbs") #undef AARCH64_ARCH_EXT_NAME #ifndef AARCH64_CPU_NAME @@ -117,6 +118,9 @@ AARCH64_CPU_NAME("thunderxt81", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, (AArch64::AEK_CRC | AArch64::AEK_PROFILE)) AARCH64_CPU_NAME("thunderxt83", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, (AArch64::AEK_CRC | AArch64::AEK_PROFILE)) +AARCH64_CPU_NAME("tsv110", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false, + (AArch64::AEK_PROFILE | AArch64::AEK_FP16 | AArch64::AEK_FP16FML | + AArch64::AEK_DOTPROD)) // Invalid CPU AARCH64_CPU_NAME("invalid", INVALID, FK_INVALID, true, AArch64::AEK_INVALID) #undef AARCH64_CPU_NAME diff --git a/include/llvm/Support/AArch64TargetParser.h b/include/llvm/Support/AArch64TargetParser.h new file mode 100644 index 0000000000000000000000000000000000000000..aea406217c46a09aa15f5bd58af5f25b62ef393e --- /dev/null +++ b/include/llvm/Support/AArch64TargetParser.h @@ -0,0 +1,122 @@ +//===-- AArch64TargetParser - Parser for AArch64 features -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a target parser to recognise AArch64 hardware features +// such as FPU/CPU/ARCH and extension names. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_AARCH64TARGETPARSERCOMMON_H +#define LLVM_SUPPORT_AARCH64TARGETPARSERCOMMON_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Support/ARMTargetParser.h" +#include + +// FIXME:This should be made into class design,to avoid dupplication. +namespace llvm { +namespace AArch64 { + +// Arch extension modifiers for CPUs. +enum ArchExtKind : unsigned { + AEK_INVALID = 0, + AEK_NONE = 1, + AEK_CRC = 1 << 1, + AEK_CRYPTO = 1 << 2, + AEK_FP = 1 << 3, + AEK_SIMD = 1 << 4, + AEK_FP16 = 1 << 5, + AEK_PROFILE = 1 << 6, + AEK_RAS = 1 << 7, + AEK_LSE = 1 << 8, + AEK_SVE = 1 << 9, + AEK_DOTPROD = 1 << 10, + AEK_RCPC = 1 << 11, + AEK_RDM = 1 << 12, + AEK_SM4 = 1 << 13, + AEK_SHA3 = 1 << 14, + AEK_SHA2 = 1 << 15, + AEK_AES = 1 << 16, + AEK_FP16FML = 1 << 17, + AEK_RAND = 1 << 18, + AEK_MTE = 1 << 19, + AEK_SSBS = 1 << 20, +}; + +enum class ArchKind { +#define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT) ID, +#include "AArch64TargetParser.def" +}; + +const ARM::ArchNames AArch64ARCHNames[] = { +#define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, \ + ARCH_BASE_EXT) \ + {NAME, \ + sizeof(NAME) - 1, \ + CPU_ATTR, \ + sizeof(CPU_ATTR) - 1, \ + SUB_ARCH, \ + sizeof(SUB_ARCH) - 1, \ + ARM::FPUKind::ARCH_FPU, \ + ARCH_BASE_EXT, \ + AArch64::ArchKind::ID, \ + ARCH_ATTR}, +#include "AArch64TargetParser.def" +}; + +const ARM::ExtName AArch64ARCHExtNames[] = { +#define AARCH64_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE) \ + {NAME, sizeof(NAME) - 1, ID, FEATURE, NEGFEATURE}, +#include "AArch64TargetParser.def" +}; + +const ARM::CpuNames AArch64CPUNames[] = { +#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \ + {NAME, sizeof(NAME) - 1, AArch64::ArchKind::ID, IS_DEFAULT, DEFAULT_EXT}, +#include "AArch64TargetParser.def" +}; + +const ArchKind ArchKinds[] = { +#define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT) \ + ArchKind::ID, +#include "AArch64TargetParser.def" +}; + +// FIXME: These should be moved to TargetTuple once it exists +bool getExtensionFeatures(unsigned Extensions, + std::vector &Features); +bool getArchFeatures(ArchKind AK, std::vector &Features); + +StringRef getArchName(ArchKind AK); +unsigned getArchAttr(ArchKind AK); +StringRef getCPUAttr(ArchKind AK); +StringRef getSubArch(ArchKind AK); +StringRef getArchExtName(unsigned ArchExtKind); +StringRef getArchExtFeature(StringRef ArchExt); + +// Information by Name +unsigned getDefaultFPU(StringRef CPU, ArchKind AK); +unsigned getDefaultExtensions(StringRef CPU, ArchKind AK); +StringRef getDefaultCPU(StringRef Arch); +ArchKind getCPUArchKind(StringRef CPU); + +// Parser +ArchKind parseArch(StringRef Arch); +ArchExtKind parseArchExt(StringRef ArchExt); +ArchKind parseCPUArch(StringRef CPU); +// Used by target parser tests +void fillValidCPUArchList(SmallVectorImpl &Values); + +bool isX18ReservedByDefault(const Triple &TT); + +} // namespace AArch64 +} // namespace llvm + +#endif diff --git a/include/llvm/Support/AMDGPUMetadata.h b/include/llvm/Support/AMDGPUMetadata.h index 667fb3f3da436c146eac3453293cc601060581c2..84851c07499d43f4078f0fa6e403980371f7ce0f 100644 --- a/include/llvm/Support/AMDGPUMetadata.h +++ b/include/llvm/Support/AMDGPUMetadata.h @@ -431,6 +431,21 @@ std::error_code fromString(std::string String, Metadata &HSAMetadata); /// Converts \p HSAMetadata to \p String. std::error_code toString(Metadata HSAMetadata, std::string &String); +//===----------------------------------------------------------------------===// +// HSA metadata for v3 code object. +//===----------------------------------------------------------------------===// +namespace V3 { +/// HSA metadata major version. +constexpr uint32_t VersionMajor = 1; +/// HSA metadata minor version. +constexpr uint32_t VersionMinor = 0; + +/// HSA metadata beginning assembler directive. +constexpr char AssemblerDirectiveBegin[] = ".amdgpu_metadata"; +/// HSA metadata ending assembler directive. +constexpr char AssemblerDirectiveEnd[] = ".end_amdgpu_metadata"; +} // end namespace V3 + } // end namespace HSAMD //===----------------------------------------------------------------------===// diff --git a/include/llvm/Support/ARMTargetParser.h b/include/llvm/Support/ARMTargetParser.h new file mode 100644 index 0000000000000000000000000000000000000000..e41675a1f6087194d3e071bf0a44ab0d9d1b11f5 --- /dev/null +++ b/include/llvm/Support/ARMTargetParser.h @@ -0,0 +1,263 @@ +//===-- ARMTargetParser - Parser for ARM target features --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a target parser to recognise ARM hardware features +// such as FPU/CPU/ARCH/extensions and specific support such as HWDIV. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_ARMTARGETPARSER_H +#define LLVM_SUPPORT_ARMTARGETPARSER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Support/ARMBuildAttributes.h" +#include + +namespace llvm { +namespace ARM { + +// Arch extension modifiers for CPUs. +// Note that this is not the same as the AArch64 list +enum ArchExtKind : unsigned { + AEK_INVALID = 0, + AEK_NONE = 1, + AEK_CRC = 1 << 1, + AEK_CRYPTO = 1 << 2, + AEK_FP = 1 << 3, + AEK_HWDIVTHUMB = 1 << 4, + AEK_HWDIVARM = 1 << 5, + AEK_MP = 1 << 6, + AEK_SIMD = 1 << 7, + AEK_SEC = 1 << 8, + AEK_VIRT = 1 << 9, + AEK_DSP = 1 << 10, + AEK_FP16 = 1 << 11, + AEK_RAS = 1 << 12, + AEK_SVE = 1 << 13, + AEK_DOTPROD = 1 << 14, + AEK_SHA2 = 1 << 15, + AEK_AES = 1 << 16, + AEK_FP16FML = 1 << 17, + // Unsupported extensions. + AEK_OS = 0x8000000, + AEK_IWMMXT = 0x10000000, + AEK_IWMMXT2 = 0x20000000, + AEK_MAVERICK = 0x40000000, + AEK_XSCALE = 0x80000000, +}; + +// List of Arch Extension names. +// FIXME: TableGen this. +struct ExtName { + const char *NameCStr; + size_t NameLength; + unsigned ID; + const char *Feature; + const char *NegFeature; + + StringRef getName() const { return StringRef(NameCStr, NameLength); } +}; + +const ExtName ARCHExtNames[] = { +#define ARM_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE) \ + {NAME, sizeof(NAME) - 1, ID, FEATURE, NEGFEATURE}, +#include "ARMTargetParser.def" +}; + +// List of HWDiv names (use getHWDivSynonym) and which architectural +// features they correspond to (use getHWDivFeatures). +// FIXME: TableGen this. +const struct { + const char *NameCStr; + size_t NameLength; + unsigned ID; + + StringRef getName() const { return StringRef(NameCStr, NameLength); } +} HWDivNames[] = { +#define ARM_HW_DIV_NAME(NAME, ID) {NAME, sizeof(NAME) - 1, ID}, +#include "ARMTargetParser.def" +}; + +// Arch names. +enum class ArchKind { +#define ARM_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT) ID, +#include "ARMTargetParser.def" +}; + +// List of CPU names and their arches. +// The same CPU can have multiple arches and can be default on multiple arches. +// When finding the Arch for a CPU, first-found prevails. Sort them accordingly. +// When this becomes table-generated, we'd probably need two tables. +// FIXME: TableGen this. +template struct CpuNames { + const char *NameCStr; + size_t NameLength; + T ArchID; + bool Default; // is $Name the default CPU for $ArchID ? + unsigned DefaultExtensions; + + StringRef getName() const { return StringRef(NameCStr, NameLength); } +}; + +const CpuNames CPUNames[] = { +#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \ + {NAME, sizeof(NAME) - 1, ARM::ArchKind::ID, IS_DEFAULT, DEFAULT_EXT}, +#include "ARMTargetParser.def" +}; + +// FPU names. +enum FPUKind { +#define ARM_FPU(NAME, KIND, VERSION, NEON_SUPPORT, RESTRICTION) KIND, +#include "ARMTargetParser.def" + FK_LAST +}; + +// FPU Version +enum class FPUVersion { + NONE, + VFPV2, + VFPV3, + VFPV3_FP16, + VFPV4, + VFPV5 +}; + +// An FPU name restricts the FPU in one of three ways: +enum class FPURestriction { + None = 0, ///< No restriction + D16, ///< Only 16 D registers + SP_D16 ///< Only single-precision instructions, with 16 D registers +}; + +// An FPU name implies one of three levels of Neon support: +enum class NeonSupportLevel { + None = 0, ///< No Neon + Neon, ///< Neon + Crypto ///< Neon with Crypto +}; + +// ISA kinds. +enum class ISAKind { INVALID = 0, ARM, THUMB, AARCH64 }; + +// Endianness +// FIXME: BE8 vs. BE32? +enum class EndianKind { INVALID = 0, LITTLE, BIG }; + +// v6/v7/v8 Profile +enum class ProfileKind { INVALID = 0, A, R, M }; + +// List of canonical FPU names (use getFPUSynonym) and which architectural +// features they correspond to (use getFPUFeatures). +// FIXME: TableGen this. +// The entries must appear in the order listed in ARM::FPUKind for correct +// indexing +struct FPUName { + const char *NameCStr; + size_t NameLength; + FPUKind ID; + FPUVersion FPUVer; + NeonSupportLevel NeonSupport; + FPURestriction Restriction; + + StringRef getName() const { return StringRef(NameCStr, NameLength); } +}; + +static const FPUName FPUNames[] = { +#define ARM_FPU(NAME, KIND, VERSION, NEON_SUPPORT, RESTRICTION) \ + {NAME, sizeof(NAME) - 1, KIND, VERSION, NEON_SUPPORT, RESTRICTION}, +#include "llvm/Support/ARMTargetParser.def" +}; + +// List of canonical arch names (use getArchSynonym). +// This table also provides the build attribute fields for CPU arch +// and Arch ID, according to the Addenda to the ARM ABI, chapters +// 2.4 and 2.3.5.2 respectively. +// FIXME: SubArch values were simplified to fit into the expectations +// of the triples and are not conforming with their official names. +// Check to see if the expectation should be changed. +// FIXME: TableGen this. +template struct ArchNames { + const char *NameCStr; + size_t NameLength; + const char *CPUAttrCStr; + size_t CPUAttrLength; + const char *SubArchCStr; + size_t SubArchLength; + unsigned DefaultFPU; + unsigned ArchBaseExtensions; + T ID; + ARMBuildAttrs::CPUArch ArchAttr; // Arch ID in build attributes. + + StringRef getName() const { return StringRef(NameCStr, NameLength); } + + // CPU class in build attributes. + StringRef getCPUAttr() const { return StringRef(CPUAttrCStr, CPUAttrLength); } + + // Sub-Arch name. + StringRef getSubArch() const { return StringRef(SubArchCStr, SubArchLength); } +}; + +static const ArchNames ARCHNames[] = { +#define ARM_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, \ + ARCH_BASE_EXT) \ + {NAME, sizeof(NAME) - 1, \ + CPU_ATTR, sizeof(CPU_ATTR) - 1, \ + SUB_ARCH, sizeof(SUB_ARCH) - 1, \ + ARCH_FPU, ARCH_BASE_EXT, \ + ArchKind::ID, ARCH_ATTR}, +#include "llvm/Support/ARMTargetParser.def" +}; + +// Information by ID +StringRef getFPUName(unsigned FPUKind); +FPUVersion getFPUVersion(unsigned FPUKind); +NeonSupportLevel getFPUNeonSupportLevel(unsigned FPUKind); +FPURestriction getFPURestriction(unsigned FPUKind); + +// FIXME: These should be moved to TargetTuple once it exists +bool getFPUFeatures(unsigned FPUKind, std::vector &Features); +bool getHWDivFeatures(unsigned HWDivKind, std::vector &Features); +bool getExtensionFeatures(unsigned Extensions, + std::vector &Features); + +StringRef getArchName(ArchKind AK); +unsigned getArchAttr(ArchKind AK); +StringRef getCPUAttr(ArchKind AK); +StringRef getSubArch(ArchKind AK); +StringRef getArchExtName(unsigned ArchExtKind); +StringRef getArchExtFeature(StringRef ArchExt); +StringRef getHWDivName(unsigned HWDivKind); + +// Information by Name +unsigned getDefaultFPU(StringRef CPU, ArchKind AK); +unsigned getDefaultExtensions(StringRef CPU, ArchKind AK); +StringRef getDefaultCPU(StringRef Arch); +StringRef getCanonicalArchName(StringRef Arch); +StringRef getFPUSynonym(StringRef FPU); +StringRef getArchSynonym(StringRef Arch); + +// Parser +unsigned parseHWDiv(StringRef HWDiv); +unsigned parseFPU(StringRef FPU); +ArchKind parseArch(StringRef Arch); +unsigned parseArchExt(StringRef ArchExt); +ArchKind parseCPUArch(StringRef CPU); +ISAKind parseArchISA(StringRef Arch); +EndianKind parseArchEndian(StringRef Arch); +ProfileKind parseArchProfile(StringRef Arch); +unsigned parseArchVersion(StringRef Arch); + +void fillValidCPUArchList(SmallVectorImpl &Values); +StringRef computeDefaultTargetABI(const Triple &TT, StringRef CPU); + +} // namespace ARM +} // namespace llvm + +#endif diff --git a/include/llvm/Support/Allocator.h b/include/llvm/Support/Allocator.h index b1b351863b99a79bc0170b5ab90771f420d6ee67..42d08378a6774a1da39ca28782eb5dc17fb622c5 100644 --- a/include/llvm/Support/Allocator.h +++ b/include/llvm/Support/Allocator.h @@ -311,6 +311,33 @@ public: return None; } + /// A wrapper around identifyObject that additionally asserts that + /// the object is indeed within the allocator. + /// \return An index uniquely and reproducibly identifying + /// an input pointer \p Ptr in the given allocator. + int64_t identifyKnownObject(const void *Ptr) { + Optional Out = identifyObject(Ptr); + assert(Out && "Wrong allocator used"); + return *Out; + } + + /// A wrapper around identifyKnownObject. Accepts type information + /// about the object and produces a smaller identifier by relying on + /// the alignment information. Note that sub-classes may have different + /// alignment, so the most base class should be passed as template parameter + /// in order to obtain correct results. For that reason automatic template + /// parameter deduction is disabled. + /// \return An index uniquely and reproducibly identifying + /// an input pointer \p Ptr in the given allocator. This identifier is + /// different from the ones produced by identifyObject and + /// identifyAlignedObject. + template + int64_t identifyKnownAlignedObject(const void *Ptr) { + int64_t Out = identifyKnownObject(Ptr); + assert(Out % alignof(T) == 0 && "Wrong alignment information"); + return Out / alignof(T); + } + size_t getTotalMemory() const { size_t TotalMemory = 0; for (auto I = Slabs.begin(), E = Slabs.end(); I != E; ++I) diff --git a/include/llvm/Support/BinaryStreamArray.h b/include/llvm/Support/BinaryStreamArray.h index 7b8a95b45735b3226724f64b83c2eeda5063bd16..7c110fcb6a4b1be8bdeadfeb05725f21ded8aacb 100644 --- a/include/llvm/Support/BinaryStreamArray.h +++ b/include/llvm/Support/BinaryStreamArray.h @@ -96,21 +96,32 @@ public: explicit VarStreamArray(const Extractor &E) : E(E) {} - explicit VarStreamArray(BinaryStreamRef Stream) : Stream(Stream) {} + explicit VarStreamArray(BinaryStreamRef Stream, uint32_t Skew = 0) + : Stream(Stream), Skew(Skew) {} - VarStreamArray(BinaryStreamRef Stream, const Extractor &E) - : Stream(Stream), E(E) {} + VarStreamArray(BinaryStreamRef Stream, const Extractor &E, uint32_t Skew = 0) + : Stream(Stream), E(E), Skew(Skew) {} Iterator begin(bool *HadError = nullptr) const { - return Iterator(*this, E, HadError); + return Iterator(*this, E, Skew, nullptr); } bool valid() const { return Stream.valid(); } + uint32_t skew() const { return Skew; } Iterator end() const { return Iterator(E); } bool empty() const { return Stream.getLength() == 0; } + VarStreamArray substream(uint32_t Begin, + uint32_t End) const { + assert(Begin >= Skew); + // We should never cut off the beginning of the stream since it might be + // skewed, meaning the initial bytes are important. + BinaryStreamRef NewStream = Stream.slice(0, End); + return {NewStream, E, Begin}; + } + /// given an offset into the array's underlying stream, return an /// iterator to the record at that offset. This is considered unsafe /// since the behavior is undefined if \p Offset does not refer to the @@ -123,13 +134,17 @@ public: Extractor &getExtractor() { return E; } BinaryStreamRef getUnderlyingStream() const { return Stream; } - void setUnderlyingStream(BinaryStreamRef S) { Stream = S; } + void setUnderlyingStream(BinaryStreamRef S, uint32_t Skew = 0) { + Stream = S; + this->Skew = Skew; + } - void drop_front() { Stream = Stream.drop_front(begin()->length()); } + void drop_front() { Skew += begin()->length(); } private: BinaryStreamRef Stream; Extractor E; + uint32_t Skew; }; template @@ -140,10 +155,6 @@ class VarStreamArrayIterator typedef VarStreamArray ArrayType; public: - VarStreamArrayIterator(const ArrayType &Array, const Extractor &E, - bool *HadError) - : VarStreamArrayIterator(Array, E, 0, HadError) {} - VarStreamArrayIterator(const ArrayType &Array, const Extractor &E, uint32_t Offset, bool *HadError) : IterRef(Array.Stream.drop_front(Offset)), Extract(E), diff --git a/include/llvm/Support/BinaryStreamReader.h b/include/llvm/Support/BinaryStreamReader.h index fe77b550c4537184ce60325d29dce7514fe2a39d..392958de30d5656aff5c0aa037a3f8a455930755 100644 --- a/include/llvm/Support/BinaryStreamReader.h +++ b/include/llvm/Support/BinaryStreamReader.h @@ -203,11 +203,12 @@ public: /// \returns a success error code if the data was successfully read, otherwise /// returns an appropriate error code. template - Error readArray(VarStreamArray &Array, uint32_t Size) { + Error readArray(VarStreamArray &Array, uint32_t Size, + uint32_t Skew = 0) { BinaryStreamRef S; if (auto EC = readStreamRef(S, Size)) return EC; - Array.setUnderlyingStream(S); + Array.setUnderlyingStream(S, Skew); return Error::success(); } diff --git a/include/llvm/Support/BuryPointer.h b/include/llvm/Support/BuryPointer.h new file mode 100644 index 0000000000000000000000000000000000000000..53f1f395b922947713e4ab5f8a98a8d291f5a343 --- /dev/null +++ b/include/llvm/Support/BuryPointer.h @@ -0,0 +1,30 @@ +//===- llvm/Support/BuryPointer.h - Memory Manipulation/Leak ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_BURYPOINTER_H +#define LLVM_SUPPORT_BURYPOINTER_H + +#include + +namespace llvm { + +// In tools that will exit soon anyway, going through the process of explicitly +// deallocating resources can be unnecessary - better to leak the resources and +// let the OS clean them up when the process ends. Use this function to ensure +// the memory is not misdiagnosed as an unintentional leak by leak detection +// tools (this is achieved by preserving pointers to the object in a globally +// visible array). +void BuryPointer(const void *Ptr); +template void BuryPointer(std::unique_ptr Ptr) { + BuryPointer(Ptr.release()); +} + +} // namespace llvm + +#endif diff --git a/include/llvm/Support/Chrono.h b/include/llvm/Support/Chrono.h index 994068af3771b972e3f336dcceddc7f000d3fb0a..57677e8d5cf149326f6fe32f8a16cd4cb1a9917c 100644 --- a/include/llvm/Support/Chrono.h +++ b/include/llvm/Support/Chrono.h @@ -47,6 +47,14 @@ toTimePoint(std::time_t T) { return time_point_cast(system_clock::from_time_t(T)); } +/// Convert a std::time_t + nanoseconds to a TimePoint +LLVM_ATTRIBUTE_ALWAYS_INLINE inline TimePoint<> +toTimePoint(std::time_t T, uint32_t nsec) { + using namespace std::chrono; + return time_point_cast(system_clock::from_time_t(T)) + + nanoseconds(nsec); +} + } // namespace sys raw_ostream &operator<<(raw_ostream &OS, sys::TimePoint<> TP); diff --git a/include/llvm/Support/Error.h b/include/llvm/Support/Error.h index 4e1619019bb4aba8ba4d80478f9b985434304f7f..4962812e750e6007ce0024c1217c82506c7fff64 100644 --- a/include/llvm/Support/Error.h +++ b/include/llvm/Support/Error.h @@ -953,10 +953,14 @@ Expected handleExpected(Expected ValOrErr, RecoveryFtor &&RecoveryPath, /// will be printed before the first one is logged. A newline will be printed /// after each error. /// +/// This function is compatible with the helpers from Support/WithColor.h. You +/// can pass any of them as the OS. Please consider using them instead of +/// including 'error: ' in the ErrorBanner. +/// /// This is useful in the base level of your program to allow clean termination /// (allowing clean deallocation of resources, etc.), while reporting error /// information to the user. -void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner); +void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner = {}); /// Write all error messages (if any) in E to a string. The newline character /// is used to separate error messages. diff --git a/include/llvm/Support/FileCheck.h b/include/llvm/Support/FileCheck.h index de8d46bcb8f27b8d180ac895485367d40ae74f40..4061a26e22c50f04abdc801f600fb5d55ccc52bb 100644 --- a/include/llvm/Support/FileCheck.h +++ b/include/llvm/Support/FileCheck.h @@ -43,7 +43,8 @@ struct FileCheckRequest { //===----------------------------------------------------------------------===// namespace Check { -enum FileCheckType { + +enum FileCheckKind { CheckNone = 0, CheckPlain, CheckNext, @@ -58,10 +59,31 @@ enum FileCheckType { CheckEOF, /// Marks when parsing found a -NOT check combined with another CHECK suffix. - CheckBadNot + CheckBadNot, + + /// Marks when parsing found a -COUNT directive with invalid count value. + CheckBadCount +}; + +class FileCheckType { + FileCheckKind Kind; + int Count; ///< optional Count for some checks + +public: + FileCheckType(FileCheckKind Kind = CheckNone) : Kind(Kind), Count(1) {} + FileCheckType(const FileCheckType &) = default; + + operator FileCheckKind() const { return Kind; } + + int getCount() const { return Count; } + FileCheckType &setCount(int C); + + std::string getDescription(StringRef Prefix) const; }; } +struct FileCheckDiag; + class FileCheckPattern { SMLoc PatternLoc; @@ -105,7 +127,8 @@ public: const StringMap &VariableTable, SMRange MatchRange = None) const; void PrintFuzzyMatch(const SourceMgr &SM, StringRef Buffer, - const StringMap &VariableTable) const; + const StringMap &VariableTable, + std::vector *Diags) const; bool hasVariable() const { return !(VariableUses.empty() && VariableDefs.empty()); @@ -113,6 +136,8 @@ public: Check::FileCheckType getCheckTy() const { return CheckTy; } + int getCount() const { return CheckTy.getCount(); } + private: bool AddRegExToRegEx(StringRef RS, unsigned &CurParen, SourceMgr &SM); void AddBackrefToRegEx(unsigned BackrefNum); @@ -123,6 +148,59 @@ private: size_t FindRegexVarEnd(StringRef Str, SourceMgr &SM); }; +//===----------------------------------------------------------------------===// +/// Summary of a FileCheck diagnostic. +//===----------------------------------------------------------------------===// + +struct FileCheckDiag { + /// What is the FileCheck directive for this diagnostic? + Check::FileCheckType CheckTy; + /// Where is the FileCheck directive for this diagnostic? + unsigned CheckLine, CheckCol; + /// What type of match result does this diagnostic describe? + /// + /// A directive's supplied pattern is said to be either expected or excluded + /// depending on whether the pattern must have or must not have a match in + /// order for the directive to succeed. For example, a CHECK directive's + /// pattern is expected, and a CHECK-NOT directive's pattern is excluded. + /// All match result types whose names end with "Excluded" are for excluded + /// patterns, and all others are for expected patterns. + /// + /// There might be more than one match result for a single pattern. For + /// example, there might be several discarded matches + /// (MatchFoundButDiscarded) before either a good match + /// (MatchFoundAndExpected) or a failure to match (MatchNoneButExpected), + /// and there might be a fuzzy match (MatchFuzzy) after the latter. + enum MatchType { + /// Indicates a good match for an expected pattern. + MatchFoundAndExpected, + /// Indicates a match for an excluded pattern. + MatchFoundButExcluded, + /// Indicates a match for an expected pattern, but the match is on the + /// wrong line. + MatchFoundButWrongLine, + /// Indicates a discarded match for an expected pattern. + MatchFoundButDiscarded, + /// Indicates no match for an excluded pattern. + MatchNoneAndExcluded, + /// Indicates no match for an expected pattern, but this might follow good + /// matches when multiple matches are expected for the pattern, or it might + /// follow discarded matches for the pattern. + MatchNoneButExpected, + /// Indicates a fuzzy match that serves as a suggestion for the next + /// intended match for an expected pattern with too few or no good matches. + MatchFuzzy, + } MatchTy; + /// The search range if MatchTy is MatchNoneAndExcluded or + /// MatchNoneButExpected, or the match range otherwise. + unsigned InputStartLine; + unsigned InputStartCol; + unsigned InputEndLine; + unsigned InputEndCol; + FileCheckDiag(const SourceMgr &SM, const Check::FileCheckType &CheckTy, + SMLoc CheckLoc, MatchType MatchTy, SMRange InputRange); +}; + //===----------------------------------------------------------------------===// // Check Strings. //===----------------------------------------------------------------------===// @@ -147,18 +225,20 @@ struct FileCheckString { size_t Check(const SourceMgr &SM, StringRef Buffer, bool IsLabelScanMode, size_t &MatchLen, StringMap &VariableTable, - FileCheckRequest &Req) const; + FileCheckRequest &Req, std::vector *Diags) const; bool CheckNext(const SourceMgr &SM, StringRef Buffer) const; bool CheckSame(const SourceMgr &SM, StringRef Buffer) const; bool CheckNot(const SourceMgr &SM, StringRef Buffer, const std::vector &NotStrings, StringMap &VariableTable, - const FileCheckRequest &Req) const; + const FileCheckRequest &Req, + std::vector *Diags) const; size_t CheckDag(const SourceMgr &SM, StringRef Buffer, std::vector &NotStrings, StringMap &VariableTable, - const FileCheckRequest &Req) const; + const FileCheckRequest &Req, + std::vector *Diags) const; }; /// FileCheck class takes the request and exposes various methods that @@ -195,7 +275,8 @@ public: /// /// Returns false if the input fails to satisfy the checks. bool CheckInput(SourceMgr &SM, StringRef Buffer, - ArrayRef CheckStrings); + ArrayRef CheckStrings, + std::vector *Diags = nullptr); }; } // namespace llvm #endif diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h index a76eab605a86029b28853ec83986def1a9236816..827e2e91eea218b2e684170e65b2978b71e974ee 100644 --- a/include/llvm/Support/FileSystem.h +++ b/include/llvm/Support/FileSystem.h @@ -160,6 +160,8 @@ protected: #if defined(LLVM_ON_UNIX) time_t fs_st_atime = 0; time_t fs_st_mtime = 0; + uint32_t fs_st_atime_nsec = 0; + uint32_t fs_st_mtime_nsec = 0; uid_t fs_st_uid = 0; gid_t fs_st_gid = 0; off_t fs_st_size = 0; @@ -180,9 +182,12 @@ public: explicit basic_file_status(file_type Type) : Type(Type) {} #if defined(LLVM_ON_UNIX) - basic_file_status(file_type Type, perms Perms, time_t ATime, time_t MTime, + basic_file_status(file_type Type, perms Perms, time_t ATime, + uint32_t ATimeNSec, time_t MTime, uint32_t MTimeNSec, uid_t UID, gid_t GID, off_t Size) - : fs_st_atime(ATime), fs_st_mtime(MTime), fs_st_uid(UID), fs_st_gid(GID), + : fs_st_atime(ATime), fs_st_mtime(MTime), + fs_st_atime_nsec(ATimeNSec), fs_st_mtime_nsec(MTimeNSec), + fs_st_uid(UID), fs_st_gid(GID), fs_st_size(Size), Type(Type), Perms(Perms) {} #elif defined(_WIN32) basic_file_status(file_type Type, perms Perms, uint32_t LastAccessTimeHigh, @@ -199,7 +204,20 @@ public: // getters file_type type() const { return Type; } perms permissions() const { return Perms; } + + /// The file access time as reported from the underlying file system. + /// + /// Also see comments on \c getLastModificationTime() related to the precision + /// of the returned value. TimePoint<> getLastAccessedTime() const; + + /// The file modification time as reported from the underlying file system. + /// + /// The returned value allows for nanosecond precision but the actual + /// resolution is an implementation detail of the underlying file system. + /// There is no guarantee for what kind of resolution you can expect, the + /// resolution can differ across platforms and even across mountpoints on the + /// same machine. TimePoint<> getLastModificationTime() const; #if defined(LLVM_ON_UNIX) @@ -247,8 +265,11 @@ public: #if defined(LLVM_ON_UNIX) file_status(file_type Type, perms Perms, dev_t Dev, nlink_t Links, ino_t Ino, - time_t ATime, time_t MTime, uid_t UID, gid_t GID, off_t Size) - : basic_file_status(Type, Perms, ATime, MTime, UID, GID, Size), + time_t ATime, uint32_t ATimeNSec, + time_t MTime, uint32_t MTimeNSec, + uid_t UID, gid_t GID, off_t Size) + : basic_file_status(Type, Perms, ATime, ATimeNSec, MTime, MTimeNSec, + UID, GID, Size), fs_st_dev(Dev), fs_st_nlinks(Links), fs_st_ino(Ino) {} #elif defined(_WIN32) file_status(file_type Type, perms Perms, uint32_t LinkCount, @@ -349,6 +370,12 @@ std::error_code create_hard_link(const Twine &to, const Twine &from); std::error_code real_path(const Twine &path, SmallVectorImpl &output, bool expand_tilde = false); +/// Expands ~ expressions to the user's home directory. On Unix ~user +/// directories are resolved as well. +/// +/// @param path The path to resolve. +void expand_tilde(const Twine &path, SmallVectorImpl &output); + /// Get the current path. /// /// @param result Holds the current path on return. diff --git a/include/llvm/Support/TargetOpcodes.def b/include/llvm/Support/TargetOpcodes.def index a683f053e034519284907c498dddb41adabb6545..3245276bbf362bb1b345aa2a4dd7cf93aa04b830 100644 --- a/include/llvm/Support/TargetOpcodes.def +++ b/include/llvm/Support/TargetOpcodes.def @@ -258,6 +258,17 @@ HANDLE_TARGET_OPCODE(G_INSERT) /// larger register. HANDLE_TARGET_OPCODE(G_MERGE_VALUES) +/// Generic instruction to create a vector value from a number of scalar +/// components. +HANDLE_TARGET_OPCODE(G_BUILD_VECTOR) + +/// Generic instruction to create a vector value from a number of scalar +/// components, which have types larger than the result vector elt type. +HANDLE_TARGET_OPCODE(G_BUILD_VECTOR_TRUNC) + +/// Generic instruction to create a vector by concatenating multiple vectors. +HANDLE_TARGET_OPCODE(G_CONCAT_VECTORS) + /// Generic pointer to int conversion. HANDLE_TARGET_OPCODE(G_PTRTOINT) @@ -443,6 +454,9 @@ HANDLE_TARGET_OPCODE(G_FLOG) /// Floating point base-2 logarithm of a value. HANDLE_TARGET_OPCODE(G_FLOG2) +/// Floating point base-10 logarithm of a value. +HANDLE_TARGET_OPCODE(G_FLOG10) + /// Generic FP negation. HANDLE_TARGET_OPCODE(G_FNEG) diff --git a/include/llvm/Support/TargetParser.h b/include/llvm/Support/TargetParser.h index 63241b52e1fa6b19eba020e51b728b825fe116d2..ace11ed410a38fafe24b2ceab67e31028163eed7 100644 --- a/include/llvm/Support/TargetParser.h +++ b/include/llvm/Support/TargetParser.h @@ -18,215 +18,20 @@ // FIXME: vector is used because that's what clang uses for subtarget feature // lists, but SmallVector would probably be better #include "llvm/ADT/Triple.h" +#include "llvm/Support/ARMTargetParser.h" +#include "llvm/Support/AArch64TargetParser.h" #include namespace llvm { class StringRef; -// Target specific information into their own namespaces. These should be -// generated from TableGen because the information is already there, and there -// is where new information about targets will be added. +// Target specific information in their own namespaces. +// (ARM/AArch64 are declared in ARM/AArch64TargetParser.h) +// These should be generated from TableGen because the information is already +// there, and there is where new information about targets will be added. // FIXME: To TableGen this we need to make some table generated files available // even if the back-end is not compiled with LLVM, plus we need to create a new // back-end to TableGen to create these clean tables. -namespace ARM { - -// FPU Version -enum class FPUVersion { - NONE, - VFPV2, - VFPV3, - VFPV3_FP16, - VFPV4, - VFPV5 -}; - -// An FPU name restricts the FPU in one of three ways: -enum class FPURestriction { - None = 0, ///< No restriction - D16, ///< Only 16 D registers - SP_D16 ///< Only single-precision instructions, with 16 D registers -}; - -// An FPU name implies one of three levels of Neon support: -enum class NeonSupportLevel { - None = 0, ///< No Neon - Neon, ///< Neon - Crypto ///< Neon with Crypto -}; - -// FPU names. -enum FPUKind { -#define ARM_FPU(NAME, KIND, VERSION, NEON_SUPPORT, RESTRICTION) KIND, -#include "ARMTargetParser.def" - FK_LAST -}; - -// Arch names. -enum class ArchKind { -#define ARM_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT) ID, -#include "ARMTargetParser.def" -}; - -// Arch extension modifiers for CPUs. -enum ArchExtKind : unsigned { - AEK_INVALID = 0, - AEK_NONE = 1, - AEK_CRC = 1 << 1, - AEK_CRYPTO = 1 << 2, - AEK_FP = 1 << 3, - AEK_HWDIVTHUMB = 1 << 4, - AEK_HWDIVARM = 1 << 5, - AEK_MP = 1 << 6, - AEK_SIMD = 1 << 7, - AEK_SEC = 1 << 8, - AEK_VIRT = 1 << 9, - AEK_DSP = 1 << 10, - AEK_FP16 = 1 << 11, - AEK_RAS = 1 << 12, - AEK_SVE = 1 << 13, - AEK_DOTPROD = 1 << 14, - AEK_SHA2 = 1 << 15, - AEK_AES = 1 << 16, - AEK_FP16FML = 1 << 17, - // Unsupported extensions. - AEK_OS = 0x8000000, - AEK_IWMMXT = 0x10000000, - AEK_IWMMXT2 = 0x20000000, - AEK_MAVERICK = 0x40000000, - AEK_XSCALE = 0x80000000, -}; - -// ISA kinds. -enum class ISAKind { INVALID = 0, ARM, THUMB, AARCH64 }; - -// Endianness -// FIXME: BE8 vs. BE32? -enum class EndianKind { INVALID = 0, LITTLE, BIG }; - -// v6/v7/v8 Profile -enum class ProfileKind { INVALID = 0, A, R, M }; - -StringRef getCanonicalArchName(StringRef Arch); - -// Information by ID -StringRef getFPUName(unsigned FPUKind); -FPUVersion getFPUVersion(unsigned FPUKind); -NeonSupportLevel getFPUNeonSupportLevel(unsigned FPUKind); -FPURestriction getFPURestriction(unsigned FPUKind); - -// FIXME: These should be moved to TargetTuple once it exists -bool getFPUFeatures(unsigned FPUKind, std::vector &Features); -bool getHWDivFeatures(unsigned HWDivKind, std::vector &Features); -bool getExtensionFeatures(unsigned Extensions, - std::vector &Features); - -StringRef getArchName(ArchKind AK); -unsigned getArchAttr(ArchKind AK); -StringRef getCPUAttr(ArchKind AK); -StringRef getSubArch(ArchKind AK); -StringRef getArchExtName(unsigned ArchExtKind); -StringRef getArchExtFeature(StringRef ArchExt); -StringRef getHWDivName(unsigned HWDivKind); - -// Information by Name -unsigned getDefaultFPU(StringRef CPU, ArchKind AK); -unsigned getDefaultExtensions(StringRef CPU, ArchKind AK); -StringRef getDefaultCPU(StringRef Arch); - -// Parser -unsigned parseHWDiv(StringRef HWDiv); -unsigned parseFPU(StringRef FPU); -ArchKind parseArch(StringRef Arch); -unsigned parseArchExt(StringRef ArchExt); -ArchKind parseCPUArch(StringRef CPU); -void fillValidCPUArchList(SmallVectorImpl &Values); -ISAKind parseArchISA(StringRef Arch); -EndianKind parseArchEndian(StringRef Arch); -ProfileKind parseArchProfile(StringRef Arch); -unsigned parseArchVersion(StringRef Arch); - -StringRef computeDefaultTargetABI(const Triple &TT, StringRef CPU); - -} // namespace ARM - -// FIXME:This should be made into class design,to avoid dupplication. -namespace AArch64 { - -// Arch names. -enum class ArchKind { -#define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT) ID, -#include "AArch64TargetParser.def" -}; - -// Arch extension modifiers for CPUs. -enum ArchExtKind : unsigned { - AEK_INVALID = 0, - AEK_NONE = 1, - AEK_CRC = 1 << 1, - AEK_CRYPTO = 1 << 2, - AEK_FP = 1 << 3, - AEK_SIMD = 1 << 4, - AEK_FP16 = 1 << 5, - AEK_PROFILE = 1 << 6, - AEK_RAS = 1 << 7, - AEK_LSE = 1 << 8, - AEK_SVE = 1 << 9, - AEK_DOTPROD = 1 << 10, - AEK_RCPC = 1 << 11, - AEK_RDM = 1 << 12, - AEK_SM4 = 1 << 13, - AEK_SHA3 = 1 << 14, - AEK_SHA2 = 1 << 15, - AEK_AES = 1 << 16, - AEK_FP16FML = 1 << 17, - AEK_RAND = 1 << 18, - AEK_MTE = 1 << 19, -}; - -StringRef getCanonicalArchName(StringRef Arch); - -// Information by ID -StringRef getFPUName(unsigned FPUKind); -ARM::FPUVersion getFPUVersion(unsigned FPUKind); -ARM::NeonSupportLevel getFPUNeonSupportLevel(unsigned FPUKind); -ARM::FPURestriction getFPURestriction(unsigned FPUKind); - -// FIXME: These should be moved to TargetTuple once it exists -bool getFPUFeatures(unsigned FPUKind, std::vector &Features); -bool getExtensionFeatures(unsigned Extensions, - std::vector &Features); -bool getArchFeatures(ArchKind AK, std::vector &Features); - -StringRef getArchName(ArchKind AK); -unsigned getArchAttr(ArchKind AK); -StringRef getCPUAttr(ArchKind AK); -StringRef getSubArch(ArchKind AK); -StringRef getArchExtName(unsigned ArchExtKind); -StringRef getArchExtFeature(StringRef ArchExt); -unsigned checkArchVersion(StringRef Arch); - -// Information by Name -unsigned getDefaultFPU(StringRef CPU, ArchKind AK); -unsigned getDefaultExtensions(StringRef CPU, ArchKind AK); -StringRef getDefaultCPU(StringRef Arch); -AArch64::ArchKind getCPUArchKind(StringRef CPU); - -// Parser -unsigned parseFPU(StringRef FPU); -AArch64::ArchKind parseArch(StringRef Arch); -ArchExtKind parseArchExt(StringRef ArchExt); -ArchKind parseCPUArch(StringRef CPU); -void fillValidCPUArchList(SmallVectorImpl &Values); -ARM::ISAKind parseArchISA(StringRef Arch); -ARM::EndianKind parseArchEndian(StringRef Arch); -ARM::ProfileKind parseArchProfile(StringRef Arch); -unsigned parseArchVersion(StringRef Arch); - -bool isX18ReservedByDefault(const Triple &TT); - -} // namespace AArch64 - namespace X86 { // This should be kept in sync with libcc/compiler-rt as its included by clang diff --git a/include/llvm/Support/Threading.h b/include/llvm/Support/Threading.h index e8021f648b0d5625f1014cc1496d60d43b55a032..ba7ece5e72ba78428eddb003a0e296e908fe2657 100644 --- a/include/llvm/Support/Threading.h +++ b/include/llvm/Support/Threading.h @@ -27,7 +27,8 @@ #define LLVM_THREADING_USE_STD_CALL_ONCE 1 #elif defined(LLVM_ON_UNIX) && \ (defined(_LIBCPP_VERSION) || \ - !(defined(__NetBSD__) || defined(__OpenBSD__) || defined(__ppc__))) + !(defined(__NetBSD__) || defined(__OpenBSD__) || \ + (defined(__ppc__) || defined(__PPC__)))) // std::call_once from libc++ is used on all Unix platforms. Other // implementations like libstdc++ are known to have problems on NetBSD, // OpenBSD and PowerPC. diff --git a/include/llvm/Support/VirtualFileSystem.h b/include/llvm/Support/VirtualFileSystem.h index b3326bbbe486b33322c04d68d48362cc780368eb..3d4094eeb11106c13b27dadbdd7611028b124fe6 100644 --- a/include/llvm/Support/VirtualFileSystem.h +++ b/include/llvm/Support/VirtualFileSystem.h @@ -374,6 +374,9 @@ public: SmallVectorImpl &Output) const override { return FS->getRealPath(Path, Output); } + std::error_code isLocal(const Twine &Path, bool &Result) override { + return FS->isLocal(Path, Result); + } protected: FileSystem &getUnderlyingFS() { return *FS; } diff --git a/include/llvm/Support/X86DisassemblerDecoderCommon.h b/include/llvm/Support/X86DisassemblerDecoderCommon.h index 185b357efef5f7a74909f64c5199ecf546650f24..466dd309909ac39ad3709e8caabf07483789d339 100644 --- a/include/llvm/Support/X86DisassemblerDecoderCommon.h +++ b/include/llvm/Support/X86DisassemblerDecoderCommon.h @@ -414,7 +414,7 @@ enum OperandEncoding { ENUM_ENTRY(TYPE_R16, "2-byte") \ ENUM_ENTRY(TYPE_R32, "4-byte") \ ENUM_ENTRY(TYPE_R64, "8-byte") \ - ENUM_ENTRY(TYPE_IMM, "immediate operand") \ + ENUM_ENTRY(TYPE_IMM, "immediate operand") \ ENUM_ENTRY(TYPE_IMM3, "1-byte immediate operand between 0 and 7") \ ENUM_ENTRY(TYPE_IMM5, "1-byte immediate operand between 0 and 31") \ ENUM_ENTRY(TYPE_AVX512ICC, "1-byte immediate operand for AVX512 icmp") \ diff --git a/include/llvm/Support/X86TargetParser.def b/include/llvm/Support/X86TargetParser.def index eb45ed6a76afb1365ac909ff10181d8345469fcd..e9bede545d3f71ced08b874d405a37f6132faf11 100644 --- a/include/llvm/Support/X86TargetParser.def +++ b/include/llvm/Support/X86TargetParser.def @@ -102,6 +102,7 @@ X86_CPU_SUBTYPE_COMPAT("icelake-server", INTEL_COREI7_ICELAKE_SERVER, "icelake-s // Entries below this are not in libgcc/compiler-rt. X86_CPU_SUBTYPE ("core2", INTEL_CORE2_65) X86_CPU_SUBTYPE ("penryn", INTEL_CORE2_45) +X86_CPU_SUBTYPE ("cascadelake", INTEL_COREI7_CASCADELAKE) X86_CPU_SUBTYPE ("k6", AMDPENTIUM_K6) X86_CPU_SUBTYPE ("k6-2", AMDPENTIUM_K62) X86_CPU_SUBTYPE ("k6-3", AMDPENTIUM_K63) diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h index 6219755e83a2cf959c99338b7c1b9638f143904c..3d790e96fff76d18f986b1e6aeb53258305c6ee7 100644 --- a/include/llvm/Support/YAMLTraits.h +++ b/include/llvm/Support/YAMLTraits.h @@ -39,6 +39,12 @@ namespace llvm { namespace yaml { +enum class NodeKind : uint8_t { + Scalar, + Map, + Sequence, +}; + struct EmptyContext {}; /// This class should be specialized by any type that needs to be converted @@ -145,14 +151,14 @@ struct ScalarTraits { // Must provide: // // Function to write the value as a string: - //static void output(const T &value, void *ctxt, llvm::raw_ostream &out); + // static void output(const T &value, void *ctxt, llvm::raw_ostream &out); // // Function to convert a string to a value. Returns the empty // StringRef on success or an error string if string is malformed: - //static StringRef input(StringRef scalar, void *ctxt, T &value); + // static StringRef input(StringRef scalar, void *ctxt, T &value); // // Function to determine if the value should be quoted. - //static QuotingType mustQuote(StringRef); + // static QuotingType mustQuote(StringRef); }; /// This class should be specialized by type that requires custom conversion @@ -163,7 +169,7 @@ struct ScalarTraits { /// static void output(const MyType &Value, void*, llvm::raw_ostream &Out) /// { /// // stream out custom formatting -/// Out << Val; +/// Out << Value; /// } /// static StringRef input(StringRef Scalar, void*, MyType &Value) { /// // parse scalar and set `value` @@ -181,6 +187,47 @@ struct BlockScalarTraits { // Function to convert a string to a value. Returns the empty // StringRef on success or an error string if string is malformed: // static StringRef input(StringRef Scalar, void *ctxt, T &Value); + // + // Optional: + // static StringRef inputTag(T &Val, std::string Tag) + // static void outputTag(const T &Val, raw_ostream &Out) +}; + +/// This class should be specialized by type that requires custom conversion +/// to/from a YAML scalar with optional tags. For example: +/// +/// template <> +/// struct TaggedScalarTraits { +/// static void output(const MyType &Value, void*, llvm::raw_ostream +/// &ScalarOut, llvm::raw_ostream &TagOut) +/// { +/// // stream out custom formatting including optional Tag +/// Out << Value; +/// } +/// static StringRef input(StringRef Scalar, StringRef Tag, void*, MyType +/// &Value) { +/// // parse scalar and set `value` +/// // return empty string on success, or error string +/// return StringRef(); +/// } +/// static QuotingType mustQuote(const MyType &Value, StringRef) { +/// return QuotingType::Single; +/// } +/// }; +template struct TaggedScalarTraits { + // Must provide: + // + // Function to write the value and tag as strings: + // static void output(const T &Value, void *ctx, llvm::raw_ostream &ScalarOut, + // llvm::raw_ostream &TagOut); + // + // Function to convert a string to a value. Returns the empty + // StringRef on success or an error string if string is malformed: + // static StringRef input(StringRef Scalar, StringRef Tag, void *ctxt, T + // &Value); + // + // Function to determine if the value should be quoted. + // static QuotingType mustQuote(const T &Value, StringRef Scalar); }; /// This class should be specialized by any type that needs to be converted @@ -234,6 +281,31 @@ struct CustomMappingTraits { // static void output(IO &io, T &elem); }; +/// This class should be specialized by any type that can be represented as +/// a scalar, map, or sequence, decided dynamically. For example: +/// +/// typedef std::unique_ptr MyPoly; +/// +/// template<> +/// struct PolymorphicTraits { +/// static NodeKind getKind(const MyPoly &poly) { +/// return poly->getKind(); +/// } +/// static MyScalar& getAsScalar(MyPoly &poly) { +/// if (!poly || !isa(poly)) +/// poly.reset(new MyScalar()); +/// return *cast(poly.get()); +/// } +/// // ... +/// }; +template struct PolymorphicTraits { + // Must provide: + // static NodeKind getKind(const T &poly); + // static scalar_type &getAsScalar(T &poly); + // static map_type &getAsMap(T &poly); + // static sequence_type &getAsSequence(T &poly); +}; + // Only used for better diagnostics of missing traits template struct MissingTrait; @@ -307,6 +379,24 @@ struct has_BlockScalarTraits (sizeof(test>(nullptr, nullptr)) == 1); }; +// Test if TaggedScalarTraits is defined on type T. +template struct has_TaggedScalarTraits { + using Signature_input = StringRef (*)(StringRef, StringRef, void *, T &); + using Signature_output = void (*)(const T &, void *, raw_ostream &, + raw_ostream &); + using Signature_mustQuote = QuotingType (*)(const T &, StringRef); + + template + static char test(SameType *, + SameType *, + SameType *); + + template static double test(...); + + static bool const value = + (sizeof(test>(nullptr, nullptr, nullptr)) == 1); +}; + // Test if MappingContextTraits is defined on type T. template struct has_MappingTraits { using Signature_mapping = void (*)(class IO &, T &, Context &); @@ -438,6 +528,17 @@ struct has_DocumentListTraits static bool const value = (sizeof(test>(nullptr))==1); }; +template struct has_PolymorphicTraits { + using Signature_getKind = NodeKind (*)(const T &); + + template + static char test(SameType *); + + template static double test(...); + + static bool const value = (sizeof(test>(nullptr)) == 1); +}; + inline bool isNumeric(StringRef S) { const static auto skipDigits = [](StringRef Input) { return Input.drop_front( @@ -626,10 +727,12 @@ struct missingTraits !has_ScalarBitSetTraits::value && !has_ScalarTraits::value && !has_BlockScalarTraits::value && + !has_TaggedScalarTraits::value && !has_MappingTraits::value && !has_SequenceTraits::value && !has_CustomMappingTraits::value && - !has_DocumentListTraits::value> {}; + !has_DocumentListTraits::value && + !has_PolymorphicTraits::value> {}; template struct validatedMappingTraits @@ -683,6 +786,9 @@ public: virtual void scalarString(StringRef &, QuotingType) = 0; virtual void blockScalarString(StringRef &) = 0; + virtual void scalarTag(std::string &) = 0; + + virtual NodeKind getNodeKind() = 0; virtual void setError(const Twine &) = 0; @@ -917,6 +1023,31 @@ yamlize(IO &YamlIO, T &Val, bool, EmptyContext &Ctx) { } } +template +typename std::enable_if::value, void>::type +yamlize(IO &io, T &Val, bool, EmptyContext &Ctx) { + if (io.outputting()) { + std::string ScalarStorage, TagStorage; + raw_string_ostream ScalarBuffer(ScalarStorage), TagBuffer(TagStorage); + TaggedScalarTraits::output(Val, io.getContext(), ScalarBuffer, + TagBuffer); + io.scalarTag(TagBuffer.str()); + StringRef ScalarStr = ScalarBuffer.str(); + io.scalarString(ScalarStr, + TaggedScalarTraits::mustQuote(Val, ScalarStr)); + } else { + std::string Tag; + io.scalarTag(Tag); + StringRef Str; + io.scalarString(Str, QuotingType::None); + StringRef Result = + TaggedScalarTraits::input(Str, Tag, io.getContext(), Val); + if (!Result.empty()) { + io.setError(Twine(Result)); + } + } +} + template typename std::enable_if::value, void>::type yamlize(IO &io, T &Val, bool, Context &Ctx) { @@ -972,6 +1103,20 @@ yamlize(IO &io, T &Val, bool, EmptyContext &Ctx) { } } +template +typename std::enable_if::value, void>::type +yamlize(IO &io, T &Val, bool, EmptyContext &Ctx) { + switch (io.outputting() ? PolymorphicTraits::getKind(Val) + : io.getNodeKind()) { + case NodeKind::Scalar: + return yamlize(io, PolymorphicTraits::getAsScalar(Val), true, Ctx); + case NodeKind::Map: + return yamlize(io, PolymorphicTraits::getAsMap(Val), true, Ctx); + case NodeKind::Sequence: + return yamlize(io, PolymorphicTraits::getAsSequence(Val), true, Ctx); + } +} + template typename std::enable_if::value, void>::type yamlize(IO &io, T &Val, bool, EmptyContext &Ctx) { @@ -1250,6 +1395,8 @@ private: void endBitSetScalar() override; void scalarString(StringRef &, QuotingType) override; void blockScalarString(StringRef &) override; + void scalarTag(std::string &) override; + NodeKind getNodeKind() override; void setError(const Twine &message) override; bool canElideEmptySequence() override; @@ -1395,6 +1542,8 @@ public: void endBitSetScalar() override; void scalarString(StringRef &, QuotingType) override; void blockScalarString(StringRef &) override; + void scalarTag(std::string &) override; + NodeKind getNodeKind() override; void setError(const Twine &message) override; bool canElideEmptySequence() override; @@ -1414,14 +1563,21 @@ private: void flowKey(StringRef Key); enum InState { - inSeq, - inFlowSeq, + inSeqFirstElement, + inSeqOtherElement, + inFlowSeqFirstElement, + inFlowSeqOtherElement, inMapFirstKey, inMapOtherKey, inFlowMapFirstKey, inFlowMapOtherKey }; + static bool inSeqAnyElement(InState State); + static bool inFlowSeqAnyElement(InState State); + static bool inMapAnyKey(InState State); + static bool inFlowMapAnyKey(InState State); + raw_ostream &Out; int WrapColumn; SmallVector StateStack; @@ -1557,6 +1713,16 @@ operator>>(Input &In, T &Val) { return In; } +// Define non-member operator>> so that Input can stream in a polymorphic type. +template +inline typename std::enable_if::value, Input &>::type +operator>>(Input &In, T &Val) { + EmptyContext Ctx; + if (In.setCurrentDocument()) + yamlize(In, Val, true, Ctx); + return In; +} + // Provide better error message about types missing a trait specialization template inline typename std::enable_if::value, @@ -1645,6 +1811,24 @@ operator<<(Output &Out, T &Val) { return Out; } +// Define non-member operator<< so that Output can stream out a polymorphic +// type. +template +inline typename std::enable_if::value, Output &>::type +operator<<(Output &Out, T &Val) { + EmptyContext Ctx; + Out.beginDocuments(); + if (Out.preflightDocument(0)) { + // FIXME: The parser does not support explicit documents terminated with a + // plain scalar; the end-marker is included as part of the scalar token. + assert(PolymorphicTraits::getKind(Val) != NodeKind::Scalar && "plain scalar documents are not supported"); + yamlize(Out, Val, true, Ctx); + Out.postflightDocument(); + } + Out.endDocuments(); + return Out; +} + // Provide better error message about types missing a trait specialization template inline typename std::enable_if::value, diff --git a/include/llvm/Support/type_traits.h b/include/llvm/Support/type_traits.h index 55d84f138f071ca35b47efded0a7c965880a233e..e7b8f2517b8aa975a26923eb07b534f48e77a737 100644 --- a/include/llvm/Support/type_traits.h +++ b/include/llvm/Support/type_traits.h @@ -30,9 +30,10 @@ namespace llvm { template struct isPodLike { // std::is_trivially_copyable is available in libc++ with clang, libstdc++ - // that comes with GCC 5. + // that comes with GCC 5. MSVC 2015 and newer also have + // std::is_trivially_copyable. #if (__has_feature(is_trivially_copyable) && defined(_LIBCPP_VERSION)) || \ - (defined(__GNUC__) && __GNUC__ >= 5) + (defined(__GNUC__) && __GNUC__ >= 5) || defined(_MSC_VER) // If the compiler supports the is_trivially_copyable trait use it, as it // matches the definition of isPodLike closely. static const bool value = std::is_trivially_copyable::value; diff --git a/include/llvm/Target/GenericOpcodes.td b/include/llvm/Target/GenericOpcodes.td index af4fa8a1f04a76ee2329f78e9ab74252ee4d2b90..775221a9f9772c7236dcab85bf49c4b96171ed86 100644 --- a/include/llvm/Target/GenericOpcodes.td +++ b/include/llvm/Target/GenericOpcodes.td @@ -540,6 +540,13 @@ def G_FLOG2 : GenericInstruction { let hasSideEffects = 0; } +// Floating point base-10 logarithm of a value. +def G_FLOG10 : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1); + let hasSideEffects = 0; +} + //------------------------------------------------------------------------------ // Opcodes for LLVM Intrinsics //------------------------------------------------------------------------------ @@ -675,6 +682,28 @@ def G_MERGE_VALUES : GenericInstruction { let hasSideEffects = 0; } +/// Create a vector from multiple scalar registers. +def G_BUILD_VECTOR : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src0, variable_ops); + let hasSideEffects = 0; +} + +/// Like G_BUILD_VECTOR, but truncates the larger operand types to fit the +/// destination vector elt type. +def G_BUILD_VECTOR_TRUNC : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src0, variable_ops); + let hasSideEffects = 0; +} + +/// Create a vector by concatenating vectors together. +def G_CONCAT_VECTORS : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src0, variable_ops); + let hasSideEffects = 0; +} + // Intrinsic without side effects. def G_INTRINSIC : GenericInstruction { let OutOperandList = (outs); diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td index abb1bb431f66865210ae60089a7d8a3fc2ea0a7d..e4b827babb92c71a3e8f1a52317d414f7588ec94 100644 --- a/include/llvm/Target/Target.td +++ b/include/llvm/Target/Target.td @@ -479,6 +479,7 @@ class Instruction { bit isInsertSubreg = 0; // Is this instruction a kind of insert subreg? // If so, make sure to override // TargetInstrInfo::getInsertSubregLikeInputs. + bit variadicOpsAreDefs = 0; // Are variadic operands definitions? // Does the instruction have side effects that are not captured by any // operands of the instruction or other flags? diff --git a/include/llvm/Target/TargetInstrPredicate.td b/include/llvm/Target/TargetInstrPredicate.td index e70da00979040ad76c03e2437130539db4cf8036..4b2c57b34c2e2035599cd09e41b3a726e72600e2 100644 --- a/include/llvm/Target/TargetInstrPredicate.td +++ b/include/llvm/Target/TargetInstrPredicate.td @@ -39,7 +39,7 @@ // processor scheduling model. // // The `MCInstPredicateExample` definition above is equivalent (and therefore -// could replace) the following definition from the ExynosM3 model (see +// could replace) the following definition from a previous ExynosM3 model (see // AArch64SchedExynosM3.td): // // def M3BranchLinkFastPred : SchedPredicate<[{ diff --git a/include/llvm/Target/TargetLoweringObjectFile.h b/include/llvm/Target/TargetLoweringObjectFile.h index 40b77c3d94a01a65a4d65787de0b27675f1a20a9..e80f2bf82f26d6eca31fedd7361efcae509c1960 100644 --- a/include/llvm/Target/TargetLoweringObjectFile.h +++ b/include/llvm/Target/TargetLoweringObjectFile.h @@ -201,6 +201,12 @@ public: virtual void emitLinkerFlagsForUsed(raw_ostream &OS, const GlobalValue *GV) const {} + /// If supported, return the section to use for the llvm.commandline + /// metadata. Otherwise, return nullptr. + virtual MCSection *getSectionForCommandLines() const { + return nullptr; + } + protected: virtual MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h index f968fa80d500f0c7c89d3706e4515396f5587bb3..3eafcc25583aceaacb8b242360646868c1e78dde 100644 --- a/include/llvm/Target/TargetMachine.h +++ b/include/llvm/Target/TargetMachine.h @@ -201,6 +201,9 @@ public: bool getO0WantsFastISel() { return O0WantsFastISel; } void setO0WantsFastISel(bool Enable) { O0WantsFastISel = Enable; } void setGlobalISel(bool Enable) { Options.EnableGlobalISel = Enable; } + void setGlobalISelAbort(GlobalISelAbortMode Mode) { + Options.GlobalISelAbort = Mode; + } void setMachineOutliner(bool Enable) { Options.EnableMachineOutliner = Enable; } @@ -351,6 +354,23 @@ public: } }; +/// Helper method for getting the code model, returning Default if +/// CM does not have a value. The tiny and kernel models will produce +/// an error, so targets that support them or require more complex codemodel +/// selection logic should implement and call their own getEffectiveCodeModel. +inline CodeModel::Model getEffectiveCodeModel(Optional CM, + CodeModel::Model Default) { + if (CM) { + // By default, targets do not support the tiny and kernel models. + if (*CM == CodeModel::Tiny) + report_fatal_error("Target does not support the tiny CodeModel"); + if (*CM == CodeModel::Kernel) + report_fatal_error("Target does not support the kernel CodeModel"); + return *CM; + } + return Default; +} + } // end namespace llvm #endif // LLVM_TARGET_TARGETMACHINE_H diff --git a/include/llvm/Target/TargetOptions.h b/include/llvm/Target/TargetOptions.h index 07ed773de55e7874865a38149513a25b09904e9f..b18101d92833d590020cfa0ea232886137497a54 100644 --- a/include/llvm/Target/TargetOptions.h +++ b/include/llvm/Target/TargetOptions.h @@ -96,6 +96,14 @@ namespace llvm { SCE // Tune debug info for SCE targets (e.g. PS4). }; + /// Enable abort calls when global instruction selection fails to lower/select + /// an instruction. + enum class GlobalISelAbortMode { + Disable, // Disable the abort. + Enable, // Enable the abort. + DisableWithDiag // Disable the abort but emit a diagnostic on failure. + }; + class TargetOptions { public: TargetOptions() @@ -192,6 +200,10 @@ namespace llvm { /// EnableGlobalISel - This flag enables global instruction selection. unsigned EnableGlobalISel : 1; + /// EnableGlobalISelAbort - Control abort behaviour when global instruction + /// selection fails to lower/select an instruction. + GlobalISelAbortMode GlobalISelAbort = GlobalISelAbortMode::Enable; + /// UseInitArray - Use .init_array instead of .ctors for static /// constructors. unsigned UseInitArray : 1; diff --git a/include/llvm/Target/TargetPfmCounters.td b/include/llvm/Target/TargetPfmCounters.td index 0a55a558f30a52405607ddfc8f3adb5acf9519d4..dac150f03445a109461f6da55558ff9f59fb4087 100644 --- a/include/llvm/Target/TargetPfmCounters.td +++ b/include/llvm/Target/TargetPfmCounters.td @@ -44,3 +44,7 @@ class PfmCountersBinding { string CpuName = cpu_name; ProcPfmCounters Counters = counters; } + +// Declares the default binding for unbound CPUs for the target. +class PfmCountersDefaultBinding + : PfmCountersBinding<"", counters> {} diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td index 141e06693887cb9a9c869488b6a0ac9b6344e5ce..808e183f5a5f9b57aa79cf04dd045d903286e23c 100644 --- a/include/llvm/Target/TargetSchedule.td +++ b/include/llvm/Target/TargetSchedule.td @@ -373,6 +373,10 @@ class SchedPredicate : SchedPredicateBase { SchedMachineModel SchedModel = ?; code Predicate = pred; } + +// Define a predicate to be typically used as the default case in a +// SchedVariant. It the SchedVariant does not use any other predicate based on +// MCSchedPredicate, this is the default scheduling case used by llvm-mca. def NoSchedPred : MCSchedPredicate; // Associate a predicate with a list of SchedReadWrites. By default, @@ -557,3 +561,13 @@ class RetireControlUnit { int MaxRetirePerCycle = retirePerCycle; SchedMachineModel SchedModel = ?; } + +// Base class for Load/StoreQueue. It is used to identify processor resources +// which describe load/store queues in the LS unit. +class MemoryQueue { + ProcResource QueueDescriptor = PR; + SchedMachineModel SchedModel = ?; +} + +class LoadQueue : MemoryQueue; +class StoreQueue : MemoryQueue; diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td index 532e866be55426a23a2277bd72bf60205e1b92a5..eb5a14bd21b8a6c461d7c83e0f7bbdf7349e686b 100644 --- a/include/llvm/Target/TargetSelectionDAG.td +++ b/include/llvm/Target/TargetSelectionDAG.td @@ -116,12 +116,18 @@ def SDTIntBinOp : SDTypeProfile<1, 2, [ // add, and, or, xor, udiv, etc. def SDTIntShiftOp : SDTypeProfile<1, 2, [ // shl, sra, srl SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2> ]>; +def SDTIntShiftDOp: SDTypeProfile<1, 3, [ // fshl, fshr + SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3> +]>; def SDTIntSatNoShOp : SDTypeProfile<1, 2, [ // ssat with no shift SDTCisSameAs<0, 1>, SDTCisInt<2> ]>; def SDTIntBinHiLoOp : SDTypeProfile<2, 2, [ // mulhi, mullo, sdivrem, udivrem SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,SDTCisInt<0> ]>; +def SDTIntScaledBinOp : SDTypeProfile<1, 3, [ // smulfix + SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3> +]>; def SDTFPBinOp : SDTypeProfile<1, 2, [ // fadd, fmul, etc. SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0> @@ -162,7 +168,7 @@ def SDTExtInreg : SDTypeProfile<1, 2, [ // sext_inreg ]>; def SDTExtInvec : SDTypeProfile<1, 1, [ // sext_invec SDTCisInt<0>, SDTCisVec<0>, SDTCisInt<1>, SDTCisVec<1>, - SDTCisOpSmallerThanOp<1, 0>, SDTCisSameSizeAs<0,1> + SDTCisOpSmallerThanOp<1, 0> ]>; def SDTSetCC : SDTypeProfile<1, 3, [ // setcc @@ -350,6 +356,8 @@ def sra : SDNode<"ISD::SRA" , SDTIntShiftOp>; def shl : SDNode<"ISD::SHL" , SDTIntShiftOp>; def rotl : SDNode<"ISD::ROTL" , SDTIntShiftOp>; def rotr : SDNode<"ISD::ROTR" , SDTIntShiftOp>; +def fshl : SDNode<"ISD::FSHL" , SDTIntShiftDOp>; +def fshr : SDNode<"ISD::FSHR" , SDTIntShiftDOp>; def and : SDNode<"ISD::AND" , SDTIntBinOp, [SDNPCommutative, SDNPAssociative]>; def or : SDNode<"ISD::OR" , SDTIntBinOp, @@ -377,6 +385,7 @@ def saddsat : SDNode<"ISD::SADDSAT" , SDTIntBinOp, [SDNPCommutative]>; def uaddsat : SDNode<"ISD::UADDSAT" , SDTIntBinOp, [SDNPCommutative]>; def ssubsat : SDNode<"ISD::SSUBSAT" , SDTIntBinOp>; def usubsat : SDNode<"ISD::USUBSAT" , SDTIntBinOp>; +def smulfix : SDNode<"ISD::SMULFIX" , SDTIntScaledBinOp, [SDNPCommutative]>; def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>; def sext_invec : SDNode<"ISD::SIGN_EXTEND_VECTOR_INREG", SDTExtInvec>; @@ -627,6 +636,15 @@ class PatFrags frags, code pred = [{}], code ImmediateCode = [{}]; SDNodeXForm OperandTransform = xform; + // When this is set, the PredicateCode may refer to a constant Operands + // vector which contains the captured nodes of the DAG, in the order listed + // by the Operands field above. + // + // This is useful when Fragments involves associative / commutative + // operators: a single piece of code can easily refer to all operands even + // when re-associated / commuted variants of the fragment are matched. + bit PredicateCodeUsesOperands = 0; + // Define a few pre-packaged predicates. This helps GlobalISel import // existing rules from SelectionDAG for many common cases. // They will be tested prior to the code in pred and must not be used in diff --git a/include/llvm/TextAPI/ELF/ELFStub.h b/include/llvm/TextAPI/ELF/ELFStub.h new file mode 100644 index 0000000000000000000000000000000000000000..fa54e6f8b711d7018d245df9e1d75e9b74df5782 --- /dev/null +++ b/include/llvm/TextAPI/ELF/ELFStub.h @@ -0,0 +1,69 @@ +//===- ELFStub.h ------------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===-----------------------------------------------------------------------===/ +/// +/// \file +/// This file defines an internal representation of an ELF stub. +/// +//===-----------------------------------------------------------------------===/ + +#ifndef LLVM_TEXTAPI_ELF_ELFSTUB_H +#define LLVM_TEXTAPI_ELF_ELFSTUB_H + +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/Support/VersionTuple.h" +#include +#include + +namespace llvm { +namespace elfabi { + +typedef uint16_t ELFArch; + +enum class ELFSymbolType { + NoType = ELF::STT_NOTYPE, + Object = ELF::STT_OBJECT, + Func = ELF::STT_FUNC, + TLS = ELF::STT_TLS, + + // Type information is 4 bits, so 16 is safely out of range. + Unknown = 16, +}; + +struct ELFSymbol { + ELFSymbol(std::string SymbolName) : Name(SymbolName) {} + std::string Name; + uint64_t Size; + ELFSymbolType Type; + bool Undefined; + bool Weak; + Optional Warning; + bool operator<(const ELFSymbol &RHS) const { + return Name < RHS.Name; + } +}; + +// A cumulative representation of ELF stubs. +// Both textual and binary stubs will read into and write from this object. +class ELFStub { +// TODO: Add support for symbol versioning. +public: + VersionTuple TbeVersion; + Optional SoName; + ELFArch Arch; + std::vector NeededLibs; + std::set Symbols; + + ELFStub() {} + ELFStub(const ELFStub &Stub); + ELFStub(ELFStub &&Stub); +}; +} // end namespace elfabi +} // end namespace llvm + +#endif // LLVM_TEXTAPI_ELF_ELFSTUB_H diff --git a/include/llvm/TextAPI/ELF/TBEHandler.h b/include/llvm/TextAPI/ELF/TBEHandler.h new file mode 100644 index 0000000000000000000000000000000000000000..91521c656fa230c90c2af273d516bd02176ef079 --- /dev/null +++ b/include/llvm/TextAPI/ELF/TBEHandler.h @@ -0,0 +1,45 @@ +//===- TBEHandler.h ---------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===-----------------------------------------------------------------------===/ +/// +/// \file +/// This file declares an interface for reading and writing .tbe (text-based +/// ELF) files. +/// +//===-----------------------------------------------------------------------===/ + +#ifndef LLVM_TEXTAPI_ELF_TBEHANDLER_H +#define LLVM_TEXTAPI_ELF_TBEHANDLER_H + +#include "llvm/Support/VersionTuple.h" +#include "llvm/Support/Error.h" +#include + +namespace llvm { + +class raw_ostream; +class Error; +class StringRef; +class VersionTuple; + +namespace elfabi { + +class ELFStub; + +const VersionTuple TBEVersionCurrent(1, 0); + +/// Attempts to read an ELF interface file from a StringRef buffer. +Expected> readTBEFromBuffer(StringRef Buf); + +/// Attempts to write an ELF interface file to a raw_ostream. +Error writeTBEToOutputStream(raw_ostream &OS, const ELFStub &Stub); + +} // end namespace elfabi +} // end namespace llvm + +#endif // LLVM_TEXTAPI_ELF_TBEHANDLER_H diff --git a/include/llvm/Transforms/IPO/FunctionImport.h b/include/llvm/Transforms/IPO/FunctionImport.h index 5ad880574ef7109637f2f7386b5b69d71ac1e648..c2103b637266d877264a4361b9d7e9b266601b84 100644 --- a/include/llvm/Transforms/IPO/FunctionImport.h +++ b/include/llvm/Transforms/IPO/FunctionImport.h @@ -176,6 +176,14 @@ void computeDeadSymbols( const DenseSet &GUIDPreservedSymbols, function_ref isPrevailing); +/// Compute dead symbols and run constant propagation in combined index +/// after that. +void computeDeadSymbolsWithConstProp( + ModuleSummaryIndex &Index, + const DenseSet &GUIDPreservedSymbols, + function_ref isPrevailing, + bool ImportEnabled); + /// Converts value \p GV to declaration, or replaces with a declaration if /// it is an alias. Returns true if converted, false if replaced. bool convertToDeclaration(GlobalValue &GV); @@ -201,10 +209,10 @@ std::error_code EmitImportsFiles( StringRef ModulePath, StringRef OutputFilename, const std::map &ModuleToSummariesForIndex); -/// Resolve WeakForLinker values in \p TheModule based on the information +/// Resolve prevailing symbol linkages in \p TheModule based on the information /// recorded in the summaries during global summary-based analysis. -void thinLTOResolveWeakForLinkerModule(Module &TheModule, - const GVSummaryMapTy &DefinedGlobals); +void thinLTOResolvePrevailingInModule(Module &TheModule, + const GVSummaryMapTy &DefinedGlobals); /// Internalize \p TheModule based on the information recorded in the summaries /// during global summary-based analysis. diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h index 81ed0a2237e97be93e548bc85dbace334a2b3e89..3624cd266af5e780239a8b06de84845470a52a42 100644 --- a/include/llvm/Transforms/Instrumentation.h +++ b/include/llvm/Transforms/Instrumentation.h @@ -77,6 +77,12 @@ struct GCOVOptions { // Emit the exit block immediately after the start block, rather than after // all of the function body's blocks. bool ExitBlockBeforeBody; + + // Regexes separated by a semi-colon to filter the files to instrument. + std::string Filter; + + // Regexes separated by a semi-colon to filter the files to not instrument. + std::string Exclude; }; ModulePass *createGCOVProfilerPass(const GCOVOptions &Options = @@ -143,7 +149,8 @@ FunctionPass *createAddressSanitizerFunctionPass(bool CompileKernel = false, bool UseAfterScope = false); ModulePass *createAddressSanitizerModulePass(bool CompileKernel = false, bool Recover = false, - bool UseGlobalsGC = true); + bool UseGlobalsGC = true, + bool UseOdrIndicator = true); // Insert MemorySanitizer instrumentation (detection of uninitialized reads) FunctionPass *createMemorySanitizerPass(int TrackOrigins = 0, diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index fe4ff621c6f0eb66858dbb61dba404e5488df55f..9019c3be4944d6a12c23f0ae5c24371dad8ef3b5 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -183,11 +183,12 @@ Pass *createLoopInstSimplifyPass(); // // LoopUnroll - This pass is a simple loop unrolling pass. // -Pass *createLoopUnrollPass(int OptLevel = 2, int Threshold = -1, int Count = -1, +Pass *createLoopUnrollPass(int OptLevel = 2, bool OnlyWhenForced = false, + int Threshold = -1, int Count = -1, int AllowPartial = -1, int Runtime = -1, int UpperBound = -1, int AllowPeeling = -1); // Create an unrolling pass for full unrolling that uses exact trip count only. -Pass *createSimpleLoopUnrollPass(int OptLevel = 2); +Pass *createSimpleLoopUnrollPass(int OptLevel = 2, bool OnlyWhenForced = false); //===----------------------------------------------------------------------===// // @@ -391,12 +392,6 @@ FunctionPass *createLowerExpectIntrinsicPass(); // FunctionPass *createPartiallyInlineLibCallsPass(); -//===----------------------------------------------------------------------===// -// -// ScalarizerPass - Converts vector operations into scalar operations -// -FunctionPass *createScalarizerPass(); - //===----------------------------------------------------------------------===// // // SeparateConstOffsetFromGEP - Split GEPs for better CSE @@ -490,6 +485,13 @@ FunctionPass *createLibCallsShrinkWrapPass(); // primarily to help other loop passes. // Pass *createLoopSimplifyCFGPass(); + +//===----------------------------------------------------------------------===// +// +// WarnMissedTransformations - This pass emits warnings for leftover forced +// transformations. +// +Pass *createWarnMissedTransformationsPass(); } // End llvm namespace #endif diff --git a/include/llvm/Transforms/Scalar/JumpThreading.h b/include/llvm/Transforms/Scalar/JumpThreading.h index c8376dd80df34b4161e407cad3e3656c92e8be75..78518941ceda94b9789e044cc7de4036b42cbced 100644 --- a/include/llvm/Transforms/Scalar/JumpThreading.h +++ b/include/llvm/Transforms/Scalar/JumpThreading.h @@ -89,22 +89,9 @@ class JumpThreadingPass : public PassInfoMixin { #else SmallSet, 16> LoopHeaders; #endif - DenseSet> RecursionSet; unsigned BBDupThreshold; - // RAII helper for updating the recursion stack. - struct RecursionSetRemover { - DenseSet> &TheSet; - std::pair ThePair; - - RecursionSetRemover(DenseSet> &S, - std::pair P) - : TheSet(S), ThePair(P) {} - - ~RecursionSetRemover() { TheSet.erase(ThePair); } - }; - public: JumpThreadingPass(int T = -1); @@ -128,11 +115,21 @@ public: bool DuplicateCondBranchOnPHIIntoPred( BasicBlock *BB, const SmallVectorImpl &PredBBs); + bool ComputeValueKnownInPredecessorsImpl( + Value *V, BasicBlock *BB, jumpthreading::PredValueInfo &Result, + jumpthreading::ConstantPreference Preference, + DenseSet> &RecursionSet, + Instruction *CxtI = nullptr); bool ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, jumpthreading::PredValueInfo &Result, jumpthreading::ConstantPreference Preference, - Instruction *CxtI = nullptr); + Instruction *CxtI = nullptr) { + DenseSet> RecursionSet; + return ComputeValueKnownInPredecessorsImpl(V, BB, Result, Preference, + RecursionSet, CxtI); + } + bool ProcessThreadableEdges(Value *Cond, BasicBlock *BB, jumpthreading::ConstantPreference Preference, Instruction *CxtI = nullptr); diff --git a/include/llvm/Transforms/Scalar/LoopPassManager.h b/include/llvm/Transforms/Scalar/LoopPassManager.h index e54960d199c270f72290a1bfa5eecf094630b07c..46ebb74c413c5991bd9d0c0a6cd6f3a7b59e1956 100644 --- a/include/llvm/Transforms/Scalar/LoopPassManager.h +++ b/include/llvm/Transforms/Scalar/LoopPassManager.h @@ -352,7 +352,11 @@ public: continue; PreservedAnalyses PassPA = Pass.run(*L, LAM, LAR, Updater); - PI.runAfterPass(Pass, *L); + // Do not pass deleted Loop into the instrumentation. + if (Updater.skipCurrentLoop()) + PI.runAfterPassInvalidated(Pass); + else + PI.runAfterPass(Pass, *L); // FIXME: We should verify the set of analyses relevant to Loop passes // are preserved. diff --git a/include/llvm/Transforms/Scalar/LoopUnrollPass.h b/include/llvm/Transforms/Scalar/LoopUnrollPass.h index 20c9a26b98cf090caea2e5b452776b3e66457f63..e38e983cc9eb69b3d3efc3f338f506ef13dc7a6a 100644 --- a/include/llvm/Transforms/Scalar/LoopUnrollPass.h +++ b/include/llvm/Transforms/Scalar/LoopUnrollPass.h @@ -24,8 +24,14 @@ class LPMUpdater; class LoopFullUnrollPass : public PassInfoMixin { const int OptLevel; + /// If false, use a cost model to determine whether unrolling of a loop is + /// profitable. If true, only loops that explicitly request unrolling via + /// metadata are considered. All other loops are skipped. + const bool OnlyWhenForced; + public: - explicit LoopFullUnrollPass(int OptLevel = 2) : OptLevel(OptLevel) {} + explicit LoopFullUnrollPass(int OptLevel = 2, bool OnlyWhenForced = false) + : OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced) {} PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U); @@ -50,7 +56,13 @@ struct LoopUnrollOptions { Optional AllowUpperBound; int OptLevel; - LoopUnrollOptions(int OptLevel = 2) : OptLevel(OptLevel) {} + /// If false, use a cost model to determine whether unrolling of a loop is + /// profitable. If true, only loops that explicitly request unrolling via + /// metadata are considered. All other loops are skipped. + bool OnlyWhenForced; + + LoopUnrollOptions(int OptLevel = 2, bool OnlyWhenForced = false) + : OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced) {} /// Enables or disables partial unrolling. When disabled only full unrolling /// is allowed. diff --git a/include/llvm/Transforms/Scalar/MakeGuardsExplicit.h b/include/llvm/Transforms/Scalar/MakeGuardsExplicit.h new file mode 100644 index 0000000000000000000000000000000000000000..41b4aada2baac97310b301167139e46611ec1b9a --- /dev/null +++ b/include/llvm/Transforms/Scalar/MakeGuardsExplicit.h @@ -0,0 +1,47 @@ +//===-- MakeGuardsExplicit.h - Turn guard intrinsics into guard branches --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass lowers the @llvm.experimental.guard intrinsic to the new form of +// guard represented as widenable explicit branch to the deopt block. The +// difference between this pass and LowerGuardIntrinsic is that after this pass +// the guard represented as intrinsic: +// +// call void(i1, ...) @llvm.experimental.guard(i1 %old_cond) [ "deopt"() ] +// +// transforms to a guard represented as widenable explicit branch: +// +// %widenable_cond = call i1 @llvm.experimental.widenable.condition() +// br i1 (%old_cond & %widenable_cond), label %guarded, label %deopt +// +// Here: +// - The semantics of @llvm.experimental.widenable.condition allows to replace +// %widenable_cond with the construction (%widenable_cond & %any_other_cond) +// without loss of correctness; +// - %guarded is the lower part of old guard intrinsic's parent block split by +// the intrinsic call; +// - %deopt is a block containing a sole call to @llvm.experimental.deoptimize +// intrinsic. +// +// Therefore, this branch preserves the property of widenability. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_TRANSFORMS_SCALAR_MAKEGUARDSEXPLICIT_H +#define LLVM_TRANSFORMS_SCALAR_MAKEGUARDSEXPLICIT_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +struct MakeGuardsExplicitPass : public PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +} // namespace llvm + +#endif //LLVM_TRANSFORMS_SCALAR_MAKEGUARDSEXPLICIT_H diff --git a/include/llvm/Transforms/Scalar/SCCP.h b/include/llvm/Transforms/Scalar/SCCP.h index e434d41d034fd92f9eee1be5ebc9fcf867b9a3f5..0abbb32fde6a03cd78a5bc21e8826f21acb85bc6 100644 --- a/include/llvm/Transforms/Scalar/SCCP.h +++ b/include/llvm/Transforms/Scalar/SCCP.h @@ -31,7 +31,7 @@ namespace llvm { -class Function; +class PostDominatorTree; /// This pass performs function-level constant propagation and merging. class SCCPPass : public PassInfoMixin { @@ -39,9 +39,15 @@ public: PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; -bool runIPSCCP( - Module &M, const DataLayout &DL, const TargetLibraryInfo *TLI, - function_ref(Function &)> getPredicateInfo); +/// Helper struct for bundling up the analysis results per function for IPSCCP. +struct AnalysisResultsForFn { + std::unique_ptr PredInfo; + DominatorTree *DT; + PostDominatorTree *PDT; +}; + +bool runIPSCCP(Module &M, const DataLayout &DL, const TargetLibraryInfo *TLI, + function_ref getAnalysis); } // end namespace llvm #endif // LLVM_TRANSFORMS_SCALAR_SCCP_H diff --git a/include/llvm/Transforms/Scalar/Scalarizer.h b/include/llvm/Transforms/Scalar/Scalarizer.h new file mode 100644 index 0000000000000000000000000000000000000000..1a0b9a2b638c8a2b77ab3f3a300a468ddc3db1a9 --- /dev/null +++ b/include/llvm/Transforms/Scalar/Scalarizer.h @@ -0,0 +1,35 @@ +//===- Scalarizer.h --- Scalarize vector operations -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass converts vector operations into scalar operations, in order +/// to expose optimization opportunities on the individual scalar operations. +/// It is mainly intended for targets that do not have vector units, but it +/// may also be useful for revectorizing code to different vector widths. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_SCALARIZER_H +#define LLVM_TRANSFORMS_SCALAR_SCALARIZER_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class ScalarizerPass : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +/// Create a legacy pass manager instance of the Scalarizer pass +FunctionPass *createScalarizerPass(); + +} + +#endif /* LLVM_TRANSFORMS_SCALAR_SCALARIZER_H */ diff --git a/include/llvm/Transforms/Scalar/WarnMissedTransforms.h b/include/llvm/Transforms/Scalar/WarnMissedTransforms.h new file mode 100644 index 0000000000000000000000000000000000000000..018b22a932e6cf1bd95b190d76f7c14f04967080 --- /dev/null +++ b/include/llvm/Transforms/Scalar/WarnMissedTransforms.h @@ -0,0 +1,38 @@ +//===- WarnMissedTransforms.h -----------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Emit warnings if forced code transformations have not been performed. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_WARNMISSEDTRANSFORMS_H +#define LLVM_TRANSFORMS_SCALAR_WARNMISSEDTRANSFORMS_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { +class Function; +class Loop; +class LPMUpdater; + +// New pass manager boilerplate. +class WarnMissedTransformationsPass + : public PassInfoMixin { +public: + explicit WarnMissedTransformationsPass() {} + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +// Legacy pass manager boilerplate. +Pass *createWarnMissedTransformationsPass(); +void initializeWarnMissedTransformationsLegacyPass(PassRegistry &); +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_WARNMISSEDTRANSFORMS_H diff --git a/include/llvm/Transforms/Utils/Cloning.h b/include/llvm/Transforms/Utils/Cloning.h index d7dce53fc76141caa29201a1fb0d056a55a40bdc..f5e997324fc8917d05321adb2dc0bab6165d9ddb 100644 --- a/include/llvm/Transforms/Utils/Cloning.h +++ b/include/llvm/Transforms/Utils/Cloning.h @@ -47,6 +47,7 @@ class LoopInfo; class Module; class ProfileSummaryInfo; class ReturnInst; +class DomTreeUpdater; /// Return an exact copy of the specified module std::unique_ptr CloneModule(const Module &M); @@ -262,11 +263,12 @@ void remapInstructionsInBlocks(const SmallVectorImpl &Blocks, /// we replace them with the uses of corresponding Phi inputs. ValueMapping /// is used to map the original instructions from BB to their newly-created /// copies. Returns the split block. -BasicBlock * -DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB, - Instruction *StopAt, - ValueToValueMapTy &ValueMapping, - DominatorTree *DT = nullptr); +BasicBlock *DuplicateInstructionsInSplitBetween(BasicBlock *BB, + BasicBlock *PredBB, + Instruction *StopAt, + ValueToValueMapTy &ValueMapping, + DomTreeUpdater &DTU); + } // end namespace llvm #endif // LLVM_TRANSFORMS_UTILS_CLONING_H diff --git a/include/llvm/Transforms/Utils/CodeExtractor.h b/include/llvm/Transforms/Utils/CodeExtractor.h index 13bef8418057002230a375cdff52094265a0e223..498ff5a7883f3157b2760f7e8eb82cec8d74694b 100644 --- a/include/llvm/Transforms/Utils/CodeExtractor.h +++ b/include/llvm/Transforms/Utils/CodeExtractor.h @@ -18,6 +18,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" #include namespace llvm { @@ -146,7 +147,8 @@ class Value; BasicBlock *findOrCreateBlockForHoisting(BasicBlock *CommonExitBlock); private: - void severSplitPHINodes(BasicBlock *&Header); + void severSplitPHINodesOfEntry(BasicBlock *&Header); + void severSplitPHINodesOfExits(const SmallPtrSetImpl &Exits); void splitReturnBlocks(); Function *constructFunction(const ValueSet &inputs, diff --git a/include/llvm/Transforms/Utils/FunctionImportUtils.h b/include/llvm/Transforms/Utils/FunctionImportUtils.h index b9fbef04cdc3d6a7c0741a12b0cd41cef8c28d28..e24398b90012b355d0f409df813d3f758b1fb0c7 100644 --- a/include/llvm/Transforms/Utils/FunctionImportUtils.h +++ b/include/llvm/Transforms/Utils/FunctionImportUtils.h @@ -114,6 +114,9 @@ bool renameModuleForThinLTO( Module &M, const ModuleSummaryIndex &Index, SetVector *GlobalsToImport = nullptr); +/// Compute synthetic function entry counts. +void computeSyntheticCounts(ModuleSummaryIndex &Index); + } // End llvm namespace #endif diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h index 86a32bb63005984407cc3b0ffc2592565626901c..ec8b0eda36416a38c57207503c1c49d0fad3aefd 100644 --- a/include/llvm/Transforms/Utils/Local.h +++ b/include/llvm/Transforms/Utils/Local.h @@ -174,6 +174,12 @@ bool RecursivelyDeleteDeadPHINode(PHINode *PN, bool SimplifyInstructionsInBlock(BasicBlock *BB, const TargetLibraryInfo *TLI = nullptr); +/// Replace all the uses of an SSA value in @llvm.dbg intrinsics with +/// undef. This is useful for signaling that a variable, e.g. has been +/// found dead and hence it's unavailable at a given program point. +/// Returns true if the dbg values have been changed. +bool replaceDbgUsesWithUndef(Instruction *I); + //===----------------------------------------------------------------------===// // Control Flow Graph Restructuring. // diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h index f642852275c04fe066ab814019288045c905e361..faf9131e6aca44d3a005481676632791c854a496 100644 --- a/include/llvm/Transforms/Utils/LoopUtils.h +++ b/include/llvm/Transforms/Utils/LoopUtils.h @@ -171,6 +171,77 @@ SmallVector findDefsUsedOutsideOfLoop(Loop *L); Optional findStringMetadataForLoop(Loop *TheLoop, StringRef Name); +/// Find named metadata for a loop with an integer value. +llvm::Optional getOptionalIntLoopAttribute(Loop *TheLoop, StringRef Name); + +/// Create a new loop identifier for a loop created from a loop transformation. +/// +/// @param OrigLoopID The loop ID of the loop before the transformation. +/// @param FollowupAttrs List of attribute names that contain attributes to be +/// added to the new loop ID. +/// @param InheritOptionsAttrsPrefix Selects which attributes should be inherited +/// from the original loop. The following values +/// are considered: +/// nullptr : Inherit all attributes from @p OrigLoopID. +/// "" : Do not inherit any attribute from @p OrigLoopID; only use +/// those specified by a followup attribute. +/// "": Inherit all attributes except those which start with +/// ; commonly used to remove metadata for the +/// applied transformation. +/// @param AlwaysNew If true, do not try to reuse OrigLoopID and never return +/// None. +/// +/// @return The loop ID for the after-transformation loop. The following values +/// can be returned: +/// None : No followup attribute was found; it is up to the +/// transformation to choose attributes that make sense. +/// @p OrigLoopID: The original identifier can be reused. +/// nullptr : The new loop has no attributes. +/// MDNode* : A new unique loop identifier. +Optional +makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef FollowupAttrs, + const char *InheritOptionsAttrsPrefix = "", + bool AlwaysNew = false); + +/// Look for the loop attribute that disables all transformation heuristic. +bool hasDisableAllTransformsHint(const Loop *L); + +/// The mode sets how eager a transformation should be applied. +enum TransformationMode { + /// The pass can use heuristics to determine whether a transformation should + /// be applied. + TM_Unspecified, + + /// The transformation should be applied without considering a cost model. + TM_Enable, + + /// The transformation should not be applied. + TM_Disable, + + /// Force is a flag and should not be used alone. + TM_Force = 0x04, + + /// The transformation was directed by the user, e.g. by a #pragma in + /// the source code. If the transformation could not be applied, a + /// warning should be emitted. + TM_ForcedByUser = TM_Enable | TM_Force, + + /// The transformation must not be applied. For instance, `#pragma clang loop + /// unroll(disable)` explicitly forbids any unrolling to take place. Unlike + /// general loop metadata, it must not be dropped. Most passes should not + /// behave differently under TM_Disable and TM_SuppressedByUser. + TM_SuppressedByUser = TM_Disable | TM_Force +}; + +/// @{ +/// Get the mode for LLVM's supported loop transformations. +TransformationMode hasUnrollTransformation(Loop *L); +TransformationMode hasUnrollAndJamTransformation(Loop *L); +TransformationMode hasVectorizeTransformation(Loop *L); +TransformationMode hasDistributeTransformation(Loop *L); +TransformationMode hasLICMVersioningTransformation(Loop *L); +/// @} + /// Set input string into loop metadata by keeping other values intact. void addStringMetadataToLoop(Loop *TheLoop, const char *MDString, unsigned V = 0); diff --git a/include/llvm/Transforms/Utils/UnrollLoop.h b/include/llvm/Transforms/Utils/UnrollLoop.h index a6b84af068a52c47fc8278f7de0e1c73c3aef5a6..70e936d75008a30ebcac45141013ba0f2eae8260 100644 --- a/include/llvm/Transforms/Utils/UnrollLoop.h +++ b/include/llvm/Transforms/Utils/UnrollLoop.h @@ -35,6 +35,15 @@ class ScalarEvolution; using NewLoopsMap = SmallDenseMap; +/// @{ +/// Metadata attribute names +const char *const LLVMLoopUnrollFollowupAll = "llvm.loop.unroll.followup_all"; +const char *const LLVMLoopUnrollFollowupUnrolled = + "llvm.loop.unroll.followup_unrolled"; +const char *const LLVMLoopUnrollFollowupRemainder = + "llvm.loop.unroll.followup_remainder"; +/// @} + const Loop* addClonedBlockToLoopInfo(BasicBlock *OriginalBB, BasicBlock *ClonedBB, LoopInfo *LI, NewLoopsMap &NewLoops); @@ -61,15 +70,16 @@ LoopUnrollResult UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, unsigned PeelCount, bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, bool PreserveLCSSA); + OptimizationRemarkEmitter *ORE, bool PreserveLCSSA, + Loop **RemainderLoop = nullptr); bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, bool AllowExpensiveTripCount, bool UseEpilogRemainder, bool UnrollRemainder, - LoopInfo *LI, - ScalarEvolution *SE, DominatorTree *DT, - AssumptionCache *AC, - bool PreserveLCSSA); + LoopInfo *LI, ScalarEvolution *SE, + DominatorTree *DT, AssumptionCache *AC, + bool PreserveLCSSA, + Loop **ResultLoop = nullptr); void computePeelCount(Loop *L, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, @@ -84,7 +94,8 @@ LoopUnrollResult UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple, bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE); + OptimizationRemarkEmitter *ORE, + Loop **EpilogueLoop = nullptr); bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, DependenceInfo &DI); diff --git a/include/llvm/Transforms/Vectorize.h b/include/llvm/Transforms/Vectorize.h index 950af7ffe05f5ca074209f278b1628b58e1a4c1d..70f9a2e0741b3a1aebcbe5c65a30c7fc872f9b52 100644 --- a/include/llvm/Transforms/Vectorize.h +++ b/include/llvm/Transforms/Vectorize.h @@ -110,8 +110,8 @@ struct VectorizeConfig { // // LoopVectorize - Create a loop vectorization pass. // -Pass *createLoopVectorizePass(bool NoUnrolling = false, - bool AlwaysVectorize = true); +Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced = false, + bool VectorizeOnlyWhenForced = false); //===----------------------------------------------------------------------===// // diff --git a/include/llvm/Transforms/Vectorize/LoadStoreVectorizer.h b/include/llvm/Transforms/Vectorize/LoadStoreVectorizer.h new file mode 100644 index 0000000000000000000000000000000000000000..6b37d7093c442c7c4c99c9e40ca631a660532638 --- /dev/null +++ b/include/llvm/Transforms/Vectorize/LoadStoreVectorizer.h @@ -0,0 +1,27 @@ +//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_LOADSTOREVECTORIZER_H +#define LLVM_TRANSFORMS_VECTORIZE_LOADSTOREVECTORIZER_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class LoadStoreVectorizerPass : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +/// Create a legacy pass manager instance of the LoadStoreVectorizer pass +Pass *createLoadStoreVectorizerPass(); + +} + +#endif /* LLVM_TRANSFORMS_VECTORIZE_LOADSTOREVECTORIZER_H */ diff --git a/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index ceb660daa28cb5414e22a39bae2ac3b198be7510..5c7bba048607e5ef7e6e40c348956c8b0f527b24 100644 --- a/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -95,7 +95,7 @@ public: FK_Enabled = 1, ///< Forcing enabled. }; - LoopVectorizeHints(const Loop *L, bool DisableInterleaving, + LoopVectorizeHints(const Loop *L, bool InterleaveOnlyWhenForced, OptimizationRemarkEmitter &ORE); /// Mark the loop L as already vectorized by setting the width to 1. @@ -105,7 +105,8 @@ public: writeHintsToMetadata(Hints); } - bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const; + bool allowVectorization(Function *F, Loop *L, + bool VectorizeOnlyWhenForced) const; /// Dumps all the hint information. void emitRemarkWithHints() const; @@ -113,7 +114,12 @@ public: unsigned getWidth() const { return Width.Value; } unsigned getInterleave() const { return Interleave.Value; } unsigned getIsVectorized() const { return IsVectorized.Value; } - enum ForceKind getForce() const { return (ForceKind)Force.Value; } + enum ForceKind getForce() const { + if ((ForceKind)Force.Value == FK_Undefined && + hasDisableAllTransformsHint(TheLoop)) + return FK_Disabled; + return (ForceKind)Force.Value; + } /// If hints are provided that force vectorization, use the AlwaysPrint /// pass name to force the frontend to print the diagnostic. diff --git a/include/llvm/Transforms/Vectorize/LoopVectorize.h b/include/llvm/Transforms/Vectorize/LoopVectorize.h index d79d8469180347cac44dbf50e068cbbb8a7f4c8a..d9c4f7b023c14fa689463aebf7b70cbecf9a3289 100644 --- a/include/llvm/Transforms/Vectorize/LoopVectorize.h +++ b/include/llvm/Transforms/Vectorize/LoopVectorize.h @@ -78,12 +78,13 @@ class TargetTransformInfo; /// The LoopVectorize Pass. struct LoopVectorizePass : public PassInfoMixin { - bool DisableUnrolling = false; + /// If false, consider all loops for interleaving. + /// If true, only loops that explicitly request interleaving are considered. + bool InterleaveOnlyWhenForced = false; - /// If true, consider all loops for vectorization. - /// If false, only loops that explicitly request vectorization are - /// considered. - bool AlwaysVectorize = true; + /// If false, consider all loops for vectorization. + /// If true, only loops that explicitly request vectorization are considered. + bool VectorizeOnlyWhenForced = false; ScalarEvolution *SE; LoopInfo *LI; diff --git a/include/llvm/XRay/FDRRecordProducer.h b/include/llvm/XRay/FDRRecordProducer.h index e67e22873f8d91cc1beeabb6935877e4f2ab8eb1..efdba2a67b7bec66cede54e29c97ca06ba912346 100644 --- a/include/llvm/XRay/FDRRecordProducer.h +++ b/include/llvm/XRay/FDRRecordProducer.h @@ -29,6 +29,11 @@ class FileBasedRecordProducer : public RecordProducer { const XRayFileHeader &Header; DataExtractor &E; uint32_t &OffsetPtr; + uint32_t CurrentBufferBytes = 0; + + // Helper function which gets the next record by speculatively reading through + // the log, finding a buffer extents record. + Expected> findNextBufferExtent(); public: FileBasedRecordProducer(const XRayFileHeader &FH, DataExtractor &DE, diff --git a/include/llvm/XRay/FDRRecords.h b/include/llvm/XRay/FDRRecords.h index 9d48332d5083d15e56b732937561c0e039486482..8a84f4d0c1fbc6e7427108005dab0a50de704fe6 100644 --- a/include/llvm/XRay/FDRRecords.h +++ b/include/llvm/XRay/FDRRecords.h @@ -14,10 +14,14 @@ #ifndef LLVM_LIB_XRAY_FDRRECORDS_H_ #define LLVM_LIB_XRAY_FDRRECORDS_H_ +#include +#include + +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/DataExtractor.h" #include "llvm/Support/Error.h" #include "llvm/XRay/XRayRecord.h" -#include namespace llvm { namespace xray { @@ -26,21 +30,37 @@ class RecordVisitor; class RecordInitializer; class Record { -protected: - enum class Type { - Unknown, - Function, - Metadata, +public: + enum class RecordKind { + RK_Metadata, + RK_Metadata_BufferExtents, + RK_Metadata_WallClockTime, + RK_Metadata_NewCPUId, + RK_Metadata_TSCWrap, + RK_Metadata_CustomEvent, + RK_Metadata_CustomEventV5, + RK_Metadata_CallArg, + RK_Metadata_PIDEntry, + RK_Metadata_NewBuffer, + RK_Metadata_EndOfBuffer, + RK_Metadata_TypedEvent, + RK_Metadata_LastMetadata, + RK_Function, }; + static StringRef kindToString(RecordKind K); + +private: + const RecordKind T; + public: Record(const Record &) = delete; Record(Record &&) = delete; Record &operator=(const Record &) = delete; Record &operator=(Record &&) = delete; - Record() = default; + explicit Record(RecordKind T) : T(T) {} - virtual Type type() const = 0; + RecordKind getRecordType() const { return T; } // Each Record should be able to apply an abstract visitor, and choose the // appropriate function in the visitor to invoke, given its own type. @@ -50,10 +70,6 @@ public: }; class MetadataRecord : public Record { -protected: - static constexpr int kMetadataBodySize = 15; - friend class RecordInitializer; - public: enum class MetadataType : unsigned { Unknown, @@ -69,11 +85,22 @@ public: TypedEvent, }; - Type type() const override { return Type::Metadata; } +protected: + static constexpr int kMetadataBodySize = 15; + friend class RecordInitializer; + +private: + const MetadataType MT; - // All metadata records must know to provide the type of their open - // metadata record. - virtual MetadataType metadataType() const = 0; +public: + explicit MetadataRecord(RecordKind T, MetadataType M) : Record(T), MT(M) {} + + static bool classof(const Record *R) { + return R->getRecordType() >= RecordKind::RK_Metadata && + R->getRecordType() <= RecordKind::RK_Metadata_LastMetadata; + } + + MetadataType metadataType() const { return MT; } virtual ~MetadataRecord() = default; }; @@ -86,16 +113,22 @@ class BufferExtents : public MetadataRecord { friend class RecordInitializer; public: - BufferExtents() = default; - explicit BufferExtents(uint64_t S) : MetadataRecord(), Size(S) {} + BufferExtents() + : MetadataRecord(RecordKind::RK_Metadata_BufferExtents, + MetadataType::BufferExtents) {} - MetadataType metadataType() const override { - return MetadataType::BufferExtents; - } + explicit BufferExtents(uint64_t S) + : MetadataRecord(RecordKind::RK_Metadata_BufferExtents, + MetadataType::BufferExtents), + Size(S) {} uint64_t size() const { return Size; } Error apply(RecordVisitor &V) override; + + static bool classof(const Record *R) { + return R->getRecordType() == RecordKind::RK_Metadata_BufferExtents; + } }; class WallclockRecord : public MetadataRecord { @@ -104,18 +137,23 @@ class WallclockRecord : public MetadataRecord { friend class RecordInitializer; public: - WallclockRecord() = default; - explicit WallclockRecord(uint64_t S, uint32_t N) - : MetadataRecord(), Seconds(S), Nanos(N) {} + WallclockRecord() + : MetadataRecord(RecordKind::RK_Metadata_WallClockTime, + MetadataType::WallClockTime) {} - MetadataType metadataType() const override { - return MetadataType::WallClockTime; - } + explicit WallclockRecord(uint64_t S, uint32_t N) + : MetadataRecord(RecordKind::RK_Metadata_WallClockTime, + MetadataType::WallClockTime), + Seconds(S), Nanos(N) {} uint64_t seconds() const { return Seconds; } uint32_t nanos() const { return Nanos; } Error apply(RecordVisitor &V) override; + + static bool classof(const Record *R) { + return R->getRecordType() == RecordKind::RK_Metadata_WallClockTime; + } }; class NewCPUIDRecord : public MetadataRecord { @@ -124,16 +162,24 @@ class NewCPUIDRecord : public MetadataRecord { friend class RecordInitializer; public: - NewCPUIDRecord() = default; - NewCPUIDRecord(uint16_t C, uint64_t T) : MetadataRecord(), CPUId(C), TSC(T) {} + NewCPUIDRecord() + : MetadataRecord(RecordKind::RK_Metadata_NewCPUId, + MetadataType::NewCPUId) {} - MetadataType metadataType() const override { return MetadataType::NewCPUId; } + NewCPUIDRecord(uint16_t C, uint64_t T) + : MetadataRecord(RecordKind::RK_Metadata_NewCPUId, + MetadataType::NewCPUId), + CPUId(C), TSC(T) {} uint16_t cpuid() const { return CPUId; } uint64_t tsc() const { return TSC; } Error apply(RecordVisitor &V) override; + + static bool classof(const Record *R) { + return R->getRecordType() == RecordKind::RK_Metadata_NewCPUId; + } }; class TSCWrapRecord : public MetadataRecord { @@ -141,14 +187,21 @@ class TSCWrapRecord : public MetadataRecord { friend class RecordInitializer; public: - TSCWrapRecord() = default; - explicit TSCWrapRecord(uint64_t B) : MetadataRecord(), BaseTSC(B) {} + TSCWrapRecord() + : MetadataRecord(RecordKind::RK_Metadata_TSCWrap, MetadataType::TSCWrap) { + } - MetadataType metadataType() const override { return MetadataType::TSCWrap; } + explicit TSCWrapRecord(uint64_t B) + : MetadataRecord(RecordKind::RK_Metadata_TSCWrap, MetadataType::TSCWrap), + BaseTSC(B) {} uint64_t tsc() const { return BaseTSC; } Error apply(RecordVisitor &V) override; + + static bool classof(const Record *R) { + return R->getRecordType() == RecordKind::RK_Metadata_TSCWrap; + } }; class CustomEventRecord : public MetadataRecord { @@ -159,13 +212,14 @@ class CustomEventRecord : public MetadataRecord { friend class RecordInitializer; public: - CustomEventRecord() = default; - explicit CustomEventRecord(uint64_t S, uint64_t T, uint16_t C, std::string D) - : MetadataRecord(), Size(S), TSC(T), CPU(C), Data(std::move(D)) {} + CustomEventRecord() + : MetadataRecord(RecordKind::RK_Metadata_CustomEvent, + MetadataType::CustomEvent) {} - MetadataType metadataType() const override { - return MetadataType::CustomEvent; - } + explicit CustomEventRecord(uint64_t S, uint64_t T, uint16_t C, std::string D) + : MetadataRecord(RecordKind::RK_Metadata_CustomEvent, + MetadataType::CustomEvent), + Size(S), TSC(T), CPU(C), Data(std::move(D)) {} int32_t size() const { return Size; } uint64_t tsc() const { return TSC; } @@ -173,6 +227,10 @@ public: StringRef data() const { return Data; } Error apply(RecordVisitor &V) override; + + static bool classof(const Record *R) { + return R->getRecordType() == RecordKind::RK_Metadata_CustomEvent; + } }; class CustomEventRecordV5 : public MetadataRecord { @@ -182,19 +240,24 @@ class CustomEventRecordV5 : public MetadataRecord { friend class RecordInitializer; public: - CustomEventRecordV5() = default; - explicit CustomEventRecordV5(int32_t S, int32_t D, std::string P) - : MetadataRecord(), Size(S), Delta(D), Data(std::move(P)) {} + CustomEventRecordV5() + : MetadataRecord(RecordKind::RK_Metadata_CustomEventV5, + MetadataType::CustomEvent) {} - MetadataType metadataType() const override { - return MetadataType::CustomEvent; - } + explicit CustomEventRecordV5(int32_t S, int32_t D, std::string P) + : MetadataRecord(RecordKind::RK_Metadata_CustomEventV5, + MetadataType::CustomEvent), + Size(S), Delta(D), Data(std::move(P)) {} int32_t size() const { return Size; } int32_t delta() const { return Delta; } StringRef data() const { return Data; } Error apply(RecordVisitor &V) override; + + static bool classof(const Record *R) { + return R->getRecordType() == RecordKind::RK_Metadata_CustomEventV5; + } }; class TypedEventRecord : public MetadataRecord { @@ -205,13 +268,14 @@ class TypedEventRecord : public MetadataRecord { friend class RecordInitializer; public: - TypedEventRecord() = default; - explicit TypedEventRecord(int32_t S, int32_t D, uint16_t E, std::string P) - : MetadataRecord(), Size(S), Delta(D), Data(std::move(P)) {} + TypedEventRecord() + : MetadataRecord(RecordKind::RK_Metadata_TypedEvent, + MetadataType::TypedEvent) {} - MetadataType metadataType() const override { - return MetadataType::TypedEvent; - } + explicit TypedEventRecord(int32_t S, int32_t D, uint16_t E, std::string P) + : MetadataRecord(RecordKind::RK_Metadata_TypedEvent, + MetadataType::TypedEvent), + Size(S), Delta(D), Data(std::move(P)) {} int32_t size() const { return Size; } int32_t delta() const { return Delta; } @@ -219,6 +283,10 @@ public: StringRef data() const { return Data; } Error apply(RecordVisitor &V) override; + + static bool classof(const Record *R) { + return R->getRecordType() == RecordKind::RK_Metadata_TypedEvent; + } }; class CallArgRecord : public MetadataRecord { @@ -226,14 +294,21 @@ class CallArgRecord : public MetadataRecord { friend class RecordInitializer; public: - CallArgRecord() = default; - explicit CallArgRecord(uint64_t A) : MetadataRecord(), Arg(A) {} + CallArgRecord() + : MetadataRecord(RecordKind::RK_Metadata_CallArg, MetadataType::CallArg) { + } - MetadataType metadataType() const override { return MetadataType::CallArg; } + explicit CallArgRecord(uint64_t A) + : MetadataRecord(RecordKind::RK_Metadata_CallArg, MetadataType::CallArg), + Arg(A) {} uint64_t arg() const { return Arg; } Error apply(RecordVisitor &V) override; + + static bool classof(const Record *R) { + return R->getRecordType() == RecordKind::RK_Metadata_CallArg; + } }; class PIDRecord : public MetadataRecord { @@ -241,14 +316,22 @@ class PIDRecord : public MetadataRecord { friend class RecordInitializer; public: - PIDRecord() = default; - explicit PIDRecord(int32_t P) : MetadataRecord(), PID(P) {} + PIDRecord() + : MetadataRecord(RecordKind::RK_Metadata_PIDEntry, + MetadataType::PIDEntry) {} - MetadataType metadataType() const override { return MetadataType::PIDEntry; } + explicit PIDRecord(int32_t P) + : MetadataRecord(RecordKind::RK_Metadata_PIDEntry, + MetadataType::PIDEntry), + PID(P) {} int32_t pid() const { return PID; } Error apply(RecordVisitor &V) override; + + static bool classof(const Record *R) { + return R->getRecordType() == RecordKind::RK_Metadata_PIDEntry; + } }; class NewBufferRecord : public MetadataRecord { @@ -256,25 +339,35 @@ class NewBufferRecord : public MetadataRecord { friend class RecordInitializer; public: - NewBufferRecord() = default; - explicit NewBufferRecord(int32_t T) : MetadataRecord(), TID(T) {} + NewBufferRecord() + : MetadataRecord(RecordKind::RK_Metadata_NewBuffer, + MetadataType::NewBuffer) {} - MetadataType metadataType() const override { return MetadataType::NewBuffer; } + explicit NewBufferRecord(int32_t T) + : MetadataRecord(RecordKind::RK_Metadata_NewBuffer, + MetadataType::NewBuffer), + TID(T) {} int32_t tid() const { return TID; } Error apply(RecordVisitor &V) override; + + static bool classof(const Record *R) { + return R->getRecordType() == RecordKind::RK_Metadata_NewBuffer; + } }; class EndBufferRecord : public MetadataRecord { public: - EndBufferRecord() = default; - - MetadataType metadataType() const override { - return MetadataType::EndOfBuffer; - } + EndBufferRecord() + : MetadataRecord(RecordKind::RK_Metadata_EndOfBuffer, + MetadataType::EndOfBuffer) {} Error apply(RecordVisitor &V) override; + + static bool classof(const Record *R) { + return R->getRecordType() == RecordKind::RK_Metadata_EndOfBuffer; + } }; class FunctionRecord : public Record { @@ -286,11 +379,10 @@ class FunctionRecord : public Record { static constexpr unsigned kFunctionRecordSize = 8; public: - FunctionRecord() = default; - explicit FunctionRecord(RecordTypes K, int32_t F, uint32_t D) - : Record(), Kind(K), FuncId(F), Delta(D) {} + FunctionRecord() : Record(RecordKind::RK_Function) {} - Type type() const override { return Type::Function; } + explicit FunctionRecord(RecordTypes K, int32_t F, uint32_t D) + : Record(RecordKind::RK_Function), Kind(K), FuncId(F), Delta(D) {} // A function record is a concrete record type which has a number of common // properties. @@ -299,6 +391,10 @@ public: uint32_t delta() const { return Delta; } Error apply(RecordVisitor &V) override; + + static bool classof(const Record *R) { + return R->getRecordType() == RecordKind::RK_Function; + } }; class RecordVisitor { diff --git a/include/llvm/module.extern.modulemap b/include/llvm/module.extern.modulemap new file mode 100644 index 0000000000000000000000000000000000000000..8acda137e0440832905a7730b5965b83a6b2aee8 --- /dev/null +++ b/include/llvm/module.extern.modulemap @@ -0,0 +1,5 @@ +module LLVM_Extern_Config_Def {} +module LLVM_Extern_IR_Attributes_Gen {} +module LLVM_Extern_IR_Intrinsics_Gen {} +module LLVM_Extern_IR_Intrinsics_Enum {} +module LLVM_Extern_Utils_DataTypes {} diff --git a/include/llvm/module.install.modulemap b/include/llvm/module.install.modulemap new file mode 100644 index 0000000000000000000000000000000000000000..ac73a861232657ae0453d8e10fb65c1f014e03d0 --- /dev/null +++ b/include/llvm/module.install.modulemap @@ -0,0 +1,27 @@ + +module LLVM_Extern_Config_Def { + textual header "Config/AsmParsers.def" + textual header "Config/AsmPrinters.def" + textual header "Config/Disassemblers.def" + textual header "Config/Targets.def" + export * +} + +module LLVM_Extern_IR_Attributes_Gen { + textual header "IR/Attributes.gen" + textual header "IR/Attributes.inc" +} + +module LLVM_Extern_IR_Intrinsics_Gen { + textual header "IR/Intrinsics.gen" + textual header "IR/Intrinsics.inc" +} + +module LLVM_Extern_IR_Intrinsics_Enum { + textual header "IR/IntrinsicEnums.inc" +} + +module LLVM_Extern_Utils_DataTypes { + header "Support/DataTypes.h" + export * +} diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap index c918eff2b9766db41f6415d09a5e18a5251b0c9b..3b6e2a8a0ba42d51e4eaa3273bf6d9750abf5c39 100644 --- a/include/llvm/module.modulemap +++ b/include/llvm/module.modulemap @@ -36,6 +36,7 @@ module LLVM_Backend { module LLVM_Bitcode { requires cplusplus umbrella "Bitcode" module * { export * } } + module LLVM_BinaryFormat { requires cplusplus umbrella "BinaryFormat" module * { export * } @@ -63,7 +64,12 @@ module LLVM_BinaryFormat { textual header "BinaryFormat/MsgPack.def" } -module LLVM_Config { requires cplusplus umbrella "Config" module * { export * } } +module LLVM_Config { + requires cplusplus + umbrella "Config" + extern module LLVM_Extern_Config_Def "module.extern.modulemap" + module * { export * } +} module LLVM_DebugInfo { requires cplusplus @@ -181,7 +187,11 @@ module LLVM_intrinsic_gen { // Attributes.h module IR_Argument { header "IR/Argument.h" export * } - module IR_Attributes { header "IR/Attributes.h" export * } + module IR_Attributes { + header "IR/Attributes.h" + extern module LLVM_Extern_IR_Attributes_Gen "module.extern.modulemap" + export * + } module IR_CallSite { header "IR/CallSite.h" export * } module IR_ConstantFolder { header "IR/ConstantFolder.h" export * } module IR_GlobalVariable { header "IR/GlobalVariable.h" export * } @@ -207,7 +217,12 @@ module LLVM_intrinsic_gen { module IR_Verifier { header "IR/Verifier.h" export * } module IR_InstIterator { header "IR/InstIterator.h" export * } module IR_InstVisitor { header "IR/InstVisitor.h" export * } - module IR_Intrinsics { header "IR/Intrinsics.h" export * } + module IR_Intrinsics { + header "IR/Intrinsics.h" + extern module LLVM_Extern_IR_Intricsics_Gen "module.extern.modulemap" + extern module LLVM_Extern_IR_Intrinsics_Enum "module.extern.modulemap" + export * + } module IR_IntrinsicInst { header "IR/IntrinsicInst.h" export * } module IR_PatternMatch { header "IR/PatternMatch.h" export * } module IR_Statepoint { header "IR/Statepoint.h" export * } @@ -284,6 +299,8 @@ module LLVM_Transforms { module * { export * } } +extern module LLVM_Extern_Utils_DataTypes "module.extern.modulemap" + // A module covering ADT/ and Support/. These are intertwined and // codependent, and notionally form a single module. module LLVM_Utils { diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp index 49b1bdb2e779610abfaa4b99ea4ea161a93fcb35..bb8742123a0f08beafedbf39f46899304d963c0d 100644 --- a/lib/Analysis/Analysis.cpp +++ b/lib/Analysis/Analysis.cpp @@ -77,6 +77,8 @@ void llvm::initializeAnalysis(PassRegistry &Registry) { initializeRegionOnlyPrinterPass(Registry); initializeSCEVAAWrapperPassPass(Registry); initializeScalarEvolutionWrapperPassPass(Registry); + initializeStackSafetyGlobalInfoWrapperPassPass(Registry); + initializeStackSafetyInfoWrapperPassPass(Registry); initializeTargetTransformInfoWrapperPassPass(Registry); initializeTypeBasedAAWrapperPassPass(Registry); initializeScopedNoAliasAAWrapperPassPass(Registry); diff --git a/lib/Analysis/CGSCCPassManager.cpp b/lib/Analysis/CGSCCPassManager.cpp index 6965235326df3fe2eb7c683475323919ea547d2a..fd2292ced017d4415ef7746fc094e75974d87763 100644 --- a/lib/Analysis/CGSCCPassManager.cpp +++ b/lib/Analysis/CGSCCPassManager.cpp @@ -79,7 +79,10 @@ PassManagerrun(*C, AM, G, UR); - PI.runAfterPass(*Pass, *C); + if (UR.InvalidatedSCCs.count(C)) + PI.runAfterPassInvalidated(*Pass); + else + PI.runAfterPass(*Pass, *C); // Update the SCC if necessary. C = UR.UpdatedC ? UR.UpdatedC : C; diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt index c33e2a88127261af7347223329d7f85cd6a9ed67..c57d8ef69d69b7d45a931af2176d6c4a08de0c04 100644 --- a/lib/Analysis/CMakeLists.txt +++ b/lib/Analysis/CMakeLists.txt @@ -81,6 +81,7 @@ add_llvm_library(LLVMAnalysis ScalarEvolutionAliasAnalysis.cpp ScalarEvolutionExpander.cpp ScalarEvolutionNormalization.cpp + StackSafetyAnalysis.cpp SyncDependenceAnalysis.cpp SyntheticCountsUtils.cpp TargetLibraryInfo.cpp diff --git a/lib/Analysis/CallGraphSCCPass.cpp b/lib/Analysis/CallGraphSCCPass.cpp index 80d0427529a5dfb432b8dfeaff3508f18d5d7ac3..0aed57a39387cc89b2ea218c1fd5515868b14737 100644 --- a/lib/Analysis/CallGraphSCCPass.cpp +++ b/lib/Analysis/CallGraphSCCPass.cpp @@ -633,23 +633,40 @@ namespace { bool runOnSCC(CallGraphSCC &SCC) override { bool BannerPrinted = false; - auto PrintBannerOnce = [&] () { + auto PrintBannerOnce = [&]() { if (BannerPrinted) return; OS << Banner; BannerPrinted = true; - }; + }; + + bool NeedModule = llvm::forcePrintModuleIR(); + if (isFunctionInPrintList("*") && NeedModule) { + PrintBannerOnce(); + OS << "\n"; + SCC.getCallGraph().getModule().print(OS, nullptr); + return false; + } + bool FoundFunction = false; for (CallGraphNode *CGN : SCC) { if (Function *F = CGN->getFunction()) { if (!F->isDeclaration() && isFunctionInPrintList(F->getName())) { - PrintBannerOnce(); - F->print(OS); + FoundFunction = true; + if (!NeedModule) { + PrintBannerOnce(); + F->print(OS); + } } } else if (isFunctionInPrintList("*")) { PrintBannerOnce(); OS << "\nPrinting Function\n"; } } + if (NeedModule && FoundFunction) { + PrintBannerOnce(); + OS << "\n"; + SCC.getCallGraph().getModule().print(OS, nullptr); + } return false; } diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp index d4f73bdb4361deb0cf48444e91b362e61ed67aec..5cac5e5ca616a4f9a4706f1ea398d08bda8afbce 100644 --- a/lib/Analysis/CaptureTracking.cpp +++ b/lib/Analysis/CaptureTracking.cpp @@ -158,7 +158,8 @@ namespace { /// storing the value (or part of it) into memory anywhere automatically /// counts as capturing it or not. bool llvm::PointerMayBeCaptured(const Value *V, - bool ReturnCaptures, bool StoreCaptures) { + bool ReturnCaptures, bool StoreCaptures, + unsigned MaxUsesToExplore) { assert(!isa(V) && "It doesn't make sense to ask whether a global is captured."); @@ -169,7 +170,7 @@ bool llvm::PointerMayBeCaptured(const Value *V, (void)StoreCaptures; SimpleCaptureTracker SCT(ReturnCaptures); - PointerMayBeCaptured(V, &SCT); + PointerMayBeCaptured(V, &SCT, MaxUsesToExplore); return SCT.Captured; } @@ -186,13 +187,15 @@ bool llvm::PointerMayBeCaptured(const Value *V, bool llvm::PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures, bool StoreCaptures, const Instruction *I, const DominatorTree *DT, bool IncludeI, - OrderedBasicBlock *OBB) { + OrderedBasicBlock *OBB, + unsigned MaxUsesToExplore) { assert(!isa(V) && "It doesn't make sense to ask whether a global is captured."); bool UseNewOBB = OBB == nullptr; if (!DT) - return PointerMayBeCaptured(V, ReturnCaptures, StoreCaptures); + return PointerMayBeCaptured(V, ReturnCaptures, StoreCaptures, + MaxUsesToExplore); if (UseNewOBB) OBB = new OrderedBasicBlock(I->getParent()); @@ -200,29 +203,25 @@ bool llvm::PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures, // with StoreCaptures. CapturesBefore CB(ReturnCaptures, I, DT, IncludeI, OBB); - PointerMayBeCaptured(V, &CB); + PointerMayBeCaptured(V, &CB, MaxUsesToExplore); if (UseNewOBB) delete OBB; return CB.Captured; } -/// TODO: Write a new FunctionPass AliasAnalysis so that it can keep -/// a cache. Then we can move the code from BasicAliasAnalysis into -/// that path, and remove this threshold. -static int const Threshold = 20; - -void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker) { +void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker, + unsigned MaxUsesToExplore) { assert(V->getType()->isPointerTy() && "Capture is for pointers only!"); - SmallVector Worklist; - SmallSet Visited; + SmallVector Worklist; + SmallSet Visited; auto AddUses = [&](const Value *V) { - int Count = 0; + unsigned Count = 0; for (const Use &U : V->uses()) { // If there are lots of uses, conservatively say that the value // is captured to avoid taking too much compile time. - if (Count++ >= Threshold) + if (Count++ >= MaxUsesToExplore) return Tracker->tooManyUses(); if (!Visited.insert(&U).second) continue; diff --git a/lib/Analysis/CmpInstAnalysis.cpp b/lib/Analysis/CmpInstAnalysis.cpp index 159c1a2d135ac9d594f3ab6a69c930f64f6becf6..27071babec5c85f0924a177cce0243868c958486 100644 --- a/lib/Analysis/CmpInstAnalysis.cpp +++ b/lib/Analysis/CmpInstAnalysis.cpp @@ -40,28 +40,28 @@ unsigned llvm::getICmpCode(const ICmpInst *ICI, bool InvertPred) { } } -Value *llvm::getICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS, - CmpInst::Predicate &NewICmpPred) { +Constant *llvm::getPredForICmpCode(unsigned Code, bool Sign, Type *OpTy, + CmpInst::Predicate &Pred) { switch (Code) { default: llvm_unreachable("Illegal ICmp code!"); case 0: // False. - return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); - case 1: NewICmpPred = Sign ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break; - case 2: NewICmpPred = ICmpInst::ICMP_EQ; break; - case 3: NewICmpPred = Sign ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break; - case 4: NewICmpPred = Sign ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break; - case 5: NewICmpPred = ICmpInst::ICMP_NE; break; - case 6: NewICmpPred = Sign ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break; + return ConstantInt::get(CmpInst::makeCmpResultType(OpTy), 0); + case 1: Pred = Sign ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break; + case 2: Pred = ICmpInst::ICMP_EQ; break; + case 3: Pred = Sign ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break; + case 4: Pred = Sign ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break; + case 5: Pred = ICmpInst::ICMP_NE; break; + case 6: Pred = Sign ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break; case 7: // True. - return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1); + return ConstantInt::get(CmpInst::makeCmpResultType(OpTy), 1); } return nullptr; } -bool llvm::PredicatesFoldable(ICmpInst::Predicate p1, ICmpInst::Predicate p2) { - return (CmpInst::isSigned(p1) == CmpInst::isSigned(p2)) || - (CmpInst::isSigned(p1) && ICmpInst::isEquality(p2)) || - (CmpInst::isSigned(p2) && ICmpInst::isEquality(p1)); +bool llvm::predicatesFoldable(ICmpInst::Predicate P1, ICmpInst::Predicate P2) { + return (CmpInst::isSigned(P1) == CmpInst::isSigned(P2)) || + (CmpInst::isSigned(P1) && ICmpInst::isEquality(P2)) || + (CmpInst::isSigned(P2) && ICmpInst::isEquality(P1)); } bool llvm::decomposeBitTestICmp(Value *LHS, Value *RHS, diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp index 92b0555913758ff32837ba385c612686ef96f1f2..6180886267470e441ff51e3c00344f18e48bc6b1 100644 --- a/lib/Analysis/ConstantFolding.cpp +++ b/lib/Analysis/ConstantFolding.cpp @@ -347,9 +347,20 @@ Constant *llvm::ConstantFoldLoadThroughBitcast(Constant *C, Type *DestTy, // We're simulating a load through a pointer that was bitcast to point to // a different type, so we can try to walk down through the initial - // elements of an aggregate to see if some part of th e aggregate is + // elements of an aggregate to see if some part of the aggregate is // castable to implement the "load" semantic model. - C = C->getAggregateElement(0u); + if (SrcTy->isStructTy()) { + // Struct types might have leading zero-length elements like [0 x i32], + // which are certainly not what we are looking for, so skip them. + unsigned Elem = 0; + Constant *ElemC; + do { + ElemC = C->getAggregateElement(Elem++); + } while (ElemC && DL.getTypeSizeInBits(ElemC->getType()) == 0); + C = ElemC; + } else { + C = C->getAggregateElement(0u); + } } while (C); return nullptr; @@ -1399,6 +1410,10 @@ bool llvm::canConstantFoldCallTo(ImmutableCallSite CS, const Function *F) { case Intrinsic::usub_with_overflow: case Intrinsic::smul_with_overflow: case Intrinsic::umul_with_overflow: + case Intrinsic::sadd_sat: + case Intrinsic::uadd_sat: + case Intrinsic::ssub_sat: + case Intrinsic::usub_sat: case Intrinsic::convert_from_fp16: case Intrinsic::convert_to_fp16: case Intrinsic::bitreverse: @@ -2019,6 +2034,14 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty, }; return ConstantStruct::get(cast(Ty), Ops); } + case Intrinsic::uadd_sat: + return ConstantInt::get(Ty, Op1->getValue().uadd_sat(Op2->getValue())); + case Intrinsic::sadd_sat: + return ConstantInt::get(Ty, Op1->getValue().sadd_sat(Op2->getValue())); + case Intrinsic::usub_sat: + return ConstantInt::get(Ty, Op1->getValue().usub_sat(Op2->getValue())); + case Intrinsic::ssub_sat: + return ConstantInt::get(Ty, Op1->getValue().ssub_sat(Op2->getValue())); case Intrinsic::cttz: if (Op2->isOne() && Op1->isZero()) // cttz(0, 1) is undef. return UndefValue::get(Ty); diff --git a/lib/Analysis/DemandedBits.cpp b/lib/Analysis/DemandedBits.cpp index 35af4be485f821f3320e583b0ed262aef18480d7..0382787fbefe788e0209bdae08a5427ca01c9bee 100644 --- a/lib/Analysis/DemandedBits.cpp +++ b/lib/Analysis/DemandedBits.cpp @@ -39,6 +39,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PassManager.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/Pass.h" @@ -50,6 +51,7 @@ #include using namespace llvm; +using namespace llvm::PatternMatch; #define DEBUG_TYPE "demanded-bits" @@ -142,6 +144,28 @@ void DemandedBits::determineLiveOperandBits( std::min(BitWidth, Known.countMaxTrailingZeros()+1)); } break; + case Intrinsic::fshl: + case Intrinsic::fshr: { + const APInt *SA; + if (OperandNo == 2) { + // Shift amount is modulo the bitwidth. For powers of two we have + // SA % BW == SA & (BW - 1). + if (isPowerOf2_32(BitWidth)) + AB = BitWidth - 1; + } else if (match(II->getOperand(2), m_APInt(SA))) { + // Normalize to funnel shift left. APInt shifts of BitWidth are well- + // defined, so no need to special-case zero shifts here. + uint64_t ShiftAmt = SA->urem(BitWidth); + if (II->getIntrinsicID() == Intrinsic::fshr) + ShiftAmt = BitWidth - ShiftAmt; + + if (OperandNo == 0) + AB = AOut.lshr(ShiftAmt); + else if (OperandNo == 1) + AB = AOut.shl(BitWidth - ShiftAmt); + } + break; + } } break; case Instruction::Add: @@ -153,8 +177,9 @@ void DemandedBits::determineLiveOperandBits( AB = APInt::getLowBitsSet(BitWidth, AOut.getActiveBits()); break; case Instruction::Shl: - if (OperandNo == 0) - if (auto *ShiftAmtC = dyn_cast(UserI->getOperand(1))) { + if (OperandNo == 0) { + const APInt *ShiftAmtC; + if (match(UserI->getOperand(1), m_APInt(ShiftAmtC))) { uint64_t ShiftAmt = ShiftAmtC->getLimitedValue(BitWidth - 1); AB = AOut.lshr(ShiftAmt); @@ -166,10 +191,12 @@ void DemandedBits::determineLiveOperandBits( else if (S->hasNoUnsignedWrap()) AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt); } + } break; case Instruction::LShr: - if (OperandNo == 0) - if (auto *ShiftAmtC = dyn_cast(UserI->getOperand(1))) { + if (OperandNo == 0) { + const APInt *ShiftAmtC; + if (match(UserI->getOperand(1), m_APInt(ShiftAmtC))) { uint64_t ShiftAmt = ShiftAmtC->getLimitedValue(BitWidth - 1); AB = AOut.shl(ShiftAmt); @@ -178,10 +205,12 @@ void DemandedBits::determineLiveOperandBits( if (cast(UserI)->isExact()) AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); } + } break; case Instruction::AShr: - if (OperandNo == 0) - if (auto *ShiftAmtC = dyn_cast(UserI->getOperand(1))) { + if (OperandNo == 0) { + const APInt *ShiftAmtC; + if (match(UserI->getOperand(1), m_APInt(ShiftAmtC))) { uint64_t ShiftAmt = ShiftAmtC->getLimitedValue(BitWidth - 1); AB = AOut.shl(ShiftAmt); // Because the high input bit is replicated into the @@ -196,6 +225,7 @@ void DemandedBits::determineLiveOperandBits( if (cast(UserI)->isExact()) AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); } + } break; case Instruction::And: AB = AOut; @@ -253,6 +283,15 @@ void DemandedBits::determineLiveOperandBits( if (OperandNo != 0) AB = AOut; break; + case Instruction::ExtractElement: + if (OperandNo == 0) + AB = AOut; + break; + case Instruction::InsertElement: + case Instruction::ShuffleVector: + if (OperandNo == 0 || OperandNo == 1) + AB = AOut; + break; } } @@ -288,8 +327,9 @@ void DemandedBits::performAnalysis() { // bits and add the instruction to the work list. For other instructions // add their operands to the work list (for integer values operands, mark // all bits as live). - if (IntegerType *IT = dyn_cast(I.getType())) { - if (AliveBits.try_emplace(&I, IT->getBitWidth(), 0).second) + Type *T = I.getType(); + if (T->isIntOrIntVectorTy()) { + if (AliveBits.try_emplace(&I, T->getScalarSizeInBits(), 0).second) Worklist.push_back(&I); continue; @@ -298,8 +338,9 @@ void DemandedBits::performAnalysis() { // Non-integer-typed instructions... for (Use &OI : I.operands()) { if (Instruction *J = dyn_cast(OI)) { - if (IntegerType *IT = dyn_cast(J->getType())) - AliveBits[J] = APInt::getAllOnesValue(IT->getBitWidth()); + Type *T = J->getType(); + if (T->isIntOrIntVectorTy()) + AliveBits[J] = APInt::getAllOnesValue(T->getScalarSizeInBits()); Worklist.push_back(J); } } @@ -315,13 +356,13 @@ void DemandedBits::performAnalysis() { LLVM_DEBUG(dbgs() << "DemandedBits: Visiting: " << *UserI); APInt AOut; - if (UserI->getType()->isIntegerTy()) { + if (UserI->getType()->isIntOrIntVectorTy()) { AOut = AliveBits[UserI]; LLVM_DEBUG(dbgs() << " Alive Out: " << AOut); } LLVM_DEBUG(dbgs() << "\n"); - if (!UserI->getType()->isIntegerTy()) + if (!UserI->getType()->isIntOrIntVectorTy()) Visited.insert(UserI); KnownBits Known, Known2; @@ -330,10 +371,11 @@ void DemandedBits::performAnalysis() { // operand is added to the work-list. for (Use &OI : UserI->operands()) { if (Instruction *I = dyn_cast(OI)) { - if (IntegerType *IT = dyn_cast(I->getType())) { - unsigned BitWidth = IT->getBitWidth(); + Type *T = I->getType(); + if (T->isIntOrIntVectorTy()) { + unsigned BitWidth = T->getScalarSizeInBits(); APInt AB = APInt::getAllOnesValue(BitWidth); - if (UserI->getType()->isIntegerTy() && !AOut && + if (UserI->getType()->isIntOrIntVectorTy() && !AOut && !isAlwaysLive(UserI)) { AB = APInt(BitWidth, 0); } else { @@ -368,11 +410,13 @@ void DemandedBits::performAnalysis() { APInt DemandedBits::getDemandedBits(Instruction *I) { performAnalysis(); - const DataLayout &DL = I->getModule()->getDataLayout(); auto Found = AliveBits.find(I); if (Found != AliveBits.end()) return Found->second; - return APInt::getAllOnesValue(DL.getTypeSizeInBits(I->getType())); + + const DataLayout &DL = I->getModule()->getDataLayout(); + return APInt::getAllOnesValue( + DL.getTypeSizeInBits(I->getType()->getScalarType())); } bool DemandedBits::isInstructionDead(Instruction *I) { diff --git a/lib/Analysis/DivergenceAnalysis.cpp b/lib/Analysis/DivergenceAnalysis.cpp index de47445c5e01286c39f0c101e88a516bc201b109..7ba23854a3cc53efb3fdd68d7ca4ea78bf87a0fc 100644 --- a/lib/Analysis/DivergenceAnalysis.cpp +++ b/lib/Analysis/DivergenceAnalysis.cpp @@ -422,3 +422,36 @@ void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const { OS << "DIVERGENT:" << I << '\n'; } } + +// class GPUDivergenceAnalysis +GPUDivergenceAnalysis::GPUDivergenceAnalysis(Function &F, + const DominatorTree &DT, + const PostDominatorTree &PDT, + const LoopInfo &LI, + const TargetTransformInfo &TTI) + : SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, false) { + for (auto &I : instructions(F)) { + if (TTI.isSourceOfDivergence(&I)) { + DA.markDivergent(I); + } else if (TTI.isAlwaysUniform(&I)) { + DA.addUniformOverride(I); + } + } + for (auto &Arg : F.args()) { + if (TTI.isSourceOfDivergence(&Arg)) { + DA.markDivergent(Arg); + } + } + + DA.compute(); +} + +bool GPUDivergenceAnalysis::isDivergent(const Value &val) const { + return DA.isDivergent(val); +} + +void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const { + OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n"; + DA.print(OS, mod); + OS << "}\n"; +} diff --git a/lib/Analysis/IVDescriptors.cpp b/lib/Analysis/IVDescriptors.cpp index 854a95573e902bb0643aa68bfa9970c1b911678e..aaebc4a481ec5db351a12432f313a44356774859 100644 --- a/lib/Analysis/IVDescriptors.cpp +++ b/lib/Analysis/IVDescriptors.cpp @@ -299,9 +299,17 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind, return false; } + bool IsASelect = isa(Cur); + + // A conditional reduction operation must only have 2 or less uses in + // VisitedInsts. + if (IsASelect && (Kind == RK_FloatAdd || Kind == RK_FloatMult) && + hasMultipleUsesOf(Cur, VisitedInsts, 2)) + return false; + // A reduction operation must only have one use of the reduction value. - if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax && - hasMultipleUsesOf(Cur, VisitedInsts)) + if (!IsAPhi && !IsASelect && Kind != RK_IntegerMinMax && + Kind != RK_FloatMinMax && hasMultipleUsesOf(Cur, VisitedInsts, 1)) return false; // All inputs to a PHI node must be a reduction value. @@ -362,7 +370,8 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind, } else if (!isa(UI) && ((!isa(UI) && !isa(UI) && !isa(UI)) || - !isMinMaxSelectCmpPattern(UI, IgnoredVal).isRecurrence())) + (!isConditionalRdxPattern(Kind, UI).isRecurrence() && + !isMinMaxSelectCmpPattern(UI, IgnoredVal).isRecurrence()))) return false; // Remember that we completed the cycle. @@ -491,6 +500,53 @@ RecurrenceDescriptor::isMinMaxSelectCmpPattern(Instruction *I, InstDesc &Prev) { return InstDesc(false, I); } +/// Returns true if the select instruction has users in the compare-and-add +/// reduction pattern below. The select instruction argument is the last one +/// in the sequence. +/// +/// %sum.1 = phi ... +/// ... +/// %cmp = fcmp pred %0, %CFP +/// %add = fadd %0, %sum.1 +/// %sum.2 = select %cmp, %add, %sum.1 +RecurrenceDescriptor::InstDesc +RecurrenceDescriptor::isConditionalRdxPattern( + RecurrenceKind Kind, Instruction *I) { + SelectInst *SI = dyn_cast(I); + if (!SI) + return InstDesc(false, I); + + CmpInst *CI = dyn_cast(SI->getCondition()); + // Only handle single use cases for now. + if (!CI || !CI->hasOneUse()) + return InstDesc(false, I); + + Value *TrueVal = SI->getTrueValue(); + Value *FalseVal = SI->getFalseValue(); + // Handle only when either of operands of select instruction is a PHI + // node for now. + if ((isa(*TrueVal) && isa(*FalseVal)) || + (!isa(*TrueVal) && !isa(*FalseVal))) + return InstDesc(false, I); + + Instruction *I1 = + isa(*TrueVal) ? dyn_cast(FalseVal) + : dyn_cast(TrueVal); + if (!I1 || !I1->isBinaryOp()) + return InstDesc(false, I); + + Value *Op1, *Op2; + if ((m_FAdd(m_Value(Op1), m_Value(Op2)).match(I1) || + m_FSub(m_Value(Op1), m_Value(Op2)).match(I1)) && + I1->isFast()) + return InstDesc(Kind == RK_FloatAdd, SI); + + if (m_FMul(m_Value(Op1), m_Value(Op2)).match(I1) && (I1->isFast())) + return InstDesc(Kind == RK_FloatMult, SI); + + return InstDesc(false, I); +} + RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind, InstDesc &Prev, bool HasFunNoNaNAttr) { @@ -520,9 +576,12 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind, case Instruction::FSub: case Instruction::FAdd: return InstDesc(Kind == RK_FloatAdd, I, UAI); + case Instruction::Select: + if (Kind == RK_FloatAdd || Kind == RK_FloatMult) + return isConditionalRdxPattern(Kind, I); + LLVM_FALLTHROUGH; case Instruction::FCmp: case Instruction::ICmp: - case Instruction::Select: if (Kind != RK_IntegerMinMax && (!HasFunNoNaNAttr || Kind != RK_FloatMinMax)) return InstDesc(false, I); @@ -531,13 +590,14 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind, } bool RecurrenceDescriptor::hasMultipleUsesOf( - Instruction *I, SmallPtrSetImpl &Insts) { + Instruction *I, SmallPtrSetImpl &Insts, + unsigned MaxNumUses) { unsigned NumUses = 0; for (User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) { if (Insts.count(dyn_cast(*Use))) ++NumUses; - if (NumUses > 1) + if (NumUses > MaxNumUses) return true; } diff --git a/lib/Analysis/InstructionPrecedenceTracking.cpp b/lib/Analysis/InstructionPrecedenceTracking.cpp index c667c28228183fcd3c58a828f40f8ce6cea70183..b98975b006a445bd1f0f255f1e560294b3e26fa1 100644 --- a/lib/Analysis/InstructionPrecedenceTracking.cpp +++ b/lib/Analysis/InstructionPrecedenceTracking.cpp @@ -142,3 +142,8 @@ bool ImplicitControlFlowTracking::isSpecialInstruction( } return true; } + +bool MemoryWriteTracking::isSpecialInstruction( + const Instruction *Insn) const { + return Insn->mayWriteToMemory(); +} diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp index fd6f4ba476eedf329ab48c639309c2905e63c7c9..ccf907c144f0d1a7535e667a1f788a819b0122aa 100644 --- a/lib/Analysis/InstructionSimplify.cpp +++ b/lib/Analysis/InstructionSimplify.cpp @@ -2630,6 +2630,70 @@ static void setLimitsForBinOp(BinaryOperator &BO, APInt &Lower, APInt &Upper, } } +/// Some intrinsics with a constant operand have an easy-to-compute range of +/// outputs. This can be used to fold a comparison to always true or always +/// false. +static void setLimitsForIntrinsic(IntrinsicInst &II, APInt &Lower, + APInt &Upper) { + unsigned Width = Lower.getBitWidth(); + const APInt *C; + switch (II.getIntrinsicID()) { + case Intrinsic::uadd_sat: + // uadd.sat(x, C) produces [C, UINT_MAX]. + if (match(II.getOperand(0), m_APInt(C)) || + match(II.getOperand(1), m_APInt(C))) + Lower = *C; + break; + case Intrinsic::sadd_sat: + if (match(II.getOperand(0), m_APInt(C)) || + match(II.getOperand(1), m_APInt(C))) { + if (C->isNegative()) { + // sadd.sat(x, -C) produces [SINT_MIN, SINT_MAX + (-C)]. + Lower = APInt::getSignedMinValue(Width); + Upper = APInt::getSignedMaxValue(Width) + *C + 1; + } else { + // sadd.sat(x, +C) produces [SINT_MIN + C, SINT_MAX]. + Lower = APInt::getSignedMinValue(Width) + *C; + Upper = APInt::getSignedMaxValue(Width) + 1; + } + } + break; + case Intrinsic::usub_sat: + // usub.sat(C, x) produces [0, C]. + if (match(II.getOperand(0), m_APInt(C))) + Upper = *C + 1; + // usub.sat(x, C) produces [0, UINT_MAX - C]. + else if (match(II.getOperand(1), m_APInt(C))) + Upper = APInt::getMaxValue(Width) - *C + 1; + break; + case Intrinsic::ssub_sat: + if (match(II.getOperand(0), m_APInt(C))) { + if (C->isNegative()) { + // ssub.sat(-C, x) produces [SINT_MIN, -SINT_MIN + (-C)]. + Lower = APInt::getSignedMinValue(Width); + Upper = *C - APInt::getSignedMinValue(Width) + 1; + } else { + // ssub.sat(+C, x) produces [-SINT_MAX + C, SINT_MAX]. + Lower = *C - APInt::getSignedMaxValue(Width); + Upper = APInt::getSignedMaxValue(Width) + 1; + } + } else if (match(II.getOperand(1), m_APInt(C))) { + if (C->isNegative()) { + // ssub.sat(x, -C) produces [SINT_MIN - (-C), SINT_MAX]: + Lower = APInt::getSignedMinValue(Width) - *C; + Upper = APInt::getSignedMaxValue(Width) + 1; + } else { + // ssub.sat(x, +C) produces [SINT_MIN, SINT_MAX - C]. + Lower = APInt::getSignedMinValue(Width); + Upper = APInt::getSignedMaxValue(Width) - *C + 1; + } + } + break; + default: + break; + } +} + static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const InstrInfoQuery &IIQ) { Type *ITy = GetCompareTy(RHS); // The return type. @@ -2663,6 +2727,8 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS, APInt Upper = APInt(Width, 0); if (auto *BO = dyn_cast(LHS)) setLimitsForBinOp(*BO, Lower, Upper, IIQ); + else if (auto *II = dyn_cast(LHS)) + setLimitsForIntrinsic(*II, Lower, Upper); ConstantRange LHS_CR = Lower != Upper ? ConstantRange(Lower, Upper) : ConstantRange(Width, true); @@ -3837,6 +3903,28 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal, if (Value *V = simplifySelectBitTest(TrueVal, FalseVal, X, Y, Pred == ICmpInst::ICMP_EQ)) return V; + + // Test for zero-shift-guard-ops around funnel shifts. These are used to + // avoid UB from oversized shifts in raw IR rotate patterns, but the + // intrinsics do not have that problem. + Value *ShAmt; + auto isFsh = m_CombineOr(m_Intrinsic(m_Value(X), m_Value(), + m_Value(ShAmt)), + m_Intrinsic(m_Value(), m_Value(X), + m_Value(ShAmt))); + // (ShAmt != 0) ? fshl(X, *, ShAmt) : X --> fshl(X, *, ShAmt) + // (ShAmt != 0) ? fshr(*, X, ShAmt) : X --> fshr(*, X, ShAmt) + // (ShAmt == 0) ? fshl(X, *, ShAmt) : X --> X + // (ShAmt == 0) ? fshr(*, X, ShAmt) : X --> X + if (match(TrueVal, isFsh) && FalseVal == X && CmpLHS == ShAmt) + return Pred == ICmpInst::ICMP_NE ? TrueVal : X; + + // (ShAmt == 0) ? X : fshl(X, *, ShAmt) --> fshl(X, *, ShAmt) + // (ShAmt == 0) ? X : fshr(*, X, ShAmt) --> fshr(*, X, ShAmt) + // (ShAmt != 0) ? X : fshl(X, *, ShAmt) --> X + // (ShAmt != 0) ? X : fshr(*, X, ShAmt) --> X + if (match(FalseVal, isFsh) && TrueVal == X && CmpLHS == ShAmt) + return Pred == ICmpInst::ICMP_EQ ? FalseVal : X; } // Check for other compares that behave like bit test. @@ -3944,6 +4032,10 @@ static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, if (Value *V = foldSelectWithBinaryOp(Cond, TrueVal, FalseVal)) return V; + Optional Imp = isImpliedByDomCondition(Cond, Q.CxtI, Q.DL); + if (Imp) + return *Imp ? TrueVal : FalseVal; + return nullptr; } @@ -4890,6 +4982,40 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1, if (match(Op0, m_Undef()) || match(Op1, m_Undef())) return Constant::getNullValue(ReturnType); break; + case Intrinsic::uadd_sat: + // sat(MAX + X) -> MAX + // sat(X + MAX) -> MAX + if (match(Op0, m_AllOnes()) || match(Op1, m_AllOnes())) + return Constant::getAllOnesValue(ReturnType); + LLVM_FALLTHROUGH; + case Intrinsic::sadd_sat: + // sat(X + undef) -> -1 + // sat(undef + X) -> -1 + // For unsigned: Assume undef is MAX, thus we saturate to MAX (-1). + // For signed: Assume undef is ~X, in which case X + ~X = -1. + if (match(Op0, m_Undef()) || match(Op1, m_Undef())) + return Constant::getAllOnesValue(ReturnType); + + // X + 0 -> X + if (match(Op1, m_Zero())) + return Op0; + // 0 + X -> X + if (match(Op0, m_Zero())) + return Op1; + break; + case Intrinsic::usub_sat: + // sat(0 - X) -> 0, sat(X - MAX) -> 0 + if (match(Op0, m_Zero()) || match(Op1, m_AllOnes())) + return Constant::getNullValue(ReturnType); + LLVM_FALLTHROUGH; + case Intrinsic::ssub_sat: + // X - X -> 0, X - undef -> 0, undef - X -> 0 + if (Op0 == Op1 || match(Op0, m_Undef()) || match(Op1, m_Undef())) + return Constant::getNullValue(ReturnType); + // X - 0 -> X + if (match(Op1, m_Zero())) + return Op0; + break; case Intrinsic::load_relative: if (auto *C0 = dyn_cast(Op0)) if (auto *C1 = dyn_cast(Op1)) @@ -4986,7 +5112,16 @@ static Value *simplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd, } case Intrinsic::fshl: case Intrinsic::fshr: { - Value *ShAmtArg = ArgBegin[2]; + Value *Op0 = ArgBegin[0], *Op1 = ArgBegin[1], *ShAmtArg = ArgBegin[2]; + + // If both operands are undef, the result is undef. + if (match(Op0, m_Undef()) && match(Op1, m_Undef())) + return UndefValue::get(F->getReturnType()); + + // If shift amount is undef, assume it is zero. + if (match(ShAmtArg, m_Undef())) + return ArgBegin[IID == Intrinsic::fshl ? 0 : 1]; + const APInt *ShAmtC; if (match(ShAmtArg, m_APInt(ShAmtC))) { // If there's effectively no shift, return the 1st arg or 2nd arg. diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp index ee0148e0d7956213f4a2d201edb1a08220eb62a5..110c085d3f353224dac76da6543b969797a6b2f5 100644 --- a/lib/Analysis/LazyValueInfo.cpp +++ b/lib/Analysis/LazyValueInfo.cpp @@ -14,6 +14,7 @@ #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ConstantFolding.h" @@ -420,6 +421,8 @@ namespace { BasicBlock *BB); bool solveBlockValueSelect(ValueLatticeElement &BBLV, SelectInst *S, BasicBlock *BB); + Optional getRangeForOperand(unsigned Op, Instruction *I, + BasicBlock *BB); bool solveBlockValueBinaryOp(ValueLatticeElement &BBLV, BinaryOperator *BBI, BasicBlock *BB); bool solveBlockValueCast(ValueLatticeElement &BBLV, CastInst *CI, @@ -634,8 +637,7 @@ bool LazyValueInfoImpl::solveBlockValueImpl(ValueLatticeElement &Res, if (auto *CI = dyn_cast(BBI)) return solveBlockValueCast(Res, CI, BB); - BinaryOperator *BO = dyn_cast(BBI); - if (BO && isa(BO->getOperand(1))) + if (BinaryOperator *BO = dyn_cast(BBI)) return solveBlockValueBinaryOp(Res, BO, BB); } @@ -951,6 +953,25 @@ bool LazyValueInfoImpl::solveBlockValueSelect(ValueLatticeElement &BBLV, return true; } +Optional LazyValueInfoImpl::getRangeForOperand(unsigned Op, + Instruction *I, + BasicBlock *BB) { + if (!hasBlockValue(I->getOperand(Op), BB)) + if (pushBlockValue(std::make_pair(BB, I->getOperand(Op)))) + return None; + + const unsigned OperandBitWidth = + DL.getTypeSizeInBits(I->getOperand(Op)->getType()); + ConstantRange Range = ConstantRange(OperandBitWidth); + if (hasBlockValue(I->getOperand(Op), BB)) { + ValueLatticeElement Val = getBlockValue(I->getOperand(Op), BB); + intersectAssumeOrGuardBlockValueConstantRange(I->getOperand(Op), Val, I); + if (Val.isConstantRange()) + Range = Val.getConstantRange(); + } + return Range; +} + bool LazyValueInfoImpl::solveBlockValueCast(ValueLatticeElement &BBLV, CastInst *CI, BasicBlock *BB) { @@ -981,21 +1002,11 @@ bool LazyValueInfoImpl::solveBlockValueCast(ValueLatticeElement &BBLV, // Figure out the range of the LHS. If that fails, we still apply the // transfer rule on the full set since we may be able to locally infer // interesting facts. - if (!hasBlockValue(CI->getOperand(0), BB)) - if (pushBlockValue(std::make_pair(BB, CI->getOperand(0)))) - // More work to do before applying this transfer rule. - return false; - - const unsigned OperandBitWidth = - DL.getTypeSizeInBits(CI->getOperand(0)->getType()); - ConstantRange LHSRange = ConstantRange(OperandBitWidth); - if (hasBlockValue(CI->getOperand(0), BB)) { - ValueLatticeElement LHSVal = getBlockValue(CI->getOperand(0), BB); - intersectAssumeOrGuardBlockValueConstantRange(CI->getOperand(0), LHSVal, - CI); - if (LHSVal.isConstantRange()) - LHSRange = LHSVal.getConstantRange(); - } + Optional LHSRes = getRangeForOperand(0, CI, BB); + if (!LHSRes.hasValue()) + // More work to do before applying this transfer rule. + return false; + ConstantRange LHSRange = LHSRes.getValue(); const unsigned ResultBitWidth = CI->getType()->getIntegerBitWidth(); @@ -1037,27 +1048,19 @@ bool LazyValueInfoImpl::solveBlockValueBinaryOp(ValueLatticeElement &BBLV, return true; }; - // Figure out the range of the LHS. If that fails, use a conservative range, - // but apply the transfer rule anyways. This lets us pick up facts from - // expressions like "and i32 (call i32 @foo()), 32" - if (!hasBlockValue(BO->getOperand(0), BB)) - if (pushBlockValue(std::make_pair(BB, BO->getOperand(0)))) - // More work to do before applying this transfer rule. - return false; + // Figure out the ranges of the operands. If that fails, use a + // conservative range, but apply the transfer rule anyways. This + // lets us pick up facts from expressions like "and i32 (call i32 + // @foo()), 32" + Optional LHSRes = getRangeForOperand(0, BO, BB); + Optional RHSRes = getRangeForOperand(1, BO, BB); - const unsigned OperandBitWidth = - DL.getTypeSizeInBits(BO->getOperand(0)->getType()); - ConstantRange LHSRange = ConstantRange(OperandBitWidth); - if (hasBlockValue(BO->getOperand(0), BB)) { - ValueLatticeElement LHSVal = getBlockValue(BO->getOperand(0), BB); - intersectAssumeOrGuardBlockValueConstantRange(BO->getOperand(0), LHSVal, - BO); - if (LHSVal.isConstantRange()) - LHSRange = LHSVal.getConstantRange(); - } + if (!LHSRes.hasValue() || !RHSRes.hasValue()) + // More work to do before applying this transfer rule. + return false; - ConstantInt *RHS = cast(BO->getOperand(1)); - ConstantRange RHSRange = ConstantRange(RHS->getValue()); + ConstantRange LHSRange = LHSRes.getValue(); + ConstantRange RHSRange = RHSRes.getValue(); // NOTE: We're currently limited by the set of operations that ConstantRange // can evaluate symbolically. Enhancing that set will allows us to analyze diff --git a/lib/Analysis/LegacyDivergenceAnalysis.cpp b/lib/Analysis/LegacyDivergenceAnalysis.cpp index 2089d1c53d0de298274ece2ac3549bf465ced6a9..5540859ebddae4e6395aabde4458b58874d920fa 100644 --- a/lib/Analysis/LegacyDivergenceAnalysis.cpp +++ b/lib/Analysis/LegacyDivergenceAnalysis.cpp @@ -1,4 +1,5 @@ -//===- LegacyDivergenceAnalysis.cpp --------- Legacy Divergence Analysis Implementation -==// +//===- LegacyDivergenceAnalysis.cpp --------- Legacy Divergence Analysis +//Implementation -==// // // The LLVM Compiler Infrastructure // @@ -64,6 +65,9 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/PostDominators.h" @@ -79,6 +83,12 @@ using namespace llvm; #define DEBUG_TYPE "divergence" +// transparently use the GPUDivergenceAnalysis +static cl::opt UseGPUDA("use-gpu-divergence-analysis", cl::init(false), + cl::Hidden, + cl::desc("turn the LegacyDivergenceAnalysis into " + "a wrapper for GPUDivergenceAnalysis")); + namespace { class DivergencePropagator { @@ -262,16 +272,17 @@ void DivergencePropagator::propagate() { } } -} /// end namespace anonymous +} // namespace // Register this pass. char LegacyDivergenceAnalysis::ID = 0; -INITIALIZE_PASS_BEGIN(LegacyDivergenceAnalysis, "divergence", "Legacy Divergence Analysis", - false, true) +INITIALIZE_PASS_BEGIN(LegacyDivergenceAnalysis, "divergence", + "Legacy Divergence Analysis", false, true) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) -INITIALIZE_PASS_END(LegacyDivergenceAnalysis, "divergence", "Legacy Divergence Analysis", - false, true) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(LegacyDivergenceAnalysis, "divergence", + "Legacy Divergence Analysis", false, true) FunctionPass *llvm::createLegacyDivergenceAnalysisPass() { return new LegacyDivergenceAnalysis(); @@ -280,9 +291,24 @@ FunctionPass *llvm::createLegacyDivergenceAnalysisPass() { void LegacyDivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); + if (UseGPUDA) + AU.addRequired(); AU.setPreservesAll(); } +bool LegacyDivergenceAnalysis::shouldUseGPUDivergenceAnalysis( + const Function &F) const { + if (!UseGPUDA) + return false; + + // GPUDivergenceAnalysis requires a reducible CFG. + auto &LI = getAnalysis().getLoopInfo(); + using RPOTraversal = ReversePostOrderTraversal; + RPOTraversal FuncRPOT(&F); + return !containsIrreducibleCFG(FuncRPOT, LI); +} + bool LegacyDivergenceAnalysis::runOnFunction(Function &F) { auto *TTIWP = getAnalysisIfAvailable(); if (TTIWP == nullptr) @@ -295,36 +321,61 @@ bool LegacyDivergenceAnalysis::runOnFunction(Function &F) { return false; DivergentValues.clear(); + gpuDA = nullptr; + + auto &DT = getAnalysis().getDomTree(); auto &PDT = getAnalysis().getPostDomTree(); - DivergencePropagator DP(F, TTI, - getAnalysis().getDomTree(), - PDT, DivergentValues); - DP.populateWithSourcesOfDivergence(); - DP.propagate(); - LLVM_DEBUG( - dbgs() << "\nAfter divergence analysis on " << F.getName() << ":\n"; - print(dbgs(), F.getParent()) - ); + + if (shouldUseGPUDivergenceAnalysis(F)) { + // run the new GPU divergence analysis + auto &LI = getAnalysis().getLoopInfo(); + gpuDA = llvm::make_unique(F, DT, PDT, LI, TTI); + + } else { + // run LLVM's existing DivergenceAnalysis + DivergencePropagator DP(F, TTI, DT, PDT, DivergentValues); + DP.populateWithSourcesOfDivergence(); + DP.propagate(); + } + + LLVM_DEBUG(dbgs() << "\nAfter divergence analysis on " << F.getName() + << ":\n"; + print(dbgs(), F.getParent())); + return false; } +bool LegacyDivergenceAnalysis::isDivergent(const Value *V) const { + if (gpuDA) { + return gpuDA->isDivergent(*V); + } + return DivergentValues.count(V); +} + void LegacyDivergenceAnalysis::print(raw_ostream &OS, const Module *) const { - if (DivergentValues.empty()) + if ((!gpuDA || !gpuDA->hasDivergence()) && DivergentValues.empty()) return; - const Value *FirstDivergentValue = *DivergentValues.begin(); - const Function *F; - if (const Argument *Arg = dyn_cast(FirstDivergentValue)) { - F = Arg->getParent(); - } else if (const Instruction *I = - dyn_cast(FirstDivergentValue)) { - F = I->getParent()->getParent(); - } else { - llvm_unreachable("Only arguments and instructions can be divergent"); + + const Function *F = nullptr; + if (!DivergentValues.empty()) { + const Value *FirstDivergentValue = *DivergentValues.begin(); + if (const Argument *Arg = dyn_cast(FirstDivergentValue)) { + F = Arg->getParent(); + } else if (const Instruction *I = + dyn_cast(FirstDivergentValue)) { + F = I->getParent()->getParent(); + } else { + llvm_unreachable("Only arguments and instructions can be divergent"); + } + } else if (gpuDA) { + F = &gpuDA->getFunction(); } + if (!F) + return; // Dumps all divergent values in F, arguments and then instructions. for (auto &Arg : F->args()) { - OS << (DivergentValues.count(&Arg) ? "DIVERGENT: " : " "); + OS << (isDivergent(&Arg) ? "DIVERGENT: " : " "); OS << Arg << "\n"; } // Iterate instructions using instructions() to ensure a deterministic order. @@ -332,7 +383,7 @@ void LegacyDivergenceAnalysis::print(raw_ostream &OS, const Module *) const { auto &BB = *BI; OS << "\n " << BB.getName() << ":\n"; for (auto &I : BB.instructionsWithoutDebug()) { - OS << (DivergentValues.count(&I) ? "DIVERGENT: " : " "); + OS << (isDivergent(&I) ? "DIVERGENT: " : " "); OS << I << "\n"; } } diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp index 4b8e8afdabbd437c64d058c08ac0ef8f4de5c62a..245f318e308a3658ae9ec6077a06a29b4be2c64e 100644 --- a/lib/Analysis/LoopAccessAnalysis.cpp +++ b/lib/Analysis/LoopAccessAnalysis.cpp @@ -420,7 +420,7 @@ void RuntimePointerChecking::groupChecks( // We've computed the grouped checks for this partition. // Save the results and continue with the next one. - std::copy(Groups.begin(), Groups.end(), std::back_inserter(CheckingGroups)); + llvm::copy(Groups, std::back_inserter(CheckingGroups)); } } @@ -1221,18 +1221,19 @@ bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, return X == PtrSCEVB; } -bool MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) { +MemoryDepChecker::VectorizationSafetyStatus +MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) { switch (Type) { case NoDep: case Forward: case BackwardVectorizable: - return true; + return VectorizationSafetyStatus::Safe; case Unknown: case ForwardButPreventsForwarding: case Backward: case BackwardVectorizableButPreventsForwarding: - return false; + return VectorizationSafetyStatus::Unsafe; } llvm_unreachable("unexpected DepType!"); } @@ -1317,6 +1318,11 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance, return false; } +void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) { + if (Status < S) + Status = S; +} + /// Given a non-constant (unknown) dependence-distance \p Dist between two /// memory accesses, that have the same stride whose absolute value is given /// in \p Stride, and that have the same type size \p TypeByteSize, @@ -1652,7 +1658,7 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets, Dependence::DepType Type = isDependent(*A.first, A.second, *B.first, B.second, Strides); - SafeForVectorization &= Dependence::isSafeForVectorization(Type); + mergeInStatus(Dependence::isSafeForVectorization(Type)); // Gather dependences unless we accumulated MaxDependences // dependences. In that case return as soon as we find the first @@ -1669,7 +1675,7 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets, << "Too many dependences, stopped recording\n"); } } - if (!RecordDependences && !SafeForVectorization) + if (!RecordDependences && !isSafeForVectorization()) return false; } ++OI; @@ -1679,7 +1685,7 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets, } LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n"); - return SafeForVectorization; + return isSafeForVectorization(); } SmallVector @@ -1870,7 +1876,7 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI, Value *Ptr = ST->getPointerOperand(); if (isUniform(Ptr)) - HasMultipleStoresToLoopInvariantAddress |= + HasDependenceInvolvingLoopInvariantAddress |= !UniformStores.insert(Ptr).second; // If we did *not* see this pointer before, insert it to the read-write @@ -1914,6 +1920,14 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI, IsReadOnlyPtr = true; } + // See if there is an unsafe dependency between a load to a uniform address and + // store to the same uniform address. + if (UniformStores.count(Ptr)) { + LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform " + "load and uniform store to the same address!\n"); + HasDependenceInvolvingLoopInvariantAddress = true; + } + MemoryLocation Loc = MemoryLocation::get(LD); // The TBAA metadata could have a control dependency on the predication // condition, so we cannot rely on it when determining whether or not we @@ -2272,7 +2286,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, PtrRtChecking(llvm::make_unique(SE)), DepChecker(llvm::make_unique(*PSE, L)), TheLoop(L), NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false), - HasMultipleStoresToLoopInvariantAddress(false) { + HasDependenceInvolvingLoopInvariantAddress(false) { if (canAnalyzeLoop()) analyzeLoop(AA, LI, TLI, DT); } @@ -2304,8 +2318,8 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { PtrRtChecking->print(OS, Depth); OS << "\n"; - OS.indent(Depth) << "Multiple stores to invariant address were " - << (HasMultipleStoresToLoopInvariantAddress ? "" : "not ") + OS.indent(Depth) << "Non vectorizable stores to invariant address were " + << (HasDependenceInvolvingLoopInvariantAddress ? "" : "not ") << "found in loop.\n"; OS.indent(Depth) << "SCEV assumptions:\n"; diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp index 4b174b66d1e12aa55122d17bfdae7286bcb1444b..6c779bf2cca03bb5b694ae8798a87915f10ef059 100644 --- a/lib/Analysis/LoopInfo.cpp +++ b/lib/Analysis/LoopInfo.cpp @@ -237,23 +237,19 @@ MDNode *Loop::getLoopID() const { } void Loop::setLoopID(MDNode *LoopID) const { - assert(LoopID && "Loop ID should not be null"); - assert(LoopID->getNumOperands() > 0 && "Loop ID needs at least one operand"); - assert(LoopID->getOperand(0) == LoopID && "Loop ID should refer to itself"); + assert((!LoopID || LoopID->getNumOperands() > 0) && + "Loop ID needs at least one operand"); + assert((!LoopID || LoopID->getOperand(0) == LoopID) && + "Loop ID should refer to itself"); - if (BasicBlock *Latch = getLoopLatch()) { - Latch->getTerminator()->setMetadata(LLVMContext::MD_loop, LoopID); - return; - } - - assert(!getLoopLatch() && - "The loop should have no single latch at this point"); BasicBlock *H = getHeader(); for (BasicBlock *BB : this->blocks()) { Instruction *TI = BB->getTerminator(); for (BasicBlock *Successor : successors(TI)) { - if (Successor == H) + if (Successor == H) { TI->setMetadata(LLVMContext::MD_loop, LoopID); + break; + } } } } diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp index 4c1bd7ab08e675b5917b43c32e351f7bbc10b1db..a68f114b83a0dbebcc13f048c9c6248c2e1fbdb2 100644 --- a/lib/Analysis/LoopPass.cpp +++ b/lib/Analysis/LoopPass.cpp @@ -216,10 +216,12 @@ bool LPPassManager::runOnFunction(Function &F) { initializeAnalysisImpl(P); + bool LocalChanged = false; { PassManagerPrettyStackEntry X(P, *CurrentLoop->getHeader()); TimeRegion PassTimer(getPassTimer(P)); - Changed |= P->runOnLoop(CurrentLoop, *this); + LocalChanged = P->runOnLoop(CurrentLoop, *this); + Changed |= LocalChanged; if (EmitICRemark) { unsigned NewSize = F.getInstructionCount(); // Update the size of the function, emit a remark, and update the @@ -235,7 +237,7 @@ bool LPPassManager::runOnFunction(Function &F) { } } - if (Changed) + if (LocalChanged) dumpPassInfo(P, MODIFICATION_MSG, ON_LOOP_MSG, CurrentLoopDeleted ? "" : CurrentLoop->getName()); diff --git a/lib/Analysis/MemorySSA.cpp b/lib/Analysis/MemorySSA.cpp index 1e45cc4bdf8ef3a93d50321e9ff00f04bbfaea65..3d98fca823ad492a184bab5950adf21e3025e7ba 100644 --- a/lib/Analysis/MemorySSA.cpp +++ b/lib/Analysis/MemorySSA.cpp @@ -2199,13 +2199,14 @@ MemorySSA::CachingWalker::getClobberingMemoryAccess(MemoryAccess *MA) { return StartingAccess->getOptimized(); const Instruction *I = StartingAccess->getMemoryInst(); - UpwardsMemoryQuery Q(I, StartingAccess); // We can't sanely do anything with a fence, since they conservatively clobber // all memory, and have no locations to get pointers from to try to // disambiguate. - if (!Q.IsCall && I->isFenceLike()) + if (!ImmutableCallSite(I) && I->isFenceLike()) return StartingAccess; + UpwardsMemoryQuery Q(I, StartingAccess); + if (isUseTriviallyOptimizableToLiveOnEntry(*MSSA->AA, I)) { MemoryAccess *LiveOnEntry = MSSA->getLiveOnEntryDef(); StartingAccess->setOptimized(LiveOnEntry); diff --git a/lib/Analysis/MemorySSAUpdater.cpp b/lib/Analysis/MemorySSAUpdater.cpp index 880dc2f278522688bc4e2f310685933b9b514c0d..6c817d20368456b1196c47d9485845154d62c61a 100644 --- a/lib/Analysis/MemorySSAUpdater.cpp +++ b/lib/Analysis/MemorySSAUpdater.cpp @@ -93,7 +93,7 @@ MemoryAccess *MemorySSAUpdater::getPreviousDefRecursive( // FIXME: Figure out whether this is dead code and if so remove it. if (!std::equal(Phi->op_begin(), Phi->op_end(), PhiOps.begin())) { // These will have been filled in by the recursive read we did above. - std::copy(PhiOps.begin(), PhiOps.end(), Phi->op_begin()); + llvm::copy(PhiOps, Phi->op_begin()); std::copy(pred_begin(BB), pred_end(BB), Phi->block_begin()); } } else { @@ -1022,7 +1022,7 @@ void MemorySSAUpdater::wireOldPredecessorsToNewImmediatePredecessor( MemoryPhi *Phi = MSSA->getMemoryAccess(Old); if (!Phi) return; - if (pred_size(Old) == 1) { + if (Old->hasNPredecessors(1)) { assert(pred_size(New) == Preds.size() && "Should have moved all predecessors."); MSSA->moveTo(Phi, New, MemorySSA::Beginning); diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp index 29b96ac746b2007fabfbeee7824439a39b7d1e0d..6bda1d1b1a36016ed675d04bdbb745ab53abdbd4 100644 --- a/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -220,10 +220,19 @@ static void addIntrinsicToSummary( } } -static void computeFunctionSummary( - ModuleSummaryIndex &Index, const Module &M, const Function &F, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, DominatorTree &DT, - bool HasLocalsInUsedOrAsm, DenseSet &CantBePromoted) { +static bool isNonVolatileLoad(const Instruction *I) { + if (const auto *LI = dyn_cast(I)) + return !LI->isVolatile(); + + return false; +} + +static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M, + const Function &F, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI, DominatorTree &DT, + bool HasLocalsInUsedOrAsm, + DenseSet &CantBePromoted, + bool IsThinLTO) { // Summary not currently supported for anonymous functions, they should // have been named. assert(F.hasName()); @@ -244,6 +253,7 @@ static void computeFunctionSummary( // Add personality function, prefix data and prologue data to function's ref // list. findRefEdges(Index, &F, RefEdges, Visited); + std::vector NonVolatileLoads; bool HasInlineAsmMaybeReferencingInternal = false; for (const BasicBlock &BB : F) @@ -251,6 +261,13 @@ static void computeFunctionSummary( if (isa(I)) continue; ++NumInsts; + if (isNonVolatileLoad(&I)) { + // Postpone processing of non-volatile load instructions + // See comments below + Visited.insert(&I); + NonVolatileLoads.push_back(&I); + continue; + } findRefEdges(Index, &I, RefEdges, Visited); auto CS = ImmutableCallSite(&I); if (!CS) @@ -340,6 +357,24 @@ static void computeFunctionSummary( } } + // By now we processed all instructions in a function, except + // non-volatile loads. All new refs we add in a loop below + // are obviously constant. All constant refs are grouped in the + // end of RefEdges vector, so we can use a single integer value + // to identify them. + unsigned RefCnt = RefEdges.size(); + for (const Instruction *I : NonVolatileLoads) { + Visited.erase(I); + findRefEdges(Index, I, RefEdges, Visited); + } + std::vector Refs = RefEdges.takeVector(); + // Regular LTO module doesn't participate in ThinLTO import, + // so no reference from it can be readonly, since this would + // require importing variable as local copy + if (IsThinLTO) + for (; RefCnt < Refs.size(); ++RefCnt) + Refs[RefCnt].setReadOnly(); + // Explicit add hot edges to enforce importing for designated GUIDs for // sample PGO, to enable the same inlines as the profiled optimized binary. for (auto &I : F.getImportGUIDs()) @@ -357,13 +392,11 @@ static void computeFunctionSummary( F.hasFnAttribute(Attribute::ReadNone), F.hasFnAttribute(Attribute::ReadOnly), F.hasFnAttribute(Attribute::NoRecurse), F.returnDoesNotAlias(), - // Inliner doesn't handle variadic functions. // FIXME: refactor this to use the same code that inliner is using. - F.isVarArg() || - // Don't try to import functions with noinline attribute. - F.getAttributes().hasFnAttribute(Attribute::NoInline)}; + // Don't try to import functions with noinline attribute. + F.getAttributes().hasFnAttribute(Attribute::NoInline)}; auto FuncSummary = llvm::make_unique( - Flags, NumInsts, FunFlags, RefEdges.takeVector(), + Flags, NumInsts, FunFlags, /*EntryCount=*/0, std::move(Refs), CallGraphEdges.takeVector(), TypeTests.takeVector(), TypeTestAssumeVCalls.takeVector(), TypeCheckedLoadVCalls.takeVector(), TypeTestAssumeConstVCalls.takeVector(), @@ -382,8 +415,13 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V, bool NonRenamableLocal = isNonRenamableLocal(V); GlobalValueSummary::GVFlags Flags(V.getLinkage(), NonRenamableLocal, /* Live = */ false, V.isDSOLocal()); - auto GVarSummary = - llvm::make_unique(Flags, RefEdges.takeVector()); + + // Don't mark variables we won't be able to internalize as read-only. + GlobalVarSummary::GVarFlags VarFlags( + !V.hasComdat() && !V.hasAppendingLinkage() && !V.isInterposable() && + !V.hasAvailableExternallyLinkage() && !V.hasDLLExportStorageClass()); + auto GVarSummary = llvm::make_unique(Flags, VarFlags, + RefEdges.takeVector()); if (NonRenamableLocal) CantBePromoted.insert(V.getGUID()); if (HasBlockAddress) @@ -471,14 +509,15 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( if (Function *F = dyn_cast(GV)) { std::unique_ptr Summary = llvm::make_unique( - GVFlags, 0, + GVFlags, /*InstCount=*/0, FunctionSummary::FFlags{ F->hasFnAttribute(Attribute::ReadNone), F->hasFnAttribute(Attribute::ReadOnly), F->hasFnAttribute(Attribute::NoRecurse), F->returnDoesNotAlias(), /* NoInline = */ false}, - ArrayRef{}, ArrayRef{}, + /*EntryCount=*/0, ArrayRef{}, + ArrayRef{}, ArrayRef{}, ArrayRef{}, ArrayRef{}, @@ -487,13 +526,19 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( Index.addGlobalValueSummary(*GV, std::move(Summary)); } else { std::unique_ptr Summary = - llvm::make_unique(GVFlags, - ArrayRef{}); + llvm::make_unique( + GVFlags, GlobalVarSummary::GVarFlags(), + ArrayRef{}); Index.addGlobalValueSummary(*GV, std::move(Summary)); } }); } + bool IsThinLTO = true; + if (auto *MD = + mdconst::extract_or_null(M.getModuleFlag("ThinLTO"))) + IsThinLTO = MD->getZExtValue(); + // Compute summaries for all functions defined in module, and save in the // index. for (auto &F : M) { @@ -514,7 +559,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( computeFunctionSummary(Index, M, F, BFI, PSI, DT, !LocalsUsed.empty() || HasLocalInlineAsmSymbol, - CantBePromoted); + CantBePromoted, IsThinLTO); } // Compute summaries for all variables defined in module, and save in the @@ -545,11 +590,6 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( setLiveRoot(Index, "llvm.global_dtors"); setLiveRoot(Index, "llvm.global.annotations"); - bool IsThinLTO = true; - if (auto *MD = - mdconst::extract_or_null(M.getModuleFlag("ThinLTO"))) - IsThinLTO = MD->getZExtValue(); - for (auto &GlobalList : Index) { // Ignore entries for references that are undefined in the current module. if (GlobalList.second.SummaryList.empty()) @@ -619,7 +659,7 @@ ModuleSummaryIndexWrapperPass::ModuleSummaryIndexWrapperPass() } bool ModuleSummaryIndexWrapperPass::runOnModule(Module &M) { - auto &PSI = *getAnalysis().getPSI(); + auto *PSI = &getAnalysis().getPSI(); Index.emplace(buildModuleSummaryIndex( M, [this](const Function &F) { @@ -627,7 +667,7 @@ bool ModuleSummaryIndexWrapperPass::runOnModule(Module &M) { *const_cast(&F)) .getBFI()); }, - &PSI)); + PSI)); return false; } diff --git a/lib/Analysis/MustExecute.cpp b/lib/Analysis/MustExecute.cpp index 23e012626e244df2957531a18510132acc6c4bce..281b8f5ab987e803f8bd615c86ab3436c0838793 100644 --- a/lib/Analysis/MustExecute.cpp +++ b/lib/Analysis/MustExecute.cpp @@ -72,6 +72,7 @@ bool ICFLoopSafetyInfo::anyBlockMayThrow() const { void ICFLoopSafetyInfo::computeLoopSafetyInfo(const Loop *CurLoop) { assert(CurLoop != nullptr && "CurLoop can't be null"); ICF.clear(); + MW.clear(); MayThrow = false; // Figure out the fact that at least one block may throw. for (auto &BB : CurLoop->blocks()) @@ -84,12 +85,14 @@ void ICFLoopSafetyInfo::computeLoopSafetyInfo(const Loop *CurLoop) { void ICFLoopSafetyInfo::insertInstructionTo(const BasicBlock *BB) { ICF.invalidateBlock(BB); + MW.invalidateBlock(BB); } void ICFLoopSafetyInfo::removeInstruction(const Instruction *Inst) { // TODO: So far we just conservatively drop cache, but maybe we can not do it // when Inst is not an ICF instruction. Follow-up on that. ICF.invalidateBlock(Inst->getParent()); + MW.invalidateBlock(Inst->getParent()); } void LoopSafetyInfo::computeBlockColors(const Loop *CurLoop) { @@ -256,6 +259,34 @@ bool ICFLoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst, allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT); } +bool ICFLoopSafetyInfo::doesNotWriteMemoryBefore(const BasicBlock *BB, + const Loop *CurLoop) const { + assert(CurLoop->contains(BB) && "Should only be called for loop blocks!"); + + // Fast path: there are no instructions before header. + if (BB == CurLoop->getHeader()) + return true; + + // Collect all transitive predecessors of BB in the same loop. This set will + // be a subset of the blocks within the loop. + SmallPtrSet Predecessors; + collectTransitivePredecessors(CurLoop, BB, Predecessors); + // Find if there any instruction in either predecessor that could write + // to memory. + for (auto *Pred : Predecessors) + if (MW.mayWriteToMemory(Pred)) + return false; + return true; +} + +bool ICFLoopSafetyInfo::doesNotWriteMemoryBefore(const Instruction &I, + const Loop *CurLoop) const { + auto *BB = I.getParent(); + assert(CurLoop->contains(BB) && "Should only be called for loop blocks!"); + return !MW.isDominatedByMemoryWriteFromSameBlock(&I) && + doesNotWriteMemoryBefore(BB, CurLoop); +} + namespace { struct MustExecutePrinter : public FunctionPass { diff --git a/lib/Analysis/ObjCARCInstKind.cpp b/lib/Analysis/ObjCARCInstKind.cpp index f268e2a9abdde9a5992c34ef946f3c4f539707e7..31c432711834a945d6481cc964085609519b4afa 100644 --- a/lib/Analysis/ObjCARCInstKind.cpp +++ b/lib/Analysis/ObjCARCInstKind.cpp @@ -85,97 +85,73 @@ raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS, } ARCInstKind llvm::objcarc::GetFunctionClass(const Function *F) { - Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end(); - // No (mandatory) arguments. - if (AI == AE) - return StringSwitch(F->getName()) - .Case("objc_autoreleasePoolPush", ARCInstKind::AutoreleasepoolPush) - .Case("clang.arc.use", ARCInstKind::IntrinsicUser) - .Default(ARCInstKind::CallOrUser); - - // One argument. - const Argument *A0 = &*AI++; - if (AI == AE) { - // Argument is a pointer. - PointerType *PTy = dyn_cast(A0->getType()); - if (!PTy) - return ARCInstKind::CallOrUser; - - Type *ETy = PTy->getElementType(); - // Argument is i8*. - if (ETy->isIntegerTy(8)) - return StringSwitch(F->getName()) - .Case("objc_retain", ARCInstKind::Retain) - .Case("objc_retainAutoreleasedReturnValue", ARCInstKind::RetainRV) - .Case("objc_unsafeClaimAutoreleasedReturnValue", ARCInstKind::ClaimRV) - .Case("objc_retainBlock", ARCInstKind::RetainBlock) - .Case("objc_release", ARCInstKind::Release) - .Case("objc_autorelease", ARCInstKind::Autorelease) - .Case("objc_autoreleaseReturnValue", ARCInstKind::AutoreleaseRV) - .Case("objc_autoreleasePoolPop", ARCInstKind::AutoreleasepoolPop) - .Case("objc_retainedObject", ARCInstKind::NoopCast) - .Case("objc_unretainedObject", ARCInstKind::NoopCast) - .Case("objc_unretainedPointer", ARCInstKind::NoopCast) - .Case("objc_retain_autorelease", ARCInstKind::FusedRetainAutorelease) - .Case("objc_retainAutorelease", ARCInstKind::FusedRetainAutorelease) - .Case("objc_retainAutoreleaseReturnValue", - ARCInstKind::FusedRetainAutoreleaseRV) - .Case("objc_sync_enter", ARCInstKind::User) - .Case("objc_sync_exit", ARCInstKind::User) - .Default(ARCInstKind::CallOrUser); - - // Argument is i8** - if (PointerType *Pte = dyn_cast(ETy)) - if (Pte->getElementType()->isIntegerTy(8)) - return StringSwitch(F->getName()) - .Case("objc_loadWeakRetained", ARCInstKind::LoadWeakRetained) - .Case("objc_loadWeak", ARCInstKind::LoadWeak) - .Case("objc_destroyWeak", ARCInstKind::DestroyWeak) - .Default(ARCInstKind::CallOrUser); - - // Anything else with one argument. + Intrinsic::ID ID = F->getIntrinsicID(); + switch (ID) { + default: return ARCInstKind::CallOrUser; + case Intrinsic::objc_autorelease: + return ARCInstKind::Autorelease; + case Intrinsic::objc_autoreleasePoolPop: + return ARCInstKind::AutoreleasepoolPop; + case Intrinsic::objc_autoreleasePoolPush: + return ARCInstKind::AutoreleasepoolPush; + case Intrinsic::objc_autoreleaseReturnValue: + return ARCInstKind::AutoreleaseRV; + case Intrinsic::objc_copyWeak: + return ARCInstKind::CopyWeak; + case Intrinsic::objc_destroyWeak: + return ARCInstKind::DestroyWeak; + case Intrinsic::objc_initWeak: + return ARCInstKind::InitWeak; + case Intrinsic::objc_loadWeak: + return ARCInstKind::LoadWeak; + case Intrinsic::objc_loadWeakRetained: + return ARCInstKind::LoadWeakRetained; + case Intrinsic::objc_moveWeak: + return ARCInstKind::MoveWeak; + case Intrinsic::objc_release: + return ARCInstKind::Release; + case Intrinsic::objc_retain: + return ARCInstKind::Retain; + case Intrinsic::objc_retainAutorelease: + return ARCInstKind::FusedRetainAutorelease; + case Intrinsic::objc_retainAutoreleaseReturnValue: + return ARCInstKind::FusedRetainAutoreleaseRV; + case Intrinsic::objc_retainAutoreleasedReturnValue: + return ARCInstKind::RetainRV; + case Intrinsic::objc_retainBlock: + return ARCInstKind::RetainBlock; + case Intrinsic::objc_storeStrong: + return ARCInstKind::StoreStrong; + case Intrinsic::objc_storeWeak: + return ARCInstKind::StoreWeak; + case Intrinsic::objc_clang_arc_use: + return ARCInstKind::IntrinsicUser; + case Intrinsic::objc_unsafeClaimAutoreleasedReturnValue: + return ARCInstKind::ClaimRV; + case Intrinsic::objc_retainedObject: + return ARCInstKind::NoopCast; + case Intrinsic::objc_unretainedObject: + return ARCInstKind::NoopCast; + case Intrinsic::objc_unretainedPointer: + return ARCInstKind::NoopCast; + case Intrinsic::objc_retain_autorelease: + return ARCInstKind::FusedRetainAutorelease; + case Intrinsic::objc_sync_enter: + return ARCInstKind::User; + case Intrinsic::objc_sync_exit: + return ARCInstKind::User; + case Intrinsic::objc_arc_annotation_topdown_bbstart: + case Intrinsic::objc_arc_annotation_topdown_bbend: + case Intrinsic::objc_arc_annotation_bottomup_bbstart: + case Intrinsic::objc_arc_annotation_bottomup_bbend: + // Ignore annotation calls. This is important to stop the + // optimizer from treating annotations as uses which would + // make the state of the pointers they are attempting to + // elucidate to be incorrect. + return ARCInstKind::None; } - - // Two arguments, first is i8**. - const Argument *A1 = &*AI++; - if (AI == AE) - if (PointerType *PTy = dyn_cast(A0->getType())) - if (PointerType *Pte = dyn_cast(PTy->getElementType())) - if (Pte->getElementType()->isIntegerTy(8)) - if (PointerType *PTy1 = dyn_cast(A1->getType())) { - Type *ETy1 = PTy1->getElementType(); - // Second argument is i8* - if (ETy1->isIntegerTy(8)) - return StringSwitch(F->getName()) - .Case("objc_storeWeak", ARCInstKind::StoreWeak) - .Case("objc_initWeak", ARCInstKind::InitWeak) - .Case("objc_storeStrong", ARCInstKind::StoreStrong) - .Default(ARCInstKind::CallOrUser); - // Second argument is i8**. - if (PointerType *Pte1 = dyn_cast(ETy1)) - if (Pte1->getElementType()->isIntegerTy(8)) - return StringSwitch(F->getName()) - .Case("objc_moveWeak", ARCInstKind::MoveWeak) - .Case("objc_copyWeak", ARCInstKind::CopyWeak) - // Ignore annotation calls. This is important to stop the - // optimizer from treating annotations as uses which would - // make the state of the pointers they are attempting to - // elucidate to be incorrect. - .Case("llvm.arc.annotation.topdown.bbstart", - ARCInstKind::None) - .Case("llvm.arc.annotation.topdown.bbend", - ARCInstKind::None) - .Case("llvm.arc.annotation.bottomup.bbstart", - ARCInstKind::None) - .Case("llvm.arc.annotation.bottomup.bbend", - ARCInstKind::None) - .Default(ARCInstKind::CallOrUser); - } - - // Anything else. - return ARCInstKind::CallOrUser; } // A whitelist of intrinsics that we know do not use objc pointers or decrement diff --git a/lib/Analysis/ProfileSummaryInfo.cpp b/lib/Analysis/ProfileSummaryInfo.cpp index 7472b6201c2b3e36a08f087c64f95e30d0712392..1d70c75f2e1c9d532edf6531589e155721874fe1 100644 --- a/lib/Analysis/ProfileSummaryInfo.cpp +++ b/lib/Analysis/ProfileSummaryInfo.cpp @@ -39,11 +39,6 @@ static cl::opt ProfileSummaryCutoffCold( cl::desc("A count is cold if it is below the minimum count" " to reach this percentile of total counts.")); -static cl::opt ProfileSampleAccurate( - "profile-sample-accurate", cl::Hidden, cl::init(false), - cl::desc("If the sample profile is accurate, we will mark all un-sampled " - "callsite as cold. Otherwise, treat un-sampled callsites as if " - "we have no profile.")); static cl::opt ProfileSummaryHugeWorkingSetSizeThreshold( "profile-summary-huge-working-set-size-threshold", cl::Hidden, cl::init(15000), cl::ZeroOrMore, @@ -151,7 +146,7 @@ bool ProfileSummaryInfo::isFunctionHotInCallGraph(const Function *F, return true; } for (const auto &BB : *F) - if (isHotBB(&BB, &BFI)) + if (isHotBlock(&BB, &BFI)) return true; return false; } @@ -180,7 +175,7 @@ bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F, return false; } for (const auto &BB : *F) - if (!isColdBB(&BB, &BFI)) + if (!isColdBlock(&BB, &BFI)) return false; return true; } @@ -253,14 +248,14 @@ uint64_t ProfileSummaryInfo::getOrCompColdCountThreshold() { return ColdCountThreshold ? ColdCountThreshold.getValue() : 0; } -bool ProfileSummaryInfo::isHotBB(const BasicBlock *B, BlockFrequencyInfo *BFI) { - auto Count = BFI->getBlockProfileCount(B); +bool ProfileSummaryInfo::isHotBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI) { + auto Count = BFI->getBlockProfileCount(BB); return Count && isHotCount(*Count); } -bool ProfileSummaryInfo::isColdBB(const BasicBlock *B, +bool ProfileSummaryInfo::isColdBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI) { - auto Count = BFI->getBlockProfileCount(B); + auto Count = BFI->getBlockProfileCount(BB); return Count && isColdCount(*Count); } @@ -278,11 +273,7 @@ bool ProfileSummaryInfo::isColdCallSite(const CallSite &CS, // In SamplePGO, if the caller has been sampled, and there is no profile // annotated on the callsite, we consider the callsite as cold. - // If there is no profile for the caller, and we know the profile is - // accurate, we consider the callsite as cold. - return (hasSampleProfile() && - (CS.getCaller()->hasProfileData() || ProfileSampleAccurate || - CS.getCaller()->hasFnAttribute("profile-sample-accurate"))); + return hasSampleProfile() && CS.getCaller()->hasProfileData(); } INITIALIZE_PASS(ProfileSummaryInfoWrapperPass, "profile-summary-info", diff --git a/lib/Analysis/StackSafetyAnalysis.cpp b/lib/Analysis/StackSafetyAnalysis.cpp new file mode 100644 index 0000000000000000000000000000000000000000..49d9b3f57c620c314f76d4d5c0d206ca5ad8473f --- /dev/null +++ b/lib/Analysis/StackSafetyAnalysis.cpp @@ -0,0 +1,676 @@ +//===- StackSafetyAnalysis.cpp - Stack memory safety analysis -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/StackSafetyAnalysis.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "stack-safety" + +static cl::opt StackSafetyMaxIterations("stack-safety-max-iterations", + cl::init(20), cl::Hidden); + +namespace { + +/// Rewrite an SCEV expression for a memory access address to an expression that +/// represents offset from the given alloca. +class AllocaOffsetRewriter : public SCEVRewriteVisitor { + const Value *AllocaPtr; + +public: + AllocaOffsetRewriter(ScalarEvolution &SE, const Value *AllocaPtr) + : SCEVRewriteVisitor(SE), AllocaPtr(AllocaPtr) {} + + const SCEV *visit(const SCEV *Expr) { + // Only re-write the expression if the alloca is used in an addition + // expression (it can be used in other types of expressions if it's cast to + // an int and passed as an argument.) + if (!isa(Expr) && !isa(Expr) && + !isa(Expr)) + return Expr; + return SCEVRewriteVisitor::visit(Expr); + } + + const SCEV *visitUnknown(const SCEVUnknown *Expr) { + // FIXME: look through one or several levels of definitions? + // This can be inttoptr(AllocaPtr) and SCEV would not unwrap + // it for us. + if (Expr->getValue() == AllocaPtr) + return SE.getZero(Expr->getType()); + return Expr; + } +}; + +/// Describes use of address in as a function call argument. +struct PassAsArgInfo { + /// Function being called. + const GlobalValue *Callee = nullptr; + /// Index of argument which pass address. + size_t ParamNo = 0; + // Offset range of address from base address (alloca or calling function + // argument). + // Range should never set to empty-set, that is an invalid access range + // that can cause empty-set to be propagated with ConstantRange::add + ConstantRange Offset; + PassAsArgInfo(const GlobalValue *Callee, size_t ParamNo, ConstantRange Offset) + : Callee(Callee), ParamNo(ParamNo), Offset(Offset) {} + + StringRef getName() const { return Callee->getName(); } +}; + +raw_ostream &operator<<(raw_ostream &OS, const PassAsArgInfo &P) { + return OS << "@" << P.getName() << "(arg" << P.ParamNo << ", " << P.Offset + << ")"; +} + +/// Describe uses of address (alloca or parameter) inside of the function. +struct UseInfo { + // Access range if the address (alloca or parameters). + // It is allowed to be empty-set when there are no known accesses. + ConstantRange Range; + + // List of calls which pass address as an argument. + SmallVector Calls; + + explicit UseInfo(unsigned PointerSize) : Range{PointerSize, false} {} + + void updateRange(ConstantRange R) { Range = Range.unionWith(R); } +}; + +raw_ostream &operator<<(raw_ostream &OS, const UseInfo &U) { + OS << U.Range; + for (auto &Call : U.Calls) + OS << ", " << Call; + return OS; +} + +struct AllocaInfo { + const AllocaInst *AI = nullptr; + uint64_t Size = 0; + UseInfo Use; + + AllocaInfo(unsigned PointerSize, const AllocaInst *AI, uint64_t Size) + : AI(AI), Size(Size), Use(PointerSize) {} + + StringRef getName() const { return AI->getName(); } +}; + +raw_ostream &operator<<(raw_ostream &OS, const AllocaInfo &A) { + return OS << A.getName() << "[" << A.Size << "]: " << A.Use; +} + +struct ParamInfo { + const Argument *Arg = nullptr; + UseInfo Use; + + explicit ParamInfo(unsigned PointerSize, const Argument *Arg) + : Arg(Arg), Use(PointerSize) {} + + StringRef getName() const { return Arg ? Arg->getName() : ""; } +}; + +raw_ostream &operator<<(raw_ostream &OS, const ParamInfo &P) { + return OS << P.getName() << "[]: " << P.Use; +} + +/// Calculate the allocation size of a given alloca. Returns 0 if the +/// size can not be statically determined. +uint64_t getStaticAllocaAllocationSize(const AllocaInst *AI) { + const DataLayout &DL = AI->getModule()->getDataLayout(); + uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType()); + if (AI->isArrayAllocation()) { + auto C = dyn_cast(AI->getArraySize()); + if (!C) + return 0; + Size *= C->getZExtValue(); + } + return Size; +} + +} // end anonymous namespace + +/// Describes uses of allocas and parameters inside of a single function. +struct StackSafetyInfo::FunctionInfo { + // May be a Function or a GlobalAlias + const GlobalValue *GV = nullptr; + // Informations about allocas uses. + SmallVector Allocas; + // Informations about parameters uses. + SmallVector Params; + // TODO: describe return value as depending on one or more of its arguments. + + // StackSafetyDataFlowAnalysis counter stored here for faster access. + int UpdateCount = 0; + + FunctionInfo(const StackSafetyInfo &SSI) : FunctionInfo(*SSI.Info) {} + + explicit FunctionInfo(const Function *F) : GV(F){}; + // Creates FunctionInfo that forwards all the parameters to the aliasee. + explicit FunctionInfo(const GlobalAlias *A); + + FunctionInfo(FunctionInfo &&) = default; + + bool IsDSOLocal() const { return GV->isDSOLocal(); }; + + bool IsInterposable() const { return GV->isInterposable(); }; + + StringRef getName() const { return GV->getName(); } + + void print(raw_ostream &O) const { + // TODO: Consider different printout format after + // StackSafetyDataFlowAnalysis. Calls and parameters are irrelevant then. + O << " @" << getName() << (IsDSOLocal() ? "" : " dso_preemptable") + << (IsInterposable() ? " interposable" : "") << "\n"; + O << " args uses:\n"; + for (auto &P : Params) + O << " " << P << "\n"; + O << " allocas uses:\n"; + for (auto &AS : Allocas) + O << " " << AS << "\n"; + } + +private: + FunctionInfo(const FunctionInfo &) = default; +}; + +StackSafetyInfo::FunctionInfo::FunctionInfo(const GlobalAlias *A) : GV(A) { + unsigned PointerSize = A->getParent()->getDataLayout().getPointerSizeInBits(); + const GlobalObject *Aliasee = A->getBaseObject(); + const FunctionType *Type = cast(Aliasee->getValueType()); + // 'Forward' all parameters to this alias to the aliasee + for (unsigned ArgNo = 0; ArgNo < Type->getNumParams(); ArgNo++) { + Params.emplace_back(PointerSize, nullptr); + UseInfo &US = Params.back().Use; + US.Calls.emplace_back(Aliasee, ArgNo, ConstantRange(APInt(PointerSize, 0))); + } +} + +namespace { + +class StackSafetyLocalAnalysis { + const Function &F; + const DataLayout &DL; + ScalarEvolution &SE; + unsigned PointerSize = 0; + + const ConstantRange UnknownRange; + + ConstantRange offsetFromAlloca(Value *Addr, const Value *AllocaPtr); + ConstantRange getAccessRange(Value *Addr, const Value *AllocaPtr, + uint64_t AccessSize); + ConstantRange getMemIntrinsicAccessRange(const MemIntrinsic *MI, const Use &U, + const Value *AllocaPtr); + + bool analyzeAllUses(const Value *Ptr, UseInfo &AS); + + ConstantRange getRange(uint64_t Lower, uint64_t Upper) const { + return ConstantRange(APInt(PointerSize, Lower), APInt(PointerSize, Upper)); + } + +public: + StackSafetyLocalAnalysis(const Function &F, ScalarEvolution &SE) + : F(F), DL(F.getParent()->getDataLayout()), SE(SE), + PointerSize(DL.getPointerSizeInBits()), + UnknownRange(PointerSize, true) {} + + // Run the transformation on the associated function. + StackSafetyInfo run(); +}; + +ConstantRange +StackSafetyLocalAnalysis::offsetFromAlloca(Value *Addr, + const Value *AllocaPtr) { + if (!SE.isSCEVable(Addr->getType())) + return UnknownRange; + + AllocaOffsetRewriter Rewriter(SE, AllocaPtr); + const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr)); + ConstantRange Offset = SE.getUnsignedRange(Expr).zextOrTrunc(PointerSize); + assert(!Offset.isEmptySet()); + return Offset; +} + +ConstantRange StackSafetyLocalAnalysis::getAccessRange(Value *Addr, + const Value *AllocaPtr, + uint64_t AccessSize) { + if (!SE.isSCEVable(Addr->getType())) + return UnknownRange; + + AllocaOffsetRewriter Rewriter(SE, AllocaPtr); + const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr)); + + ConstantRange AccessStartRange = + SE.getUnsignedRange(Expr).zextOrTrunc(PointerSize); + ConstantRange SizeRange = getRange(0, AccessSize); + ConstantRange AccessRange = AccessStartRange.add(SizeRange); + assert(!AccessRange.isEmptySet()); + return AccessRange; +} + +ConstantRange StackSafetyLocalAnalysis::getMemIntrinsicAccessRange( + const MemIntrinsic *MI, const Use &U, const Value *AllocaPtr) { + if (auto MTI = dyn_cast(MI)) { + if (MTI->getRawSource() != U && MTI->getRawDest() != U) + return getRange(0, 1); + } else { + if (MI->getRawDest() != U) + return getRange(0, 1); + } + const auto *Len = dyn_cast(MI->getLength()); + // Non-constant size => unsafe. FIXME: try SCEV getRange. + if (!Len) + return UnknownRange; + ConstantRange AccessRange = getAccessRange(U, AllocaPtr, Len->getZExtValue()); + return AccessRange; +} + +/// The function analyzes all local uses of Ptr (alloca or argument) and +/// calculates local access range and all function calls where it was used. +bool StackSafetyLocalAnalysis::analyzeAllUses(const Value *Ptr, UseInfo &US) { + SmallPtrSet Visited; + SmallVector WorkList; + WorkList.push_back(Ptr); + + // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc. + while (!WorkList.empty()) { + const Value *V = WorkList.pop_back_val(); + for (const Use &UI : V->uses()) { + auto I = cast(UI.getUser()); + assert(V == UI.get()); + + switch (I->getOpcode()) { + case Instruction::Load: { + US.updateRange( + getAccessRange(UI, Ptr, DL.getTypeStoreSize(I->getType()))); + break; + } + + case Instruction::VAArg: + // "va-arg" from a pointer is safe. + break; + case Instruction::Store: { + if (V == I->getOperand(0)) { + // Stored the pointer - conservatively assume it may be unsafe. + US.updateRange(UnknownRange); + return false; + } + US.updateRange(getAccessRange( + UI, Ptr, DL.getTypeStoreSize(I->getOperand(0)->getType()))); + break; + } + + case Instruction::Ret: + // Information leak. + // FIXME: Process parameters correctly. This is a leak only if we return + // alloca. + US.updateRange(UnknownRange); + return false; + + case Instruction::Call: + case Instruction::Invoke: { + ImmutableCallSite CS(I); + + if (const IntrinsicInst *II = dyn_cast(I)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end) + break; + } + + if (const MemIntrinsic *MI = dyn_cast(I)) { + US.updateRange(getMemIntrinsicAccessRange(MI, UI, Ptr)); + break; + } + + // FIXME: consult devirt? + // Do not follow aliases, otherwise we could inadvertently follow + // dso_preemptable aliases or aliases with interposable linkage. + const GlobalValue *Callee = dyn_cast( + CS.getCalledValue()->stripPointerCastsNoFollowAliases()); + if (!Callee) { + US.updateRange(UnknownRange); + return false; + } + + assert(isa(Callee) || isa(Callee)); + + ImmutableCallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end(); + for (ImmutableCallSite::arg_iterator A = B; A != E; ++A) { + if (A->get() == V) { + ConstantRange Offset = offsetFromAlloca(UI, Ptr); + US.Calls.emplace_back(Callee, A - B, Offset); + } + } + + break; + } + + default: + if (Visited.insert(I).second) + WorkList.push_back(cast(I)); + } + } + } + + return true; +} + +StackSafetyInfo StackSafetyLocalAnalysis::run() { + StackSafetyInfo::FunctionInfo Info(&F); + assert(!F.isDeclaration() && + "Can't run StackSafety on a function declaration"); + + LLVM_DEBUG(dbgs() << "[StackSafety] " << F.getName() << "\n"); + + for (auto &I : instructions(F)) { + if (auto AI = dyn_cast(&I)) { + Info.Allocas.emplace_back(PointerSize, AI, + getStaticAllocaAllocationSize(AI)); + AllocaInfo &AS = Info.Allocas.back(); + analyzeAllUses(AI, AS.Use); + } + } + + for (const Argument &A : make_range(F.arg_begin(), F.arg_end())) { + Info.Params.emplace_back(PointerSize, &A); + ParamInfo &PS = Info.Params.back(); + analyzeAllUses(&A, PS.Use); + } + + LLVM_DEBUG(dbgs() << "[StackSafety] done\n"); + LLVM_DEBUG(Info.print(dbgs())); + return StackSafetyInfo(std::move(Info)); +} + +class StackSafetyDataFlowAnalysis { + using FunctionMap = + std::map; + + FunctionMap Functions; + // Callee-to-Caller multimap. + DenseMap> Callers; + SetVector WorkList; + + unsigned PointerSize = 0; + const ConstantRange UnknownRange; + + ConstantRange getArgumentAccessRange(const GlobalValue *Callee, + unsigned ParamNo) const; + bool updateOneUse(UseInfo &US, bool UpdateToFullSet); + void updateOneNode(const GlobalValue *Callee, + StackSafetyInfo::FunctionInfo &FS); + void updateOneNode(const GlobalValue *Callee) { + updateOneNode(Callee, Functions.find(Callee)->second); + } + void updateAllNodes() { + for (auto &F : Functions) + updateOneNode(F.first, F.second); + } + void runDataFlow(); + void verifyFixedPoint(); + +public: + StackSafetyDataFlowAnalysis( + Module &M, std::function FI); + StackSafetyGlobalInfo run(); +}; + +StackSafetyDataFlowAnalysis::StackSafetyDataFlowAnalysis( + Module &M, std::function FI) + : PointerSize(M.getDataLayout().getPointerSizeInBits()), + UnknownRange(PointerSize, true) { + // Without ThinLTO, run the local analysis for every function in the TU and + // then run the DFA. + for (auto &F : M.functions()) + if (!F.isDeclaration()) + Functions.emplace(&F, FI(F)); + for (auto &A : M.aliases()) + if (isa(A.getBaseObject())) + Functions.emplace(&A, StackSafetyInfo::FunctionInfo(&A)); +} + +ConstantRange +StackSafetyDataFlowAnalysis::getArgumentAccessRange(const GlobalValue *Callee, + unsigned ParamNo) const { + auto IT = Functions.find(Callee); + // Unknown callee (outside of LTO domain or an indirect call). + if (IT == Functions.end()) + return UnknownRange; + const StackSafetyInfo::FunctionInfo &FS = IT->second; + // The definition of this symbol may not be the definition in this linkage + // unit. + if (!FS.IsDSOLocal() || FS.IsInterposable()) + return UnknownRange; + if (ParamNo >= FS.Params.size()) // possibly vararg + return UnknownRange; + return FS.Params[ParamNo].Use.Range; +} + +bool StackSafetyDataFlowAnalysis::updateOneUse(UseInfo &US, + bool UpdateToFullSet) { + bool Changed = false; + for (auto &CS : US.Calls) { + assert(!CS.Offset.isEmptySet() && + "Param range can't be empty-set, invalid offset range"); + + ConstantRange CalleeRange = getArgumentAccessRange(CS.Callee, CS.ParamNo); + CalleeRange = CalleeRange.add(CS.Offset); + if (!US.Range.contains(CalleeRange)) { + Changed = true; + if (UpdateToFullSet) + US.Range = UnknownRange; + else + US.Range = US.Range.unionWith(CalleeRange); + } + } + return Changed; +} + +void StackSafetyDataFlowAnalysis::updateOneNode( + const GlobalValue *Callee, StackSafetyInfo::FunctionInfo &FS) { + bool UpdateToFullSet = FS.UpdateCount > StackSafetyMaxIterations; + bool Changed = false; + for (auto &AS : FS.Allocas) + Changed |= updateOneUse(AS.Use, UpdateToFullSet); + for (auto &PS : FS.Params) + Changed |= updateOneUse(PS.Use, UpdateToFullSet); + + if (Changed) { + LLVM_DEBUG(dbgs() << "=== update [" << FS.UpdateCount + << (UpdateToFullSet ? ", full-set" : "") << "] " + << FS.getName() << "\n"); + // Callers of this function may need updating. + for (auto &CallerID : Callers[Callee]) + WorkList.insert(CallerID); + + ++FS.UpdateCount; + } +} + +void StackSafetyDataFlowAnalysis::runDataFlow() { + Callers.clear(); + WorkList.clear(); + + SmallVector Callees; + for (auto &F : Functions) { + Callees.clear(); + StackSafetyInfo::FunctionInfo &FS = F.second; + for (auto &AS : FS.Allocas) + for (auto &CS : AS.Use.Calls) + Callees.push_back(CS.Callee); + for (auto &PS : FS.Params) + for (auto &CS : PS.Use.Calls) + Callees.push_back(CS.Callee); + + llvm::sort(Callees); + Callees.erase(std::unique(Callees.begin(), Callees.end()), Callees.end()); + + for (auto &Callee : Callees) + Callers[Callee].push_back(F.first); + } + + updateAllNodes(); + + while (!WorkList.empty()) { + const GlobalValue *Callee = WorkList.back(); + WorkList.pop_back(); + updateOneNode(Callee); + } +} + +void StackSafetyDataFlowAnalysis::verifyFixedPoint() { + WorkList.clear(); + updateAllNodes(); + assert(WorkList.empty()); +} + +StackSafetyGlobalInfo StackSafetyDataFlowAnalysis::run() { + runDataFlow(); + LLVM_DEBUG(verifyFixedPoint()); + + StackSafetyGlobalInfo SSI; + for (auto &F : Functions) + SSI.emplace(F.first, std::move(F.second)); + return SSI; +} + +void print(const StackSafetyGlobalInfo &SSI, raw_ostream &O, const Module &M) { + size_t Count = 0; + for (auto &F : M.functions()) + if (!F.isDeclaration()) { + SSI.find(&F)->second.print(O); + O << "\n"; + ++Count; + } + for (auto &A : M.aliases()) { + SSI.find(&A)->second.print(O); + O << "\n"; + ++Count; + } + assert(Count == SSI.size() && "Unexpected functions in the result"); +} + +} // end anonymous namespace + +StackSafetyInfo::StackSafetyInfo() = default; +StackSafetyInfo::StackSafetyInfo(StackSafetyInfo &&) = default; +StackSafetyInfo &StackSafetyInfo::operator=(StackSafetyInfo &&) = default; + +StackSafetyInfo::StackSafetyInfo(FunctionInfo &&Info) + : Info(new FunctionInfo(std::move(Info))) {} + +StackSafetyInfo::~StackSafetyInfo() = default; + +void StackSafetyInfo::print(raw_ostream &O) const { Info->print(O); } + +AnalysisKey StackSafetyAnalysis::Key; + +StackSafetyInfo StackSafetyAnalysis::run(Function &F, + FunctionAnalysisManager &AM) { + StackSafetyLocalAnalysis SSLA(F, AM.getResult(F)); + return SSLA.run(); +} + +PreservedAnalyses StackSafetyPrinterPass::run(Function &F, + FunctionAnalysisManager &AM) { + OS << "'Stack Safety Local Analysis' for function '" << F.getName() << "'\n"; + AM.getResult(F).print(OS); + return PreservedAnalyses::all(); +} + +char StackSafetyInfoWrapperPass::ID = 0; + +StackSafetyInfoWrapperPass::StackSafetyInfoWrapperPass() : FunctionPass(ID) { + initializeStackSafetyInfoWrapperPassPass(*PassRegistry::getPassRegistry()); +} + +void StackSafetyInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.setPreservesAll(); +} + +void StackSafetyInfoWrapperPass::print(raw_ostream &O, const Module *M) const { + SSI.print(O); +} + +bool StackSafetyInfoWrapperPass::runOnFunction(Function &F) { + StackSafetyLocalAnalysis SSLA( + F, getAnalysis().getSE()); + SSI = StackSafetyInfo(SSLA.run()); + return false; +} + +AnalysisKey StackSafetyGlobalAnalysis::Key; + +StackSafetyGlobalInfo +StackSafetyGlobalAnalysis::run(Module &M, ModuleAnalysisManager &AM) { + FunctionAnalysisManager &FAM = + AM.getResult(M).getManager(); + + StackSafetyDataFlowAnalysis SSDFA( + M, [&FAM](Function &F) -> const StackSafetyInfo & { + return FAM.getResult(F); + }); + return SSDFA.run(); +} + +PreservedAnalyses StackSafetyGlobalPrinterPass::run(Module &M, + ModuleAnalysisManager &AM) { + OS << "'Stack Safety Analysis' for module '" << M.getName() << "'\n"; + print(AM.getResult(M), OS, M); + return PreservedAnalyses::all(); +} + +char StackSafetyGlobalInfoWrapperPass::ID = 0; + +StackSafetyGlobalInfoWrapperPass::StackSafetyGlobalInfoWrapperPass() + : ModulePass(ID) { + initializeStackSafetyGlobalInfoWrapperPassPass( + *PassRegistry::getPassRegistry()); +} + +void StackSafetyGlobalInfoWrapperPass::print(raw_ostream &O, + const Module *M) const { + ::print(SSI, O, *M); +} + +void StackSafetyGlobalInfoWrapperPass::getAnalysisUsage( + AnalysisUsage &AU) const { + AU.addRequired(); +} + +bool StackSafetyGlobalInfoWrapperPass::runOnModule(Module &M) { + StackSafetyDataFlowAnalysis SSDFA( + M, [this](Function &F) -> const StackSafetyInfo & { + return getAnalysis(F).getResult(); + }); + SSI = SSDFA.run(); + return false; +} + +static const char LocalPassArg[] = "stack-safety-local"; +static const char LocalPassName[] = "Stack Safety Local Analysis"; +INITIALIZE_PASS_BEGIN(StackSafetyInfoWrapperPass, LocalPassArg, LocalPassName, + false, true) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(StackSafetyInfoWrapperPass, LocalPassArg, LocalPassName, + false, true) + +static const char GlobalPassName[] = "Stack Safety Analysis"; +INITIALIZE_PASS_BEGIN(StackSafetyGlobalInfoWrapperPass, DEBUG_TYPE, + GlobalPassName, false, false) +INITIALIZE_PASS_DEPENDENCY(StackSafetyInfoWrapperPass) +INITIALIZE_PASS_END(StackSafetyGlobalInfoWrapperPass, DEBUG_TYPE, + GlobalPassName, false, false) diff --git a/lib/Analysis/SyntheticCountsUtils.cpp b/lib/Analysis/SyntheticCountsUtils.cpp index b085fa274d7fbad13adbc5642e69d1e37b1b8931..386396bcff364f3cf0222a3f7994fd4c46023686 100644 --- a/lib/Analysis/SyntheticCountsUtils.cpp +++ b/lib/Analysis/SyntheticCountsUtils.cpp @@ -14,12 +14,12 @@ #include "llvm/Analysis/SyntheticCountsUtils.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SCCIterator.h" -#include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/ModuleSummaryIndex.h" using namespace llvm; @@ -29,7 +29,7 @@ void SyntheticCountsUtils::propagateFromSCC( const SccTy &SCC, GetRelBBFreqTy GetRelBBFreq, GetCountTy GetCount, AddCountTy AddCount) { - SmallPtrSet SCCNodes; + DenseSet SCCNodes; SmallVector, 8> SCCEdges, NonSCCEdges; for (auto &Node : SCC) @@ -111,3 +111,4 @@ void SyntheticCountsUtils::propagate(const CallGraphType &CG, } template class llvm::SyntheticCountsUtils; +template class llvm::SyntheticCountsUtils; diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 6e4eb8ff0cdf2a47cddd20a38e6c9cf3a568b027..79fe6dc7d87cb26dc4fa2b5989fbf3a6493f434c 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -389,8 +389,7 @@ unsigned TargetTransformInfo::getMaxInterleaveFactor(unsigned VF) const { } TargetTransformInfo::OperandValueKind -TargetTransformInfo::getOperandInfo(Value *V, - OperandValueProperties &OpProps) const { +TargetTransformInfo::getOperandInfo(Value *V, OperandValueProperties &OpProps) { OperandValueKind OpInfo = OK_AnyValue; OpProps = OP_None; @@ -400,6 +399,13 @@ TargetTransformInfo::getOperandInfo(Value *V, return OK_UniformConstantValue; } + // A broadcast shuffle creates a uniform value. + // TODO: Add support for non-zero index broadcasts. + // TODO: Add support for different source vector width. + if (auto *ShuffleInst = dyn_cast(V)) + if (ShuffleInst->isZeroEltSplat()) + OpInfo = OK_UniformValue; + const Value *Splat = getSplatValue(V); // Check for a splat of a constant or for a non uniform vector of constants @@ -1108,14 +1114,20 @@ int TargetTransformInfo::getInstructionThroughput(const Instruction *I) const { } case Instruction::ShuffleVector: { const ShuffleVectorInst *Shuffle = cast(I); - // TODO: Identify and add costs for insert/extract subvector, etc. + Type *Ty = Shuffle->getType(); + Type *SrcTy = Shuffle->getOperand(0)->getType(); + + // TODO: Identify and add costs for insert subvector, etc. + int SubIndex; + if (Shuffle->isExtractSubvectorMask(SubIndex)) + return TTIImpl->getShuffleCost(SK_ExtractSubvector, SrcTy, SubIndex, Ty); + if (Shuffle->changesLength()) return -1; if (Shuffle->isIdentity()) return 0; - Type *Ty = Shuffle->getType(); if (Shuffle->isReverse()) return TTIImpl->getShuffleCost(SK_Reverse, Ty, 0, nullptr); diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp index ed17441d1e404e2c70f5ea62b191987394ffcd3d..6f0196069f2d64ff9d819049b2f8376dcb8c24a6 100644 --- a/lib/Analysis/ValueTracking.cpp +++ b/lib/Analysis/ValueTracking.cpp @@ -1506,6 +1506,27 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known, // of bits which might be set provided by popcnt KnownOne2. break; } + case Intrinsic::fshr: + case Intrinsic::fshl: { + const APInt *SA; + if (!match(I->getOperand(2), m_APInt(SA))) + break; + + // Normalize to funnel shift left. + uint64_t ShiftAmt = SA->urem(BitWidth); + if (II->getIntrinsicID() == Intrinsic::fshr) + ShiftAmt = BitWidth - ShiftAmt; + + KnownBits Known3(Known); + computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(1), Known3, Depth + 1, Q); + + Known.Zero = + Known2.Zero.shl(ShiftAmt) | Known3.Zero.lshr(BitWidth - ShiftAmt); + Known.One = + Known2.One.shl(ShiftAmt) | Known3.One.lshr(BitWidth - ShiftAmt); + break; + } case Intrinsic::x86_sse42_crc32_64_64: Known.Zero.setBitsFrom(32); break; @@ -4001,13 +4022,11 @@ OverflowResult llvm::computeOverflowForUnsignedAdd( if (LHSKnown.isNegative() && RHSKnown.isNegative()) { // The sign bit is set in both cases: this MUST overflow. - // Create a simple add instruction, and insert it into the struct. return OverflowResult::AlwaysOverflows; } if (LHSKnown.isNonNegative() && RHSKnown.isNonNegative()) { // The sign bit is clear in both cases: this CANNOT overflow. - // Create a simple add instruction, and insert it into the struct. return OverflowResult::NeverOverflows; } } @@ -4124,11 +4143,18 @@ OverflowResult llvm::computeOverflowForUnsignedSub(const Value *LHS, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { - // If the LHS is negative and the RHS is non-negative, no unsigned wrap. KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT); - KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT); - if (LHSKnown.isNegative() && RHSKnown.isNonNegative()) - return OverflowResult::NeverOverflows; + if (LHSKnown.isNonNegative() || LHSKnown.isNegative()) { + KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT); + + // If the LHS is negative and the RHS is non-negative, no unsigned wrap. + if (LHSKnown.isNegative() && RHSKnown.isNonNegative()) + return OverflowResult::NeverOverflows; + + // If the LHS is non-negative and the RHS negative, we always wrap. + if (LHSKnown.isNonNegative() && RHSKnown.isNegative()) + return OverflowResult::AlwaysOverflows; + } return OverflowResult::MayOverflow; } @@ -5373,3 +5399,35 @@ Optional llvm::isImpliedCondition(const Value *LHS, const Value *RHS, } return None; } + +Optional llvm::isImpliedByDomCondition(const Value *Cond, + const Instruction *ContextI, + const DataLayout &DL) { + assert(Cond->getType()->isIntOrIntVectorTy(1) && "Condition must be bool"); + if (!ContextI || !ContextI->getParent()) + return None; + + // TODO: This is a poor/cheap way to determine dominance. Should we use a + // dominator tree (eg, from a SimplifyQuery) instead? + const BasicBlock *ContextBB = ContextI->getParent(); + const BasicBlock *PredBB = ContextBB->getSinglePredecessor(); + if (!PredBB) + return None; + + // We need a conditional branch in the predecessor. + Value *PredCond; + BasicBlock *TrueBB, *FalseBB; + if (!match(PredBB->getTerminator(), m_Br(m_Value(PredCond), TrueBB, FalseBB))) + return None; + + // The branch should get simplified. Don't bother simplifying this condition. + if (TrueBB == FalseBB) + return None; + + assert((TrueBB == ContextBB || FalseBB == ContextBB) && + "Predecessor block does not point to successor?"); + + // Is this condition implied by the predecessor condition? + bool CondIsTrue = TrueBB == ContextBB; + return isImpliedCondition(PredCond, Cond, DL, CondIsTrue); +} diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp index 38dca50e82a523b9b503337690e3161c9e234810..9ebb01684c8a5d4ab1568031872abe122569b92e 100644 --- a/lib/Analysis/VectorUtils.cpp +++ b/lib/Analysis/VectorUtils.cpp @@ -37,13 +37,19 @@ static cl::opt MaxInterleaveGroupFactor( cl::desc("Maximum factor for an interleaved access group (default = 8)"), cl::init(8)); -/// Identify if the intrinsic is trivially vectorizable. -/// This method returns true if the intrinsic's argument types are all -/// scalars for the scalar form of the intrinsic and all vectors for -/// the vector form of the intrinsic. +/// Return true if all of the intrinsic's arguments and return type are scalars +/// for the scalar form of the intrinsic and vectors for the vector form of the +/// intrinsic. bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { switch (ID) { - case Intrinsic::sqrt: + case Intrinsic::bswap: // Begin integer bit-manipulation. + case Intrinsic::bitreverse: + case Intrinsic::ctpop: + case Intrinsic::ctlz: + case Intrinsic::cttz: + case Intrinsic::fshl: + case Intrinsic::fshr: + case Intrinsic::sqrt: // Begin floating-point. case Intrinsic::sin: case Intrinsic::cos: case Intrinsic::exp: @@ -63,14 +69,9 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::rint: case Intrinsic::nearbyint: case Intrinsic::round: - case Intrinsic::bswap: - case Intrinsic::bitreverse: - case Intrinsic::ctpop: case Intrinsic::pow: case Intrinsic::fma: case Intrinsic::fmuladd: - case Intrinsic::ctlz: - case Intrinsic::cttz: case Intrinsic::powi: case Intrinsic::canonicalize: return true; @@ -504,8 +505,9 @@ Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef VL) { return Inst; } -Constant *llvm::createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF, - const InterleaveGroup &Group) { +Constant * +llvm::createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF, + const InterleaveGroup &Group) { // All 1's means mask is not needed. if (Group.getNumMembers() == Group.getFactor()) return nullptr; @@ -719,9 +721,9 @@ void InterleavedAccessInfo::analyzeInterleaving( collectDependences(); // Holds all interleaved store groups temporarily. - SmallSetVector StoreGroups; + SmallSetVector *, 4> StoreGroups; // Holds all interleaved load groups temporarily. - SmallSetVector LoadGroups; + SmallSetVector *, 4> LoadGroups; // Search in bottom-up program order for pairs of accesses (A and B) that can // form interleaved load or store groups. In the algorithm below, access A @@ -743,7 +745,7 @@ void InterleavedAccessInfo::analyzeInterleaving( // Initialize a group for B if it has an allowable stride. Even if we don't // create a group for B, we continue with the bottom-up algorithm to ensure // we don't break any of B's dependences. - InterleaveGroup *Group = nullptr; + InterleaveGroup *Group = nullptr; if (isStrided(DesB.Stride) && (!isPredicated(B->getParent()) || EnablePredicatedInterleavedMemAccesses)) { Group = getInterleaveGroup(B); @@ -788,7 +790,7 @@ void InterleavedAccessInfo::analyzeInterleaving( // illegal code motion. A will then be free to form another group with // instructions that precede it. if (isInterleaved(A)) { - InterleaveGroup *StoreGroup = getInterleaveGroup(A); + InterleaveGroup *StoreGroup = getInterleaveGroup(A); StoreGroups.remove(StoreGroup); releaseGroup(StoreGroup); } @@ -867,7 +869,7 @@ void InterleavedAccessInfo::analyzeInterleaving( } // Iteration over B accesses. // Remove interleaved store groups with gaps. - for (InterleaveGroup *Group : StoreGroups) + for (auto *Group : StoreGroups) if (Group->getNumMembers() != Group->getFactor()) { LLVM_DEBUG( dbgs() << "LV: Invalidate candidate interleaved store group due " @@ -888,7 +890,7 @@ void InterleavedAccessInfo::analyzeInterleaving( // This means that we can forcefully peel the loop in order to only have to // check the first pointer for no-wrap. When we'll change to use Assume=true // we'll only need at most one runtime check per interleaved group. - for (InterleaveGroup *Group : LoadGroups) { + for (auto *Group : LoadGroups) { // Case 1: A full group. Can Skip the checks; For full groups, if the wide // load would wrap around the address space we would do a memory access at // nullptr even without the transformation. @@ -946,9 +948,9 @@ void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() { return; // Avoid releasing a Group twice. - SmallPtrSet DelSet; + SmallPtrSet *, 4> DelSet; for (auto &I : InterleaveGroupMap) { - InterleaveGroup *Group = I.second; + InterleaveGroup *Group = I.second; if (Group->requiresScalarEpilogue()) DelSet.insert(Group); } @@ -963,3 +965,18 @@ void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() { RequiresScalarEpilogue = false; } + +template +void InterleaveGroup::addMetadata(InstT *NewInst) const { + llvm_unreachable("addMetadata can only be used for Instruction"); +} + +namespace llvm { +template <> +void InterleaveGroup::addMetadata(Instruction *NewInst) const { + SmallVector VL; + std::transform(Members.begin(), Members.end(), std::back_inserter(VL), + [](std::pair p) { return p.second; }); + propagateMetadata(NewInst, VL); +} +} diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp index af4f43986ef0632c50c7284fa14f4ad9ee156925..eab7ec81953609cc050f7b7af47307ebac74d2ae 100644 --- a/lib/AsmParser/LLLexer.cpp +++ b/lib/AsmParser/LLLexer.cpp @@ -788,6 +788,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(info); KEYWORD(byte); KEYWORD(bit); + KEYWORD(varFlags); #undef KEYWORD @@ -823,6 +824,8 @@ lltok::Kind LLLexer::LexIdentifier() { } \ } while (false) + INSTKEYWORD(fneg, FNeg); + INSTKEYWORD(add, Add); INSTKEYWORD(fadd, FAdd); INSTKEYWORD(sub, Sub); INSTKEYWORD(fsub, FSub); INSTKEYWORD(mul, Mul); INSTKEYWORD(fmul, FMul); @@ -902,6 +905,11 @@ lltok::Kind LLLexer::LexIdentifier() { return lltok::DIFlag; } + if (Keyword.startswith("DISPFlag")) { + StrVal.assign(Keyword.begin(), Keyword.end()); + return lltok::DISPFlag; + } + if (Keyword.startswith("CSK_")) { StrVal.assign(Keyword.begin(), Keyword.end()); return lltok::ChecksumKind; diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp index 5fe1e125d48680db35067d3bf9e19275f65a991a..f887372060b5344a2ed11e442c788414573b2bee 100644 --- a/lib/AsmParser/LLParser.cpp +++ b/lib/AsmParser/LLParser.cpp @@ -3295,7 +3295,31 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { ID.Kind = ValID::t_Constant; return false; } - + + // Unary Operators. + case lltok::kw_fneg: { + unsigned Opc = Lex.getUIntVal(); + Constant *Val; + Lex.Lex(); + if (ParseToken(lltok::lparen, "expected '(' in unary constantexpr") || + ParseGlobalTypeAndValue(Val) || + ParseToken(lltok::rparen, "expected ')' in unary constantexpr")) + return true; + + // Check that the type is valid for the operator. + switch (Opc) { + case Instruction::FNeg: + if (!Val->getType()->isFPOrFPVectorTy()) + return Error(ID.Loc, "constexpr requires fp operands"); + break; + default: llvm_unreachable("Unknown unary operator!"); + } + unsigned Flags = 0; + Constant *C = ConstantExpr::get(Opc, Val, Flags); + ID.ConstantVal = C; + ID.Kind = ValID::t_Constant; + return false; + } // Binary Operators. case lltok::kw_add: case lltok::kw_fadd: @@ -3725,6 +3749,10 @@ struct DIFlagField : public MDFieldImpl { DIFlagField() : MDFieldImpl(DINode::FlagZero) {} }; +struct DISPFlagField : public MDFieldImpl { + DISPFlagField() : MDFieldImpl(DISubprogram::SPFlagZero) {} +}; + struct MDSignedField : public MDFieldImpl { int64_t Min; int64_t Max; @@ -4017,6 +4045,46 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DIFlagField &Result) { return false; } +/// DISPFlagField +/// ::= uint32 +/// ::= DISPFlagVector +/// ::= DISPFlagVector '|' DISPFlag* '|' uint32 +template <> +bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DISPFlagField &Result) { + + // Parser for a single flag. + auto parseFlag = [&](DISubprogram::DISPFlags &Val) { + if (Lex.getKind() == lltok::APSInt && !Lex.getAPSIntVal().isSigned()) { + uint32_t TempVal = static_cast(Val); + bool Res = ParseUInt32(TempVal); + Val = static_cast(TempVal); + return Res; + } + + if (Lex.getKind() != lltok::DISPFlag) + return TokError("expected debug info flag"); + + Val = DISubprogram::getFlag(Lex.getStrVal()); + if (!Val) + return TokError(Twine("invalid subprogram debug info flag '") + + Lex.getStrVal() + "'"); + Lex.Lex(); + return false; + }; + + // Parse the flags and combine them together. + DISubprogram::DISPFlags Combined = DISubprogram::SPFlagZero; + do { + DISubprogram::DISPFlags Val; + if (parseFlag(Val)) + return true; + Combined |= Val; + } while (EatIfPresent(lltok::bar)); + + Result.assign(Combined); + return false; +} + template <> bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDSignedField &Result) { @@ -4473,7 +4541,8 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) { OPTIONAL(dwoId, MDUnsignedField, ); \ OPTIONAL(splitDebugInlining, MDBoolField, = true); \ OPTIONAL(debugInfoForProfiling, MDBoolField, = false); \ - OPTIONAL(nameTableKind, NameTableKindField, ); + OPTIONAL(nameTableKind, NameTableKindField, ); \ + OPTIONAL(debugBaseAddress, MDBoolField, = false); PARSE_MD_FIELDS(); #undef VISIT_MD_FIELDS @@ -4481,7 +4550,8 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) { Context, language.Val, file.Val, producer.Val, isOptimized.Val, flags.Val, runtimeVersion.Val, splitDebugFilename.Val, emissionKind.Val, enums.Val, retainedTypes.Val, globals.Val, imports.Val, macros.Val, dwoId.Val, - splitDebugInlining.Val, debugInfoForProfiling.Val, nameTableKind.Val); + splitDebugInlining.Val, debugInfoForProfiling.Val, nameTableKind.Val, + debugBaseAddress.Val); return false; } @@ -4491,8 +4561,8 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) { /// isDefinition: true, scopeLine: 8, containingType: !3, /// virtuality: DW_VIRTUALTIY_pure_virtual, /// virtualIndex: 10, thisAdjustment: 4, flags: 11, -/// isOptimized: false, templateParams: !4, declaration: !5, -/// retainedNodes: !6, thrownTypes: !7) +/// spFlags: 10, isOptimized: false, templateParams: !4, +/// declaration: !5, retainedNodes: !6, thrownTypes: !7) bool LLParser::ParseDISubprogram(MDNode *&Result, bool IsDistinct) { auto Loc = Lex.getLoc(); #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED) \ @@ -4510,26 +4580,31 @@ bool LLParser::ParseDISubprogram(MDNode *&Result, bool IsDistinct) { OPTIONAL(virtualIndex, MDUnsignedField, (0, UINT32_MAX)); \ OPTIONAL(thisAdjustment, MDSignedField, (0, INT32_MIN, INT32_MAX)); \ OPTIONAL(flags, DIFlagField, ); \ + OPTIONAL(spFlags, DISPFlagField, ); \ OPTIONAL(isOptimized, MDBoolField, ); \ OPTIONAL(unit, MDField, ); \ OPTIONAL(templateParams, MDField, ); \ OPTIONAL(declaration, MDField, ); \ - OPTIONAL(retainedNodes, MDField, ); \ + OPTIONAL(retainedNodes, MDField, ); \ OPTIONAL(thrownTypes, MDField, ); PARSE_MD_FIELDS(); #undef VISIT_MD_FIELDS - if (isDefinition.Val && !IsDistinct) + // An explicit spFlags field takes precedence over individual fields in + // older IR versions. + DISubprogram::DISPFlags SPFlags = + spFlags.Seen ? spFlags.Val + : DISubprogram::toSPFlags(isLocal.Val, isDefinition.Val, + isOptimized.Val, virtuality.Val); + if ((SPFlags & DISubprogram::SPFlagDefinition) && !IsDistinct) return Lex.Error( Loc, - "missing 'distinct', required for !DISubprogram when 'isDefinition'"); - + "missing 'distinct', required for !DISubprogram that is a Definition"); Result = GET_OR_DISTINCT( DISubprogram, (Context, scope.Val, name.Val, linkageName.Val, file.Val, line.Val, - type.Val, isLocal.Val, isDefinition.Val, scopeLine.Val, - containingType.Val, virtuality.Val, virtualIndex.Val, thisAdjustment.Val, - flags.Val, isOptimized.Val, unit.Val, templateParams.Val, + type.Val, scopeLine.Val, containingType.Val, virtualIndex.Val, + thisAdjustment.Val, flags.Val, SPFlags, unit.Val, templateParams.Val, declaration.Val, retainedNodes.Val, thrownTypes.Val)); return false; } @@ -5492,6 +5567,16 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB, case lltok::kw_catchswitch: return ParseCatchSwitch(Inst, PFS); case lltok::kw_catchpad: return ParseCatchPad(Inst, PFS); case lltok::kw_cleanuppad: return ParseCleanupPad(Inst, PFS); + // Unary Operators. + case lltok::kw_fneg: { + FastMathFlags FMF = EatFastMathFlagsIfPresent(); + int Res = ParseUnaryOp(Inst, PFS, KeywordVal, 2); + if (Res != 0) + return Res; + if (FMF.any()) + Inst->setFastMathFlags(FMF); + return false; + } // Binary Operators. case lltok::kw_add: case lltok::kw_sub: @@ -6063,6 +6148,43 @@ bool LLParser::ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS) { return false; } +//===----------------------------------------------------------------------===// +// Unary Operators. +//===----------------------------------------------------------------------===// + +/// ParseUnaryOp +/// ::= UnaryOp TypeAndValue ',' Value +/// +/// If OperandType is 0, then any FP or integer operand is allowed. If it is 1, +/// then any integer operand is allowed, if it is 2, any fp operand is allowed. +bool LLParser::ParseUnaryOp(Instruction *&Inst, PerFunctionState &PFS, + unsigned Opc, unsigned OperandType) { + LocTy Loc; Value *LHS; + if (ParseTypeAndValue(LHS, Loc, PFS)) + return true; + + bool Valid; + switch (OperandType) { + default: llvm_unreachable("Unknown operand type!"); + case 0: // int or FP. + Valid = LHS->getType()->isIntOrIntVectorTy() || + LHS->getType()->isFPOrFPVectorTy(); + break; + case 1: + Valid = LHS->getType()->isIntOrIntVectorTy(); + break; + case 2: + Valid = LHS->getType()->isFPOrFPVectorTy(); + break; + } + + if (!Valid) + return Error(Loc, "invalid operand type for instruction"); + + Inst = UnaryOperator::Create((Instruction::UnaryOps)Opc, LHS); + return false; +} + //===----------------------------------------------------------------------===// // Binary Operators. //===----------------------------------------------------------------------===// @@ -7397,8 +7519,14 @@ bool LLParser::ParseArgs(std::vector &Args) { return false; } -static ValueInfo EmptyVI = - ValueInfo(false, (GlobalValueSummaryMapTy::value_type *)-8); +auto FwdVIRef = (GlobalValueSummaryMapTy::value_type *)-8; + +static void resolveFwdRef(ValueInfo *Fwd, ValueInfo &Resolved) { + bool ReadOnly = Fwd->isReadOnly(); + *Fwd = Resolved; + if (ReadOnly) + Fwd->setReadOnly(); +} /// Stores the given Name/GUID and associated summary into the Index. /// Also updates any forward references to the associated entry ID. @@ -7434,9 +7562,9 @@ void LLParser::AddGlobalValueToIndex( auto FwdRefVIs = ForwardRefValueInfos.find(ID); if (FwdRefVIs != ForwardRefValueInfos.end()) { for (auto VIRef : FwdRefVIs->second) { - assert(*VIRef.first == EmptyVI && + assert(VIRef.first->getRef() == FwdVIRef && "Forward referenced ValueInfo expected to be empty"); - *VIRef.first = VI; + resolveFwdRef(VIRef.first, VI); } ForwardRefValueInfos.erase(FwdRefVIs); } @@ -7599,8 +7727,8 @@ bool LLParser::ParseFunctionSummary(std::string Name, GlobalValue::GUID GUID, return true; auto FS = llvm::make_unique( - GVFlags, InstCount, FFlags, std::move(Refs), std::move(Calls), - std::move(TypeIdInfo.TypeTests), + GVFlags, InstCount, FFlags, /*EntryCount=*/0, std::move(Refs), + std::move(Calls), std::move(TypeIdInfo.TypeTests), std::move(TypeIdInfo.TypeTestAssumeVCalls), std::move(TypeIdInfo.TypeCheckedLoadVCalls), std::move(TypeIdInfo.TypeTestAssumeConstVCalls), @@ -7626,11 +7754,14 @@ bool LLParser::ParseVariableSummary(std::string Name, GlobalValue::GUID GUID, GlobalValueSummary::GVFlags GVFlags = GlobalValueSummary::GVFlags( /*Linkage=*/GlobalValue::ExternalLinkage, /*NotEligibleToImport=*/false, /*Live=*/false, /*IsLocal=*/false); + GlobalVarSummary::GVarFlags GVarFlags(/*ReadOnly*/ false); std::vector Refs; if (ParseToken(lltok::colon, "expected ':' here") || ParseToken(lltok::lparen, "expected '(' here") || ParseModuleReference(ModulePath) || - ParseToken(lltok::comma, "expected ',' here") || ParseGVFlags(GVFlags)) + ParseToken(lltok::comma, "expected ',' here") || ParseGVFlags(GVFlags) || + ParseToken(lltok::comma, "expected ',' here") || + ParseGVarFlags(GVarFlags)) return true; // Parse optional refs field @@ -7642,7 +7773,8 @@ bool LLParser::ParseVariableSummary(std::string Name, GlobalValue::GUID GUID, if (ParseToken(lltok::rparen, "expected ')' here")) return true; - auto GS = llvm::make_unique(GVFlags, std::move(Refs)); + auto GS = + llvm::make_unique(GVFlags, GVarFlags, std::move(Refs)); GS->setModulePath(ModulePath); @@ -7687,7 +7819,7 @@ bool LLParser::ParseAliasSummary(std::string Name, GlobalValue::GUID GUID, AS->setModulePath(ModulePath); // Record forward reference if the aliasee is not parsed yet. - if (AliaseeVI == EmptyVI) { + if (AliaseeVI.getRef() == FwdVIRef) { auto FwdRef = ForwardRefAliasees.insert( std::make_pair(GVId, std::vector>())); FwdRef.first->second.push_back(std::make_pair(AS.get(), Loc)); @@ -7809,7 +7941,7 @@ bool LLParser::ParseOptionalCalls(std::vector &Calls) { // Keep track of the Call array index needing a forward reference. // We will save the location of the ValueInfo needing an update, but // can only do so once the std::vector is finalized. - if (VI == EmptyVI) + if (VI.getRef() == FwdVIRef) IdToIndexMap[GVId].push_back(std::make_pair(Calls.size(), Loc)); Calls.push_back(FunctionSummary::EdgeTy{VI, CalleeInfo(Hotness, RelBF)}); @@ -7821,7 +7953,7 @@ bool LLParser::ParseOptionalCalls(std::vector &Calls) { // of any forward GV references that need updating later. for (auto I : IdToIndexMap) { for (auto P : I.second) { - assert(Calls[P.first].first == EmptyVI && + assert(Calls[P.first].first.getRef() == FwdVIRef && "Forward referenced ValueInfo expected to be empty"); auto FwdRef = ForwardRefValueInfos.insert(std::make_pair( I.first, std::vector>())); @@ -7872,28 +8004,42 @@ bool LLParser::ParseOptionalRefs(std::vector &Refs) { ParseToken(lltok::lparen, "expected '(' in refs")) return true; - IdToIndexMapType IdToIndexMap; - // Parse each ref edge - do { + struct ValueContext { ValueInfo VI; - LocTy Loc = Lex.getLoc(); unsigned GVId; - if (ParseGVReference(VI, GVId)) + LocTy Loc; + }; + std::vector VContexts; + // Parse each ref edge + do { + ValueContext VC; + VC.Loc = Lex.getLoc(); + if (ParseGVReference(VC.VI, VC.GVId)) return true; + VContexts.push_back(VC); + } while (EatIfPresent(lltok::comma)); + + // Sort value contexts so that ones with readonly ValueInfo are at the end + // of VContexts vector. This is needed to match immutableRefCount() behavior. + llvm::sort(VContexts, [](const ValueContext &VC1, const ValueContext &VC2) { + return VC1.VI.isReadOnly() < VC2.VI.isReadOnly(); + }); + IdToIndexMapType IdToIndexMap; + for (auto &VC : VContexts) { // Keep track of the Refs array index needing a forward reference. // We will save the location of the ValueInfo needing an update, but // can only do so once the std::vector is finalized. - if (VI == EmptyVI) - IdToIndexMap[GVId].push_back(std::make_pair(Refs.size(), Loc)); - Refs.push_back(VI); - } while (EatIfPresent(lltok::comma)); + if (VC.VI.getRef() == FwdVIRef) + IdToIndexMap[VC.GVId].push_back(std::make_pair(Refs.size(), VC.Loc)); + Refs.push_back(VC.VI); + } // Now that the Refs vector is finalized, it is safe to save the locations // of any forward GV references that need updating later. for (auto I : IdToIndexMap) { for (auto P : I.second) { - assert(Refs[P.first] == EmptyVI && + assert(Refs[P.first].getRef() == FwdVIRef && "Forward referenced ValueInfo expected to be empty"); auto FwdRef = ForwardRefValueInfos.insert(std::make_pair( I.first, std::vector>())); @@ -8179,6 +8325,27 @@ bool LLParser::ParseGVFlags(GlobalValueSummary::GVFlags &GVFlags) { return false; } +/// GVarFlags +/// ::= 'varFlags' ':' '(' 'readonly' ':' Flag ')' +bool LLParser::ParseGVarFlags(GlobalVarSummary::GVarFlags &GVarFlags) { + assert(Lex.getKind() == lltok::kw_varFlags); + Lex.Lex(); + + unsigned Flag; + if (ParseToken(lltok::colon, "expected ':' here") || + ParseToken(lltok::lparen, "expected '(' here") || + ParseToken(lltok::kw_readonly, "expected 'readonly' here") || + ParseToken(lltok::colon, "expected ':' here")) + return true; + + ParseFlag(Flag); + GVarFlags.ReadOnly = Flag; + + if (ParseToken(lltok::rparen, "expected ')' here")) + return true; + return false; +} + /// ModuleReference /// ::= 'module' ':' UInt bool LLParser::ParseModuleReference(StringRef &ModulePath) { @@ -8199,18 +8366,20 @@ bool LLParser::ParseModuleReference(StringRef &ModulePath) { /// GVReference /// ::= SummaryID bool LLParser::ParseGVReference(ValueInfo &VI, unsigned &GVId) { + bool ReadOnly = EatIfPresent(lltok::kw_readonly); if (ParseToken(lltok::SummaryID, "expected GV ID")) return true; GVId = Lex.getUIntVal(); - // Check if we already have a VI for this GV if (GVId < NumberedValueInfos.size()) { - assert(NumberedValueInfos[GVId] != EmptyVI); + assert(NumberedValueInfos[GVId].getRef() != FwdVIRef); VI = NumberedValueInfos[GVId]; } else // We will create a forward reference to the stored location. - VI = EmptyVI; + VI = ValueInfo(false, FwdVIRef); + if (ReadOnly) + VI.setReadOnly(); return false; } diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h index cec1a8e8f7eb6a27df315fa564fa9896dca2c40b..5a0fc297265d4d1dc649bc31084de1f1dcbdd3f9 100644 --- a/lib/AsmParser/LLParser.h +++ b/lib/AsmParser/LLParser.h @@ -352,6 +352,7 @@ namespace llvm { bool ParseVariableSummary(std::string Name, GlobalValue::GUID, unsigned ID); bool ParseAliasSummary(std::string Name, GlobalValue::GUID, unsigned ID); bool ParseGVFlags(GlobalValueSummary::GVFlags &GVFlags); + bool ParseGVarFlags(GlobalVarSummary::GVarFlags &GVarFlags); bool ParseOptionalFFlags(FunctionSummary::FFlags &FFlags); bool ParseOptionalCalls(std::vector &Calls); bool ParseHotness(CalleeInfo::HotnessType &Hotness); @@ -571,6 +572,8 @@ namespace llvm { bool ParseCatchPad(Instruction *&Inst, PerFunctionState &PFS); bool ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS); + bool ParseUnaryOp(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc, + unsigned OperandType); bool ParseArithmetic(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc, unsigned OperandType); bool ParseLogical(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc); diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h index f8f5955a16c89efd3281990f868f7e9ad847e41b..c2e2795a9467be6c4026aa5ea29091dd3c242163 100644 --- a/lib/AsmParser/LLToken.h +++ b/lib/AsmParser/LLToken.h @@ -270,6 +270,7 @@ enum Kind { kw_umin, // Instruction Opcodes (Opcode in UIntVal). + kw_fneg, kw_add, kw_fadd, kw_sub, @@ -417,6 +418,7 @@ enum Kind { kw_info, kw_byte, kw_bit, + kw_varFlags, // Unsigned Valued tokens (UIntVal). GlobalID, // @42 @@ -440,6 +442,7 @@ enum Kind { NameTableKind, // GNU DwarfOp, // DW_OP_foo DIFlag, // DIFlagFoo + DISPFlag, // DISPFlagFoo DwarfMacinfo, // DW_MACINFO_foo ChecksumKind, // CSK_foo diff --git a/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b789f646b5f6f920880ff76c912d1f31c8730987 --- /dev/null +++ b/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp @@ -0,0 +1,324 @@ +//===- AMDGPUMetadataVerifier.cpp - MsgPack Types ---------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Implements a verifier for AMDGPU HSA metadata. +// +//===----------------------------------------------------------------------===// + +#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h" +#include "llvm/Support/AMDGPUMetadata.h" + +namespace llvm { +namespace AMDGPU { +namespace HSAMD { +namespace V3 { + +bool MetadataVerifier::verifyScalar( + msgpack::Node &Node, msgpack::ScalarNode::ScalarKind SKind, + function_ref verifyValue) { + auto ScalarPtr = dyn_cast(&Node); + if (!ScalarPtr) + return false; + auto &Scalar = *ScalarPtr; + // Do not output extraneous tags for types we know from the spec. + Scalar.IgnoreTag = true; + if (Scalar.getScalarKind() != SKind) { + if (Strict) + return false; + // If we are not strict, we interpret string values as "implicitly typed" + // and attempt to coerce them to the expected type here. + if (Scalar.getScalarKind() != msgpack::ScalarNode::SK_String) + return false; + std::string StringValue = Scalar.getString(); + Scalar.setScalarKind(SKind); + if (Scalar.inputYAML(StringValue) != StringRef()) + return false; + } + if (verifyValue) + return verifyValue(Scalar); + return true; +} + +bool MetadataVerifier::verifyInteger(msgpack::Node &Node) { + if (!verifyScalar(Node, msgpack::ScalarNode::SK_UInt)) + if (!verifyScalar(Node, msgpack::ScalarNode::SK_Int)) + return false; + return true; +} + +bool MetadataVerifier::verifyArray( + msgpack::Node &Node, function_ref verifyNode, + Optional Size) { + auto ArrayPtr = dyn_cast(&Node); + if (!ArrayPtr) + return false; + auto &Array = *ArrayPtr; + if (Size && Array.size() != *Size) + return false; + for (auto &Item : Array) + if (!verifyNode(*Item.get())) + return false; + + return true; +} + +bool MetadataVerifier::verifyEntry( + msgpack::MapNode &MapNode, StringRef Key, bool Required, + function_ref verifyNode) { + auto Entry = MapNode.find(Key); + if (Entry == MapNode.end()) + return !Required; + return verifyNode(*Entry->second.get()); +} + +bool MetadataVerifier::verifyScalarEntry( + msgpack::MapNode &MapNode, StringRef Key, bool Required, + msgpack::ScalarNode::ScalarKind SKind, + function_ref verifyValue) { + return verifyEntry(MapNode, Key, Required, [=](msgpack::Node &Node) { + return verifyScalar(Node, SKind, verifyValue); + }); +} + +bool MetadataVerifier::verifyIntegerEntry(msgpack::MapNode &MapNode, + StringRef Key, bool Required) { + return verifyEntry(MapNode, Key, Required, [this](msgpack::Node &Node) { + return verifyInteger(Node); + }); +} + +bool MetadataVerifier::verifyKernelArgs(msgpack::Node &Node) { + auto ArgsMapPtr = dyn_cast(&Node); + if (!ArgsMapPtr) + return false; + auto &ArgsMap = *ArgsMapPtr; + + if (!verifyScalarEntry(ArgsMap, ".name", false, + msgpack::ScalarNode::SK_String)) + return false; + if (!verifyScalarEntry(ArgsMap, ".type_name", false, + msgpack::ScalarNode::SK_String)) + return false; + if (!verifyIntegerEntry(ArgsMap, ".size", true)) + return false; + if (!verifyIntegerEntry(ArgsMap, ".offset", true)) + return false; + if (!verifyScalarEntry(ArgsMap, ".value_kind", true, + msgpack::ScalarNode::SK_String, + [](msgpack::ScalarNode &SNode) { + return StringSwitch(SNode.getString()) + .Case("by_value", true) + .Case("global_buffer", true) + .Case("dynamic_shared_pointer", true) + .Case("sampler", true) + .Case("image", true) + .Case("pipe", true) + .Case("queue", true) + .Case("hidden_global_offset_x", true) + .Case("hidden_global_offset_y", true) + .Case("hidden_global_offset_z", true) + .Case("hidden_none", true) + .Case("hidden_printf_buffer", true) + .Case("hidden_default_queue", true) + .Case("hidden_completion_action", true) + .Default(false); + })) + return false; + if (!verifyScalarEntry(ArgsMap, ".value_type", true, + msgpack::ScalarNode::SK_String, + [](msgpack::ScalarNode &SNode) { + return StringSwitch(SNode.getString()) + .Case("struct", true) + .Case("i8", true) + .Case("u8", true) + .Case("i16", true) + .Case("u16", true) + .Case("f16", true) + .Case("i32", true) + .Case("u32", true) + .Case("f32", true) + .Case("i64", true) + .Case("u64", true) + .Case("f64", true) + .Default(false); + })) + return false; + if (!verifyIntegerEntry(ArgsMap, ".pointee_align", false)) + return false; + if (!verifyScalarEntry(ArgsMap, ".address_space", false, + msgpack::ScalarNode::SK_String, + [](msgpack::ScalarNode &SNode) { + return StringSwitch(SNode.getString()) + .Case("private", true) + .Case("global", true) + .Case("constant", true) + .Case("local", true) + .Case("generic", true) + .Case("region", true) + .Default(false); + })) + return false; + if (!verifyScalarEntry(ArgsMap, ".access", false, + msgpack::ScalarNode::SK_String, + [](msgpack::ScalarNode &SNode) { + return StringSwitch(SNode.getString()) + .Case("read_only", true) + .Case("write_only", true) + .Case("read_write", true) + .Default(false); + })) + return false; + if (!verifyScalarEntry(ArgsMap, ".actual_access", false, + msgpack::ScalarNode::SK_String, + [](msgpack::ScalarNode &SNode) { + return StringSwitch(SNode.getString()) + .Case("read_only", true) + .Case("write_only", true) + .Case("read_write", true) + .Default(false); + })) + return false; + if (!verifyScalarEntry(ArgsMap, ".is_const", false, + msgpack::ScalarNode::SK_Boolean)) + return false; + if (!verifyScalarEntry(ArgsMap, ".is_restrict", false, + msgpack::ScalarNode::SK_Boolean)) + return false; + if (!verifyScalarEntry(ArgsMap, ".is_volatile", false, + msgpack::ScalarNode::SK_Boolean)) + return false; + if (!verifyScalarEntry(ArgsMap, ".is_pipe", false, + msgpack::ScalarNode::SK_Boolean)) + return false; + + return true; +} + +bool MetadataVerifier::verifyKernel(msgpack::Node &Node) { + auto KernelMapPtr = dyn_cast(&Node); + if (!KernelMapPtr) + return false; + auto &KernelMap = *KernelMapPtr; + + if (!verifyScalarEntry(KernelMap, ".name", true, + msgpack::ScalarNode::SK_String)) + return false; + if (!verifyScalarEntry(KernelMap, ".symbol", true, + msgpack::ScalarNode::SK_String)) + return false; + if (!verifyScalarEntry(KernelMap, ".language", false, + msgpack::ScalarNode::SK_String, + [](msgpack::ScalarNode &SNode) { + return StringSwitch(SNode.getString()) + .Case("OpenCL C", true) + .Case("OpenCL C++", true) + .Case("HCC", true) + .Case("HIP", true) + .Case("OpenMP", true) + .Case("Assembler", true) + .Default(false); + })) + return false; + if (!verifyEntry( + KernelMap, ".language_version", false, [this](msgpack::Node &Node) { + return verifyArray( + Node, + [this](msgpack::Node &Node) { return verifyInteger(Node); }, 2); + })) + return false; + if (!verifyEntry(KernelMap, ".args", false, [this](msgpack::Node &Node) { + return verifyArray(Node, [this](msgpack::Node &Node) { + return verifyKernelArgs(Node); + }); + })) + return false; + if (!verifyEntry(KernelMap, ".reqd_workgroup_size", false, + [this](msgpack::Node &Node) { + return verifyArray(Node, + [this](msgpack::Node &Node) { + return verifyInteger(Node); + }, + 3); + })) + return false; + if (!verifyEntry(KernelMap, ".workgroup_size_hint", false, + [this](msgpack::Node &Node) { + return verifyArray(Node, + [this](msgpack::Node &Node) { + return verifyInteger(Node); + }, + 3); + })) + return false; + if (!verifyScalarEntry(KernelMap, ".vec_type_hint", false, + msgpack::ScalarNode::SK_String)) + return false; + if (!verifyScalarEntry(KernelMap, ".device_enqueue_symbol", false, + msgpack::ScalarNode::SK_String)) + return false; + if (!verifyIntegerEntry(KernelMap, ".kernarg_segment_size", true)) + return false; + if (!verifyIntegerEntry(KernelMap, ".group_segment_fixed_size", true)) + return false; + if (!verifyIntegerEntry(KernelMap, ".private_segment_fixed_size", true)) + return false; + if (!verifyIntegerEntry(KernelMap, ".kernarg_segment_align", true)) + return false; + if (!verifyIntegerEntry(KernelMap, ".wavefront_size", true)) + return false; + if (!verifyIntegerEntry(KernelMap, ".sgpr_count", true)) + return false; + if (!verifyIntegerEntry(KernelMap, ".vgpr_count", true)) + return false; + if (!verifyIntegerEntry(KernelMap, ".max_flat_workgroup_size", true)) + return false; + if (!verifyIntegerEntry(KernelMap, ".sgpr_spill_count", false)) + return false; + if (!verifyIntegerEntry(KernelMap, ".vgpr_spill_count", false)) + return false; + + return true; +} + +bool MetadataVerifier::verify(msgpack::Node &HSAMetadataRoot) { + auto RootMapPtr = dyn_cast(&HSAMetadataRoot); + if (!RootMapPtr) + return false; + auto &RootMap = *RootMapPtr; + + if (!verifyEntry( + RootMap, "amdhsa.version", true, [this](msgpack::Node &Node) { + return verifyArray( + Node, + [this](msgpack::Node &Node) { return verifyInteger(Node); }, 2); + })) + return false; + if (!verifyEntry( + RootMap, "amdhsa.printf", false, [this](msgpack::Node &Node) { + return verifyArray(Node, [this](msgpack::Node &Node) { + return verifyScalar(Node, msgpack::ScalarNode::SK_String); + }); + })) + return false; + if (!verifyEntry(RootMap, "amdhsa.kernels", true, + [this](msgpack::Node &Node) { + return verifyArray(Node, [this](msgpack::Node &Node) { + return verifyKernel(Node); + }); + })) + return false; + + return true; +} + +} // end namespace V3 +} // end namespace HSAMD +} // end namespace AMDGPU +} // end namespace llvm diff --git a/lib/BinaryFormat/CMakeLists.txt b/lib/BinaryFormat/CMakeLists.txt index 93bb8b07763f0541310b89585569df2e7738ac5d..d645279d0ac59e547acd7ea5f9a82be0502f997b 100644 --- a/lib/BinaryFormat/CMakeLists.txt +++ b/lib/BinaryFormat/CMakeLists.txt @@ -1,7 +1,9 @@ add_llvm_library(LLVMBinaryFormat + AMDGPUMetadataVerifier.cpp Dwarf.cpp Magic.cpp MsgPackReader.cpp + MsgPackTypes.cpp MsgPackWriter.cpp Wasm.cpp diff --git a/lib/BinaryFormat/Dwarf.cpp b/lib/BinaryFormat/Dwarf.cpp index 5984de73ae633d94a8b1646db850c6c300ee1a3a..fe8ce2bd7aa36558d586415516f86abc7066517a 100644 --- a/lib/BinaryFormat/Dwarf.cpp +++ b/lib/BinaryFormat/Dwarf.cpp @@ -13,6 +13,7 @@ #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Triple.h" #include "llvm/Support/ErrorHandling.h" using namespace llvm; @@ -455,14 +456,32 @@ StringRef llvm::dwarf::RangeListEncodingString(unsigned Encoding) { } } -StringRef llvm::dwarf::CallFrameString(unsigned Encoding) { +StringRef llvm::dwarf::CallFrameString(unsigned Encoding, + Triple::ArchType Arch) { + assert(Arch != llvm::Triple::ArchType::UnknownArch); +#define SELECT_AARCH64 (Arch == llvm::Triple::aarch64_be || Arch == llvm::Triple::aarch64) +#define SELECT_MIPS64 Arch == llvm::Triple::mips64 +#define SELECT_SPARC (Arch == llvm::Triple::sparc || Arch == llvm::Triple::sparcv9) +#define SELECT_X86 (Arch == llvm::Triple::x86 || Arch == llvm::Triple::x86_64) +#define HANDLE_DW_CFA(ID, NAME) +#define HANDLE_DW_CFA_PRED(ID, NAME, PRED) \ + if (ID == Encoding && PRED) \ + return "DW_CFA_" #NAME; +#include "llvm/BinaryFormat/Dwarf.def" + switch (Encoding) { default: return StringRef(); +#define HANDLE_DW_CFA_PRED(ID, NAME, PRED) #define HANDLE_DW_CFA(ID, NAME) \ case DW_CFA_##NAME: \ return "DW_CFA_" #NAME; #include "llvm/BinaryFormat/Dwarf.def" + +#undef SELECT_X86 +#undef SELECT_SPARC +#undef SELECT_MIPS64 +#undef SELECT_AARCH64 } } diff --git a/lib/BinaryFormat/Magic.cpp b/lib/BinaryFormat/Magic.cpp index 5a339583fca1a78f8c2dcd00db214ce607688b20..78efa6ec87be8c614d22a565c47eae296466e31b 100644 --- a/lib/BinaryFormat/Magic.cpp +++ b/lib/BinaryFormat/Magic.cpp @@ -206,7 +206,7 @@ file_magic llvm::identify_magic(StringRef Magic) { } std::error_code llvm::identify_magic(const Twine &Path, file_magic &Result) { - auto FileOrError = MemoryBuffer::getFile(Path); + auto FileOrError = MemoryBuffer::getFile(Path, -1LL, false); if (!FileOrError) return FileOrError.getError(); diff --git a/lib/BinaryFormat/MsgPackTypes.cpp b/lib/BinaryFormat/MsgPackTypes.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4a8f70b10fb83e4c32cc01996fe1f2fc080dbf31 --- /dev/null +++ b/lib/BinaryFormat/MsgPackTypes.cpp @@ -0,0 +1,303 @@ +//===- MsgPackTypes.cpp - MsgPack Types -------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Implementation of types representing MessagePack "documents". +// +//===----------------------------------------------------------------------===// + +#include "llvm/BinaryFormat/MsgPackTypes.h" +#include "llvm/Support/Error.h" + +using namespace llvm; +using namespace msgpack; + +namespace llvm { +namespace msgpack { +void ScalarNode::anchor() {} +void ArrayNode::anchor() {} +void MapNode::anchor() {} +} +} + +Expected Node::readArray(Reader &MPReader, size_t Length) { + auto A = std::make_shared(); + for (size_t I = 0; I < Length; ++I) { + auto OptNodeOrErr = Node::read(MPReader); + if (auto Err = OptNodeOrErr.takeError()) + return std::move(Err); + if (!*OptNodeOrErr) + return make_error( + "Insufficient array elements", + std::make_error_code(std::errc::invalid_argument)); + A->push_back(std::move(**OptNodeOrErr)); + } + return OptNodePtr(std::move(A)); +} + +Expected Node::readMap(Reader &MPReader, size_t Length) { + auto M = std::make_shared(); + for (size_t I = 0; I < Length; ++I) { + auto OptKeyOrErr = Node::read(MPReader); + if (auto Err = OptKeyOrErr.takeError()) + return std::move(Err); + if (!*OptKeyOrErr) + return make_error( + "Insufficient map elements", + std::make_error_code(std::errc::invalid_argument)); + auto OptValOrErr = Node::read(MPReader); + if (auto Err = OptValOrErr.takeError()) + return std::move(Err); + if (!*OptValOrErr) + return make_error( + "Insufficient map elements", + std::make_error_code(std::errc::invalid_argument)); + auto *Key = dyn_cast((*OptKeyOrErr)->get()); + if (!Key) + return make_error( + "Only string map keys are supported", + std::make_error_code(std::errc::invalid_argument)); + if (Key->getScalarKind() != ScalarNode::SK_String) + return make_error( + "Only string map keys are supported", + std::make_error_code(std::errc::invalid_argument)); + M->try_emplace(Key->getString(), std::move(**OptValOrErr)); + } + return OptNodePtr(std::move(M)); +} + +Expected Node::read(Reader &MPReader) { + Object Obj; + + auto ContinueOrErr = MPReader.read(Obj); + if (auto Err = ContinueOrErr.takeError()) + return std::move(Err); + if (!*ContinueOrErr) + return None; + + switch (Obj.Kind) { + case Type::Int: + return OptNodePtr(std::make_shared(Obj.Int)); + case Type::UInt: + return OptNodePtr(std::make_shared(Obj.UInt)); + case Type::Nil: + return OptNodePtr(std::make_shared()); + case Type::Boolean: + return OptNodePtr(std::make_shared(Obj.Bool)); + case Type::Float: + return OptNodePtr(std::make_shared(Obj.Float)); + case Type::String: + return OptNodePtr(std::make_shared(Obj.Raw)); + case Type::Binary: + return OptNodePtr(std::make_shared(Obj.Raw)); + case Type::Array: + return Node::readArray(MPReader, Obj.Length); + case Type::Map: + return Node::readMap(MPReader, Obj.Length); + case Type::Extension: + return make_error( + "Extension types are not supported", + std::make_error_code(std::errc::invalid_argument)); + } + llvm_unreachable("msgpack::Type not handled"); +} + +void ScalarNode::destroy() { + switch (SKind) { + case SK_String: + case SK_Binary: + StringValue.~basic_string(); + break; + default: + // POD types do not require destruction + break; + } +} + +ScalarNode::ScalarNode(int64_t IntValue) + : Node(NK_Scalar), SKind(SK_Int), IntValue(IntValue) {} + +ScalarNode::ScalarNode(int32_t IntValue) + : ScalarNode(static_cast(IntValue)) {} + +ScalarNode::ScalarNode(uint64_t UIntValue) + : Node(NK_Scalar), SKind(SK_UInt), UIntValue(UIntValue) {} + +ScalarNode::ScalarNode(uint32_t IntValue) + : ScalarNode(static_cast(IntValue)) {} + +ScalarNode::ScalarNode() : Node(NK_Scalar), SKind(SK_Nil) {} + +ScalarNode::ScalarNode(bool BoolValue) + : Node(NK_Scalar), SKind(SK_Boolean), BoolValue(BoolValue) {} + +ScalarNode::ScalarNode(double FloatValue) + : Node(NK_Scalar), SKind(SK_Float), BoolValue(FloatValue) {} + +ScalarNode::ScalarNode(StringRef StringValue) + : Node(NK_Scalar), SKind(SK_String) { + new (&this->StringValue) std::string(StringValue); +} + +ScalarNode::ScalarNode(const char *StringValue) + : ScalarNode(StringRef(StringValue)) {} + +ScalarNode::ScalarNode(std::string &&StringValue) + : Node(NK_Scalar), SKind(SK_String) { + new (&this->StringValue) std::string(StringValue); +} + +ScalarNode::ScalarNode(MemoryBufferRef BinaryValue) + : Node(NK_Scalar), SKind(SK_Binary) { + new (&StringValue) std::string(BinaryValue.getBuffer()); +} + +ScalarNode::~ScalarNode() { destroy(); } + +ScalarNode &ScalarNode::operator=(ScalarNode &&RHS) { + destroy(); + switch (SKind = RHS.SKind) { + case SK_Int: + IntValue = RHS.IntValue; + break; + case SK_UInt: + UIntValue = RHS.UIntValue; + break; + case SK_Boolean: + BoolValue = RHS.BoolValue; + break; + case SK_Float: + FloatValue = RHS.FloatValue; + break; + case SK_String: + case SK_Binary: + new (&StringValue) std::string(std::move(RHS.StringValue)); + break; + case SK_Nil: + // pass + break; + } + return *this; +} + +StringRef ScalarNode::inputYAML(StringRef ScalarStr) { + switch (SKind) { + case SK_Int: + return yaml::ScalarTraits::input(ScalarStr, nullptr, IntValue); + case SK_UInt: + return yaml::ScalarTraits::input(ScalarStr, nullptr, UIntValue); + case SK_Nil: + return StringRef(); + case SK_Boolean: + return yaml::ScalarTraits::input(ScalarStr, nullptr, BoolValue); + case SK_Float: + return yaml::ScalarTraits::input(ScalarStr, nullptr, FloatValue); + case SK_Binary: + case SK_String: + return yaml::ScalarTraits::input(ScalarStr, nullptr, + StringValue); + } + llvm_unreachable("unrecognized ScalarKind"); +} + +void ScalarNode::outputYAML(raw_ostream &OS) const { + switch (SKind) { + case SK_Int: + yaml::ScalarTraits::output(IntValue, nullptr, OS); + break; + case SK_UInt: + yaml::ScalarTraits::output(UIntValue, nullptr, OS); + break; + case SK_Nil: + yaml::ScalarTraits::output("", nullptr, OS); + break; + case SK_Boolean: + yaml::ScalarTraits::output(BoolValue, nullptr, OS); + break; + case SK_Float: + yaml::ScalarTraits::output(FloatValue, nullptr, OS); + break; + case SK_Binary: + case SK_String: + yaml::ScalarTraits::output(StringValue, nullptr, OS); + break; + } +} + +yaml::QuotingType ScalarNode::mustQuoteYAML(StringRef ScalarStr) const { + switch (SKind) { + case SK_Int: + return yaml::ScalarTraits::mustQuote(ScalarStr); + case SK_UInt: + return yaml::ScalarTraits::mustQuote(ScalarStr); + case SK_Nil: + return yaml::ScalarTraits::mustQuote(ScalarStr); + case SK_Boolean: + return yaml::ScalarTraits::mustQuote(ScalarStr); + case SK_Float: + return yaml::ScalarTraits::mustQuote(ScalarStr); + case SK_Binary: + case SK_String: + return yaml::ScalarTraits::mustQuote(ScalarStr); + } + llvm_unreachable("unrecognized ScalarKind"); +} + +const char *ScalarNode::IntTag = "!int"; +const char *ScalarNode::NilTag = "!nil"; +const char *ScalarNode::BooleanTag = "!bool"; +const char *ScalarNode::FloatTag = "!float"; +const char *ScalarNode::StringTag = "!str"; +const char *ScalarNode::BinaryTag = "!bin"; + +StringRef ScalarNode::getYAMLTag() const { + switch (SKind) { + case SK_Int: + return IntTag; + case SK_UInt: + return IntTag; + case SK_Nil: + return NilTag; + case SK_Boolean: + return BooleanTag; + case SK_Float: + return FloatTag; + case SK_String: + return StringTag; + case SK_Binary: + return BinaryTag; + } + llvm_unreachable("unrecognized ScalarKind"); +} + +void ScalarNode::write(Writer &MPWriter) { + switch (SKind) { + case SK_Int: + MPWriter.write(IntValue); + break; + case SK_UInt: + MPWriter.write(UIntValue); + break; + case SK_Nil: + MPWriter.writeNil(); + break; + case SK_Boolean: + MPWriter.write(BoolValue); + break; + case SK_Float: + MPWriter.write(FloatValue); + break; + case SK_String: + MPWriter.write(StringValue); + break; + case SK_Binary: + MPWriter.write(MemoryBufferRef(StringValue, "")); + break; + } +} diff --git a/lib/BinaryFormat/Wasm.cpp b/lib/BinaryFormat/Wasm.cpp index ba8de34a6d22efef1c0ee838c0c362c99da9ad13..94d40bf02a390c0438d7ec2e1f72375c91964a6e 100644 --- a/lib/BinaryFormat/Wasm.cpp +++ b/lib/BinaryFormat/Wasm.cpp @@ -19,6 +19,8 @@ std::string llvm::wasm::toString(wasm::WasmSymbolType type) { return "WASM_SYMBOL_TYPE_DATA"; case wasm::WASM_SYMBOL_TYPE_SECTION: return "WASM_SYMBOL_TYPE_SECTION"; + case wasm::WASM_SYMBOL_TYPE_EVENT: + return "WASM_SYMBOL_TYPE_EVENT"; } llvm_unreachable("unknown symbol type"); } diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp index 56e05f8f085564153c86aa9c93dfb91eb5ca3ddb..846ce3a4f7aa72042cf5d80022f9b626c7ab074a 100644 --- a/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/lib/Bitcode/Reader/BitcodeReader.cpp @@ -898,6 +898,11 @@ static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags, return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live, Local); } +// Decode the flags for GlobalVariable in the summary +static GlobalVarSummary::GVarFlags getDecodedGVarFlags(uint64_t RawFlags) { + return GlobalVarSummary::GVarFlags((RawFlags & 0x1) ? true : false); +} + static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) { switch (Val) { default: // Map unknown visibilities to default. @@ -964,6 +969,20 @@ static int getDecodedCastOpcode(unsigned Val) { } } +static int getDecodedUnaryOpcode(unsigned Val, Type *Ty) { + bool IsFP = Ty->isFPOrFPVectorTy(); + // UnOps are only valid for int/fp or vector of int/fp types + if (!IsFP && !Ty->isIntOrIntVectorTy()) + return -1; + + switch (Val) { + default: + return -1; + case bitc::UNOP_NEG: + return IsFP ? Instruction::FNeg : -1; + } +} + static int getDecodedBinaryOpcode(unsigned Val, Type *Ty) { bool IsFP = Ty->isFPOrFPVectorTy(); // BinOps are only valid for int/fp or vector of int/fp types @@ -2317,6 +2336,19 @@ Error BitcodeReader::parseConstants() { } break; } + case bitc::CST_CODE_CE_UNOP: { // CE_UNOP: [opcode, opval] + if (Record.size() < 2) + return error("Invalid record"); + int Opc = getDecodedUnaryOpcode(Record[0], CurTy); + if (Opc < 0) { + V = UndefValue::get(CurTy); // Unknown unop. + } else { + Constant *LHS = ValueList.getConstantFwdRef(Record[1], CurTy); + unsigned Flags = 0; + V = ConstantExpr::get(Opc, LHS, Flags); + } + break; + } case bitc::CST_CODE_CE_BINOP: { // CE_BINOP: [opcode, opval, opval] if (Record.size() < 3) return error("Invalid record"); @@ -3521,12 +3553,14 @@ Error BitcodeReader::parseFunctionBody(Function *F) { MDNode *Scope = nullptr, *IA = nullptr; if (ScopeID) { - Scope = MDLoader->getMDNodeFwdRefOrNull(ScopeID - 1); + Scope = dyn_cast_or_null( + MDLoader->getMetadataFwdRefOrLoad(ScopeID - 1)); if (!Scope) return error("Invalid record"); } if (IAID) { - IA = MDLoader->getMDNodeFwdRefOrNull(IAID - 1); + IA = dyn_cast_or_null( + MDLoader->getMetadataFwdRefOrLoad(IAID - 1)); if (!IA) return error("Invalid record"); } @@ -3535,7 +3569,27 @@ Error BitcodeReader::parseFunctionBody(Function *F) { I = nullptr; continue; } + case bitc::FUNC_CODE_INST_UNOP: { // UNOP: [opval, ty, opcode] + unsigned OpNum = 0; + Value *LHS; + if (getValueTypePair(Record, OpNum, NextValueNo, LHS) || + OpNum+1 > Record.size()) + return error("Invalid record"); + int Opc = getDecodedUnaryOpcode(Record[OpNum++], LHS->getType()); + if (Opc == -1) + return error("Invalid record"); + I = UnaryOperator::Create((Instruction::UnaryOps)Opc, LHS); + InstructionList.push_back(I); + if (OpNum < Record.size()) { + if (isa(I)) { + FastMathFlags FMF = getDecodedFastMathFlags(Record[OpNum]); + if (FMF.any()) + I->setFastMathFlags(FMF); + } + } + break; + } case bitc::FUNC_CODE_INST_BINOP: { // BINOP: [opval, ty, opval, opcode] unsigned OpNum = 0; Value *LHS, *RHS; @@ -4864,7 +4918,7 @@ void ModuleSummaryIndexBitcodeReader::setValueGUID( ValueIdToValueInfoMap[ValueID] = std::make_pair( TheIndex.getOrInsertValueInfo( ValueGUID, - UseStrtab ? ValueName : TheIndex.saveString(ValueName.str())), + UseStrtab ? ValueName : TheIndex.saveString(ValueName)), OriginalNameID); } @@ -5170,6 +5224,12 @@ static void parseTypeIdSummaryRecord(ArrayRef Record, parseWholeProgramDevirtResolution(Record, Strtab, Slot, TypeId); } +static void setImmutableRefs(std::vector &Refs, unsigned Count) { + // Read-only refs are in the end of the refs list. + for (unsigned RefNo = Refs.size() - Count; RefNo < Refs.size(); ++RefNo) + Refs[RefNo].setReadOnly(); +} + // Eagerly parse the entire summary block. This populates the GlobalValueSummary // objects in the index. Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { @@ -5187,9 +5247,9 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { } const uint64_t Version = Record[0]; const bool IsOldProfileFormat = Version == 1; - if (Version < 1 || Version > 4) + if (Version < 1 || Version > 6) return error("Invalid summary version " + Twine(Version) + - ", 1, 2, 3 or 4 expected"); + ". Version should be in the range [1-6]."); Record.clear(); // Keep around the last seen summary to be used when we see an optional @@ -5243,6 +5303,9 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { // 1 bit: SkipModuleByDistributedBackend flag. if (Flags & 0x2) TheIndex.setSkipModuleByDistributedBackend(); + // 1 bit: HasSyntheticEntryCounts flag. + if (Flags & 0x4) + TheIndex.setHasSyntheticEntryCounts(); break; } case bitc::FS_VALUE_GUID: { // [valueid, refguid] @@ -5268,11 +5331,16 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { unsigned InstCount = Record[2]; uint64_t RawFunFlags = 0; unsigned NumRefs = Record[3]; + unsigned NumImmutableRefs = 0; int RefListStartIndex = 4; if (Version >= 4) { RawFunFlags = Record[3]; NumRefs = Record[4]; RefListStartIndex = 5; + if (Version >= 5) { + NumImmutableRefs = Record[5]; + RefListStartIndex = 6; + } } auto Flags = getDecodedGVSummaryFlags(RawFlags, Version); @@ -5291,9 +5359,10 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { std::vector Calls = makeCallList( ArrayRef(Record).slice(CallGraphEdgeStartIndex), IsOldProfileFormat, HasProfile, HasRelBF); + setImmutableRefs(Refs, NumImmutableRefs); auto FS = llvm::make_unique( - Flags, InstCount, getDecodedFFlags(RawFunFlags), std::move(Refs), - std::move(Calls), std::move(PendingTypeTests), + Flags, InstCount, getDecodedFFlags(RawFunFlags), /*EntryCount=*/0, + std::move(Refs), std::move(Calls), std::move(PendingTypeTests), std::move(PendingTypeTestAssumeVCalls), std::move(PendingTypeCheckedLoadVCalls), std::move(PendingTypeTestAssumeConstVCalls), @@ -5339,14 +5408,21 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { TheIndex.addGlobalValueSummary(GUID.first, std::move(AS)); break; } - // FS_PERMODULE_GLOBALVAR_INIT_REFS: [valueid, flags, n x valueid] + // FS_PERMODULE_GLOBALVAR_INIT_REFS: [valueid, flags, varflags, n x valueid] case bitc::FS_PERMODULE_GLOBALVAR_INIT_REFS: { unsigned ValueID = Record[0]; uint64_t RawFlags = Record[1]; + unsigned RefArrayStart = 2; + GlobalVarSummary::GVarFlags GVF; auto Flags = getDecodedGVSummaryFlags(RawFlags, Version); + if (Version >= 5) { + GVF = getDecodedGVarFlags(Record[2]); + RefArrayStart = 3; + } std::vector Refs = - makeRefList(ArrayRef(Record).slice(2)); - auto FS = llvm::make_unique(Flags, std::move(Refs)); + makeRefList(ArrayRef(Record).slice(RefArrayStart)); + auto FS = + llvm::make_unique(Flags, GVF, std::move(Refs)); FS->setModulePath(getThisModule()->first()); auto GUID = getValueInfoFromValueId(ValueID); FS->setOriginalName(GUID.second); @@ -5364,13 +5440,25 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { uint64_t RawFlags = Record[2]; unsigned InstCount = Record[3]; uint64_t RawFunFlags = 0; + uint64_t EntryCount = 0; unsigned NumRefs = Record[4]; + unsigned NumImmutableRefs = 0; int RefListStartIndex = 5; if (Version >= 4) { RawFunFlags = Record[4]; - NumRefs = Record[5]; RefListStartIndex = 6; + size_t NumRefsIndex = 5; + if (Version >= 5) { + RefListStartIndex = 7; + if (Version >= 6) { + NumRefsIndex = 6; + EntryCount = Record[5]; + RefListStartIndex = 8; + } + NumImmutableRefs = Record[RefListStartIndex - 1]; + } + NumRefs = Record[NumRefsIndex]; } auto Flags = getDecodedGVSummaryFlags(RawFlags, Version); @@ -5384,9 +5472,10 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { ArrayRef(Record).slice(CallGraphEdgeStartIndex), IsOldProfileFormat, HasProfile, false); ValueInfo VI = getValueInfoFromValueId(ValueID).first; + setImmutableRefs(Refs, NumImmutableRefs); auto FS = llvm::make_unique( - Flags, InstCount, getDecodedFFlags(RawFunFlags), std::move(Refs), - std::move(Edges), std::move(PendingTypeTests), + Flags, InstCount, getDecodedFFlags(RawFunFlags), EntryCount, + std::move(Refs), std::move(Edges), std::move(PendingTypeTests), std::move(PendingTypeTestAssumeVCalls), std::move(PendingTypeCheckedLoadVCalls), std::move(PendingTypeTestAssumeConstVCalls), @@ -5432,10 +5521,17 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { unsigned ValueID = Record[0]; uint64_t ModuleId = Record[1]; uint64_t RawFlags = Record[2]; + unsigned RefArrayStart = 3; + GlobalVarSummary::GVarFlags GVF; auto Flags = getDecodedGVSummaryFlags(RawFlags, Version); + if (Version >= 5) { + GVF = getDecodedGVarFlags(Record[3]); + RefArrayStart = 4; + } std::vector Refs = - makeRefList(ArrayRef(Record).slice(3)); - auto FS = llvm::make_unique(Flags, std::move(Refs)); + makeRefList(ArrayRef(Record).slice(RefArrayStart)); + auto FS = + llvm::make_unique(Flags, GVF, std::move(Refs)); LastSeenSummary = FS.get(); FS->setModulePath(ModuleIdMap[ModuleId]); ValueInfo VI = getValueInfoFromValueId(ValueID).first; diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp index e77965be216a1aff6bd17c380b634092b9d852b2..3289aa0acddd9bc7ce9136df03496a7c74dec803 100644 --- a/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/lib/Bitcode/Reader/MetadataLoader.cpp @@ -650,10 +650,6 @@ public: return MetadataList.getMetadataFwdRef(ID); } - MDNode *getMDNodeFwdRefOrNull(unsigned Idx) { - return MetadataList.getMDNodeFwdRefOrNull(Idx); - } - DISubprogram *lookupSubprogramForFunction(Function *F) { return FunctionsWithSPs.lookup(F); } @@ -772,7 +768,7 @@ MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock() { // It is acknowledged by 'TODO: Inherit from Metadata' in the // NamedMDNode class definition. MDNode *MD = MetadataList.getMDNodeFwdRefOrNull(Record[i]); - assert(MD && "Invalid record"); + assert(MD && "Invalid metadata: expect fwd ref to MDNode"); NMD->addOperand(MD); } break; @@ -1049,7 +1045,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( for (unsigned i = 0; i != Size; ++i) { MDNode *MD = MetadataList.getMDNodeFwdRefOrNull(Record[i]); if (!MD) - return error("Invalid record"); + return error("Invalid named metadata: expect fwd ref to MDNode"); NMD->addOperand(MD); } break; @@ -1395,7 +1391,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( Record.size() <= 14 ? 0 : Record[14], Record.size() <= 16 ? true : Record[16], Record.size() <= 17 ? false : Record[17], - Record.size() <= 18 ? 0 : Record[18]); + Record.size() <= 18 ? 0 : Record[18], + Record.size() <= 19 ? 0 : Record[19]); MetadataList.assignValue(CU, NextMetadataNo); NextMetadataNo++; @@ -1409,20 +1406,43 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( if (Record.size() < 18 || Record.size() > 21) return error("Invalid record"); - IsDistinct = - (Record[0] & 1) || Record[8]; // All definitions should be distinct. + bool HasSPFlags = Record[0] & 4; + DISubprogram::DISPFlags SPFlags = + HasSPFlags + ? static_cast(Record[9]) + : DISubprogram::toSPFlags( + /*IsLocalToUnit=*/Record[7], /*IsDefinition=*/Record[8], + /*IsOptimized=*/Record[14], /*Virtuality=*/Record[11]); + + // All definitions should be distinct. + IsDistinct = (Record[0] & 1) || (SPFlags & DISubprogram::SPFlagDefinition); // Version 1 has a Function as Record[15]. // Version 2 has removed Record[15]. // Version 3 has the Unit as Record[15]. // Version 4 added thisAdjustment. - bool HasUnit = Record[0] >= 2; - if (HasUnit && Record.size() < 19) + // Version 5 repacked flags into DISPFlags, changing many element numbers. + bool HasUnit = Record[0] & 2; + if (!HasSPFlags && HasUnit && Record.size() < 19) + return error("Invalid record"); + if (HasSPFlags && !HasUnit) return error("Invalid record"); - Metadata *CUorFn = getMDOrNull(Record[15]); - unsigned Offset = Record.size() >= 19 ? 1 : 0; - bool HasFn = Offset && !HasUnit; - bool HasThisAdj = Record.size() >= 20; - bool HasThrownTypes = Record.size() >= 21; + // Accommodate older formats. + bool HasFn = false; + bool HasThisAdj = true; + bool HasThrownTypes = true; + unsigned OffsetA = 0; + unsigned OffsetB = 0; + if (!HasSPFlags) { + OffsetA = 2; + OffsetB = 2; + if (Record.size() >= 19) { + HasFn = !HasUnit; + OffsetB++; + } + HasThisAdj = Record.size() >= 20; + HasThrownTypes = Record.size() >= 21; + } + Metadata *CUorFn = getMDOrNull(Record[12 + OffsetB]); DISubprogram *SP = GET_OR_DISTINCT( DISubprogram, (Context, @@ -1432,20 +1452,18 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( getMDOrNull(Record[4]), // file Record[5], // line getMDOrNull(Record[6]), // type - Record[7], // isLocal - Record[8], // isDefinition - Record[9], // scopeLine - getDITypeRefOrNull(Record[10]), // containingType - Record[11], // virtuality - Record[12], // virtualIndex - HasThisAdj ? Record[19] : 0, // thisAdjustment - static_cast(Record[13]), // flags - Record[14], // isOptimized + Record[7 + OffsetA], // scopeLine + getDITypeRefOrNull(Record[8 + OffsetA]), // containingType + Record[10 + OffsetA], // virtualIndex + HasThisAdj ? Record[16 + OffsetB] : 0, // thisAdjustment + static_cast(Record[11 + OffsetA]),// flags + SPFlags, // SPFlags HasUnit ? CUorFn : nullptr, // unit - getMDOrNull(Record[15 + Offset]), // templateParams - getMDOrNull(Record[16 + Offset]), // declaration - getMDOrNull(Record[17 + Offset]), // retainedNodes - HasThrownTypes ? getMDOrNull(Record[20]) : nullptr // thrownTypes + getMDOrNull(Record[13 + OffsetB]), // templateParams + getMDOrNull(Record[14 + OffsetB]), // declaration + getMDOrNull(Record[15 + OffsetB]), // retainedNodes + HasThrownTypes ? getMDOrNull(Record[17 + OffsetB]) + : nullptr // thrownTypes )); MetadataList.assignValue(SP, NextMetadataNo); NextMetadataNo++; @@ -1833,7 +1851,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseGlobalObjectAttachment( return error("Invalid ID"); MDNode *MD = MetadataList.getMDNodeFwdRefOrNull(Record[I + 1]); if (!MD) - return error("Invalid metadata attachment"); + return error("Invalid metadata attachment: expect fwd ref to MDNode"); GO.addMetadata(K->second, *MD); } return Error::success(); @@ -2003,10 +2021,6 @@ Metadata *MetadataLoader::getMetadataFwdRefOrLoad(unsigned Idx) { return Pimpl->getMetadataFwdRefOrLoad(Idx); } -MDNode *MetadataLoader::getMDNodeFwdRefOrNull(unsigned Idx) { - return Pimpl->getMDNodeFwdRefOrNull(Idx); -} - DISubprogram *MetadataLoader::lookupSubprogramForFunction(Function *F) { return Pimpl->lookupSubprogramForFunction(F); } diff --git a/lib/Bitcode/Reader/MetadataLoader.h b/lib/Bitcode/Reader/MetadataLoader.h index 752eb35ebb43003c1a3df57442292415af9c0bcc..07a77a086f3240b1c85e1dfb5ca4200157fa7bb3 100644 --- a/lib/Bitcode/Reader/MetadataLoader.h +++ b/lib/Bitcode/Reader/MetadataLoader.h @@ -65,8 +65,6 @@ public: /// necessary. Metadata *getMetadataFwdRefOrLoad(unsigned Idx); - MDNode *getMDNodeFwdRefOrNull(unsigned Idx); - /// Return the DISubprogram metadata for a Function if any, null otherwise. DISubprogram *lookupSubprogramForFunction(Function *F); diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index f4634c9d3f4e39b9a5f7392f497115f55ea59167..68d79edceafb5509659c20c9afa02ef6b9a4c56b 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -112,6 +112,8 @@ enum { // FUNCTION_BLOCK abbrev id's. FUNCTION_INST_LOAD_ABBREV = bitc::FIRST_APPLICATION_ABBREV, + FUNCTION_INST_UNOP_ABBREV, + FUNCTION_INST_UNOP_FLAGS_ABBREV, FUNCTION_INST_BINOP_ABBREV, FUNCTION_INST_BINOP_FLAGS_ABBREV, FUNCTION_INST_CAST_ABBREV, @@ -513,6 +515,13 @@ static unsigned getEncodedCastOpcode(unsigned Opcode) { } } +static unsigned getEncodedUnaryOpcode(unsigned Opcode) { + switch (Opcode) { + default: llvm_unreachable("Unknown binary instruction!"); + case Instruction::FNeg: return bitc::UNOP_NEG; + } +} + static unsigned getEncodedBinaryOpcode(unsigned Opcode) { switch (Opcode) { default: llvm_unreachable("Unknown binary instruction!"); @@ -991,6 +1000,11 @@ static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) { return RawFlags; } +static uint64_t getEncodedGVarFlags(GlobalVarSummary::GVarFlags Flags) { + uint64_t RawFlags = Flags.ReadOnly; + return RawFlags; +} + static unsigned getEncodedVisibility(const GlobalValue &GV) { switch (GV.getVisibility()) { case GlobalValue::DefaultVisibility: return 0; @@ -1619,22 +1633,20 @@ void ModuleBitcodeWriter::writeDICompileUnit(const DICompileUnit *N, void ModuleBitcodeWriter::writeDISubprogram(const DISubprogram *N, SmallVectorImpl &Record, unsigned Abbrev) { - uint64_t HasUnitFlag = 1 << 1; - Record.push_back(N->isDistinct() | HasUnitFlag); + const uint64_t HasUnitFlag = 1 << 1; + const uint64_t HasSPFlagsFlag = 1 << 2; + Record.push_back(uint64_t(N->isDistinct()) | HasUnitFlag | HasSPFlagsFlag); Record.push_back(VE.getMetadataOrNullID(N->getScope())); Record.push_back(VE.getMetadataOrNullID(N->getRawName())); Record.push_back(VE.getMetadataOrNullID(N->getRawLinkageName())); Record.push_back(VE.getMetadataOrNullID(N->getFile())); Record.push_back(N->getLine()); Record.push_back(VE.getMetadataOrNullID(N->getType())); - Record.push_back(N->isLocalToUnit()); - Record.push_back(N->isDefinition()); Record.push_back(N->getScopeLine()); Record.push_back(VE.getMetadataOrNullID(N->getContainingType())); - Record.push_back(N->getVirtuality()); + Record.push_back(N->getSPFlags()); Record.push_back(N->getVirtualIndex()); Record.push_back(N->getFlags()); - Record.push_back(N->isOptimized()); Record.push_back(VE.getMetadataOrNullID(N->getRawUnit())); Record.push_back(VE.getMetadataOrNullID(N->getTemplateParams().get())); Record.push_back(VE.getMetadataOrNullID(N->getDeclaration())); @@ -2384,6 +2396,16 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal, Record.push_back(Flags); } break; + case Instruction::FNeg: { + assert(CE->getNumOperands() == 1 && "Unknown constant expr!"); + Code = bitc::CST_CODE_CE_UNOP; + Record.push_back(getEncodedUnaryOpcode(CE->getOpcode())); + Record.push_back(VE.getValueID(C->getOperand(0))); + uint64_t Flags = getOptimizationFlags(CE); + if (Flags != 0) + Record.push_back(Flags); + break; + } case Instruction::GetElementPtr: { Code = bitc::CST_CODE_CE_GEP; const auto *GO = cast(C); @@ -2556,7 +2578,19 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I, } } break; - + case Instruction::FNeg: { + Code = bitc::FUNC_CODE_INST_UNOP; + if (!pushValueAndType(I.getOperand(0), InstID, Vals)) + AbbrevToUse = FUNCTION_INST_UNOP_ABBREV; + Vals.push_back(getEncodedUnaryOpcode(I.getOpcode())); + uint64_t Flags = getOptimizationFlags(&I); + if (Flags != 0) { + if (AbbrevToUse == FUNCTION_INST_UNOP_ABBREV) + AbbrevToUse = FUNCTION_INST_UNOP_FLAGS_ABBREV; + Vals.push_back(Flags); + } + break; + } case Instruction::GetElementPtr: { Code = bitc::FUNC_CODE_INST_GEP; AbbrevToUse = FUNCTION_INST_GEP_ABBREV; @@ -3217,6 +3251,25 @@ void ModuleBitcodeWriter::writeBlockInfo() { FUNCTION_INST_LOAD_ABBREV) llvm_unreachable("Unexpected abbrev ordering!"); } + { // INST_UNOP abbrev for FUNCTION_BLOCK. + auto Abbv = std::make_shared(); + Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_UNOP)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc + if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) != + FUNCTION_INST_UNOP_ABBREV) + llvm_unreachable("Unexpected abbrev ordering!"); + } + { // INST_UNOP_FLAGS abbrev for FUNCTION_BLOCK. + auto Abbv = std::make_shared(); + Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_UNOP)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8)); // flags + if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) != + FUNCTION_INST_UNOP_FLAGS_ABBREV) + llvm_unreachable("Unexpected abbrev ordering!"); + } { // INST_BINOP abbrev for FUNCTION_BLOCK. auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_BINOP)); @@ -3489,6 +3542,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord( NameVals.push_back(FS->instCount()); NameVals.push_back(getEncodedFFlags(FS->fflags())); NameVals.push_back(FS->refs().size()); + NameVals.push_back(FS->immutableRefCount()); for (auto &RI : FS->refs()) NameVals.push_back(VE.getValueID(RI.getValue())); @@ -3530,6 +3584,7 @@ void ModuleBitcodeWriterBase::writeModuleLevelReferences( NameVals.push_back(VE.getValueID(&V)); GlobalVarSummary *VS = cast(Summary); NameVals.push_back(getEncodedGVSummaryFlags(VS->flags())); + NameVals.push_back(getEncodedGVarFlags(VS->varflags())); unsigned SizeBeforeRefs = NameVals.size(); for (auto &RI : VS->refs()) @@ -3546,7 +3601,7 @@ void ModuleBitcodeWriterBase::writeModuleLevelReferences( // Current version for the summary. // This is bumped whenever we introduce changes in the way some record are // interpreted, like flags for instance. -static const uint64_t INDEX_VERSION = 4; +static const uint64_t INDEX_VERSION = 6; /// Emit the per-module summary section alongside the rest of /// the module's bitcode. @@ -3581,6 +3636,7 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() { Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // fflags Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // immutablerefcnt // numrefs x valueid, n x (valueid, hotness) Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); @@ -3597,6 +3653,7 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() { Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // fflags Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // immutablerefcnt // numrefs x valueid, n x (valueid [, rel_block_freq]) Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); @@ -3675,6 +3732,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { Flags |= 0x1; if (Index.skipModuleByDistributedBackend()) Flags |= 0x2; + if (Index.hasSyntheticEntryCounts()) + Flags |= 0x4; Stream.EmitRecord(bitc::FS_FLAGS, ArrayRef{Flags}); for (const auto &GVI : valueIds()) { @@ -3690,7 +3749,9 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // fflags + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // entrycount Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // immutablerefcnt // numrefs x valueid, n x (valueid) Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); @@ -3705,6 +3766,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // fflags Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // immutablerefcnt // numrefs x valueid, n x (valueid, hotness) Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); @@ -3777,6 +3839,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { NameVals.push_back(*ValueId); NameVals.push_back(Index.getModuleId(VS->modulePath())); NameVals.push_back(getEncodedGVSummaryFlags(VS->flags())); + NameVals.push_back(getEncodedGVarFlags(VS->varflags())); for (auto &RI : VS->refs()) { auto RefValueId = getValueId(RI.getGUID()); if (!RefValueId) @@ -3801,18 +3864,24 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { NameVals.push_back(getEncodedGVSummaryFlags(FS->flags())); NameVals.push_back(FS->instCount()); NameVals.push_back(getEncodedFFlags(FS->fflags())); + NameVals.push_back(FS->entryCount()); + // Fill in below - NameVals.push_back(0); + NameVals.push_back(0); // numrefs + NameVals.push_back(0); // immutablerefcnt - unsigned Count = 0; + unsigned Count = 0, ImmutableRefCnt = 0; for (auto &RI : FS->refs()) { auto RefValueId = getValueId(RI.getGUID()); if (!RefValueId) continue; NameVals.push_back(*RefValueId); + if (RI.isReadOnly()) + ImmutableRefCnt++; Count++; } - NameVals[5] = Count; + NameVals[6] = Count; + NameVals[7] = ImmutableRefCnt; bool HasProfileData = false; for (auto &EI : FS->calls()) { @@ -3960,7 +4029,7 @@ void ModuleBitcodeWriter::writeModuleHash(size_t BlockStartPos) { if (ModHash) // Save the written hash value. - std::copy(std::begin(Vals), std::end(Vals), std::begin(*ModHash)); + llvm::copy(Vals, std::begin(*ModHash)); } } diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 1f54c611bad66a74b5726f384b9c04967ef89416..f25c7a098558574e40d97f8e55102df491b3865d 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -12,6 +12,7 @@ add_subdirectory(Linker) add_subdirectory(Analysis) add_subdirectory(LTO) add_subdirectory(MC) +add_subdirectory(MCA) add_subdirectory(Object) add_subdirectory(ObjectYAML) add_subdirectory(Option) @@ -23,6 +24,7 @@ add_subdirectory(AsmParser) add_subdirectory(LineEditor) add_subdirectory(ProfileData) add_subdirectory(Passes) +add_subdirectory(TextAPI) add_subdirectory(ToolDrivers) add_subdirectory(XRay) add_subdirectory(Testing) diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 526f7ce30831518d2156b7aee4a4c2bf82d3f0d7..7070451e333073a7be91b788450ce8c9569c8976 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/AsmPrinter.h" -#include "AsmPrinterHandler.h" #include "CodeViewDebug.h" #include "DwarfDebug.h" #include "DwarfException.h" @@ -36,6 +35,7 @@ #include "llvm/BinaryFormat/COFF.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/BinaryFormat/ELF.h" +#include "llvm/CodeGen/AsmPrinterHandler.h" #include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCMetadataPrinter.h" #include "llvm/CodeGen/GCStrategy.h" @@ -54,6 +54,7 @@ #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" @@ -262,7 +263,7 @@ bool AsmPrinter::doInitialization(Module &M) { // use the directive, where it would need the same conditionalization // anyway. const Triple &Target = TM.getTargetTriple(); - OutStreamer->EmitVersionForTarget(Target); + OutStreamer->EmitVersionForTarget(Target, M.getSDKVersion()); // Allow the target to emit any magic that it wants at the start of the file. EmitStartOfAsmFile(M); @@ -629,8 +630,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { /// /// \p Value - The value to emit. /// \p Size - The size of the integer (in bytes) to emit. -void AsmPrinter::EmitDebugThreadLocal(const MCExpr *Value, - unsigned Size) const { +void AsmPrinter::EmitDebugValue(const MCExpr *Value, unsigned Size) const { OutStreamer->EmitValue(Value, Size); } @@ -1499,6 +1499,9 @@ bool AsmPrinter::doFinalization(Module &M) { // Emit llvm.ident metadata in an '.ident' directive. EmitModuleIdents(M); + // Emit bytes for llvm.commandline metadata. + EmitModuleCommandLines(M); + // Emit __morestack address if needed for indirect calls. if (MMI->usesMorestackAddr()) { unsigned Align = 1; @@ -2008,6 +2011,29 @@ void AsmPrinter::EmitModuleIdents(Module &M) { } } +void AsmPrinter::EmitModuleCommandLines(Module &M) { + MCSection *CommandLine = getObjFileLowering().getSectionForCommandLines(); + if (!CommandLine) + return; + + const NamedMDNode *NMD = M.getNamedMetadata("llvm.commandline"); + if (!NMD || !NMD->getNumOperands()) + return; + + OutStreamer->PushSection(); + OutStreamer->SwitchSection(CommandLine); + OutStreamer->EmitZeros(1); + for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) { + const MDNode *N = NMD->getOperand(i); + assert(N->getNumOperands() == 1 && + "llvm.commandline metadata entry can have only one operand"); + const MDString *S = cast(N->getOperand(0)); + OutStreamer->EmitBytes(S->getString()); + OutStreamer->EmitZeros(1); + } + OutStreamer->PopSection(); +} + //===--------------------------------------------------------------------===// // Emission and print routines // @@ -2977,11 +3003,6 @@ GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) { if (!S.usesMetadata()) return nullptr; - assert(!S.useStatepoints() && "statepoints do not currently support custom" - " stackmap formats, please see the documentation for a description of" - " the default format. If you really need a custom serialized format," - " please file a bug"); - gcp_map_type &GCMap = getGCMap(GCMetadataPrinters); gcp_map_type::iterator GCPI = GCMap.find(&S); if (GCPI != GCMap.end()) @@ -3002,6 +3023,27 @@ GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) { report_fatal_error("no GCMetadataPrinter registered for GC: " + Twine(Name)); } +void AsmPrinter::emitStackMaps(StackMaps &SM) { + GCModuleInfo *MI = getAnalysisIfAvailable(); + assert(MI && "AsmPrinter didn't require GCModuleInfo?"); + bool NeedsDefault = false; + if (MI->begin() == MI->end()) + // No GC strategy, use the default format. + NeedsDefault = true; + else + for (auto &I : *MI) { + if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(*I)) + if (MP->emitStackMaps(SM, *this)) + continue; + // The strategy doesn't have printer or doesn't emit custom stack maps. + // Use the default format. + NeedsDefault = true; + } + + if (NeedsDefault) + SM.serializeToStackMapSection(); +} + /// Pin vtable to this file. AsmPrinterHandler::~AsmPrinterHandler() = default; diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp index 6055884706700ad94187c11742197ad49bbcd4a2..afce3ad3133b3376ae4445a639842cb4e5a15452 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp @@ -212,6 +212,9 @@ void AsmPrinter::emitCFIInstruction(const MCCFIInstruction &Inst) const { case MCCFIInstruction::OpWindowSave: OutStreamer->EmitCFIWindowSave(); break; + case MCCFIInstruction::OpNegateRAState: + OutStreamer->EmitCFINegateRAState(); + break; case MCCFIInstruction::OpSameValue: OutStreamer->EmitCFISameValue(Inst.getRegister()); break; diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 01d018fdde37bad3c844ca5f11e758e37866bcb7..4132b1a30c42ab537812bf581792ae8b003e761b 100644 --- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -44,6 +44,7 @@ #include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/ContinuationRecordBuilder.h" #include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h" +#include "llvm/DebugInfo/CodeView/EnumTables.h" #include "llvm/DebugInfo/CodeView/Line.h" #include "llvm/DebugInfo/CodeView/SymbolRecord.h" #include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h" @@ -92,9 +93,6 @@ using namespace llvm; using namespace llvm::codeview; -static cl::opt EmitDebugGlobalHashes("emit-codeview-ghash-section", - cl::ReallyHidden, cl::init(false)); - static CPUType mapArchToCVCPUType(Triple::ArchType Type) { switch (Type) { case Triple::ArchType::x86: @@ -125,6 +123,11 @@ CodeViewDebug::CodeViewDebug(AsmPrinter *AP) TheCPU = mapArchToCVCPUType(Triple(MMI->getModule()->getTargetTriple()).getArch()); + + // Check if we should emit type record hashes. + ConstantInt *GH = mdconst::extract_or_null( + MMI->getModule()->getModuleFlag("CodeViewGHash")); + EmitDebugGlobalHashes = GH && !GH->isZero(); } StringRef CodeViewDebug::getFullFilepath(const DIFile *File) { @@ -729,14 +732,7 @@ static Version parseVersion(StringRef Name) { } void CodeViewDebug::emitCompilerInformation() { - MCContext &Context = MMI->getContext(); - MCSymbol *CompilerBegin = Context.createTempSymbol(), - *CompilerEnd = Context.createTempSymbol(); - OS.AddComment("Record length"); - OS.emitAbsoluteSymbolDiff(CompilerEnd, CompilerBegin, 2); - OS.EmitLabel(CompilerBegin); - OS.AddComment("Record kind: S_COMPILE3"); - OS.EmitIntValue(SymbolKind::S_COMPILE3, 2); + MCSymbol *CompilerEnd = beginSymbolRecord(SymbolKind::S_COMPILE3); uint32_t Flags = 0; NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu"); @@ -775,7 +771,7 @@ void CodeViewDebug::emitCompilerInformation() { OS.AddComment("Null-terminated compiler version string"); emitNullTerminatedSymbolName(OS, CompilerVersion); - OS.EmitLabel(CompilerEnd); + endSymbolRecord(CompilerEnd); } static TypeIndex getStringIdTypeIdx(GlobalTypeTableBuilder &TypeTable, @@ -811,14 +807,12 @@ void CodeViewDebug::emitBuildInfo() { // Make a new .debug$S subsection for the S_BUILDINFO record, which points // from the module symbols into the type stream. - MCSymbol *BuildInfoEnd = beginCVSubsection(DebugSubsectionKind::Symbols); - OS.AddComment("Record length"); - OS.EmitIntValue(6, 2); - OS.AddComment("Record kind: S_BUILDINFO"); - OS.EmitIntValue(unsigned(SymbolKind::S_BUILDINFO), 2); + MCSymbol *BISubsecEnd = beginCVSubsection(DebugSubsectionKind::Symbols); + MCSymbol *BIEnd = beginSymbolRecord(SymbolKind::S_BUILDINFO); OS.AddComment("LF_BUILDINFO index"); OS.EmitIntValue(BuildInfoIndex.getIndex(), 4); - endCVSubsection(BuildInfoEnd); + endSymbolRecord(BIEnd); + endCVSubsection(BISubsecEnd); } void CodeViewDebug::emitInlineeLinesSubsection() { @@ -858,18 +852,11 @@ void CodeViewDebug::emitInlineeLinesSubsection() { void CodeViewDebug::emitInlinedCallSite(const FunctionInfo &FI, const DILocation *InlinedAt, const InlineSite &Site) { - MCSymbol *InlineBegin = MMI->getContext().createTempSymbol(), - *InlineEnd = MMI->getContext().createTempSymbol(); - assert(TypeIndices.count({Site.Inlinee, nullptr})); TypeIndex InlineeIdx = TypeIndices[{Site.Inlinee, nullptr}]; // SymbolRecord - OS.AddComment("Record length"); - OS.emitAbsoluteSymbolDiff(InlineEnd, InlineBegin, 2); // RecordLength - OS.EmitLabel(InlineBegin); - OS.AddComment("Record kind: S_INLINESITE"); - OS.EmitIntValue(SymbolKind::S_INLINESITE, 2); // RecordKind + MCSymbol *InlineEnd = beginSymbolRecord(SymbolKind::S_INLINESITE); OS.AddComment("PtrParent"); OS.EmitIntValue(0, 4); @@ -884,7 +871,7 @@ void CodeViewDebug::emitInlinedCallSite(const FunctionInfo &FI, OS.EmitCVInlineLinetableDirective(Site.SiteFuncId, FileId, StartLineNum, FI.Begin, FI.End); - OS.EmitLabel(InlineEnd); + endSymbolRecord(InlineEnd); emitLocalVariableList(FI, Site.InlinedLocals); @@ -897,10 +884,7 @@ void CodeViewDebug::emitInlinedCallSite(const FunctionInfo &FI, } // Close the scope. - OS.AddComment("Record length"); - OS.EmitIntValue(2, 2); // RecordLength - OS.AddComment("Record kind: S_INLINESITE_END"); - OS.EmitIntValue(SymbolKind::S_INLINESITE_END, 2); // RecordKind + emitEndSymbolRecord(SymbolKind::S_INLINESITE_END); } void CodeViewDebug::switchToDebugSectionForSymbol(const MCSymbol *GVSym) { @@ -935,13 +919,7 @@ void CodeViewDebug::emitDebugInfoForThunk(const Function *GV, MCSymbol *SymbolsEnd = beginCVSubsection(DebugSubsectionKind::Symbols); // Emit S_THUNK32 - MCSymbol *ThunkRecordBegin = MMI->getContext().createTempSymbol(), - *ThunkRecordEnd = MMI->getContext().createTempSymbol(); - OS.AddComment("Record length"); - OS.emitAbsoluteSymbolDiff(ThunkRecordEnd, ThunkRecordBegin, 2); - OS.EmitLabel(ThunkRecordBegin); - OS.AddComment("Record kind: S_THUNK32"); - OS.EmitIntValue(unsigned(SymbolKind::S_THUNK32), 2); + MCSymbol *ThunkRecordEnd = beginSymbolRecord(SymbolKind::S_THUNK32); OS.AddComment("PtrParent"); OS.EmitIntValue(0, 4); OS.AddComment("PtrEnd"); @@ -959,17 +937,13 @@ void CodeViewDebug::emitDebugInfoForThunk(const Function *GV, OS.AddComment("Function name"); emitNullTerminatedSymbolName(OS, FuncName); // Additional fields specific to the thunk ordinal would go here. - OS.EmitLabel(ThunkRecordEnd); + endSymbolRecord(ThunkRecordEnd); // Local variables/inlined routines are purposely omitted here. The point of // marking this as a thunk is so Visual Studio will NOT stop in this routine. // Emit S_PROC_ID_END - const unsigned RecordLengthForSymbolEnd = 2; - OS.AddComment("Record length"); - OS.EmitIntValue(RecordLengthForSymbolEnd, 2); - OS.AddComment("Record kind: S_PROC_ID_END"); - OS.EmitIntValue(unsigned(SymbolKind::S_PROC_ID_END), 2); + emitEndSymbolRecord(SymbolKind::S_PROC_ID_END); endCVSubsection(SymbolsEnd); } @@ -1012,19 +986,9 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV, OS.AddComment("Symbol subsection for " + Twine(FuncName)); MCSymbol *SymbolsEnd = beginCVSubsection(DebugSubsectionKind::Symbols); { - MCSymbol *ProcRecordBegin = MMI->getContext().createTempSymbol(), - *ProcRecordEnd = MMI->getContext().createTempSymbol(); - OS.AddComment("Record length"); - OS.emitAbsoluteSymbolDiff(ProcRecordEnd, ProcRecordBegin, 2); - OS.EmitLabel(ProcRecordBegin); - - if (GV->hasLocalLinkage()) { - OS.AddComment("Record kind: S_LPROC32_ID"); - OS.EmitIntValue(unsigned(SymbolKind::S_LPROC32_ID), 2); - } else { - OS.AddComment("Record kind: S_GPROC32_ID"); - OS.EmitIntValue(unsigned(SymbolKind::S_GPROC32_ID), 2); - } + SymbolKind ProcKind = GV->hasLocalLinkage() ? SymbolKind::S_LPROC32_ID + : SymbolKind::S_GPROC32_ID; + MCSymbol *ProcRecordEnd = beginSymbolRecord(ProcKind); // These fields are filled in by tools like CVPACK which run after the fact. OS.AddComment("PtrParent"); @@ -1053,15 +1017,9 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV, OS.AddComment("Function name"); // Truncate the name so we won't overflow the record length field. emitNullTerminatedSymbolName(OS, FuncName); - OS.EmitLabel(ProcRecordEnd); - - MCSymbol *FrameProcBegin = MMI->getContext().createTempSymbol(), - *FrameProcEnd = MMI->getContext().createTempSymbol(); - OS.AddComment("Record length"); - OS.emitAbsoluteSymbolDiff(FrameProcEnd, FrameProcBegin, 2); - OS.EmitLabel(FrameProcBegin); - OS.AddComment("Record kind: S_FRAMEPROC"); - OS.EmitIntValue(unsigned(SymbolKind::S_FRAMEPROC), 2); + endSymbolRecord(ProcRecordEnd); + + MCSymbol *FrameProcEnd = beginSymbolRecord(SymbolKind::S_FRAMEPROC); // Subtract out the CSR size since MSVC excludes that and we include it. OS.AddComment("FrameSize"); OS.EmitIntValue(FI.FrameSize - FI.CSRSize, 4); @@ -1077,7 +1035,7 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV, OS.EmitIntValue(0, 2); OS.AddComment("Flags (defines frame register)"); OS.EmitIntValue(uint32_t(FI.FrameProcOpts), 4); - OS.EmitLabel(FrameProcEnd); + endSymbolRecord(FrameProcEnd); emitLocalVariableList(FI, FI.Locals); emitLexicalBlockList(FI.ChildBlocks, FI); @@ -1095,13 +1053,7 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV, for (auto Annot : FI.Annotations) { MCSymbol *Label = Annot.first; MDTuple *Strs = cast(Annot.second); - MCSymbol *AnnotBegin = MMI->getContext().createTempSymbol(), - *AnnotEnd = MMI->getContext().createTempSymbol(); - OS.AddComment("Record length"); - OS.emitAbsoluteSymbolDiff(AnnotEnd, AnnotBegin, 2); - OS.EmitLabel(AnnotBegin); - OS.AddComment("Record kind: S_ANNOTATION"); - OS.EmitIntValue(SymbolKind::S_ANNOTATION, 2); + MCSymbol *AnnotEnd = beginSymbolRecord(SymbolKind::S_ANNOTATION); OS.EmitCOFFSecRel32(Label, /*Offset=*/0); // FIXME: Make sure we don't overflow the max record size. OS.EmitCOFFSectionIndex(Label); @@ -1113,17 +1065,14 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV, assert(Str.data()[Str.size()] == '\0' && "non-nullterminated MDString"); OS.EmitBytes(StringRef(Str.data(), Str.size() + 1)); } - OS.EmitLabel(AnnotEnd); + endSymbolRecord(AnnotEnd); } if (SP != nullptr) emitDebugInfoForUDTs(LocalUDTs); // We're done with this function. - OS.AddComment("Record length"); - OS.EmitIntValue(0x0002, 2); - OS.AddComment("Record kind: S_PROC_ID_END"); - OS.EmitIntValue(unsigned(SymbolKind::S_PROC_ID_END), 2); + emitEndSymbolRecord(SymbolKind::S_PROC_ID_END); } endCVSubsection(SymbolsEnd); @@ -1713,6 +1662,9 @@ TypeIndex CodeViewDebug::lowerTypePointer(const DIDerivedType *Ty, break; } + if (Ty->isObjectPointer()) + PO |= PointerOptions::Const; + PointerRecord PR(PointeeTI, PK, PM, PO, Ty->getSizeInBits() / 8); return TypeTable.writeLeafType(PR); } @@ -1877,27 +1829,22 @@ TypeIndex CodeViewDebug::lowerTypeMemberFunction(const DISubroutineType *Ty, // Lower the containing class type. TypeIndex ClassType = getTypeIndex(ClassTy); - SmallVector ReturnAndArgTypeIndices; - for (DITypeRef ArgTypeRef : Ty->getTypeArray()) - ReturnAndArgTypeIndices.push_back(getTypeIndex(ArgTypeRef)); + DITypeRefArray ReturnAndArgs = Ty->getTypeArray(); + + unsigned Index = 0; + SmallVector ArgTypeIndices; + TypeIndex ReturnTypeIndex = getTypeIndex(ReturnAndArgs[Index++]); - // MSVC uses type none for variadic argument. - if (ReturnAndArgTypeIndices.size() > 1 && - ReturnAndArgTypeIndices.back() == TypeIndex::Void()) { - ReturnAndArgTypeIndices.back() = TypeIndex::None(); - } - TypeIndex ReturnTypeIndex = TypeIndex::Void(); - ArrayRef ArgTypeIndices = None; - if (!ReturnAndArgTypeIndices.empty()) { - auto ReturnAndArgTypesRef = makeArrayRef(ReturnAndArgTypeIndices); - ReturnTypeIndex = ReturnAndArgTypesRef.front(); - ArgTypeIndices = ReturnAndArgTypesRef.drop_front(); - } TypeIndex ThisTypeIndex; - if (!IsStaticMethod && !ArgTypeIndices.empty()) { - ThisTypeIndex = ArgTypeIndices.front(); - ArgTypeIndices = ArgTypeIndices.drop_front(); - } + if (!IsStaticMethod && ReturnAndArgs.size() > 1) + ThisTypeIndex = getTypeIndexForThisPtr(ReturnAndArgs[Index++], Ty); + + while (Index < ReturnAndArgs.size()) + ArgTypeIndices.push_back(getTypeIndex(ReturnAndArgs[Index++])); + + // MSVC uses type none for variadic argument. + if (!ArgTypeIndices.empty() && ArgTypeIndices.back() == TypeIndex::Void()) + ArgTypeIndices.back() = TypeIndex::None(); ArgListRecord ArgListRec(TypeRecordKind::ArgList, ArgTypeIndices); TypeIndex ArgListIndex = TypeTable.writeLeafType(ArgListRec); @@ -1987,8 +1934,8 @@ static ClassOptions getCommonClassOptions(const DICompositeType *Ty) { CO |= ClassOptions::Nested; // Put the Scoped flag on function-local types. MSVC puts this flag for enum - // type only when it has an immediate function scope. Clang never puts enums - // inside DILexicalBlock scopes. Enum types, as generated by clang, are + // type only when it has an immediate function scope. Clang never puts enums + // inside DILexicalBlock scopes. Enum types, as generated by clang, are // always in function, class, or file scopes. if (Ty->getTag() == dwarf::DW_TAG_enumeration_type) { if (ImmediateScope && isa(ImmediateScope)) @@ -2444,6 +2391,31 @@ TypeIndex CodeViewDebug::getTypeIndex(DITypeRef TypeRef, DITypeRef ClassTyRef) { return recordTypeIndexForDINode(Ty, TI, ClassTy); } +codeview::TypeIndex +CodeViewDebug::getTypeIndexForThisPtr(DITypeRef TypeRef, + const DISubroutineType *SubroutineTy) { + const DIType *Ty = TypeRef.resolve(); + + PointerOptions Options = PointerOptions::None; + if (SubroutineTy->getFlags() & DINode::DIFlags::FlagLValueReference) + Options = PointerOptions::LValueRefThisPointer; + else if (SubroutineTy->getFlags() & DINode::DIFlags::FlagRValueReference) + Options = PointerOptions::RValueRefThisPointer; + + // Check if we've already translated this type. If there is no ref qualifier + // on the function then we look up this pointer type with no associated class + // so that the TypeIndex for the this pointer can be shared with the type + // index for other pointers to this class type. If there is a ref qualifier + // then we lookup the pointer using the subroutine as the parent type. + auto I = TypeIndices.find({Ty, SubroutineTy}); + if (I != TypeIndices.end()) + return I->second; + + TypeLoweringScope S(*this); + TypeIndex TI = lowerTypePointer(cast(Ty), Options); + return recordTypeIndexForDINode(Ty, TI, SubroutineTy); +} + TypeIndex CodeViewDebug::getTypeIndexForReferenceTo(DITypeRef TypeRef) { DIType *Ty = TypeRef.resolve(); PointerRecord PR(getTypeIndex(Ty), @@ -2461,6 +2433,14 @@ TypeIndex CodeViewDebug::getCompleteTypeIndex(DITypeRef TypeRef) { if (!Ty) return TypeIndex::Void(); + // Look through typedefs when getting the complete type index. Call + // getTypeIndex on the typdef to ensure that any UDTs are accumulated and are + // emitted only once. + if (Ty->getTag() == dwarf::DW_TAG_typedef) + (void)getTypeIndex(Ty); + while (Ty->getTag() == dwarf::DW_TAG_typedef) + Ty = cast(Ty)->getBaseType().resolve(); + // If this is a non-record type, the complete type index is the same as the // normal type index. Just call getTypeIndex. switch (Ty->getTag()) { @@ -2562,14 +2542,7 @@ static void copyBytesForDefRange(SmallString<20> &BytePrefix, void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI, const LocalVariable &Var) { // LocalSym record, see SymbolRecord.h for more info. - MCSymbol *LocalBegin = MMI->getContext().createTempSymbol(), - *LocalEnd = MMI->getContext().createTempSymbol(); - OS.AddComment("Record length"); - OS.emitAbsoluteSymbolDiff(LocalEnd, LocalBegin, 2); - OS.EmitLabel(LocalBegin); - - OS.AddComment("Record kind: S_LOCAL"); - OS.EmitIntValue(unsigned(SymbolKind::S_LOCAL), 2); + MCSymbol *LocalEnd = beginSymbolRecord(SymbolKind::S_LOCAL); LocalSymFlags Flags = LocalSymFlags::None; if (Var.DIVar->isParameter()) @@ -2586,7 +2559,7 @@ void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI, OS.EmitIntValue(static_cast(Flags), 2); // Truncate the name so we won't overflow the record length field. emitNullTerminatedSymbolName(OS, Var.DIVar->getName()); - OS.EmitLabel(LocalEnd); + endSymbolRecord(LocalEnd); // Calculate the on disk prefix of the appropriate def range record. The // records and on disk formats are described in SymbolRecords.h. BytePrefix @@ -2658,15 +2631,7 @@ void CodeViewDebug::emitLexicalBlockList(ArrayRef Blocks, /// lexical block scope. void CodeViewDebug::emitLexicalBlock(const LexicalBlock &Block, const FunctionInfo& FI) { - MCSymbol *RecordBegin = MMI->getContext().createTempSymbol(), - *RecordEnd = MMI->getContext().createTempSymbol(); - - // Lexical block symbol record. - OS.AddComment("Record length"); - OS.emitAbsoluteSymbolDiff(RecordEnd, RecordBegin, 2); // Record Length - OS.EmitLabel(RecordBegin); - OS.AddComment("Record kind: S_BLOCK32"); - OS.EmitIntValue(SymbolKind::S_BLOCK32, 2); // Record Kind + MCSymbol *RecordEnd = beginSymbolRecord(SymbolKind::S_BLOCK32); OS.AddComment("PtrParent"); OS.EmitIntValue(0, 4); // PtrParent OS.AddComment("PtrEnd"); @@ -2679,7 +2644,7 @@ void CodeViewDebug::emitLexicalBlock(const LexicalBlock &Block, OS.EmitCOFFSectionIndex(FI.Begin); // Func Symbol OS.AddComment("Lexical block name"); emitNullTerminatedSymbolName(OS, Block.Name); // Name - OS.EmitLabel(RecordEnd); + endSymbolRecord(RecordEnd); // Emit variables local to this lexical block. emitLocalVariableList(FI, Block.Locals); @@ -2688,10 +2653,7 @@ void CodeViewDebug::emitLexicalBlock(const LexicalBlock &Block, emitLexicalBlockList(Block.Children, FI); // Close the lexical block scope. - OS.AddComment("Record length"); - OS.EmitIntValue(2, 2); // Record Length - OS.AddComment("Record kind: S_END"); - OS.EmitIntValue(SymbolKind::S_END, 2); // Record Kind + emitEndSymbolRecord(SymbolKind::S_END); } /// Convenience routine for collecting lexical block information for a list @@ -2849,26 +2811,53 @@ void CodeViewDebug::endCVSubsection(MCSymbol *EndLabel) { OS.EmitValueToAlignment(4); } +static StringRef getSymbolName(SymbolKind SymKind) { + for (const EnumEntry &EE : getSymbolTypeNames()) + if (EE.Value == SymKind) + return EE.Name; + return ""; +} + +MCSymbol *CodeViewDebug::beginSymbolRecord(SymbolKind SymKind) { + MCSymbol *BeginLabel = MMI->getContext().createTempSymbol(), + *EndLabel = MMI->getContext().createTempSymbol(); + OS.AddComment("Record length"); + OS.emitAbsoluteSymbolDiff(EndLabel, BeginLabel, 2); + OS.EmitLabel(BeginLabel); + if (OS.isVerboseAsm()) + OS.AddComment("Record kind: " + getSymbolName(SymKind)); + OS.EmitIntValue(unsigned(SymKind), 2); + return EndLabel; +} + +void CodeViewDebug::endSymbolRecord(MCSymbol *SymEnd) { + // MSVC does not pad out symbol records to four bytes, but LLVM does to avoid + // an extra copy of every symbol record in LLD. This increases object file + // size by less than 1% in the clang build, and is compatible with the Visual + // C++ linker. + OS.EmitValueToAlignment(4); + OS.EmitLabel(SymEnd); +} + +void CodeViewDebug::emitEndSymbolRecord(SymbolKind EndKind) { + OS.AddComment("Record length"); + OS.EmitIntValue(2, 2); + if (OS.isVerboseAsm()) + OS.AddComment("Record kind: " + getSymbolName(EndKind)); + OS.EmitIntValue(unsigned(EndKind), 2); // Record Kind +} + void CodeViewDebug::emitDebugInfoForUDTs( ArrayRef> UDTs) { for (const auto &UDT : UDTs) { const DIType *T = UDT.second; assert(shouldEmitUdt(T)); - MCSymbol *UDTRecordBegin = MMI->getContext().createTempSymbol(), - *UDTRecordEnd = MMI->getContext().createTempSymbol(); - OS.AddComment("Record length"); - OS.emitAbsoluteSymbolDiff(UDTRecordEnd, UDTRecordBegin, 2); - OS.EmitLabel(UDTRecordBegin); - - OS.AddComment("Record kind: S_UDT"); - OS.EmitIntValue(unsigned(SymbolKind::S_UDT), 2); - + MCSymbol *UDTRecordEnd = beginSymbolRecord(SymbolKind::S_UDT); OS.AddComment("Type"); OS.EmitIntValue(getCompleteTypeIndex(T).getIndex(), 4); - emitNullTerminatedSymbolName(OS, UDT.first); - OS.EmitLabel(UDTRecordEnd); + endSymbolRecord(UDTRecordEnd); } } @@ -2939,31 +2928,14 @@ void CodeViewDebug::emitDebugInfoForRetainedTypes() { void CodeViewDebug::emitDebugInfoForGlobal(const DIGlobalVariable *DIGV, const GlobalVariable *GV, MCSymbol *GVSym) { - // DataSym record, see SymbolRecord.h for more info. - // FIXME: Thread local data, etc - MCSymbol *DataBegin = MMI->getContext().createTempSymbol(), - *DataEnd = MMI->getContext().createTempSymbol(); - const unsigned FixedLengthOfThisRecord = 12; - OS.AddComment("Record length"); - OS.emitAbsoluteSymbolDiff(DataEnd, DataBegin, 2); - OS.EmitLabel(DataBegin); - if (DIGV->isLocalToUnit()) { - if (GV->isThreadLocal()) { - OS.AddComment("Record kind: S_LTHREAD32"); - OS.EmitIntValue(unsigned(SymbolKind::S_LTHREAD32), 2); - } else { - OS.AddComment("Record kind: S_LDATA32"); - OS.EmitIntValue(unsigned(SymbolKind::S_LDATA32), 2); - } - } else { - if (GV->isThreadLocal()) { - OS.AddComment("Record kind: S_GTHREAD32"); - OS.EmitIntValue(unsigned(SymbolKind::S_GTHREAD32), 2); - } else { - OS.AddComment("Record kind: S_GDATA32"); - OS.EmitIntValue(unsigned(SymbolKind::S_GDATA32), 2); - } - } + // DataSym record, see SymbolRecord.h for more info. Thread local data + // happens to have the same format as global data. + SymbolKind DataSym = GV->isThreadLocal() + ? (DIGV->isLocalToUnit() ? SymbolKind::S_LTHREAD32 + : SymbolKind::S_GTHREAD32) + : (DIGV->isLocalToUnit() ? SymbolKind::S_LDATA32 + : SymbolKind::S_GDATA32); + MCSymbol *DataEnd = beginSymbolRecord(DataSym); OS.AddComment("Type"); OS.EmitIntValue(getCompleteTypeIndex(DIGV->getType()).getIndex(), 4); OS.AddComment("DataOffset"); @@ -2971,6 +2943,7 @@ void CodeViewDebug::emitDebugInfoForGlobal(const DIGlobalVariable *DIGV, OS.AddComment("Segment"); OS.EmitCOFFSectionIndex(GVSym); OS.AddComment("Name"); - emitNullTerminatedSymbolName(OS, DIGV->getName(), FixedLengthOfThisRecord); - OS.EmitLabel(DataEnd); + const unsigned LengthOfDataRecord = 12; + emitNullTerminatedSymbolName(OS, DIGV->getName(), LengthOfDataRecord); + endSymbolRecord(DataEnd); } diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/lib/CodeGen/AsmPrinter/CodeViewDebug.h index ef0f0c3635e501b8b5e73df307ef0dd3bfc5eede..fcc7c3290eb2c4aa1388dacf638cded494bb67b9 100644 --- a/lib/CodeGen/AsmPrinter/CodeViewDebug.h +++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.h @@ -14,14 +14,14 @@ #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H #define LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H -#include "DbgEntityHistoryCalculator.h" -#include "DebugHandlerBase.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/DbgEntityHistoryCalculator.h" +#include "llvm/CodeGen/DebugHandlerBase.h" #include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h" #include "llvm/DebugInfo/CodeView/TypeIndex.h" @@ -54,6 +54,9 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase { BumpPtrAllocator Allocator; codeview::GlobalTypeTableBuilder TypeTable; + /// Whether to emit type record hashes into .debug$H. + bool EmitDebugGlobalHashes = false; + /// The codeview CPU type used by the translation unit. codeview::CPUType TheCPU; @@ -299,9 +302,18 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase { /// Returns an end label for use with endCVSubsection when the subsection is /// finished. MCSymbol *beginCVSubsection(codeview::DebugSubsectionKind Kind); - void endCVSubsection(MCSymbol *EndLabel); + /// Opens a symbol record of the given kind. Returns an end label for use with + /// endSymbolRecord. + MCSymbol *beginSymbolRecord(codeview::SymbolKind Kind); + void endSymbolRecord(MCSymbol *SymEnd); + + /// Emits an S_END, S_INLINESITE_END, or S_PROC_ID_END record. These records + /// are empty, so we emit them with a simpler assembly sequence that doesn't + /// involve labels. + void emitEndSymbolRecord(codeview::SymbolKind EndKind); + void emitInlinedCallSite(const FunctionInfo &FI, const DILocation *InlinedAt, const InlineSite &Site); @@ -343,6 +355,10 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase { codeview::TypeIndex getTypeIndex(DITypeRef TypeRef, DITypeRef ClassTyRef = DITypeRef()); + codeview::TypeIndex + getTypeIndexForThisPtr(DITypeRef TypeRef, + const DISubroutineType *SubroutineTy); + codeview::TypeIndex getTypeIndexForReferenceTo(DITypeRef TypeRef); codeview::TypeIndex getMemberFunctionType(const DISubprogram *SP, diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp index 301fd9ef81b3cfabb952c1deb76e003b0c47ba69..e27659494f08125d0125ed860fed1435037c0d2f 100644 --- a/lib/CodeGen/AsmPrinter/DIE.cpp +++ b/lib/CodeGen/AsmPrinter/DIE.cpp @@ -465,7 +465,7 @@ void DIEInteger::print(raw_ostream &O) const { /// EmitValue - Emit expression value. /// void DIEExpr::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const { - AP->EmitDebugThreadLocal(Expr, SizeOf(AP, Form)); + AP->EmitDebugValue(Expr, SizeOf(AP, Form)); } /// SizeOf - Determine size of expression value in bytes. diff --git a/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp index bb2fa7d9c72069bace064c968fc10745a052604b..09867822c30ac9f1c281cdc0108911edc96a3377 100644 --- a/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp +++ b/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -#include "DbgEntityHistoryCalculator.h" +#include "llvm/CodeGen/DbgEntityHistoryCalculator.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp index a362dd40e3b16121df0df7534fb5a74bb164e99b..551cd36d19843d6cf1c0a68a37a5365ef5341008 100644 --- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp +++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp @@ -12,7 +12,7 @@ // //===----------------------------------------------------------------------===// -#include "DebugHandlerBase.h" +#include "llvm/CodeGen/DebugHandlerBase.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/AsmPrinter.h" diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index d93c7f6c8459ea038b083b5109c986b770b4989d..1dca3f0fce5baf647f26da923f5d14e9746812a0 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -1125,3 +1125,12 @@ bool DwarfCompileUnit::includeMinimalInlineScopes() const { return getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly || (DD->useSplitDwarf() && !Skeleton); } + +void DwarfCompileUnit::addAddrTableBase() { + const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); + MCSymbol *Label = DD->getAddressPool().getLabel(); + addSectionLabel(getUnitDie(), + getDwarfVersion() >= 5 ? dwarf::DW_AT_addr_base + : dwarf::DW_AT_GNU_addr_base, + Label, TLOF.getDwarfAddrSection()->getBeginSymbol()); +} diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h index 13679c37fe547c45ac7dd833a91e09160305bae4..9ec22f68c12f1a0dc4b69b4f5cca2de589c75143 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h @@ -14,7 +14,6 @@ #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H -#include "DbgEntityHistoryCalculator.h" #include "DwarfDebug.h" #include "DwarfUnit.h" #include "llvm/ADT/ArrayRef.h" @@ -23,6 +22,7 @@ #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/CodeGen/DbgEntityHistoryCalculator.h" #include "llvm/CodeGen/DIE.h" #include "llvm/CodeGen/LexicalScopes.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -243,6 +243,9 @@ public: void emitHeader(bool UseOffsets) override; + /// Add the DW_AT_addr_base attribute to the unit DIE. + void addAddrTableBase(); + MCSymbol *getLabelBegin() const { assert(getSection()); return LabelBegin; diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 070b8fe4ec1c4a1b7b1eb6cb4f7539270ba4604d..2d8c9e497388155f705894fe502285fbc0c4dc4b 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -568,28 +568,10 @@ void DwarfDebug::addGnuPubAttributes(DwarfCompileUnit &U, DIE &D) const { U.addFlag(D, dwarf::DW_AT_GNU_pubnames); } -// Create new DwarfCompileUnit for the given metadata node with tag -// DW_TAG_compile_unit. -DwarfCompileUnit & -DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) { - if (auto *CU = CUMap.lookup(DIUnit)) - return *CU; - StringRef FN = DIUnit->getFilename(); - CompilationDir = DIUnit->getDirectory(); - - auto OwnedUnit = llvm::make_unique( - InfoHolder.getUnits().size(), DIUnit, Asm, this, &InfoHolder); - DwarfCompileUnit &NewCU = *OwnedUnit; +void DwarfDebug::finishUnitAttributes(const DICompileUnit *DIUnit, + DwarfCompileUnit &NewCU) { DIE &Die = NewCU.getUnitDie(); - InfoHolder.addUnit(std::move(OwnedUnit)); - if (useSplitDwarf()) { - NewCU.setSkeleton(constructSkeletonCU(NewCU)); - NewCU.addString(Die, dwarf::DW_AT_GNU_dwo_name, - Asm->TM.Options.MCOptions.SplitDwarfFile); - } - - for (auto *IE : DIUnit->getImportedEntities()) - NewCU.addImportedEntity(IE); + StringRef FN = DIUnit->getFilename(); // LTO with assembly output shares a single line table amongst multiple CUs. // To avoid the compilation directory being ambiguous, let the line table @@ -640,11 +622,6 @@ DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) { dwarf::DW_FORM_data1, RVer); } - if (useSplitDwarf()) - NewCU.setSection(Asm->getObjFileLowering().getDwarfInfoDWOSection()); - else - NewCU.setSection(Asm->getObjFileLowering().getDwarfInfoSection()); - if (DIUnit->getDWOId()) { // This CU is either a clang module DWO or a skeleton CU. NewCU.addUInt(Die, dwarf::DW_AT_GNU_dwo_id, dwarf::DW_FORM_data8, @@ -654,9 +631,34 @@ DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) { NewCU.addString(Die, dwarf::DW_AT_GNU_dwo_name, DIUnit->getSplitDebugFilename()); } +} +// Create new DwarfCompileUnit for the given metadata node with tag +// DW_TAG_compile_unit. +DwarfCompileUnit & +DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) { + if (auto *CU = CUMap.lookup(DIUnit)) + return *CU; + + CompilationDir = DIUnit->getDirectory(); + + auto OwnedUnit = llvm::make_unique( + InfoHolder.getUnits().size(), DIUnit, Asm, this, &InfoHolder); + DwarfCompileUnit &NewCU = *OwnedUnit; + InfoHolder.addUnit(std::move(OwnedUnit)); + + for (auto *IE : DIUnit->getImportedEntities()) + NewCU.addImportedEntity(IE); + + if (useSplitDwarf()) { + NewCU.setSkeleton(constructSkeletonCU(NewCU)); + NewCU.setSection(Asm->getObjFileLowering().getDwarfInfoDWOSection()); + } else { + finishUnitAttributes(DIUnit, NewCU); + NewCU.setSection(Asm->getObjFileLowering().getDwarfInfoSection()); + } CUMap.insert({DIUnit, &NewCU}); - CUDieMap.insert({&Die, &NewCU}); + CUDieMap.insert({&NewCU.getUnitDie(), &NewCU}); return NewCU; } @@ -851,7 +853,12 @@ void DwarfDebug::finalizeModuleInfo() { // If we're splitting the dwarf out now that we've got the entire // CU then add the dwo id to it. auto *SkCU = TheCU.getSkeleton(); - if (useSplitDwarf()) { + if (useSplitDwarf() && !empty(TheCU.getUnitDie().children())) { + finishUnitAttributes(TheCU.getCUNode(), TheCU); + TheCU.addString(TheCU.getUnitDie(), dwarf::DW_AT_GNU_dwo_name, + Asm->TM.Options.MCOptions.SplitDwarfFile); + SkCU->addString(SkCU->getUnitDie(), dwarf::DW_AT_GNU_dwo_name, + Asm->TM.Options.MCOptions.SplitDwarfFile); // Emit a unique identifier for this CU. uint64_t ID = DIEHash(Asm).computeCUSignature(DWOName, TheCU.getUnitDie()); @@ -870,6 +877,8 @@ void DwarfDebug::finalizeModuleInfo() { SkCU->addSectionLabel(SkCU->getUnitDie(), dwarf::DW_AT_GNU_ranges_base, Sym, Sym); } + } else if (SkCU) { + finishUnitAttributes(SkCU->getCUNode(), *SkCU); } // If we have code split among multiple sections or non-contiguous @@ -882,7 +891,9 @@ void DwarfDebug::finalizeModuleInfo() { // We don't keep track of which addresses are used in which CU so this // is a bit pessimistic under LTO. - if (!AddrPool.isEmpty()) + if (!AddrPool.isEmpty() && + (getDwarfVersion() >= 5 || + (SkCU && !empty(TheCU.getUnitDie().children())))) U.addAddrTableBase(); if (unsigned NumRanges = TheCU.getRanges().size()) { @@ -1839,8 +1850,8 @@ void DwarfDebug::emitDebugPubSection(bool GnuStyle, StringRef Name, if (GnuStyle) { dwarf::PubIndexEntryDescriptor Desc = computeIndexValue(TheU, Entity); Asm->OutStreamer->AddComment( - Twine("Kind: ") + dwarf::GDBIndexEntryKindString(Desc.Kind) + ", " + - dwarf::GDBIndexEntryLinkageString(Desc.Linkage)); + Twine("Attributes: ") + dwarf::GDBIndexEntryKindString(Desc.Kind) + + ", " + dwarf::GDBIndexEntryLinkageString(Desc.Linkage)); Asm->emitInt8(Desc.toBits()); } @@ -2287,7 +2298,7 @@ static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm, // contributions. auto *Base = CUBase; if (!Base && (P.second.size() > 1 || DwarfVersion < 5) && - (UseDwarfRangesBaseAddressSpecifier || DwarfVersion >= 5)) { + (CU.getCUNode()->getRangesBaseAddress() || DwarfVersion >= 5)) { BaseIsSet = true; // FIXME/use care: This may not be a useful base address if it's not // the lowest address/range in this object. @@ -2483,8 +2494,6 @@ void DwarfDebug::emitDebugMacinfo() { void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die, std::unique_ptr NewU) { - NewU->addString(Die, dwarf::DW_AT_GNU_dwo_name, - Asm->TM.Options.MCOptions.SplitDwarfFile); if (!CompilationDir.empty()) NewU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir); @@ -2609,10 +2618,18 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU, NewTU.setTypeSignature(Signature); Ins.first->second = Signature; - if (useSplitDwarf()) - NewTU.setSection(Asm->getObjFileLowering().getDwarfTypesDWOSection()); - else { - NewTU.setSection(Asm->getObjFileLowering().getDwarfTypesSection(Signature)); + if (useSplitDwarf()) { + MCSection *Section = + getDwarfVersion() <= 4 + ? Asm->getObjFileLowering().getDwarfTypesDWOSection() + : Asm->getObjFileLowering().getDwarfInfoDWOSection(); + NewTU.setSection(Section); + } else { + MCSection *Section = + getDwarfVersion() <= 4 + ? Asm->getObjFileLowering().getDwarfTypesSection(Signature) + : Asm->getObjFileLowering().getDwarfInfoSection(Signature); + NewTU.setSection(Section); // Non-split type units reuse the compile unit's line table. CU.applyStmtList(UnitDie); } diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h index c73d442af2fd6ea9aced119ce57a04d73482ec68..8a31e989b289a4c3c8c87310be2cd9041f3557f3 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -15,8 +15,6 @@ #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFDEBUG_H #include "AddressPool.h" -#include "DbgEntityHistoryCalculator.h" -#include "DebugHandlerBase.h" #include "DebugLocStream.h" #include "DwarfFile.h" #include "llvm/ADT/ArrayRef.h" @@ -31,6 +29,8 @@ #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/AccelTable.h" +#include "llvm/CodeGen/DbgEntityHistoryCalculator.h" +#include "llvm/CodeGen/DebugHandlerBase.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DebugLoc.h" @@ -540,6 +540,8 @@ class DwarfDebug : public DebugHandlerBase { /// Create new DwarfCompileUnit for the given metadata node with tag /// DW_TAG_compile_unit. DwarfCompileUnit &getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit); + void finishUnitAttributes(const DICompileUnit *DIUnit, + DwarfCompileUnit &NewCU); /// Construct imported_module or imported_declaration DIE. void constructAndAddImportedEntityDIE(DwarfCompileUnit &TheCU, diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/lib/CodeGen/AsmPrinter/DwarfFile.cpp index 4e410bb49beab02e459d05c33eacf058fc4405fb..78ccad4814111b77bfff2801c6d786c19e70fe63 100644 --- a/lib/CodeGen/AsmPrinter/DwarfFile.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfFile.cpp @@ -39,13 +39,17 @@ void DwarfFile::emitUnit(DwarfUnit *TheU, bool UseOffsets) { if (TheU->getCUNode()->isDebugDirectivesOnly()) return; - DIE &Die = TheU->getUnitDie(); - MCSection *USection = TheU->getSection(); - Asm->OutStreamer->SwitchSection(USection); + MCSection *S = TheU->getSection(); + if (!S) + return; + + Asm->OutStreamer->SwitchSection(S); TheU->emitHeader(UseOffsets); + Asm->emitDwarfDIE(TheU->getUnitDie()); - Asm->emitDwarfDIE(Die); + if (MCSymbol *EndLabel = TheU->getEndLabel()) + Asm->OutStreamer->EmitLabel(EndLabel); } // Compute the size and offset for each DIE. diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 2053395808f1bcf1f16e5fb2d349a8eeb4d165ee..991dac7cf35b0e96f6d5476abfd2334d26e031e9 100644 --- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -1553,7 +1553,14 @@ DIE *DwarfUnit::getOrCreateStaticMemberDIE(const DIDerivedType *DT) { void DwarfUnit::emitCommonHeader(bool UseOffsets, dwarf::UnitType UT) { // Emit size of content not including length itself Asm->OutStreamer->AddComment("Length of Unit"); - Asm->emitInt32(getHeaderSize() + getUnitDie().getSize()); + if (!DD->useSectionsAsReferences()) { + StringRef Prefix = isDwoUnit() ? "debug_info_dwo_" : "debug_info_"; + MCSymbol *BeginLabel = Asm->createTempSymbol(Prefix + "start"); + EndLabel = Asm->createTempSymbol(Prefix + "end"); + Asm->EmitLabelDifference(EndLabel, BeginLabel, 4); + Asm->OutStreamer->EmitLabel(BeginLabel); + } else + Asm->emitInt32(getHeaderSize() + getUnitDie().getSize()); Asm->OutStreamer->AddComment("DWARF version number"); unsigned Version = DD->getDwarfVersion(); @@ -1664,12 +1671,3 @@ void DwarfUnit::addLoclistsBase() { DU->getLoclistsTableBaseSym(), TLOF.getDwarfLoclistsSection()->getBeginSymbol()); } - -void DwarfUnit::addAddrTableBase() { - const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); - MCSymbol *Label = DD->getAddressPool().getLabel(); - addSectionLabel(getUnitDie(), - getDwarfVersion() >= 5 ? dwarf::DW_AT_addr_base - : dwarf::DW_AT_GNU_addr_base, - Label, TLOF.getDwarfAddrSection()->getBeginSymbol()); -} diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h index 860d165318430a9611bd09a569b41fbc2c1a9498..a59ebb7c1465c93c7cd27f0a0841408de4fe7722 100644 --- a/lib/CodeGen/AsmPrinter/DwarfUnit.h +++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h @@ -49,6 +49,9 @@ protected: /// Target of Dwarf emission. AsmPrinter *Asm; + /// Emitted at the end of the CU and used to compute the CU Length field. + MCSymbol *EndLabel = nullptr; + // Holders for some common dwarf information. DwarfDebug *DD; DwarfFile *DU; @@ -82,6 +85,7 @@ protected: public: // Accessors. AsmPrinter* getAsmPrinter() const { return Asm; } + MCSymbol *getEndLabel() const { return EndLabel; } uint16_t getLanguage() const { return CUNode->getSourceLanguage(); } const DICompileUnit *getCUNode() const { return CUNode; } @@ -275,9 +279,6 @@ public: /// Add the DW_AT_loclists_base attribute to the unit DIE. void addLoclistsBase(); - /// Add the DW_AT_addr_base attribute to the unit DIE. - void addAddrTableBase(); - virtual DwarfCompileUnit &getCU() = 0; void constructTypeDIE(DIE &Buffer, const DICompositeType *CTy); diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.h b/lib/CodeGen/AsmPrinter/EHStreamer.h index e3a6f8e9d587268e821f716a0ef2871ec04c5745..ce912d032c6dac48d50709b223f3b89dc5a1125f 100644 --- a/lib/CodeGen/AsmPrinter/EHStreamer.h +++ b/lib/CodeGen/AsmPrinter/EHStreamer.h @@ -14,8 +14,8 @@ #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_EHSTREAMER_H #define LLVM_LIB_CODEGEN_ASMPRINTER_EHSTREAMER_H -#include "AsmPrinterHandler.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/CodeGen/AsmPrinterHandler.h" #include "llvm/Support/Compiler.h" namespace llvm { diff --git a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp index 49cc376fcc9837acce8e2239a54f9bfcf8ea8a0e..34677ecc9e6945212e313b03249ad1cc1e1d35e4 100644 --- a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp @@ -15,10 +15,10 @@ #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/BuiltinGCs.h" #include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCMetadataPrinter.h" #include "llvm/CodeGen/GCStrategy.h" -#include "llvm/CodeGen/GCs.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" diff --git a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp index 59a57ed30d105104ec6c481494f476d1c4a0b079..3479a00def23b412b35f439e9e416c79d6b744bb 100644 --- a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp @@ -15,9 +15,9 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/BuiltinGCs.h" #include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCMetadataPrinter.h" -#include "llvm/CodeGen/GCs.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/IR/Mangler.h" diff --git a/lib/CodeGen/AsmPrinter/WasmException.cpp b/lib/CodeGen/AsmPrinter/WasmException.cpp index 46745d08c9f3b7103de89ed30ddf05833c8c4cfc..527e5ae50146a5d98bbbe6d4cd34fa8d9d043c54 100644 --- a/lib/CodeGen/AsmPrinter/WasmException.cpp +++ b/lib/CodeGen/AsmPrinter/WasmException.cpp @@ -13,9 +13,25 @@ //===----------------------------------------------------------------------===// #include "WasmException.h" +#include "llvm/IR/Mangler.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCStreamer.h" using namespace llvm; +void WasmException::endModule() { + // This is the symbol used in 'throw' and 'if_except' instruction to denote + // this is a C++ exception. This symbol has to be emitted somewhere once in + // the module. Check if the symbol has already been created, i.e., we have at + // least one 'throw' or 'if_except' instruction in the module, and emit the + // symbol only if so. + SmallString<60> NameStr; + Mangler::getNameWithPrefix(NameStr, "__cpp_exception", Asm->getDataLayout()); + if (Asm->OutContext.lookupSymbol(NameStr)) { + MCSymbol *ExceptionSym = Asm->GetExternalSymbolSymbol("__cpp_exception"); + Asm->OutStreamer->EmitLabel(ExceptionSym); + } +} + void WasmException::markFunctionEnd() { // Get rid of any dead landing pads. if (!Asm->MF->getLandingPads().empty()) { diff --git a/lib/CodeGen/AsmPrinter/WasmException.h b/lib/CodeGen/AsmPrinter/WasmException.h index 09a9a25ce8d0027564e8867f3b6e55801014df68..cbdb42457cf80eaf75eb19b303365a310223e4b6 100644 --- a/lib/CodeGen/AsmPrinter/WasmException.h +++ b/lib/CodeGen/AsmPrinter/WasmException.h @@ -24,7 +24,7 @@ class LLVM_LIBRARY_VISIBILITY WasmException : public EHStreamer { public: WasmException(AsmPrinter *A) : EHStreamer(A) {} - void endModule() override {} + void endModule() override; void beginFunction(const MachineFunction *MF) override {} virtual void markFunctionEnd() override; void endFunction(const MachineFunction *MF) override; diff --git a/lib/CodeGen/AsmPrinter/WinCFGuard.h b/lib/CodeGen/AsmPrinter/WinCFGuard.h index 124e8f04bfad8506beb2a9420cfcdad4232f6af8..28f119e359661f4d2ca89eaeddb9c716d2164f0f 100644 --- a/lib/CodeGen/AsmPrinter/WinCFGuard.h +++ b/lib/CodeGen/AsmPrinter/WinCFGuard.h @@ -14,7 +14,7 @@ #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_WINCFGUARD_H #define LLVM_LIB_CODEGEN_ASMPRINTER_WINCFGUARD_H -#include "AsmPrinterHandler.h" +#include "llvm/CodeGen/AsmPrinterHandler.h" #include "llvm/Support/Compiler.h" namespace llvm { diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp index c1b9879401527c1b48b7e9ac95a036d2a96faf77..95581c09dd1c46d5487ae2bd60b21fe1f40292cf 100644 --- a/lib/CodeGen/AtomicExpandPass.cpp +++ b/lib/CodeGen/AtomicExpandPass.cpp @@ -91,6 +91,7 @@ namespace { AtomicRMWInst *widenPartwordAtomicRMW(AtomicRMWInst *AI); void expandPartwordCmpXchg(AtomicCmpXchgInst *I); void expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI); + void expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI); AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI); static Value *insertRMWCmpXchgLoop( @@ -944,6 +945,35 @@ void AtomicExpand::expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI) { AI->eraseFromParent(); } +void AtomicExpand::expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI) { + IRBuilder<> Builder(CI); + + PartwordMaskValues PMV = createMaskInstrs( + Builder, CI, CI->getCompareOperand()->getType(), CI->getPointerOperand(), + TLI->getMinCmpXchgSizeInBits() / 8); + + Value *CmpVal_Shifted = Builder.CreateShl( + Builder.CreateZExt(CI->getCompareOperand(), PMV.WordType), PMV.ShiftAmt, + "CmpVal_Shifted"); + Value *NewVal_Shifted = Builder.CreateShl( + Builder.CreateZExt(CI->getNewValOperand(), PMV.WordType), PMV.ShiftAmt, + "NewVal_Shifted"); + Value *OldVal = TLI->emitMaskedAtomicCmpXchgIntrinsic( + Builder, CI, PMV.AlignedAddr, CmpVal_Shifted, NewVal_Shifted, PMV.Mask, + CI->getSuccessOrdering()); + Value *FinalOldVal = Builder.CreateTrunc( + Builder.CreateLShr(OldVal, PMV.ShiftAmt), PMV.ValueType); + + Value *Res = UndefValue::get(CI->getType()); + Res = Builder.CreateInsertValue(Res, FinalOldVal, 0); + Value *Success = Builder.CreateICmpEQ( + CmpVal_Shifted, Builder.CreateAnd(OldVal, PMV.Mask), "Success"); + Res = Builder.CreateInsertValue(Res, Success, 1); + + CI->replaceAllUsesWith(Res); + CI->eraseFromParent(); +} + Value *AtomicExpand::insertRMWLLSCLoop( IRBuilder<> &Builder, Type *ResultTy, Value *Addr, AtomicOrdering MemOpOrder, @@ -1366,8 +1396,8 @@ bool AtomicExpand::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) { return expandAtomicCmpXchg(CI); } case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic: - llvm_unreachable( - "MaskedIntrinsic expansion of cmpxhg not yet implemented"); + expandAtomicCmpXchgToMaskedIntrinsic(CI); + return true; } } diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp index fa027eedd4d95979b6e88d21c6d3fc54178a8f6a..efbfd5f4ab2c582e9b06674810eb0111de99ef0f 100644 --- a/lib/CodeGen/BranchFolding.cpp +++ b/lib/CodeGen/BranchFolding.cpp @@ -298,7 +298,7 @@ static unsigned HashEndOfMBB(const MachineBasicBlock &MBB) { /// Whether MI should be counted as an instruction when calculating common tail. static bool countsAsInstruction(const MachineInstr &MI) { - return !(MI.isDebugValue() || MI.isCFIInstruction()); + return !(MI.isDebugInstr() || MI.isCFIInstruction()); } /// ComputeCommonTailLength - Given two machine basic blocks, compute the number @@ -1363,9 +1363,9 @@ static void copyDebugInfoToPredecessor(const TargetInstrInfo *TII, MachineBasicBlock &PredMBB) { auto InsertBefore = PredMBB.getFirstTerminator(); for (MachineInstr &MI : MBB.instrs()) - if (MI.isDebugValue()) { + if (MI.isDebugInstr()) { TII->duplicate(PredMBB, InsertBefore, MI); - LLVM_DEBUG(dbgs() << "Copied debug value from empty block to pred: " + LLVM_DEBUG(dbgs() << "Copied debug entity from empty block to pred: " << MI); } } @@ -1375,9 +1375,9 @@ static void copyDebugInfoToSuccessor(const TargetInstrInfo *TII, MachineBasicBlock &SuccMBB) { auto InsertBefore = SuccMBB.SkipPHIsAndLabels(SuccMBB.begin()); for (MachineInstr &MI : MBB.instrs()) - if (MI.isDebugValue()) { + if (MI.isDebugInstr()) { TII->duplicate(SuccMBB, InsertBefore, MI); - LLVM_DEBUG(dbgs() << "Copied debug value from empty block to succ: " + LLVM_DEBUG(dbgs() << "Copied debug entity from empty block to succ: " << MI); } } diff --git a/lib/CodeGen/BuiltinGCs.cpp b/lib/CodeGen/BuiltinGCs.cpp index 3a9b20aa661d64dcf0b59d63c86e4d196e01df5e..93939e573b7baa41052cf40c6be5c82cb8b6b753 100644 --- a/lib/CodeGen/BuiltinGCs.cpp +++ b/lib/CodeGen/BuiltinGCs.cpp @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/BuiltinGCs.h" #include "llvm/CodeGen/GCStrategy.h" -#include "llvm/CodeGen/GCs.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/Support/Casting.h" @@ -28,10 +28,8 @@ namespace { class ErlangGC : public GCStrategy { public: ErlangGC() { - InitRoots = false; - NeededSafePoints = 1 << GC::PostCall; + NeededSafePoints = true; UsesMetadata = true; - CustomRoots = false; } }; @@ -41,7 +39,7 @@ public: class OcamlGC : public GCStrategy { public: OcamlGC() { - NeededSafePoints = 1 << GC::PostCall; + NeededSafePoints = true; UsesMetadata = true; } }; @@ -56,10 +54,7 @@ public: /// while introducing only minor runtime overhead. class ShadowStackGC : public GCStrategy { public: - ShadowStackGC() { - InitRoots = true; - CustomRoots = true; - } + ShadowStackGC() {} }; /// A GCStrategy which serves as an example for the usage of a statepoint based @@ -74,10 +69,8 @@ public: UseStatepoints = true; // These options are all gc.root specific, we specify them so that the // gc.root lowering code doesn't run. - InitRoots = false; - NeededSafePoints = 0; + NeededSafePoints = false; UsesMetadata = false; - CustomRoots = false; } Optional isGCManagedPointer(const Type *Ty) const override { @@ -108,10 +101,8 @@ public: UseStatepoints = true; // These options are all gc.root specific, we specify them so that the // gc.root lowering code doesn't run. - InitRoots = false; - NeededSafePoints = 0; + NeededSafePoints = false; UsesMetadata = false; - CustomRoots = false; } Optional isGCManagedPointer(const Type *Ty) const override { @@ -136,9 +127,5 @@ static GCRegistry::Add D("statepoint-example", "an example strategy for statepoint"); static GCRegistry::Add E("coreclr", "CoreCLR-compatible GC"); -// Provide hooks to ensure the containing library is fully loaded. -void llvm::linkErlangGC() {} -void llvm::linkOcamlGC() {} -void llvm::linkShadowStackGC() {} -void llvm::linkStatepointExampleGC() {} -void llvm::linkCoreCLRGC() {} +// Provide hook to ensure the containing library is fully loaded. +void llvm::linkAllBuiltinGCs() {} diff --git a/lib/CodeGen/CFIInstrInserter.cpp b/lib/CodeGen/CFIInstrInserter.cpp index 4fd119430cff8d335b0f53091f90b99a803daa23..c4799855a2b30eed983974b647511ea9a692da4d 100644 --- a/lib/CodeGen/CFIInstrInserter.cpp +++ b/lib/CodeGen/CFIInstrInserter.cpp @@ -207,6 +207,7 @@ void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) { case MCCFIInstruction::OpUndefined: case MCCFIInstruction::OpRegister: case MCCFIInstruction::OpWindowSave: + case MCCFIInstruction::OpNegateRAState: case MCCFIInstruction::OpGnuArgsSize: break; } diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt index fbdc511eea7fa8542d0ffb46b7b42169ec65f0c8..e76f9f8ed4e7c90202ff9e6f7685a1fec364a13c 100644 --- a/lib/CodeGen/CMakeLists.txt +++ b/lib/CodeGen/CMakeLists.txt @@ -39,6 +39,7 @@ add_llvm_library(LLVMCodeGen InlineSpiller.cpp InterferenceCache.cpp InterleavedAccessPass.cpp + InterleavedLoadCombinePass.cpp IntrinsicLowering.cpp LatencyPriorityQueue.cpp LazyMachineBlockFrequencyInfo.cpp @@ -83,7 +84,6 @@ add_llvm_library(LLVMCodeGen MachineOperand.cpp MachineOptimizationRemarkEmitter.cpp MachineOutliner.cpp - MachinePassRegistry.cpp MachinePipeliner.cpp MachinePostDominators.cpp MachineRegionInfo.cpp diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp index 2f845354c57027832713d2f55339e7a29c95f2fb..66166482c78bcd7ca31566a3f7b339c8cf929a61 100644 --- a/lib/CodeGen/CodeGen.cpp +++ b/lib/CodeGen/CodeGen.cpp @@ -42,6 +42,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeIfConverterPass(Registry); initializeImplicitNullChecksPass(Registry); initializeIndirectBrExpandPassPass(Registry); + initializeInterleavedLoadCombinePass(Registry); initializeInterleavedAccessPass(Registry); initializeLiveDebugValuesPass(Registry); initializeLiveDebugVariablesPass(Registry); diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 651873bb91182a1636f9f7278f6828c995c47bc1..392e8bc0db48021a640b87c392969a4343d67846 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -416,7 +416,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) { OptSize = F.optForSize(); ProfileSummaryInfo *PSI = - getAnalysis().getPSI(); + &getAnalysis().getPSI(); if (ProfileGuidedSectionPrefix) { if (PSI->isFunctionHotInCallGraph(&F, *BFI)) F.setSectionPrefix(".hot"); @@ -2362,6 +2362,8 @@ class TypePromotionTransaction { /// Keep track of the original uses (pair Instruction, Index). SmallVector OriginalUses; + /// Keep track of the debug users. + SmallVector DbgValues; using use_iterator = SmallVectorImpl::iterator; @@ -2375,6 +2377,10 @@ class TypePromotionTransaction { Instruction *UserI = cast(U.getUser()); OriginalUses.push_back(InstructionAndIdx(UserI, U.getOperandNo())); } + // Record the debug uses separately. They are not in the instruction's + // use list, but they are replaced by RAUW. + findDbgValues(DbgValues, Inst); + // Now, we can replace the uses. Inst->replaceAllUsesWith(New); } @@ -2387,6 +2393,15 @@ class TypePromotionTransaction { UseIt != EndIt; ++UseIt) { UseIt->Inst->setOperand(UseIt->Idx, Inst); } + // RAUW has replaced all original uses with references to the new value, + // including the debug uses. Since we are undoing the replacements, + // the original debug uses must also be reinstated to maintain the + // correctness and utility of debug value instructions. + for (auto *DVI: DbgValues) { + LLVMContext &Ctx = Inst->getType()->getContext(); + auto *MV = MetadataAsValue::get(Ctx, ValueAsMetadata::get(Inst)); + DVI->setOperand(0, MV); + } } }; @@ -2657,15 +2672,159 @@ private: Value *PromotedOperand) const; }; +class PhiNodeSet; + +/// An iterator for PhiNodeSet. +class PhiNodeSetIterator { + PhiNodeSet * const Set; + size_t CurrentIndex = 0; + +public: + /// The constructor. Start should point to either a valid element, or be equal + /// to the size of the underlying SmallVector of the PhiNodeSet. + PhiNodeSetIterator(PhiNodeSet * const Set, size_t Start); + PHINode * operator*() const; + PhiNodeSetIterator& operator++(); + bool operator==(const PhiNodeSetIterator &RHS) const; + bool operator!=(const PhiNodeSetIterator &RHS) const; +}; + +/// Keeps a set of PHINodes. +/// +/// This is a minimal set implementation for a specific use case: +/// It is very fast when there are very few elements, but also provides good +/// performance when there are many. It is similar to SmallPtrSet, but also +/// provides iteration by insertion order, which is deterministic and stable +/// across runs. It is also similar to SmallSetVector, but provides removing +/// elements in O(1) time. This is achieved by not actually removing the element +/// from the underlying vector, so comes at the cost of using more memory, but +/// that is fine, since PhiNodeSets are used as short lived objects. +class PhiNodeSet { + friend class PhiNodeSetIterator; + + using MapType = SmallDenseMap; + using iterator = PhiNodeSetIterator; + + /// Keeps the elements in the order of their insertion in the underlying + /// vector. To achieve constant time removal, it never deletes any element. + SmallVector NodeList; + + /// Keeps the elements in the underlying set implementation. This (and not the + /// NodeList defined above) is the source of truth on whether an element + /// is actually in the collection. + MapType NodeMap; + + /// Points to the first valid (not deleted) element when the set is not empty + /// and the value is not zero. Equals to the size of the underlying vector + /// when the set is empty. When the value is 0, as in the beginning, the + /// first element may or may not be valid. + size_t FirstValidElement = 0; + +public: + /// Inserts a new element to the collection. + /// \returns true if the element is actually added, i.e. was not in the + /// collection before the operation. + bool insert(PHINode *Ptr) { + if (NodeMap.insert(std::make_pair(Ptr, NodeList.size())).second) { + NodeList.push_back(Ptr); + return true; + } + return false; + } + + /// Removes the element from the collection. + /// \returns whether the element is actually removed, i.e. was in the + /// collection before the operation. + bool erase(PHINode *Ptr) { + auto it = NodeMap.find(Ptr); + if (it != NodeMap.end()) { + NodeMap.erase(Ptr); + SkipRemovedElements(FirstValidElement); + return true; + } + return false; + } + + /// Removes all elements and clears the collection. + void clear() { + NodeMap.clear(); + NodeList.clear(); + FirstValidElement = 0; + } + + /// \returns an iterator that will iterate the elements in the order of + /// insertion. + iterator begin() { + if (FirstValidElement == 0) + SkipRemovedElements(FirstValidElement); + return PhiNodeSetIterator(this, FirstValidElement); + } + + /// \returns an iterator that points to the end of the collection. + iterator end() { return PhiNodeSetIterator(this, NodeList.size()); } + + /// Returns the number of elements in the collection. + size_t size() const { + return NodeMap.size(); + } + + /// \returns 1 if the given element is in the collection, and 0 if otherwise. + size_t count(PHINode *Ptr) const { + return NodeMap.count(Ptr); + } + +private: + /// Updates the CurrentIndex so that it will point to a valid element. + /// + /// If the element of NodeList at CurrentIndex is valid, it does not + /// change it. If there are no more valid elements, it updates CurrentIndex + /// to point to the end of the NodeList. + void SkipRemovedElements(size_t &CurrentIndex) { + while (CurrentIndex < NodeList.size()) { + auto it = NodeMap.find(NodeList[CurrentIndex]); + // If the element has been deleted and added again later, NodeMap will + // point to a different index, so CurrentIndex will still be invalid. + if (it != NodeMap.end() && it->second == CurrentIndex) + break; + ++CurrentIndex; + } + } +}; + +PhiNodeSetIterator::PhiNodeSetIterator(PhiNodeSet *const Set, size_t Start) + : Set(Set), CurrentIndex(Start) {} + +PHINode * PhiNodeSetIterator::operator*() const { + assert(CurrentIndex < Set->NodeList.size() && + "PhiNodeSet access out of range"); + return Set->NodeList[CurrentIndex]; +} + +PhiNodeSetIterator& PhiNodeSetIterator::operator++() { + assert(CurrentIndex < Set->NodeList.size() && + "PhiNodeSet access out of range"); + ++CurrentIndex; + Set->SkipRemovedElements(CurrentIndex); + return *this; +} + +bool PhiNodeSetIterator::operator==(const PhiNodeSetIterator &RHS) const { + return CurrentIndex == RHS.CurrentIndex; +} + +bool PhiNodeSetIterator::operator!=(const PhiNodeSetIterator &RHS) const { + return !((*this) == RHS); +} + /// Keep track of simplification of Phi nodes. /// Accept the set of all phi nodes and erase phi node from this set /// if it is simplified. class SimplificationTracker { DenseMap Storage; const SimplifyQuery &SQ; - // Tracks newly created Phi nodes. We use a SetVector to get deterministic - // order when iterating over the set in MatchPhiSet. - SmallSetVector AllPhiNodes; + // Tracks newly created Phi nodes. The elements are iterated by insertion + // order. + PhiNodeSet AllPhiNodes; // Tracks newly created Select nodes. SmallPtrSet AllSelectNodes; @@ -2697,7 +2856,7 @@ public: Put(PI, V); PI->replaceAllUsesWith(V); if (auto *PHI = dyn_cast(PI)) - AllPhiNodes.remove(PHI); + AllPhiNodes.erase(PHI); if (auto *Select = dyn_cast(PI)) AllSelectNodes.erase(Select); PI->eraseFromParent(); @@ -2720,11 +2879,11 @@ public: assert(Get(To) == To && "Replacement PHI node is already replaced."); Put(From, To); From->replaceAllUsesWith(To); - AllPhiNodes.remove(From); + AllPhiNodes.erase(From); From->eraseFromParent(); } - SmallSetVector& newPhiNodes() { return AllPhiNodes; } + PhiNodeSet& newPhiNodes() { return AllPhiNodes; } void insertNewPhi(PHINode *PN) { AllPhiNodes.insert(PN); } @@ -2752,8 +2911,7 @@ public: /// A helper class for combining addressing modes. class AddressingModeCombiner { - typedef std::pair ValueInBB; - typedef DenseMap FoldAddrToValueMapping; + typedef DenseMap FoldAddrToValueMapping; typedef std::pair PHIPair; private: @@ -2773,10 +2931,10 @@ private: const SimplifyQuery &SQ; /// Original Address. - ValueInBB Original; + Value *Original; public: - AddressingModeCombiner(const SimplifyQuery &_SQ, ValueInBB OriginalValue) + AddressingModeCombiner(const SimplifyQuery &_SQ, Value *OriginalValue) : CommonType(nullptr), SQ(_SQ), Original(OriginalValue) {} /// Get the combined AddrMode @@ -2872,46 +3030,40 @@ public: } private: - /// Initialize Map with anchor values. For address seen in some BB + /// Initialize Map with anchor values. For address seen /// we set the value of different field saw in this address. - /// If address is not an instruction than basic block is set to null. /// At the same time we find a common type for different field we will /// use to create new Phi/Select nodes. Keep it in CommonType field. /// Return false if there is no common type found. bool initializeMap(FoldAddrToValueMapping &Map) { // Keep track of keys where the value is null. We will need to replace it // with constant null when we know the common type. - SmallVector NullValue; + SmallVector NullValue; Type *IntPtrTy = SQ.DL.getIntPtrType(AddrModes[0].OriginalValue->getType()); for (auto &AM : AddrModes) { - BasicBlock *BB = nullptr; - if (Instruction *I = dyn_cast(AM.OriginalValue)) - BB = I->getParent(); - Value *DV = AM.GetFieldAsValue(DifferentField, IntPtrTy); if (DV) { auto *Type = DV->getType(); if (CommonType && CommonType != Type) return false; CommonType = Type; - Map[{ AM.OriginalValue, BB }] = DV; + Map[AM.OriginalValue] = DV; } else { - NullValue.push_back({ AM.OriginalValue, BB }); + NullValue.push_back(AM.OriginalValue); } } assert(CommonType && "At least one non-null value must be!"); - for (auto VIBB : NullValue) - Map[VIBB] = Constant::getNullValue(CommonType); + for (auto *V : NullValue) + Map[V] = Constant::getNullValue(CommonType); return true; } - /// We have mapping between value A and basic block where value A - /// seen to other value B where B was a field in addressing mode represented - /// by A. Also we have an original value C representing an address in some - /// basic block. Traversing from C through phi and selects we ended up with - /// A's in a map. This utility function tries to find a value V which is a - /// field in addressing mode C and traversing through phi nodes and selects - /// we will end up in corresponded values B in a map. + /// We have mapping between value A and other value B where B was a field in + /// addressing mode represented by A. Also we have an original value C + /// representing an address we start with. Traversing from C through phi and + /// selects we ended up with A's in a map. This utility function tries to find + /// a value V which is a field in addressing mode C and traversing through phi + /// nodes and selects we will end up in corresponded values B in a map. /// The utility will create a new Phi/Selects if needed. // The simple example looks as follows: // BB1: @@ -2924,22 +3076,24 @@ private: // p = phi [p1, BB1], [p2, BB2] // v = load p // Map is - // -> b1 - // -> b2 + // p1 -> b1 + // p2 -> b2 // Request is - // -> ? - // The function tries to find or build phi [b1, BB1], [b2, BB2] in BB3 + // p -> ? + // The function tries to find or build phi [b1, BB1], [b2, BB2] in BB3. Value *findCommon(FoldAddrToValueMapping &Map) { // Tracks the simplification of newly created phi nodes. The reason we use // this mapping is because we will add new created Phi nodes in AddrToBase. // Simplification of Phi nodes is recursive, so some Phi node may - // be simplified after we added it to AddrToBase. + // be simplified after we added it to AddrToBase. In reality this + // simplification is possible only if original phi/selects were not + // simplified yet. // Using this mapping we can find the current value in AddrToBase. SimplificationTracker ST(SQ); // First step, DFS to create PHI nodes for all intermediate blocks. // Also fill traverse order for the second step. - SmallVector TraverseOrder; + SmallVector TraverseOrder; InsertPlaceholders(Map, TraverseOrder, ST); // Second Step, fill new nodes by merged values and simplify if possible. @@ -2969,7 +3123,7 @@ private: /// Matcher tracks the matched Phi nodes. bool MatchPhiNode(PHINode *PHI, PHINode *Candidate, SmallSetVector &Matcher, - SmallSetVector &PhiNodesToMatch) { + PhiNodeSet &PhiNodesToMatch) { SmallVector WorkList; Matcher.insert({ PHI, Candidate }); WorkList.push_back({ PHI, Candidate }); @@ -3018,11 +3172,12 @@ private: /// Returns false if this matching fails and creation of new Phi is disabled. bool MatchPhiSet(SimplificationTracker &ST, bool AllowNewPhiNodes, unsigned &PhiNotMatchedCount) { - // Use a SetVector for Matched to make sure we do replacements (ReplacePhi) - // in a deterministic order below. + // Matched and PhiNodesToMatch iterate their elements in a deterministic + // order, so the replacements (ReplacePhi) are also done in a deterministic + // order. SmallSetVector Matched; SmallPtrSet WillNotMatch; - SmallSetVector &PhiNodesToMatch = ST.newPhiNodes(); + PhiNodeSet &PhiNodesToMatch = ST.newPhiNodes(); while (PhiNodesToMatch.size()) { PHINode *PHI = *PhiNodesToMatch.begin(); @@ -3057,129 +3212,86 @@ private: // Just remove all seen values in matcher. They will not match anything. PhiNotMatchedCount += WillNotMatch.size(); for (auto *P : WillNotMatch) - PhiNodesToMatch.remove(P); + PhiNodesToMatch.erase(P); } return true; } - /// Fill the placeholder with values from predecessors and simplify it. + /// Fill the placeholders with values from predecessors and simplify them. void FillPlaceholders(FoldAddrToValueMapping &Map, - SmallVectorImpl &TraverseOrder, + SmallVectorImpl &TraverseOrder, SimplificationTracker &ST) { while (!TraverseOrder.empty()) { - auto Current = TraverseOrder.pop_back_val(); + Value *Current = TraverseOrder.pop_back_val(); assert(Map.find(Current) != Map.end() && "No node to fill!!!"); - Value *CurrentValue = Current.first; - BasicBlock *CurrentBlock = Current.second; Value *V = Map[Current]; if (SelectInst *Select = dyn_cast(V)) { // CurrentValue also must be Select. - auto *CurrentSelect = cast(CurrentValue); + auto *CurrentSelect = cast(Current); auto *TrueValue = CurrentSelect->getTrueValue(); - ValueInBB TrueItem = { TrueValue, isa(TrueValue) - ? CurrentBlock - : nullptr }; - assert(Map.find(TrueItem) != Map.end() && "No True Value!"); - Select->setTrueValue(ST.Get(Map[TrueItem])); + assert(Map.find(TrueValue) != Map.end() && "No True Value!"); + Select->setTrueValue(ST.Get(Map[TrueValue])); auto *FalseValue = CurrentSelect->getFalseValue(); - ValueInBB FalseItem = { FalseValue, isa(FalseValue) - ? CurrentBlock - : nullptr }; - assert(Map.find(FalseItem) != Map.end() && "No False Value!"); - Select->setFalseValue(ST.Get(Map[FalseItem])); + assert(Map.find(FalseValue) != Map.end() && "No False Value!"); + Select->setFalseValue(ST.Get(Map[FalseValue])); } else { // Must be a Phi node then. PHINode *PHI = cast(V); + auto *CurrentPhi = dyn_cast(Current); // Fill the Phi node with values from predecessors. - bool IsDefinedInThisBB = - cast(CurrentValue)->getParent() == CurrentBlock; - auto *CurrentPhi = dyn_cast(CurrentValue); - for (auto B : predecessors(CurrentBlock)) { - Value *PV = IsDefinedInThisBB - ? CurrentPhi->getIncomingValueForBlock(B) - : CurrentValue; - ValueInBB item = { PV, isa(PV) ? B : nullptr }; - assert(Map.find(item) != Map.end() && "No predecessor Value!"); - PHI->addIncoming(ST.Get(Map[item]), B); + for (auto B : predecessors(PHI->getParent())) { + Value *PV = CurrentPhi->getIncomingValueForBlock(B); + assert(Map.find(PV) != Map.end() && "No predecessor Value!"); + PHI->addIncoming(ST.Get(Map[PV]), B); } } - // Simplify if possible. Map[Current] = ST.Simplify(V); } } - /// Starting from value recursively iterates over predecessors up to known - /// ending values represented in a map. For each traversed block inserts - /// a placeholder Phi or Select. + /// Starting from original value recursively iterates over def-use chain up to + /// known ending values represented in a map. For each traversed phi/select + /// inserts a placeholder Phi or Select. /// Reports all new created Phi/Select nodes by adding them to set. - /// Also reports and order in what basic blocks have been traversed. + /// Also reports and order in what values have been traversed. void InsertPlaceholders(FoldAddrToValueMapping &Map, - SmallVectorImpl &TraverseOrder, + SmallVectorImpl &TraverseOrder, SimplificationTracker &ST) { - SmallVector Worklist; - assert((isa(Original.first) || isa(Original.first)) && + SmallVector Worklist; + assert((isa(Original) || isa(Original)) && "Address must be a Phi or Select node"); auto *Dummy = UndefValue::get(CommonType); Worklist.push_back(Original); while (!Worklist.empty()) { - auto Current = Worklist.pop_back_val(); - // If value is not an instruction it is something global, constant, - // parameter and we can say that this value is observable in any block. - // Set block to null to denote it. - // Also please take into account that it is how we build anchors. - if (!isa(Current.first)) - Current.second = nullptr; + Value *Current = Worklist.pop_back_val(); // if it is already visited or it is an ending value then skip it. if (Map.find(Current) != Map.end()) continue; TraverseOrder.push_back(Current); - Value *CurrentValue = Current.first; - BasicBlock *CurrentBlock = Current.second; // CurrentValue must be a Phi node or select. All others must be covered // by anchors. - Instruction *CurrentI = cast(CurrentValue); - bool IsDefinedInThisBB = CurrentI->getParent() == CurrentBlock; - - unsigned PredCount = pred_size(CurrentBlock); - // if Current Value is not defined in this basic block we are interested - // in values in predecessors. - if (!IsDefinedInThisBB) { - assert(PredCount && "Unreachable block?!"); - PHINode *PHI = PHINode::Create(CommonType, PredCount, "sunk_phi", - &CurrentBlock->front()); - Map[Current] = PHI; - ST.insertNewPhi(PHI); - // Add all predecessors in work list. - for (auto B : predecessors(CurrentBlock)) - Worklist.push_back({ CurrentValue, B }); - continue; - } - // Value is defined in this basic block. - if (SelectInst *OrigSelect = dyn_cast(CurrentI)) { + if (SelectInst *CurrentSelect = dyn_cast(Current)) { // Is it OK to get metadata from OrigSelect?! // Create a Select placeholder with dummy value. - SelectInst *Select = - SelectInst::Create(OrigSelect->getCondition(), Dummy, Dummy, - OrigSelect->getName(), OrigSelect, OrigSelect); + SelectInst *Select = SelectInst::Create( + CurrentSelect->getCondition(), Dummy, Dummy, + CurrentSelect->getName(), CurrentSelect, CurrentSelect); Map[Current] = Select; ST.insertNewSelect(Select); - // We are interested in True and False value in this basic block. - Worklist.push_back({ OrigSelect->getTrueValue(), CurrentBlock }); - Worklist.push_back({ OrigSelect->getFalseValue(), CurrentBlock }); + // We are interested in True and False values. + Worklist.push_back(CurrentSelect->getTrueValue()); + Worklist.push_back(CurrentSelect->getFalseValue()); } else { // It must be a Phi node then. - auto *CurrentPhi = cast(CurrentI); - // Create new Phi node for merge of bases. - assert(PredCount && "Unreachable block?!"); - PHINode *PHI = PHINode::Create(CommonType, PredCount, "sunk_phi", - &CurrentBlock->front()); + PHINode *CurrentPhi = cast(Current); + unsigned PredCount = CurrentPhi->getNumIncomingValues(); + PHINode *PHI = + PHINode::Create(CommonType, PredCount, "sunk_phi", CurrentPhi); Map[Current] = PHI; ST.insertNewPhi(PHI); - - // Add all predecessors in work list. - for (auto B : predecessors(CurrentBlock)) - Worklist.push_back({ CurrentPhi->getIncomingValueForBlock(B), B }); + for (Value *P : CurrentPhi->incoming_values()) + Worklist.push_back(P); } } } @@ -4398,7 +4510,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, bool PhiOrSelectSeen = false; SmallVector AddrModeInsts; const SimplifyQuery SQ(*DL, TLInfo); - AddressingModeCombiner AddrModes(SQ, { Addr, MemoryInst->getParent() }); + AddressingModeCombiner AddrModes(SQ, Addr); TypePromotionTransaction TPT(RemovedInsts); TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); diff --git a/lib/CodeGen/GCMetadata.cpp b/lib/CodeGen/GCMetadata.cpp index fe3d29657942612c7b684b2ce37824c1adf0d469..1c80556dfef57943142f6cfe8c59b3af1649a4c8 100644 --- a/lib/CodeGen/GCMetadata.cpp +++ b/lib/CodeGen/GCMetadata.cpp @@ -103,16 +103,6 @@ void Printer::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); } -static const char *DescKind(GC::PointKind Kind) { - switch (Kind) { - case GC::PreCall: - return "pre-call"; - case GC::PostCall: - return "post-call"; - } - llvm_unreachable("Invalid point kind"); -} - bool Printer::runOnFunction(Function &F) { if (F.hasGC()) return false; @@ -129,7 +119,7 @@ bool Printer::runOnFunction(Function &F) { for (GCFunctionInfo::iterator PI = FD->begin(), PE = FD->end(); PI != PE; ++PI) { - OS << "\t" << PI->Label->getName() << ": " << DescKind(PI->Kind) + OS << "\t" << PI->Label->getName() << ": " << "post-call" << ", live = {"; for (GCFunctionInfo::live_iterator RI = FD->live_begin(PI), diff --git a/lib/CodeGen/GCRootLowering.cpp b/lib/CodeGen/GCRootLowering.cpp index 31ddeadbd97a13e1576913d346e88f89bbb72ef0..e8ccd84b0b93e4ebe0c8798e49500c78d4916003 100644 --- a/lib/CodeGen/GCRootLowering.cpp +++ b/lib/CodeGen/GCRootLowering.cpp @@ -38,7 +38,7 @@ namespace { /// directed by the GCStrategy. It also performs automatic root initialization /// and custom intrinsic lowering. class LowerIntrinsics : public FunctionPass { - bool PerformDefaultLowering(Function &F, GCStrategy &S); + bool DoLowering(Function &F, GCStrategy &S); public: static char ID; @@ -102,13 +102,6 @@ void LowerIntrinsics::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved(); } -static bool NeedsDefaultLoweringPass(const GCStrategy &C) { - // Default lowering is necessary only if read or write barriers have a default - // action. The default for roots is no action. - return !C.customWriteBarrier() || !C.customReadBarrier() || - C.initializeRoots(); -} - /// doInitialization - If this module uses the GC intrinsics, find them now. bool LowerIntrinsics::doInitialization(Module &M) { GCModuleInfo *MI = getAnalysisIfAvailable(); @@ -148,8 +141,7 @@ static bool CouldBecomeSafePoint(Instruction *I) { return true; } -static bool InsertRootInitializers(Function &F, AllocaInst **Roots, - unsigned Count) { +static bool InsertRootInitializers(Function &F, ArrayRef Roots) { // Scroll past alloca instructions. BasicBlock::iterator IP = F.getEntryBlock().begin(); while (isa(IP)) @@ -166,12 +158,12 @@ static bool InsertRootInitializers(Function &F, AllocaInst **Roots, // Add root initializers. bool MadeChange = false; - for (AllocaInst **I = Roots, **E = Roots + Count; I != E; ++I) - if (!InitedRoots.count(*I)) { + for (AllocaInst *Root : Roots) + if (!InitedRoots.count(Root)) { StoreInst *SI = new StoreInst( - ConstantPointerNull::get(cast((*I)->getAllocatedType())), - *I); - SI->insertAfter(*I); + ConstantPointerNull::get(cast(Root->getAllocatedType())), + Root); + SI->insertAfter(Root); MadeChange = true; } @@ -188,64 +180,59 @@ bool LowerIntrinsics::runOnFunction(Function &F) { GCFunctionInfo &FI = getAnalysis().getFunctionInfo(F); GCStrategy &S = FI.getStrategy(); - bool MadeChange = false; - - if (NeedsDefaultLoweringPass(S)) - MadeChange |= PerformDefaultLowering(F, S); - - return MadeChange; + return DoLowering(F, S); } -bool LowerIntrinsics::PerformDefaultLowering(Function &F, GCStrategy &S) { - bool LowerWr = !S.customWriteBarrier(); - bool LowerRd = !S.customReadBarrier(); - bool InitRoots = S.initializeRoots(); - +/// Lower barriers out of existance (if the associated GCStrategy hasn't +/// already done so...), and insert initializing stores to roots as a defensive +/// measure. Given we're going to report all roots live at all safepoints, we +/// need to be able to ensure each root has been initialized by the point the +/// first safepoint is reached. This really should have been done by the +/// frontend, but the old API made this non-obvious, so we do a potentially +/// redundant store just in case. +bool LowerIntrinsics::DoLowering(Function &F, GCStrategy &S) { SmallVector Roots; bool MadeChange = false; - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) { - if (IntrinsicInst *CI = dyn_cast(II++)) { - Function *F = CI->getCalledFunction(); - switch (F->getIntrinsicID()) { - case Intrinsic::gcwrite: - if (LowerWr) { - // Replace a write barrier with a simple store. - Value *St = - new StoreInst(CI->getArgOperand(0), CI->getArgOperand(2), CI); - CI->replaceAllUsesWith(St); - CI->eraseFromParent(); - } - break; - case Intrinsic::gcread: - if (LowerRd) { - // Replace a read barrier with a simple load. - Value *Ld = new LoadInst(CI->getArgOperand(1), "", CI); - Ld->takeName(CI); - CI->replaceAllUsesWith(Ld); - CI->eraseFromParent(); - } - break; - case Intrinsic::gcroot: - if (InitRoots) { - // Initialize the GC root, but do not delete the intrinsic. The - // backend needs the intrinsic to flag the stack slot. - Roots.push_back( - cast(CI->getArgOperand(0)->stripPointerCasts())); - } - break; - default: - continue; - } - + for (BasicBlock &BB : F) + for (BasicBlock::iterator II = BB.begin(), E = BB.end(); II != E;) { + IntrinsicInst *CI = dyn_cast(II++); + if (!CI) + continue; + + Function *F = CI->getCalledFunction(); + switch (F->getIntrinsicID()) { + default: break; + case Intrinsic::gcwrite: { + // Replace a write barrier with a simple store. + Value *St = new StoreInst(CI->getArgOperand(0), + CI->getArgOperand(2), CI); + CI->replaceAllUsesWith(St); + CI->eraseFromParent(); MadeChange = true; + break; + } + case Intrinsic::gcread: { + // Replace a read barrier with a simple load. + Value *Ld = new LoadInst(CI->getArgOperand(1), "", CI); + Ld->takeName(CI); + CI->replaceAllUsesWith(Ld); + CI->eraseFromParent(); + MadeChange = true; + break; + } + case Intrinsic::gcroot: { + // Initialize the GC root, but do not delete the intrinsic. The + // backend needs the intrinsic to flag the stack slot. + Roots.push_back( + cast(CI->getArgOperand(0)->stripPointerCasts())); + break; + } } } - } if (Roots.size()) - MadeChange |= InsertRootInitializers(F, Roots.begin(), Roots.size()); + MadeChange |= InsertRootInitializers(F, Roots); return MadeChange; } @@ -276,26 +263,18 @@ MCSymbol *GCMachineCodeAnalysis::InsertLabel(MachineBasicBlock &MBB, } void GCMachineCodeAnalysis::VisitCallPoint(MachineBasicBlock::iterator CI) { - // Find the return address (next instruction), too, so as to bracket the call - // instruction. + // Find the return address (next instruction), since that's what will be on + // the stack when the call is suspended and we need to inspect the stack. MachineBasicBlock::iterator RAI = CI; ++RAI; - if (FI->getStrategy().needsSafePoint(GC::PreCall)) { - MCSymbol *Label = InsertLabel(*CI->getParent(), CI, CI->getDebugLoc()); - FI->addSafePoint(GC::PreCall, Label, CI->getDebugLoc()); - } - - if (FI->getStrategy().needsSafePoint(GC::PostCall)) { - MCSymbol *Label = InsertLabel(*CI->getParent(), RAI, CI->getDebugLoc()); - FI->addSafePoint(GC::PostCall, Label, CI->getDebugLoc()); - } + MCSymbol *Label = InsertLabel(*CI->getParent(), RAI, CI->getDebugLoc()); + FI->addSafePoint(Label, CI->getDebugLoc()); } void GCMachineCodeAnalysis::FindSafePoints(MachineFunction &MF) { - for (MachineFunction::iterator BBI = MF.begin(), BBE = MF.end(); BBI != BBE; - ++BBI) - for (MachineBasicBlock::iterator MI = BBI->begin(), ME = BBI->end(); + for (MachineBasicBlock &MBB : MF) + for (MachineBasicBlock::iterator MI = MBB.begin(), ME = MBB.end(); MI != ME; ++MI) if (MI->isCall()) { // Do not treat tail or sibling call sites as safe points. This is diff --git a/lib/CodeGen/GlobalISel/CMakeLists.txt b/lib/CodeGen/GlobalISel/CMakeLists.txt index 4c1da3756b1839ff05821d90fd81c83d4200d627..5f13692bbee1f454a7a442eecf377bb29826191f 100644 --- a/lib/CodeGen/GlobalISel/CMakeLists.txt +++ b/lib/CodeGen/GlobalISel/CMakeLists.txt @@ -3,6 +3,7 @@ add_llvm_library(LLVMGlobalISel GlobalISel.cpp Combiner.cpp CombinerHelper.cpp + GISelChangeObserver.cpp IRTranslator.cpp InstructionSelect.cpp InstructionSelector.cpp diff --git a/lib/CodeGen/GlobalISel/Combiner.cpp b/lib/CodeGen/GlobalISel/Combiner.cpp index f3f075af4869598c724f9e833d3d4556e9394d58..90fd54ec244ba34b5a45ac892f4a2182dda10e85 100644 --- a/lib/CodeGen/GlobalISel/Combiner.cpp +++ b/lib/CodeGen/GlobalISel/Combiner.cpp @@ -1,4 +1,4 @@ -//===-- lib/CodeGen/GlobalISel/GICombiner.cpp -----------------------===// +//===-- lib/CodeGen/GlobalISel/Combiner.cpp -------------------------------===// // // The LLVM Compiler Infrastructure // @@ -12,12 +12,13 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/GlobalISel/Combiner.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" -#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" -#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/GISelWorkList.h" -#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" @@ -28,27 +29,62 @@ using namespace llvm; namespace { /// This class acts as the glue the joins the CombinerHelper to the overall /// Combine algorithm. The CombinerHelper is intended to report the -/// modifications it makes to the MIR to the CombinerChangeObserver and the +/// modifications it makes to the MIR to the GISelChangeObserver and the /// observer subclass will act on these events. In this case, instruction /// erasure will cancel any future visits to the erased instruction and /// instruction creation will schedule that instruction for a future visit. /// Other Combiner implementations may require more complex behaviour from -/// their CombinerChangeObserver subclass. -class WorkListMaintainer : public CombinerChangeObserver { +/// their GISelChangeObserver subclass. +class WorkListMaintainer : public GISelChangeObserver, + public MachineFunction::Delegate { using WorkListTy = GISelWorkList<512>; + MachineFunction &MF; WorkListTy &WorkList; + /// The instructions that have been created but we want to report once they + /// have their operands. This is only maintained if debug output is requested. + SmallPtrSet CreatedInstrs; public: - WorkListMaintainer(WorkListTy &WorkList) : WorkList(WorkList) {} - virtual ~WorkListMaintainer() {} + WorkListMaintainer(MachineFunction &MF, WorkListTy &WorkList) + : GISelChangeObserver(), MF(MF), WorkList(WorkList) { + MF.setDelegate(this); + } + virtual ~WorkListMaintainer() { + MF.resetDelegate(this); + } - void erasedInstr(MachineInstr &MI) override { - LLVM_DEBUG(dbgs() << "Erased: "; MI.print(dbgs()); dbgs() << "\n"); + void erasingInstr(const MachineInstr &MI) override { + LLVM_DEBUG(dbgs() << "Erased: " << MI << "\n"); WorkList.remove(&MI); } - void createdInstr(MachineInstr &MI) override { - LLVM_DEBUG(dbgs() << "Created: "; MI.print(dbgs()); dbgs() << "\n"); + void createdInstr(const MachineInstr &MI) override { + LLVM_DEBUG(dbgs() << "Creating: " << MI << "\n"); WorkList.insert(&MI); + LLVM_DEBUG(CreatedInstrs.insert(&MI)); + } + void changingInstr(const MachineInstr &MI) override { + LLVM_DEBUG(dbgs() << "Changing: " << MI << "\n"); + WorkList.insert(&MI); + } + void changedInstr(const MachineInstr &MI) override { + LLVM_DEBUG(dbgs() << "Changed: " << MI << "\n"); + WorkList.insert(&MI); + } + + void reportFullyCreatedInstrs() { + LLVM_DEBUG(for (const auto *MI + : CreatedInstrs) { + dbgs() << "Created: "; + MI->print(dbgs()); + }); + LLVM_DEBUG(CreatedInstrs.clear()); + } + + void MF_HandleInsertion(const MachineInstr &MI) override { + createdInstr(MI); + } + void MF_HandleRemoval(const MachineInstr &MI) override { + erasingInstr(MI); } }; } @@ -80,8 +116,8 @@ bool Combiner::combineMachineInstrs(MachineFunction &MF) { // insert with list bottom up, so while we pop_back_val, we'll traverse top // down RPOT. Changed = false; - GISelWorkList<512> WorkList; - WorkListMaintainer Observer(WorkList); + GISelWorkList<512> WorkList(&MF); + WorkListMaintainer Observer(MF, WorkList); for (MachineBasicBlock *MBB : post_order(&MF)) { if (MBB->empty()) continue; @@ -100,8 +136,9 @@ bool Combiner::combineMachineInstrs(MachineFunction &MF) { // Main Loop. Process the instructions here. while (!WorkList.empty()) { MachineInstr *CurrInst = WorkList.pop_back_val(); - LLVM_DEBUG(dbgs() << "Try combining " << *CurrInst << "\n";); + LLVM_DEBUG(dbgs() << "\nTry combining " << *CurrInst;); Changed |= CInfo.combine(Observer, *CurrInst, Builder); + Observer.reportFullyCreatedInstrs(); } MFChanged |= Changed; } while (Changed); diff --git a/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 557bc880fd6ce0421dfef8db76b601e16db334c7..b1c5670a6dec3f76398394b1785b180ec4fa2449 100644 --- a/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1,4 +1,4 @@ -//== ---lib/CodeGen/GlobalISel/GICombinerHelper.cpp --------------------- == // +//===-- lib/CodeGen/GlobalISel/GICombinerHelper.cpp -----------------------===// // // The LLVM Compiler Infrastructure // @@ -6,27 +6,44 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" +#include "llvm/CodeGen/GlobalISel/Combiner.h" +#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" -#define DEBUG_TYPE "gi-combine" +#define DEBUG_TYPE "gi-combiner" using namespace llvm; -CombinerHelper::CombinerHelper(CombinerChangeObserver &Observer, +CombinerHelper::CombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B) : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer) {} -void CombinerHelper::eraseInstr(MachineInstr &MI) { - Observer.erasedInstr(MI); +void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, unsigned FromReg, + unsigned ToReg) const { + Observer.changingAllUsesOfReg(MRI, FromReg); + + if (MRI.constrainRegAttrs(ToReg, FromReg)) + MRI.replaceRegWith(FromReg, ToReg); + else + Builder.buildCopy(ToReg, FromReg); + + Observer.finishedChangingAllUsesOfReg(); } -void CombinerHelper::scheduleForVisit(MachineInstr &MI) { - Observer.createdInstr(MI); + +void CombinerHelper::replaceRegOpWith(MachineRegisterInfo &MRI, + MachineOperand &FromRegOp, + unsigned ToReg) const { + assert(FromRegOp.getParent() && "Expected an operand in an MI"); + Observer.changingInstr(*FromRegOp.getParent()); + + FromRegOp.setReg(ToReg); + + Observer.changedInstr(*FromRegOp.getParent()); } bool CombinerHelper::tryCombineCopy(MachineInstr &MI) { @@ -40,7 +57,7 @@ bool CombinerHelper::tryCombineCopy(MachineInstr &MI) { // a(sx) = COPY b(sx) -> Replace all uses of a with b. if (DstTy.isValid() && SrcTy.isValid() && DstTy == SrcTy) { MI.eraseFromParent(); - MRI.replaceRegWith(DstReg, SrcReg); + replaceRegWith(MRI, DstReg, SrcReg); return true; } return false; @@ -193,8 +210,11 @@ bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) { // type since by definition the result of an extend is larger. assert(Preferred.Ty != LoadValueTy && "Extending to same type?"); + LLVM_DEBUG(dbgs() << "Preferred use is: " << *Preferred.MI); + // Rewrite the load to the chosen extending load. unsigned ChosenDstReg = Preferred.MI->getOperand(0).getReg(); + Observer.changingInstr(MI); MI.setDesc( Builder.getTII().get(Preferred.ExtendOpcode == TargetOpcode::G_SEXT ? TargetOpcode::G_SEXTLOAD @@ -213,7 +233,7 @@ bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) { if (UseMI->getOpcode() == Preferred.ExtendOpcode || UseMI->getOpcode() == TargetOpcode::G_ANYEXT) { unsigned UseDstReg = UseMI->getOperand(0).getReg(); - unsigned UseSrcReg = UseMI->getOperand(1).getReg(); + MachineOperand &UseSrcMO = UseMI->getOperand(1); const LLT &UseDstTy = MRI.getType(UseDstReg); if (UseDstReg != ChosenDstReg) { if (Preferred.Ty == UseDstTy) { @@ -226,7 +246,7 @@ bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) { // rewrites to: // %2:_(s32) = G_SEXTLOAD ... // ... = ... %2(s32) - MRI.replaceRegWith(UseDstReg, ChosenDstReg); + replaceRegWith(MRI, UseDstReg, ChosenDstReg); ScheduleForErase.push_back(UseMO.getParent()); } else if (Preferred.Ty.getSizeInBits() < UseDstTy.getSizeInBits()) { // If the preferred size is smaller, then keep the extend but extend @@ -239,7 +259,7 @@ bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) { // %2:_(s32) = G_SEXTLOAD ... // %3:_(s64) = G_ANYEXT %2:_(s32) // ... = ... %3(s64) - MRI.replaceRegWith(UseSrcReg, ChosenDstReg); + replaceRegOpWith(MRI, UseSrcMO, ChosenDstReg); } else { // If the preferred size is large, then insert a truncate. For // example: @@ -286,7 +306,9 @@ bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) { MachineInstr *PreviouslyEmitted = EmittedInsns.lookup(InsertIntoBB); if (PreviouslyEmitted) { + Observer.changingInstr(*UseMO->getParent()); UseMO->setReg(PreviouslyEmitted->getOperand(0).getReg()); + Observer.changedInstr(*UseMO->getParent()); continue; } @@ -294,14 +316,14 @@ bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) { unsigned NewDstReg = MRI.cloneVirtualRegister(MI.getOperand(0).getReg()); MachineInstr *NewMI = Builder.buildTrunc(NewDstReg, ChosenDstReg); EmittedInsns[InsertIntoBB] = NewMI; - UseMO->setReg(NewDstReg); - Observer.createdInstr(*NewMI); + replaceRegOpWith(MRI, *UseMO, NewDstReg); } for (auto &EraseMI : ScheduleForErase) { + Observer.erasingInstr(*EraseMI); EraseMI->eraseFromParent(); - Observer.erasedInstr(*EraseMI); } MI.getOperand(0).setReg(ChosenDstReg); + Observer.changedInstr(MI); return true; } diff --git a/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp b/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp new file mode 100644 index 0000000000000000000000000000000000000000..993a919826fa6a3eeb54c34ff515d874f56837ef --- /dev/null +++ b/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp @@ -0,0 +1,31 @@ +//===-- lib/CodeGen/GlobalISel/GISelChangeObserver.cpp --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file constains common code to combine machine functions at generic +// level. +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +void GISelChangeObserver::changingAllUsesOfReg( + const MachineRegisterInfo &MRI, unsigned Reg) { + for (auto &ChangingMI : MRI.use_instructions(Reg)) { + changingInstr(ChangingMI); + ChangingAllUsesOfReg.insert(&ChangingMI); + } +} + +void GISelChangeObserver::finishedChangingAllUsesOfReg() { + for (auto *ChangedMI : ChangingAllUsesOfReg) + changedInstr(*ChangedMI); +} + diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp index ef090777726c52908e1047cdb9e31d307a18b508..33313756a3899d6f3fbcef46a57b9b325a484437 100644 --- a/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -215,7 +215,7 @@ ArrayRef IRTranslator::getOrCreateVRegs(const Value &Val) { unsigned Idx = 0; while (auto Elt = C.getAggregateElement(Idx++)) { auto EltRegs = getOrCreateVRegs(*Elt); - std::copy(EltRegs.begin(), EltRegs.end(), std::back_inserter(*VRegs)); + llvm::copy(EltRegs, std::back_inserter(*VRegs)); } } else { assert(SplitTys.size() == 1 && "unexpectedly split LLT"); @@ -330,6 +330,13 @@ bool IRTranslator::translateFSub(const User &U, MachineIRBuilder &MIRBuilder) { return translateBinaryOp(TargetOpcode::G_FSUB, U, MIRBuilder); } +bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) { + MIRBuilder.buildInstr(TargetOpcode::G_FNEG) + .addDef(getOrCreateVReg(U)) + .addUse(getOrCreateVReg(*U.getOperand(1))); + return true; +} + bool IRTranslator::translateCompare(const User &U, MachineIRBuilder &MIRBuilder) { const CmpInst *CI = dyn_cast(&U); @@ -347,8 +354,10 @@ bool IRTranslator::translateCompare(const User &U, else if (Pred == CmpInst::FCMP_TRUE) MIRBuilder.buildCopy( Res, getOrCreateVReg(*Constant::getAllOnesValue(CI->getType()))); - else - MIRBuilder.buildFCmp(Pred, Res, Op0, Op1); + else { + auto FCmp = MIRBuilder.buildFCmp(Pred, Res, Op0, Op1); + FCmp->copyIRFlags(*CI); + } return true; } @@ -581,8 +590,15 @@ bool IRTranslator::translateSelect(const User &U, ArrayRef Op0Regs = getOrCreateVRegs(*U.getOperand(1)); ArrayRef Op1Regs = getOrCreateVRegs(*U.getOperand(2)); - for (unsigned i = 0; i < ResRegs.size(); ++i) - MIRBuilder.buildSelect(ResRegs[i], Tst, Op0Regs[i], Op1Regs[i]); + const SelectInst &SI = cast(U); + const CmpInst *Cmp = dyn_cast(SI.getCondition()); + for (unsigned i = 0; i < ResRegs.size(); ++i) { + auto Select = + MIRBuilder.buildSelect(ResRegs[i], Tst, Op0Regs[i], Op1Regs[i]); + if (Cmp && isa(Cmp)) { + Select->copyIRFlags(*Cmp); + } + } return true; } @@ -862,37 +878,56 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, return translateOverflowIntrinsic(CI, TargetOpcode::G_UMULO, MIRBuilder); case Intrinsic::smul_with_overflow: return translateOverflowIntrinsic(CI, TargetOpcode::G_SMULO, MIRBuilder); - case Intrinsic::pow: - MIRBuilder.buildInstr(TargetOpcode::G_FPOW) + case Intrinsic::pow: { + auto Pow = MIRBuilder.buildInstr(TargetOpcode::G_FPOW) .addDef(getOrCreateVReg(CI)) .addUse(getOrCreateVReg(*CI.getArgOperand(0))) .addUse(getOrCreateVReg(*CI.getArgOperand(1))); + Pow->copyIRFlags(CI); + return true; + } + case Intrinsic::exp: { + auto Exp = MIRBuilder.buildInstr(TargetOpcode::G_FEXP) + .addDef(getOrCreateVReg(CI)) + .addUse(getOrCreateVReg(*CI.getArgOperand(0))); + Exp->copyIRFlags(CI); return true; - case Intrinsic::exp: - MIRBuilder.buildInstr(TargetOpcode::G_FEXP) + } + case Intrinsic::exp2: { + auto Exp2 = MIRBuilder.buildInstr(TargetOpcode::G_FEXP2) .addDef(getOrCreateVReg(CI)) .addUse(getOrCreateVReg(*CI.getArgOperand(0))); + Exp2->copyIRFlags(CI); return true; - case Intrinsic::exp2: - MIRBuilder.buildInstr(TargetOpcode::G_FEXP2) + } + case Intrinsic::log: { + auto Log = MIRBuilder.buildInstr(TargetOpcode::G_FLOG) .addDef(getOrCreateVReg(CI)) .addUse(getOrCreateVReg(*CI.getArgOperand(0))); + Log->copyIRFlags(CI); return true; - case Intrinsic::log: - MIRBuilder.buildInstr(TargetOpcode::G_FLOG) + } + case Intrinsic::log2: { + auto Log2 = MIRBuilder.buildInstr(TargetOpcode::G_FLOG2) .addDef(getOrCreateVReg(CI)) .addUse(getOrCreateVReg(*CI.getArgOperand(0))); + Log2->copyIRFlags(CI); return true; - case Intrinsic::log2: - MIRBuilder.buildInstr(TargetOpcode::G_FLOG2) + } + case Intrinsic::log10: { + auto Log10 = MIRBuilder.buildInstr(TargetOpcode::G_FLOG10) .addDef(getOrCreateVReg(CI)) .addUse(getOrCreateVReg(*CI.getArgOperand(0))); + Log10->copyIRFlags(CI); return true; - case Intrinsic::fabs: - MIRBuilder.buildInstr(TargetOpcode::G_FABS) + } + case Intrinsic::fabs: { + auto Fabs = MIRBuilder.buildInstr(TargetOpcode::G_FABS) .addDef(getOrCreateVReg(CI)) .addUse(getOrCreateVReg(*CI.getArgOperand(0))); + Fabs->copyIRFlags(CI); return true; + } case Intrinsic::trunc: MIRBuilder.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC) .addDef(getOrCreateVReg(CI)) @@ -903,13 +938,15 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, .addDef(getOrCreateVReg(CI)) .addUse(getOrCreateVReg(*CI.getArgOperand(0))); return true; - case Intrinsic::fma: - MIRBuilder.buildInstr(TargetOpcode::G_FMA) + case Intrinsic::fma: { + auto FMA = MIRBuilder.buildInstr(TargetOpcode::G_FMA) .addDef(getOrCreateVReg(CI)) .addUse(getOrCreateVReg(*CI.getArgOperand(0))) .addUse(getOrCreateVReg(*CI.getArgOperand(1))) .addUse(getOrCreateVReg(*CI.getArgOperand(2))); + FMA->copyIRFlags(CI); return true; + } case Intrinsic::fmuladd: { const TargetMachine &TM = MF->getTarget(); const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering(); @@ -921,11 +958,14 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, TLI.isFMAFasterThanFMulAndFAdd(TLI.getValueType(*DL, CI.getType()))) { // TODO: Revisit this to see if we should move this part of the // lowering to the combiner. - MIRBuilder.buildInstr(TargetOpcode::G_FMA, Dst, Op0, Op1, Op2); + auto FMA = MIRBuilder.buildInstr(TargetOpcode::G_FMA, {Dst}, {Op0, Op1, Op2}); + FMA->copyIRFlags(CI); } else { LLT Ty = getLLTForType(*CI.getType(), *DL); - auto FMul = MIRBuilder.buildInstr(TargetOpcode::G_FMUL, Ty, Op0, Op1); - MIRBuilder.buildInstr(TargetOpcode::G_FADD, Dst, FMul, Op2); + auto FMul = MIRBuilder.buildInstr(TargetOpcode::G_FMUL, {Ty}, {Op0, Op1}); + FMul->copyIRFlags(CI); + auto FAdd = MIRBuilder.buildInstr(TargetOpcode::G_FADD, {Dst}, {FMul, Op2}); + FAdd->copyIRFlags(CI); } return true; } @@ -961,13 +1001,15 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, getStackGuard(GuardVal, MIRBuilder); AllocaInst *Slot = cast(CI.getArgOperand(1)); + int FI = getOrCreateFrameIndex(*Slot); + MF->getFrameInfo().setStackProtectorIndex(FI); + MIRBuilder.buildStore( GuardVal, getOrCreateVReg(*Slot), - *MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(*MF, - getOrCreateFrameIndex(*Slot)), - MachineMemOperand::MOStore | MachineMemOperand::MOVolatile, - PtrTy.getSizeInBits() / 8, 8)); + *MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), + MachineMemOperand::MOStore | + MachineMemOperand::MOVolatile, + PtrTy.getSizeInBits() / 8, 8)); return true; } case Intrinsic::cttz: @@ -1400,7 +1442,7 @@ bool IRTranslator::translatePHI(const User &U, MachineIRBuilder &MIRBuilder) { SmallVector Insts; for (auto Reg : getOrCreateVRegs(PI)) { - auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_PHI, Reg); + auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_PHI, {Reg}, {}); Insts.push_back(MIB.getInstr()); } @@ -1584,22 +1626,22 @@ bool IRTranslator::translate(const Constant &C, unsigned Reg) { // Return the scalar if it is a <1 x Ty> vector. if (CAZ->getNumElements() == 1) return translate(*CAZ->getElementValue(0u), Reg); - std::vector Ops; + SmallVector Ops; for (unsigned i = 0; i < CAZ->getNumElements(); ++i) { Constant &Elt = *CAZ->getElementValue(i); Ops.push_back(getOrCreateVReg(Elt)); } - EntryBuilder.buildMerge(Reg, Ops); + EntryBuilder.buildBuildVector(Reg, Ops); } else if (auto CV = dyn_cast(&C)) { // Return the scalar if it is a <1 x Ty> vector. if (CV->getNumElements() == 1) return translate(*CV->getElementAsConstant(0), Reg); - std::vector Ops; + SmallVector Ops; for (unsigned i = 0; i < CV->getNumElements(); ++i) { Constant &Elt = *CV->getElementAsConstant(i); Ops.push_back(getOrCreateVReg(Elt)); } - EntryBuilder.buildMerge(Reg, Ops); + EntryBuilder.buildBuildVector(Reg, Ops); } else if (auto CE = dyn_cast(&C)) { switch(CE->getOpcode()) { #define HANDLE_INST(NUM, OPCODE, CLASS) \ @@ -1615,7 +1657,7 @@ bool IRTranslator::translate(const Constant &C, unsigned Reg) { for (unsigned i = 0; i < CV->getNumOperands(); ++i) { Ops.push_back(getOrCreateVReg(*CV->getOperand(i))); } - EntryBuilder.buildMerge(Reg, Ops); + EntryBuilder.buildBuildVector(Reg, Ops); } else if (auto *BA = dyn_cast(&C)) { EntryBuilder.buildBlockAddress(Reg, BA); } else diff --git a/lib/CodeGen/GlobalISel/Legalizer.cpp b/lib/CodeGen/GlobalISel/Legalizer.cpp index 9a2aac998a84b42cde5e75df115d25241b5b94dd..df2dcace4212c9033519f385420085780f49f3a0 100644 --- a/lib/CodeGen/GlobalISel/Legalizer.cpp +++ b/lib/CodeGen/GlobalISel/Legalizer.cpp @@ -16,6 +16,7 @@ #include "llvm/CodeGen/GlobalISel/Legalizer.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/GISelWorkList.h" #include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" @@ -64,9 +65,52 @@ static bool isArtifact(const MachineInstr &MI) { case TargetOpcode::G_SEXT: case TargetOpcode::G_MERGE_VALUES: case TargetOpcode::G_UNMERGE_VALUES: + case TargetOpcode::G_CONCAT_VECTORS: + case TargetOpcode::G_BUILD_VECTOR: return true; } } +using InstListTy = GISelWorkList<256>; +using ArtifactListTy = GISelWorkList<128>; + +class LegalizerWorkListManager : public GISelChangeObserver { + InstListTy &InstList; + ArtifactListTy &ArtifactList; + +public: + LegalizerWorkListManager(InstListTy &Insts, ArtifactListTy &Arts) + : InstList(Insts), ArtifactList(Arts) {} + + void createdInstr(const MachineInstr &MI) override { + // Only legalize pre-isel generic instructions. + // Legalization process could generate Target specific pseudo + // instructions with generic types. Don't record them + if (isPreISelGenericOpcode(MI.getOpcode())) { + if (isArtifact(MI)) + ArtifactList.insert(&MI); + else + InstList.insert(&MI); + } + LLVM_DEBUG(dbgs() << ".. .. New MI: " << MI); + } + + void erasingInstr(const MachineInstr &MI) override { + LLVM_DEBUG(dbgs() << ".. .. Erasing: " << MI); + InstList.remove(&MI); + ArtifactList.remove(&MI); + } + + void changingInstr(const MachineInstr &MI) override { + LLVM_DEBUG(dbgs() << ".. .. Changing MI: " << MI); + } + + void changedInstr(const MachineInstr &MI) override { + // When insts change, we want to revisit them to legalize them again. + // We'll consider them the same as created. + LLVM_DEBUG(dbgs() << ".. .. Changed MI: " << MI); + createdInstr(MI); + } +}; bool Legalizer::runOnMachineFunction(MachineFunction &MF) { // If the ISel pipeline failed, do not bother running that pass. @@ -77,14 +121,13 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) { init(MF); const TargetPassConfig &TPC = getAnalysis(); MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr); - LegalizerHelper Helper(MF); const size_t NumBlocks = MF.size(); MachineRegisterInfo &MRI = MF.getRegInfo(); // Populate Insts - GISelWorkList<256> InstList; - GISelWorkList<128> ArtifactList; + InstListTy InstList(&MF); + ArtifactListTy ArtifactList(&MF); ReversePostOrderTraversal RPOT(&MF); // Perform legalization bottom up so we can DCE as we legalize. // Traverse BB in RPOT and within each basic block, add insts top down, @@ -103,24 +146,12 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) { InstList.insert(&MI); } } - Helper.MIRBuilder.recordInsertions([&](MachineInstr *MI) { - // Only legalize pre-isel generic instructions. - // Legalization process could generate Target specific pseudo - // instructions with generic types. Don't record them - if (isPreISelGenericOpcode(MI->getOpcode())) { - if (isArtifact(*MI)) - ArtifactList.insert(MI); - else - InstList.insert(MI); - } - LLVM_DEBUG(dbgs() << ".. .. New MI: " << *MI;); - }); + LegalizerWorkListManager WorkListObserver(InstList, ArtifactList); + LegalizerHelper Helper(MF, WorkListObserver); const LegalizerInfo &LInfo(Helper.getLegalizerInfo()); LegalizationArtifactCombiner ArtCombiner(Helper.MIRBuilder, MF.getRegInfo(), LInfo); - auto RemoveDeadInstFromLists = [&InstList, - &ArtifactList](MachineInstr *DeadMI) { - InstList.remove(DeadMI); - ArtifactList.remove(DeadMI); + auto RemoveDeadInstFromLists = [&WorkListObserver](MachineInstr *DeadMI) { + WorkListObserver.erasingInstr(*DeadMI); }; bool Changed = false; do { @@ -138,7 +169,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) { // Error out if we couldn't legalize this instruction. We may want to // fall back to DAG ISel instead in the future. if (Res == LegalizerHelper::UnableToLegalize) { - Helper.MIRBuilder.stopRecordingInsertions(); + Helper.MIRBuilder.stopObservingChanges(); reportGISelFailure(MF, TPC, MORE, "gisel-legalize", "unable to legalize instruction", MI); return false; @@ -149,7 +180,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) { MachineInstr &MI = *ArtifactList.pop_back_val(); assert(isPreISelGenericOpcode(MI.getOpcode()) && "Expecting generic opcode"); if (isTriviallyDead(MI, MRI)) { - LLVM_DEBUG(dbgs() << MI << "Is dead; erasing.\n"); + LLVM_DEBUG(dbgs() << MI << "Is dead\n"); RemoveDeadInstFromLists(&MI); MI.eraseFromParentAndMarkDBGValuesForRemoval(); continue; @@ -157,7 +188,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) { SmallVector DeadInstructions; if (ArtCombiner.tryCombineInstruction(MI, DeadInstructions)) { for (auto *DeadMI : DeadInstructions) { - LLVM_DEBUG(dbgs() << ".. Erasing Dead Instruction " << *DeadMI); + LLVM_DEBUG(dbgs() << *DeadMI << "Is dead\n"); RemoveDeadInstFromLists(DeadMI); DeadMI->eraseFromParentAndMarkDBGValuesForRemoval(); } diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 516f5ce4343e7c34efb2279e7fdef00491cd0417..266409dd1fabceb3c2d329bc1337e1a4619af098 100644 --- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -15,6 +15,7 @@ #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -29,14 +30,19 @@ using namespace llvm; using namespace LegalizeActions; -LegalizerHelper::LegalizerHelper(MachineFunction &MF) - : MRI(MF.getRegInfo()), LI(*MF.getSubtarget().getLegalizerInfo()) { +LegalizerHelper::LegalizerHelper(MachineFunction &MF, + GISelChangeObserver &Observer) + : MRI(MF.getRegInfo()), LI(*MF.getSubtarget().getLegalizerInfo()), + Observer(Observer) { MIRBuilder.setMF(MF); + MIRBuilder.setChangeObserver(Observer); } -LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI) - : MRI(MF.getRegInfo()), LI(LI) { +LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI, + GISelChangeObserver &Observer) + : MRI(MF.getRegInfo()), LI(LI), Observer(Observer) { MIRBuilder.setMF(MF); + MIRBuilder.setChangeObserver(Observer); } LegalizerHelper::LegalizeResult LegalizerHelper::legalizeInstrStep(MachineInstr &MI) { @@ -64,8 +70,8 @@ LegalizerHelper::legalizeInstrStep(MachineInstr &MI) { return fewerElementsVector(MI, Step.TypeIdx, Step.NewType); case Custom: LLVM_DEBUG(dbgs() << ".. Custom legalization\n"); - return LI.legalizeCustom(MI, MRI, MIRBuilder) ? Legalized - : UnableToLegalize; + return LI.legalizeCustom(MI, MRI, MIRBuilder, Observer) ? Legalized + : UnableToLegalize; default: LLVM_DEBUG(dbgs() << ".. Unable to legalize\n"); return UnableToLegalize; @@ -82,17 +88,20 @@ void LegalizerHelper::extractParts(unsigned Reg, LLT Ty, int NumParts, static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { switch (Opcode) { case TargetOpcode::G_SDIV: - assert(Size == 32 && "Unsupported size"); - return RTLIB::SDIV_I32; + assert((Size == 32 || Size == 64) && "Unsupported size"); + return Size == 64 ? RTLIB::SDIV_I64 : RTLIB::SDIV_I32; case TargetOpcode::G_UDIV: - assert(Size == 32 && "Unsupported size"); - return RTLIB::UDIV_I32; + assert((Size == 32 || Size == 64) && "Unsupported size"); + return Size == 64 ? RTLIB::UDIV_I64 : RTLIB::UDIV_I32; case TargetOpcode::G_SREM: - assert(Size == 32 && "Unsupported size"); - return RTLIB::SREM_I32; + assert((Size == 32 || Size == 64) && "Unsupported size"); + return Size == 64 ? RTLIB::SREM_I64 : RTLIB::SREM_I32; case TargetOpcode::G_UREM: + assert((Size == 32 || Size == 64) && "Unsupported size"); + return Size == 64 ? RTLIB::UREM_I64 : RTLIB::UREM_I32; + case TargetOpcode::G_CTLZ_ZERO_UNDEF: assert(Size == 32 && "Unsupported size"); - return RTLIB::UREM_I32; + return RTLIB::CTLZ_I32; case TargetOpcode::G_FADD: assert((Size == 32 || Size == 64) && "Unsupported size"); return Size == 64 ? RTLIB::ADD_F64 : RTLIB::ADD_F32; @@ -189,8 +198,9 @@ LegalizerHelper::libcall(MachineInstr &MI) { case TargetOpcode::G_SDIV: case TargetOpcode::G_UDIV: case TargetOpcode::G_SREM: - case TargetOpcode::G_UREM: { - Type *HLTy = Type::getInt32Ty(Ctx); + case TargetOpcode::G_UREM: + case TargetOpcode::G_CTLZ_ZERO_UNDEF: { + Type *HLTy = IntegerType::get(Ctx, Size); auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy); if (Status != Legalized) return Status; @@ -294,7 +304,12 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, for (int i = 0; i < NumParts; ++i) DstRegs.push_back( MIRBuilder.buildUndef(NarrowTy)->getOperand(0).getReg()); - MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs); + + unsigned DstReg = MI.getOperand(0).getReg(); + if(MRI.getType(DstReg).isVector()) + MIRBuilder.buildBuildVector(DstReg, DstRegs); + else + MIRBuilder.buildMerge(DstReg, DstRegs); MI.eraseFromParent(); return Legalized; } @@ -324,7 +339,10 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, CarryIn = CarryOut; } unsigned DstReg = MI.getOperand(0).getReg(); - MIRBuilder.buildMerge(DstReg, DstRegs); + if(MRI.getType(DstReg).isVector()) + MIRBuilder.buildBuildVector(DstReg, DstRegs); + else + MIRBuilder.buildMerge(DstReg, DstRegs); MI.eraseFromParent(); return Legalized; } @@ -380,7 +398,11 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, DstRegs.push_back(SegReg); } - MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs); + unsigned DstReg = MI.getOperand(0).getReg(); + if(MRI.getType(DstReg).isVector()) + MIRBuilder.buildBuildVector(DstReg, DstRegs); + else + MIRBuilder.buildMerge(DstReg, DstRegs); MI.eraseFromParent(); return Legalized; } @@ -441,7 +463,11 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, } assert(DstRegs.size() == (unsigned)NumParts && "not all parts covered"); - MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs); + unsigned DstReg = MI.getOperand(0).getReg(); + if(MRI.getType(DstReg).isVector()) + MIRBuilder.buildBuildVector(DstReg, DstRegs); + else + MIRBuilder.buildMerge(DstReg, DstRegs); MI.eraseFromParent(); return Legalized; } @@ -482,7 +508,10 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, DstRegs.push_back(DstReg); } unsigned DstReg = MI.getOperand(0).getReg(); - MIRBuilder.buildMerge(DstReg, DstRegs); + if(MRI.getType(DstReg).isVector()) + MIRBuilder.buildBuildVector(DstReg, DstRegs); + else + MIRBuilder.buildMerge(DstReg, DstRegs); MI.eraseFromParent(); return Legalized; } @@ -542,11 +571,16 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, DstRegs.push_back(DstReg); } unsigned DstReg = MI.getOperand(0).getReg(); - MIRBuilder.buildMerge(DstReg, DstRegs); + if(MRI.getType(DstReg).isVector()) + MIRBuilder.buildBuildVector(DstReg, DstRegs); + else + MIRBuilder.buildMerge(DstReg, DstRegs); MI.eraseFromParent(); return Legalized; } - case TargetOpcode::G_OR: { + case TargetOpcode::G_AND: + case TargetOpcode::G_OR: + case TargetOpcode::G_XOR: { // Legalize bitwise operation: // A = BinOp B, C // into: @@ -585,11 +619,15 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, // Do the operation on each small part. for (int i = 0; i < NumParts; ++i) - MIRBuilder.buildOr(DstRegs[i], SrcsReg1[i], SrcsReg2[i]); + MIRBuilder.buildInstr(MI.getOpcode(), {DstRegs[i]}, + {SrcsReg1[i], SrcsReg2[i]}); // Gather the destination registers into the final destination. unsigned DstReg = MI.getOperand(0).getReg(); - MIRBuilder.buildMerge(DstReg, DstRegs); + if(MRI.getType(DstReg).isVector()) + MIRBuilder.buildBuildVector(DstReg, DstRegs); + else + MIRBuilder.buildMerge(DstReg, DstRegs); MI.eraseFromParent(); return Legalized; } @@ -599,7 +637,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy, unsigned OpIdx, unsigned ExtOpcode) { MachineOperand &MO = MI.getOperand(OpIdx); - auto ExtB = MIRBuilder.buildInstr(ExtOpcode, WideTy, MO.getReg()); + auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO.getReg()}); MO.setReg(ExtB->getOperand(0).getReg()); } @@ -608,7 +646,7 @@ void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy, MachineOperand &MO = MI.getOperand(OpIdx); unsigned DstExt = MRI.createGenericVirtualRegister(WideTy); MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); - MIRBuilder.buildInstr(TruncOpcode, MO.getReg(), DstExt); + MIRBuilder.buildInstr(TruncOpcode, {MO.getReg()}, {DstExt}); MO.setReg(DstExt); } @@ -623,20 +661,20 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { case TargetOpcode::G_USUBO: { if (TypeIdx == 1) return UnableToLegalize; // TODO - auto LHSZext = MIRBuilder.buildInstr(TargetOpcode::G_ZEXT, WideTy, - MI.getOperand(2).getReg()); - auto RHSZext = MIRBuilder.buildInstr(TargetOpcode::G_ZEXT, WideTy, - MI.getOperand(3).getReg()); + auto LHSZext = MIRBuilder.buildInstr(TargetOpcode::G_ZEXT, {WideTy}, + {MI.getOperand(2).getReg()}); + auto RHSZext = MIRBuilder.buildInstr(TargetOpcode::G_ZEXT, {WideTy}, + {MI.getOperand(3).getReg()}); unsigned Opcode = MI.getOpcode() == TargetOpcode::G_UADDO ? TargetOpcode::G_ADD : TargetOpcode::G_SUB; // Do the arithmetic in the larger type. - auto NewOp = MIRBuilder.buildInstr(Opcode, WideTy, LHSZext, RHSZext); + auto NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSZext, RHSZext}); LLT OrigTy = MRI.getType(MI.getOperand(0).getReg()); APInt Mask = APInt::getAllOnesValue(OrigTy.getSizeInBits()); auto AndOp = MIRBuilder.buildInstr( - TargetOpcode::G_AND, WideTy, NewOp, - MIRBuilder.buildConstant(WideTy, Mask.getZExtValue())); + TargetOpcode::G_AND, {WideTy}, + {NewOp, MIRBuilder.buildConstant(WideTy, Mask.getZExtValue())}); // There is no overflow if the AndOp is the same as NewOp. MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1).getReg(), NewOp, AndOp); @@ -660,25 +698,26 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { auto TopBit = APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits()); MIBSrc = MIRBuilder.buildInstr( - TargetOpcode::G_OR, WideTy, MIBSrc, - MIRBuilder.buildConstant(WideTy, TopBit.getSExtValue())); + TargetOpcode::G_OR, {WideTy}, + {MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit.getSExtValue())}); } // Perform the operation at the larger size. - auto MIBNewOp = MIRBuilder.buildInstr(MI.getOpcode(), WideTy, MIBSrc); + auto MIBNewOp = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy}, {MIBSrc}); // This is already the correct result for CTPOP and CTTZs if (MI.getOpcode() == TargetOpcode::G_CTLZ || MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) { // The correct result is NewOp - (Difference in widety and current ty). unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits(); - MIBNewOp = - MIRBuilder.buildInstr(TargetOpcode::G_SUB, WideTy, MIBNewOp, - MIRBuilder.buildConstant(WideTy, SizeDiff)); + MIBNewOp = MIRBuilder.buildInstr( + TargetOpcode::G_SUB, {WideTy}, + {MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff)}); } auto &TII = *MI.getMF()->getSubtarget().getInstrInfo(); - // Make the original instruction a trunc now, and update it's source. + // Make the original instruction a trunc now, and update its source. + Observer.changingInstr(MI); MI.setDesc(TII.get(TargetOpcode::G_TRUNC)); MI.getOperand(1).setReg(MIBNewOp->getOperand(0).getReg()); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; } @@ -691,45 +730,50 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { // Perform operation at larger width (any extension is fine here, high bits // don't affect the result) and then truncate the result back to the // original type. + Observer.changingInstr(MI); widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT); widenScalarDst(MI, WideTy); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; case TargetOpcode::G_SHL: + Observer.changingInstr(MI); widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); // The "number of bits to shift" operand must preserve its value as an // unsigned integer: widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); widenScalarDst(MI, WideTy); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; case TargetOpcode::G_SDIV: case TargetOpcode::G_SREM: + Observer.changingInstr(MI); widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT); widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); widenScalarDst(MI, WideTy); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; case TargetOpcode::G_ASHR: + Observer.changingInstr(MI); widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT); // The "number of bits to shift" operand must preserve its value as an // unsigned integer: widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); widenScalarDst(MI, WideTy); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; case TargetOpcode::G_UDIV: case TargetOpcode::G_UREM: case TargetOpcode::G_LSHR: + Observer.changingInstr(MI); widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT); widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); widenScalarDst(MI, WideTy); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; case TargetOpcode::G_SELECT: @@ -738,40 +782,45 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { // Perform operation at larger width (any extension is fine here, high bits // don't affect the result) and then truncate the result back to the // original type. + Observer.changingInstr(MI); widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT); widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT); widenScalarDst(MI, WideTy); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; case TargetOpcode::G_FPTOSI: case TargetOpcode::G_FPTOUI: if (TypeIdx != 0) return UnableToLegalize; + Observer.changingInstr(MI); widenScalarDst(MI, WideTy); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; case TargetOpcode::G_SITOFP: if (TypeIdx != 1) return UnableToLegalize; + Observer.changingInstr(MI); widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; case TargetOpcode::G_UITOFP: if (TypeIdx != 1) return UnableToLegalize; + Observer.changingInstr(MI); widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; case TargetOpcode::G_INSERT: if (TypeIdx != 0) return UnableToLegalize; + Observer.changingInstr(MI); widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); widenScalarDst(MI, WideTy); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; case TargetOpcode::G_LOAD: @@ -784,8 +833,9 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { LLVM_FALLTHROUGH; case TargetOpcode::G_SEXTLOAD: case TargetOpcode::G_ZEXTLOAD: + Observer.changingInstr(MI); widenScalarDst(MI, WideTy); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; case TargetOpcode::G_STORE: { @@ -793,18 +843,20 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { WideTy != LLT::scalar(8)) return UnableToLegalize; + Observer.changingInstr(MI); widenScalarSrc(MI, WideTy, 0, TargetOpcode::G_ZEXT); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; } case TargetOpcode::G_CONSTANT: { MachineOperand &SrcMO = MI.getOperand(1); LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext(); const APInt &Val = SrcMO.getCImm()->getValue().sext(WideTy.getSizeInBits()); + Observer.changingInstr(MI); SrcMO.setCImm(ConstantInt::get(Ctx, Val)); widenScalarDst(MI, WideTy); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; } case TargetOpcode::G_FCONSTANT: { @@ -822,28 +874,32 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { default: llvm_unreachable("Unhandled fp widen type"); } + Observer.changingInstr(MI); SrcMO.setFPImm(ConstantFP::get(Ctx, Val)); widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; } case TargetOpcode::G_BRCOND: + Observer.changingInstr(MI); widenScalarSrc(MI, WideTy, 0, TargetOpcode::G_ANYEXT); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; case TargetOpcode::G_FCMP: + Observer.changingInstr(MI); if (TypeIdx == 0) widenScalarDst(MI, WideTy); else { widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT); widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT); } - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; case TargetOpcode::G_ICMP: + Observer.changingInstr(MI); if (TypeIdx == 0) widenScalarDst(MI, WideTy); else { @@ -854,18 +910,20 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { widenScalarSrc(MI, WideTy, 2, ExtOpcode); widenScalarSrc(MI, WideTy, 3, ExtOpcode); } - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; case TargetOpcode::G_GEP: assert(TypeIdx == 1 && "unable to legalize pointer of GEP"); + Observer.changingInstr(MI); widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; case TargetOpcode::G_PHI: { assert(TypeIdx == 0 && "Expecting only Idx 0"); + Observer.changingInstr(MI); for (unsigned I = 1; I < MI.getNumOperands(); I += 2) { MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB(); MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator()); @@ -875,14 +933,15 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { MachineBasicBlock &MBB = *MI.getParent(); MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI()); widenScalarDst(MI, WideTy); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; } case TargetOpcode::G_EXTRACT_VECTOR_ELT: if (TypeIdx != 2) return UnableToLegalize; + Observer.changingInstr(MI); widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; } } @@ -1063,6 +1122,24 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { case TargetOpcode::G_CTTZ: case TargetOpcode::G_CTPOP: return lowerBitCount(MI, TypeIdx, Ty); + case G_UADDE: { + unsigned Res = MI.getOperand(0).getReg(); + unsigned CarryOut = MI.getOperand(1).getReg(); + unsigned LHS = MI.getOperand(2).getReg(); + unsigned RHS = MI.getOperand(3).getReg(); + unsigned CarryIn = MI.getOperand(4).getReg(); + + unsigned TmpRes = MRI.createGenericVirtualRegister(Ty); + unsigned ZExtCarryIn = MRI.createGenericVirtualRegister(Ty); + + MIRBuilder.buildAdd(TmpRes, LHS, RHS); + MIRBuilder.buildZExt(ZExtCarryIn, CarryIn); + MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn); + MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, LHS); + + MI.eraseFromParent(); + return Legalized; + } } } @@ -1072,6 +1149,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, // FIXME: Don't know how to handle secondary types yet. if (TypeIdx != 0) return UnableToLegalize; + + MIRBuilder.setInstr(MI); switch (MI.getOpcode()) { default: return UnableToLegalize; @@ -1085,8 +1164,6 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, if (Size % NarrowSize != 0) return UnableToLegalize; - MIRBuilder.setInstr(MI); - SmallVector Src1Regs, Src2Regs, DstRegs; extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs); extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs); @@ -1097,7 +1174,49 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, DstRegs.push_back(DstReg); } - MIRBuilder.buildMerge(DstReg, DstRegs); + MIRBuilder.buildConcatVectors(DstReg, DstRegs); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_LOAD: + case TargetOpcode::G_STORE: { + bool IsLoad = MI.getOpcode() == TargetOpcode::G_LOAD; + unsigned ValReg = MI.getOperand(0).getReg(); + unsigned AddrReg = MI.getOperand(1).getReg(); + unsigned NarrowSize = NarrowTy.getSizeInBits(); + unsigned Size = MRI.getType(ValReg).getSizeInBits(); + unsigned NumParts = Size / NarrowSize; + + SmallVector NarrowRegs; + if (!IsLoad) + extractParts(ValReg, NarrowTy, NumParts, NarrowRegs); + + const LLT OffsetTy = + LLT::scalar(MRI.getType(AddrReg).getScalarSizeInBits()); + MachineFunction &MF = *MI.getMF(); + MachineMemOperand *MMO = *MI.memoperands_begin(); + for (unsigned Idx = 0; Idx < NumParts; ++Idx) { + unsigned Adjustment = Idx * NarrowTy.getSizeInBits() / 8; + unsigned Alignment = MinAlign(MMO->getAlignment(), Adjustment); + unsigned NewAddrReg = 0; + MIRBuilder.materializeGEP(NewAddrReg, AddrReg, OffsetTy, Adjustment); + MachineMemOperand &NewMMO = *MF.getMachineMemOperand( + MMO->getPointerInfo().getWithOffset(Adjustment), MMO->getFlags(), + NarrowTy.getSizeInBits() / 8, Alignment); + if (IsLoad) { + unsigned Dst = MRI.createGenericVirtualRegister(NarrowTy); + NarrowRegs.push_back(Dst); + MIRBuilder.buildLoad(Dst, NewAddrReg, NewMMO); + } else { + MIRBuilder.buildStore(NarrowRegs[Idx], NewAddrReg, NewMMO); + } + } + if (IsLoad) { + if (NarrowTy.isVector()) + MIRBuilder.buildConcatVectors(ValReg, NarrowRegs); + else + MIRBuilder.buildBuildVector(ValReg, NarrowRegs); + } MI.eraseFromParent(); return Legalized; } @@ -1108,27 +1227,27 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { unsigned Opc = MI.getOpcode(); auto &TII = *MI.getMF()->getSubtarget().getInstrInfo(); - auto isLegalOrCustom = [this](const LegalityQuery &Q) { + auto isSupported = [this](const LegalityQuery &Q) { auto QAction = LI.getAction(Q).Action; - return QAction == Legal || QAction == Custom; + return QAction == Legal || QAction == Libcall || QAction == Custom; }; switch (Opc) { default: return UnableToLegalize; case TargetOpcode::G_CTLZ_ZERO_UNDEF: { // This trivially expands to CTLZ. + Observer.changingInstr(MI); MI.setDesc(TII.get(TargetOpcode::G_CTLZ)); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; } case TargetOpcode::G_CTLZ: { unsigned SrcReg = MI.getOperand(1).getReg(); unsigned Len = Ty.getSizeInBits(); - if (isLegalOrCustom({TargetOpcode::G_CTLZ_ZERO_UNDEF, {Ty}})) { - // If CTLZ_ZERO_UNDEF is legal or custom, emit that and a select with - // zero. - auto MIBCtlzZU = - MIRBuilder.buildInstr(TargetOpcode::G_CTLZ_ZERO_UNDEF, Ty, SrcReg); + if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {Ty}})) { + // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero. + auto MIBCtlzZU = MIRBuilder.buildInstr(TargetOpcode::G_CTLZ_ZERO_UNDEF, + {Ty}, {SrcReg}); auto MIBZero = MIRBuilder.buildConstant(Ty, 0); auto MIBLen = MIRBuilder.buildConstant(Ty, Len); auto MIBICmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), @@ -1154,30 +1273,32 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) { auto MIBShiftAmt = MIRBuilder.buildConstant(Ty, 1ULL << i); auto MIBOp = MIRBuilder.buildInstr( - TargetOpcode::G_OR, Ty, Op, - MIRBuilder.buildInstr(TargetOpcode::G_LSHR, Ty, Op, MIBShiftAmt)); + TargetOpcode::G_OR, {Ty}, + {Op, MIRBuilder.buildInstr(TargetOpcode::G_LSHR, {Ty}, + {Op, MIBShiftAmt})}); Op = MIBOp->getOperand(0).getReg(); } - auto MIBPop = MIRBuilder.buildInstr(TargetOpcode::G_CTPOP, Ty, Op); - MIRBuilder.buildInstr(TargetOpcode::G_SUB, MI.getOperand(0).getReg(), - MIRBuilder.buildConstant(Ty, Len), MIBPop); + auto MIBPop = MIRBuilder.buildInstr(TargetOpcode::G_CTPOP, {Ty}, {Op}); + MIRBuilder.buildInstr(TargetOpcode::G_SUB, {MI.getOperand(0).getReg()}, + {MIRBuilder.buildConstant(Ty, Len), MIBPop}); MI.eraseFromParent(); return Legalized; } case TargetOpcode::G_CTTZ_ZERO_UNDEF: { // This trivially expands to CTTZ. + Observer.changingInstr(MI); MI.setDesc(TII.get(TargetOpcode::G_CTTZ)); - MIRBuilder.recordInsertion(&MI); + Observer.changedInstr(MI); return Legalized; } case TargetOpcode::G_CTTZ: { unsigned SrcReg = MI.getOperand(1).getReg(); unsigned Len = Ty.getSizeInBits(); - if (isLegalOrCustom({TargetOpcode::G_CTTZ_ZERO_UNDEF, {Ty}})) { + if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {Ty}})) { // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with // zero. - auto MIBCttzZU = - MIRBuilder.buildInstr(TargetOpcode::G_CTTZ_ZERO_UNDEF, Ty, SrcReg); + auto MIBCttzZU = MIRBuilder.buildInstr(TargetOpcode::G_CTTZ_ZERO_UNDEF, + {Ty}, {SrcReg}); auto MIBZero = MIRBuilder.buildConstant(Ty, 0); auto MIBLen = MIRBuilder.buildConstant(Ty, Len); auto MIBICmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), @@ -1193,17 +1314,18 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { // Ref: "Hacker's Delight" by Henry Warren auto MIBCstNeg1 = MIRBuilder.buildConstant(Ty, -1); auto MIBNot = - MIRBuilder.buildInstr(TargetOpcode::G_XOR, Ty, SrcReg, MIBCstNeg1); + MIRBuilder.buildInstr(TargetOpcode::G_XOR, {Ty}, {SrcReg, MIBCstNeg1}); auto MIBTmp = MIRBuilder.buildInstr( - TargetOpcode::G_AND, Ty, MIBNot, - MIRBuilder.buildInstr(TargetOpcode::G_ADD, Ty, SrcReg, MIBCstNeg1)); - if (!isLegalOrCustom({TargetOpcode::G_CTPOP, {Ty}}) && - isLegalOrCustom({TargetOpcode::G_CTLZ, {Ty}})) { + TargetOpcode::G_AND, {Ty}, + {MIBNot, MIRBuilder.buildInstr(TargetOpcode::G_ADD, {Ty}, + {SrcReg, MIBCstNeg1})}); + if (!isSupported({TargetOpcode::G_CTPOP, {Ty}}) && + isSupported({TargetOpcode::G_CTLZ, {Ty}})) { auto MIBCstLen = MIRBuilder.buildConstant(Ty, Len); MIRBuilder.buildInstr( - TargetOpcode::G_SUB, MI.getOperand(0).getReg(), - MIBCstLen, - MIRBuilder.buildInstr(TargetOpcode::G_CTLZ, Ty, MIBTmp)); + TargetOpcode::G_SUB, {MI.getOperand(0).getReg()}, + {MIBCstLen, + MIRBuilder.buildInstr(TargetOpcode::G_CTLZ, {Ty}, {MIBTmp})}); MI.eraseFromParent(); return Legalized; } diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp index ca776de0a0fe0d9e69ec90307fb580cfe60465e7..926af3fd76c6760c13f3483df2577a77c44456ed 100644 --- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp +++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp @@ -19,6 +19,7 @@ #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/ADT/SmallBitVector.h" +#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -375,7 +376,8 @@ bool LegalizerInfo::isLegal(const MachineInstr &MI, } bool LegalizerInfo::legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const { + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const { return false; } diff --git a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index c7b98c6b85961c85cf7883d45385c2be658bf859..c1109e611777dfe7abd9f261406cce476052f6c7 100644 --- a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -10,6 +10,7 @@ /// This file implements the MachineIRBuidler class. //===----------------------------------------------------------------------===// #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" @@ -22,73 +23,70 @@ using namespace llvm; -void MachineIRBuilderBase::setMF(MachineFunction &MF) { +void MachineIRBuilder::setMF(MachineFunction &MF) { State.MF = &MF; State.MBB = nullptr; State.MRI = &MF.getRegInfo(); State.TII = MF.getSubtarget().getInstrInfo(); State.DL = DebugLoc(); State.II = MachineBasicBlock::iterator(); - State.InsertedInstr = nullptr; + State.Observer = nullptr; } -void MachineIRBuilderBase::setMBB(MachineBasicBlock &MBB) { +void MachineIRBuilder::setMBB(MachineBasicBlock &MBB) { State.MBB = &MBB; State.II = MBB.end(); assert(&getMF() == MBB.getParent() && "Basic block is in a different function"); } -void MachineIRBuilderBase::setInstr(MachineInstr &MI) { +void MachineIRBuilder::setInstr(MachineInstr &MI) { assert(MI.getParent() && "Instruction is not part of a basic block"); setMBB(*MI.getParent()); State.II = MI.getIterator(); } -void MachineIRBuilderBase::setInsertPt(MachineBasicBlock &MBB, - MachineBasicBlock::iterator II) { +void MachineIRBuilder::setInsertPt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator II) { assert(MBB.getParent() == &getMF() && "Basic block is in a different function"); State.MBB = &MBB; State.II = II; } -void MachineIRBuilderBase::recordInsertion(MachineInstr *InsertedInstr) const { - if (State.InsertedInstr) - State.InsertedInstr(InsertedInstr); +void MachineIRBuilder::recordInsertion(MachineInstr *InsertedInstr) const { + if (State.Observer) + State.Observer->createdInstr(*InsertedInstr); } -void MachineIRBuilderBase::recordInsertions( - std::function Inserted) { - State.InsertedInstr = std::move(Inserted); +void MachineIRBuilder::setChangeObserver(GISelChangeObserver &Observer) { + State.Observer = &Observer; } -void MachineIRBuilderBase::stopRecordingInsertions() { - State.InsertedInstr = nullptr; -} +void MachineIRBuilder::stopObservingChanges() { State.Observer = nullptr; } //------------------------------------------------------------------------------ // Build instruction variants. //------------------------------------------------------------------------------ -MachineInstrBuilder MachineIRBuilderBase::buildInstr(unsigned Opcode) { +MachineInstrBuilder MachineIRBuilder::buildInstr(unsigned Opcode) { return insertInstr(buildInstrNoInsert(Opcode)); } -MachineInstrBuilder MachineIRBuilderBase::buildInstrNoInsert(unsigned Opcode) { +MachineInstrBuilder MachineIRBuilder::buildInstrNoInsert(unsigned Opcode) { MachineInstrBuilder MIB = BuildMI(getMF(), getDL(), getTII().get(Opcode)); return MIB; } -MachineInstrBuilder MachineIRBuilderBase::insertInstr(MachineInstrBuilder MIB) { +MachineInstrBuilder MachineIRBuilder::insertInstr(MachineInstrBuilder MIB) { getMBB().insert(getInsertPt(), MIB); recordInsertion(MIB); return MIB; } MachineInstrBuilder -MachineIRBuilderBase::buildDirectDbgValue(unsigned Reg, const MDNode *Variable, - const MDNode *Expr) { +MachineIRBuilder::buildDirectDbgValue(unsigned Reg, const MDNode *Variable, + const MDNode *Expr) { assert(isa(Variable) && "not a variable"); assert(cast(Expr)->isValid() && "not an expression"); assert( @@ -99,8 +97,9 @@ MachineIRBuilderBase::buildDirectDbgValue(unsigned Reg, const MDNode *Variable, /*IsIndirect*/ false, Reg, Variable, Expr)); } -MachineInstrBuilder MachineIRBuilderBase::buildIndirectDbgValue( - unsigned Reg, const MDNode *Variable, const MDNode *Expr) { +MachineInstrBuilder +MachineIRBuilder::buildIndirectDbgValue(unsigned Reg, const MDNode *Variable, + const MDNode *Expr) { assert(isa(Variable) && "not a variable"); assert(cast(Expr)->isValid() && "not an expression"); assert( @@ -111,9 +110,9 @@ MachineInstrBuilder MachineIRBuilderBase::buildIndirectDbgValue( /*IsIndirect*/ true, Reg, Variable, Expr)); } -MachineInstrBuilder -MachineIRBuilderBase::buildFIDbgValue(int FI, const MDNode *Variable, - const MDNode *Expr) { +MachineInstrBuilder MachineIRBuilder::buildFIDbgValue(int FI, + const MDNode *Variable, + const MDNode *Expr) { assert(isa(Variable) && "not a variable"); assert(cast(Expr)->isValid() && "not an expression"); assert( @@ -126,8 +125,9 @@ MachineIRBuilderBase::buildFIDbgValue(int FI, const MDNode *Variable, .addMetadata(Expr); } -MachineInstrBuilder MachineIRBuilderBase::buildConstDbgValue( - const Constant &C, const MDNode *Variable, const MDNode *Expr) { +MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C, + const MDNode *Variable, + const MDNode *Expr) { assert(isa(Variable) && "not a variable"); assert(cast(Expr)->isValid() && "not an expression"); assert( @@ -149,7 +149,7 @@ MachineInstrBuilder MachineIRBuilderBase::buildConstDbgValue( return MIB.addImm(0).addMetadata(Variable).addMetadata(Expr); } -MachineInstrBuilder MachineIRBuilderBase::buildDbgLabel(const MDNode *Label) { +MachineInstrBuilder MachineIRBuilder::buildDbgLabel(const MDNode *Label) { assert(isa(Label) && "not a label"); assert(cast(Label)->isValidLocationForIntrinsic(State.DL) && "Expected inlined-at fields to agree"); @@ -158,16 +158,15 @@ MachineInstrBuilder MachineIRBuilderBase::buildDbgLabel(const MDNode *Label) { return MIB.addMetadata(Label); } -MachineInstrBuilder MachineIRBuilderBase::buildFrameIndex(unsigned Res, - int Idx) { +MachineInstrBuilder MachineIRBuilder::buildFrameIndex(unsigned Res, int Idx) { assert(getMRI()->getType(Res).isPointer() && "invalid operand type"); return buildInstr(TargetOpcode::G_FRAME_INDEX) .addDef(Res) .addFrameIndex(Idx); } -MachineInstrBuilder -MachineIRBuilderBase::buildGlobalValue(unsigned Res, const GlobalValue *GV) { +MachineInstrBuilder MachineIRBuilder::buildGlobalValue(unsigned Res, + const GlobalValue *GV) { assert(getMRI()->getType(Res).isPointer() && "invalid operand type"); assert(getMRI()->getType(Res).getAddressSpace() == GV->getType()->getAddressSpace() && @@ -178,17 +177,14 @@ MachineIRBuilderBase::buildGlobalValue(unsigned Res, const GlobalValue *GV) { .addGlobalAddress(GV); } -void MachineIRBuilderBase::validateBinaryOp(unsigned Res, unsigned Op0, - unsigned Op1) { - assert((getMRI()->getType(Res).isScalar() || - getMRI()->getType(Res).isVector()) && - "invalid operand type"); - assert(getMRI()->getType(Res) == getMRI()->getType(Op0) && - getMRI()->getType(Res) == getMRI()->getType(Op1) && "type mismatch"); +void MachineIRBuilder::validateBinaryOp(const LLT &Res, const LLT &Op0, + const LLT &Op1) { + assert((Res.isScalar() || Res.isVector()) && "invalid operand type"); + assert((Res == Op0 && Res == Op1) && "type mismatch"); } -MachineInstrBuilder MachineIRBuilderBase::buildGEP(unsigned Res, unsigned Op0, - unsigned Op1) { +MachineInstrBuilder MachineIRBuilder::buildGEP(unsigned Res, unsigned Op0, + unsigned Op1) { assert(getMRI()->getType(Res).isPointer() && getMRI()->getType(Res) == getMRI()->getType(Op0) && "type mismatch"); assert(getMRI()->getType(Op1).isScalar() && "invalid offset type"); @@ -200,8 +196,8 @@ MachineInstrBuilder MachineIRBuilderBase::buildGEP(unsigned Res, unsigned Op0, } Optional -MachineIRBuilderBase::materializeGEP(unsigned &Res, unsigned Op0, - const LLT &ValueTy, uint64_t Value) { +MachineIRBuilder::materializeGEP(unsigned &Res, unsigned Op0, + const LLT &ValueTy, uint64_t Value) { assert(Res == 0 && "Res is a result argument"); assert(ValueTy.isScalar() && "invalid offset type"); @@ -217,9 +213,8 @@ MachineIRBuilderBase::materializeGEP(unsigned &Res, unsigned Op0, return buildGEP(Res, Op0, TmpReg); } -MachineInstrBuilder MachineIRBuilderBase::buildPtrMask(unsigned Res, - unsigned Op0, - uint32_t NumBits) { +MachineInstrBuilder MachineIRBuilder::buildPtrMask(unsigned Res, unsigned Op0, + uint32_t NumBits) { assert(getMRI()->getType(Res).isPointer() && getMRI()->getType(Res) == getMRI()->getType(Op0) && "type mismatch"); @@ -229,24 +224,23 @@ MachineInstrBuilder MachineIRBuilderBase::buildPtrMask(unsigned Res, .addImm(NumBits); } -MachineInstrBuilder MachineIRBuilderBase::buildBr(MachineBasicBlock &Dest) { +MachineInstrBuilder MachineIRBuilder::buildBr(MachineBasicBlock &Dest) { return buildInstr(TargetOpcode::G_BR).addMBB(&Dest); } -MachineInstrBuilder MachineIRBuilderBase::buildBrIndirect(unsigned Tgt) { +MachineInstrBuilder MachineIRBuilder::buildBrIndirect(unsigned Tgt) { assert(getMRI()->getType(Tgt).isPointer() && "invalid branch destination"); return buildInstr(TargetOpcode::G_BRINDIRECT).addUse(Tgt); } -MachineInstrBuilder MachineIRBuilderBase::buildCopy(unsigned Res, unsigned Op) { - assert(getMRI()->getType(Res) == LLT() || getMRI()->getType(Op) == LLT() || - getMRI()->getType(Res) == getMRI()->getType(Op)); - return buildInstr(TargetOpcode::COPY).addDef(Res).addUse(Op); +MachineInstrBuilder MachineIRBuilder::buildCopy(const DstOp &Res, + const SrcOp &Op) { + return buildInstr(TargetOpcode::COPY, Res, Op); } -MachineInstrBuilder -MachineIRBuilderBase::buildConstant(unsigned Res, const ConstantInt &Val) { - LLT Ty = getMRI()->getType(Res); +MachineInstrBuilder MachineIRBuilder::buildConstant(const DstOp &Res, + const ConstantInt &Val) { + LLT Ty = Res.getLLTTy(*getMRI()); assert((Ty.isScalar() || Ty.isPointer()) && "invalid operand type"); @@ -255,48 +249,55 @@ MachineIRBuilderBase::buildConstant(unsigned Res, const ConstantInt &Val) { NewVal = ConstantInt::get(getMF().getFunction().getContext(), Val.getValue().sextOrTrunc(Ty.getSizeInBits())); - return buildInstr(TargetOpcode::G_CONSTANT).addDef(Res).addCImm(NewVal); + auto MIB = buildInstr(TargetOpcode::G_CONSTANT); + Res.addDefToMIB(*getMRI(), MIB); + MIB.addCImm(NewVal); + return MIB; } -MachineInstrBuilder MachineIRBuilderBase::buildConstant(unsigned Res, - int64_t Val) { +MachineInstrBuilder MachineIRBuilder::buildConstant(const DstOp &Res, + int64_t Val) { auto IntN = IntegerType::get(getMF().getFunction().getContext(), - getMRI()->getType(Res).getSizeInBits()); + Res.getLLTTy(*getMRI()).getSizeInBits()); ConstantInt *CI = ConstantInt::get(IntN, Val, true); return buildConstant(Res, *CI); } -MachineInstrBuilder -MachineIRBuilderBase::buildFConstant(unsigned Res, const ConstantFP &Val) { - assert(getMRI()->getType(Res).isScalar() && "invalid operand type"); +MachineInstrBuilder MachineIRBuilder::buildFConstant(const DstOp &Res, + const ConstantFP &Val) { + assert(Res.getLLTTy(*getMRI()).isScalar() && "invalid operand type"); - return buildInstr(TargetOpcode::G_FCONSTANT).addDef(Res).addFPImm(&Val); + auto MIB = buildInstr(TargetOpcode::G_FCONSTANT); + Res.addDefToMIB(*getMRI(), MIB); + MIB.addFPImm(&Val); + return MIB; } -MachineInstrBuilder MachineIRBuilderBase::buildFConstant(unsigned Res, - double Val) { - LLT DstTy = getMRI()->getType(Res); +MachineInstrBuilder MachineIRBuilder::buildFConstant(const DstOp &Res, + double Val) { + LLT DstTy = Res.getLLTTy(*getMRI()); auto &Ctx = getMF().getFunction().getContext(); auto *CFP = ConstantFP::get(Ctx, getAPFloatFromSize(Val, DstTy.getSizeInBits())); return buildFConstant(Res, *CFP); } -MachineInstrBuilder MachineIRBuilderBase::buildBrCond(unsigned Tst, - MachineBasicBlock &Dest) { +MachineInstrBuilder MachineIRBuilder::buildBrCond(unsigned Tst, + MachineBasicBlock &Dest) { assert(getMRI()->getType(Tst).isScalar() && "invalid operand type"); return buildInstr(TargetOpcode::G_BRCOND).addUse(Tst).addMBB(&Dest); } -MachineInstrBuilder MachineIRBuilderBase::buildLoad(unsigned Res, unsigned Addr, - MachineMemOperand &MMO) { +MachineInstrBuilder MachineIRBuilder::buildLoad(unsigned Res, unsigned Addr, + MachineMemOperand &MMO) { return buildLoadInstr(TargetOpcode::G_LOAD, Res, Addr, MMO); } -MachineInstrBuilder -MachineIRBuilderBase::buildLoadInstr(unsigned Opcode, unsigned Res, - unsigned Addr, MachineMemOperand &MMO) { +MachineInstrBuilder MachineIRBuilder::buildLoadInstr(unsigned Opcode, + unsigned Res, + unsigned Addr, + MachineMemOperand &MMO) { assert(getMRI()->getType(Res).isValid() && "invalid operand type"); assert(getMRI()->getType(Addr).isPointer() && "invalid operand type"); @@ -306,9 +307,8 @@ MachineIRBuilderBase::buildLoadInstr(unsigned Opcode, unsigned Res, .addMemOperand(&MMO); } -MachineInstrBuilder MachineIRBuilderBase::buildStore(unsigned Val, - unsigned Addr, - MachineMemOperand &MMO) { +MachineInstrBuilder MachineIRBuilder::buildStore(unsigned Val, unsigned Addr, + MachineMemOperand &MMO) { assert(getMRI()->getType(Val).isValid() && "invalid operand type"); assert(getMRI()->getType(Addr).isPointer() && "invalid operand type"); @@ -318,83 +318,73 @@ MachineInstrBuilder MachineIRBuilderBase::buildStore(unsigned Val, .addMemOperand(&MMO); } -MachineInstrBuilder MachineIRBuilderBase::buildUAdde(unsigned Res, - unsigned CarryOut, - unsigned Op0, unsigned Op1, - unsigned CarryIn) { - assert(getMRI()->getType(Res).isScalar() && "invalid operand type"); - assert(getMRI()->getType(Res) == getMRI()->getType(Op0) && - getMRI()->getType(Res) == getMRI()->getType(Op1) && "type mismatch"); - assert(getMRI()->getType(CarryOut).isScalar() && "invalid operand type"); - assert(getMRI()->getType(CarryOut) == getMRI()->getType(CarryIn) && - "type mismatch"); - - return buildInstr(TargetOpcode::G_UADDE) - .addDef(Res) - .addDef(CarryOut) - .addUse(Op0) - .addUse(Op1) - .addUse(CarryIn); +MachineInstrBuilder MachineIRBuilder::buildUAdde(const DstOp &Res, + const DstOp &CarryOut, + const SrcOp &Op0, + const SrcOp &Op1, + const SrcOp &CarryIn) { + return buildInstr(TargetOpcode::G_UADDE, {Res, CarryOut}, + {Op0, Op1, CarryIn}); } -MachineInstrBuilder MachineIRBuilderBase::buildAnyExt(unsigned Res, - unsigned Op) { - validateTruncExt(Res, Op, true); - return buildInstr(TargetOpcode::G_ANYEXT).addDef(Res).addUse(Op); +MachineInstrBuilder MachineIRBuilder::buildAnyExt(const DstOp &Res, + const SrcOp &Op) { + return buildInstr(TargetOpcode::G_ANYEXT, Res, Op); } -MachineInstrBuilder MachineIRBuilderBase::buildSExt(unsigned Res, unsigned Op) { - validateTruncExt(Res, Op, true); - return buildInstr(TargetOpcode::G_SEXT).addDef(Res).addUse(Op); +MachineInstrBuilder MachineIRBuilder::buildSExt(const DstOp &Res, + const SrcOp &Op) { + return buildInstr(TargetOpcode::G_SEXT, Res, Op); } -MachineInstrBuilder MachineIRBuilderBase::buildZExt(unsigned Res, unsigned Op) { - validateTruncExt(Res, Op, true); - return buildInstr(TargetOpcode::G_ZEXT).addDef(Res).addUse(Op); +MachineInstrBuilder MachineIRBuilder::buildZExt(const DstOp &Res, + const SrcOp &Op) { + return buildInstr(TargetOpcode::G_ZEXT, Res, Op); } -MachineInstrBuilder MachineIRBuilderBase::buildExtOrTrunc(unsigned ExtOpc, - unsigned Res, - unsigned Op) { +MachineInstrBuilder MachineIRBuilder::buildExtOrTrunc(unsigned ExtOpc, + const DstOp &Res, + const SrcOp &Op) { assert((TargetOpcode::G_ANYEXT == ExtOpc || TargetOpcode::G_ZEXT == ExtOpc || TargetOpcode::G_SEXT == ExtOpc) && "Expecting Extending Opc"); - assert(getMRI()->getType(Res).isScalar() || - getMRI()->getType(Res).isVector()); - assert(getMRI()->getType(Res).isScalar() == getMRI()->getType(Op).isScalar()); + assert(Res.getLLTTy(*getMRI()).isScalar() || + Res.getLLTTy(*getMRI()).isVector()); + assert(Res.getLLTTy(*getMRI()).isScalar() == + Op.getLLTTy(*getMRI()).isScalar()); unsigned Opcode = TargetOpcode::COPY; - if (getMRI()->getType(Res).getSizeInBits() > - getMRI()->getType(Op).getSizeInBits()) + if (Res.getLLTTy(*getMRI()).getSizeInBits() > + Op.getLLTTy(*getMRI()).getSizeInBits()) Opcode = ExtOpc; - else if (getMRI()->getType(Res).getSizeInBits() < - getMRI()->getType(Op).getSizeInBits()) + else if (Res.getLLTTy(*getMRI()).getSizeInBits() < + Op.getLLTTy(*getMRI()).getSizeInBits()) Opcode = TargetOpcode::G_TRUNC; else - assert(getMRI()->getType(Res) == getMRI()->getType(Op)); + assert(Res.getLLTTy(*getMRI()) == Op.getLLTTy(*getMRI())); - return buildInstr(Opcode).addDef(Res).addUse(Op); + return buildInstr(Opcode, Res, Op); } -MachineInstrBuilder MachineIRBuilderBase::buildSExtOrTrunc(unsigned Res, - unsigned Op) { +MachineInstrBuilder MachineIRBuilder::buildSExtOrTrunc(const DstOp &Res, + const SrcOp &Op) { return buildExtOrTrunc(TargetOpcode::G_SEXT, Res, Op); } -MachineInstrBuilder MachineIRBuilderBase::buildZExtOrTrunc(unsigned Res, - unsigned Op) { +MachineInstrBuilder MachineIRBuilder::buildZExtOrTrunc(const DstOp &Res, + const SrcOp &Op) { return buildExtOrTrunc(TargetOpcode::G_ZEXT, Res, Op); } -MachineInstrBuilder MachineIRBuilderBase::buildAnyExtOrTrunc(unsigned Res, - unsigned Op) { +MachineInstrBuilder MachineIRBuilder::buildAnyExtOrTrunc(const DstOp &Res, + const SrcOp &Op) { return buildExtOrTrunc(TargetOpcode::G_ANYEXT, Res, Op); } -MachineInstrBuilder MachineIRBuilderBase::buildCast(unsigned Dst, - unsigned Src) { - LLT SrcTy = getMRI()->getType(Src); - LLT DstTy = getMRI()->getType(Dst); +MachineInstrBuilder MachineIRBuilder::buildCast(const DstOp &Dst, + const SrcOp &Src) { + LLT SrcTy = Src.getLLTTy(*getMRI()); + LLT DstTy = Dst.getLLTTy(*getMRI()); if (SrcTy == DstTy) return buildCopy(Dst, Src); @@ -408,11 +398,11 @@ MachineInstrBuilder MachineIRBuilderBase::buildCast(unsigned Dst, Opcode = TargetOpcode::G_BITCAST; } - return buildInstr(Opcode).addDef(Dst).addUse(Src); + return buildInstr(Opcode, Dst, Src); } -MachineInstrBuilder -MachineIRBuilderBase::buildExtract(unsigned Res, unsigned Src, uint64_t Index) { +MachineInstrBuilder MachineIRBuilder::buildExtract(unsigned Res, unsigned Src, + uint64_t Index) { #ifndef NDEBUG assert(getMRI()->getType(Src).isValid() && "invalid operand type"); assert(getMRI()->getType(Res).isValid() && "invalid operand type"); @@ -433,8 +423,8 @@ MachineIRBuilderBase::buildExtract(unsigned Res, unsigned Src, uint64_t Index) { .addImm(Index); } -void MachineIRBuilderBase::buildSequence(unsigned Res, ArrayRef Ops, - ArrayRef Indices) { +void MachineIRBuilder::buildSequence(unsigned Res, ArrayRef Ops, + ArrayRef Indices) { #ifndef NDEBUG assert(Ops.size() == Indices.size() && "incompatible args"); assert(!Ops.empty() && "invalid trivial sequence"); @@ -474,56 +464,67 @@ void MachineIRBuilderBase::buildSequence(unsigned Res, ArrayRef Ops, } } -MachineInstrBuilder MachineIRBuilderBase::buildUndef(unsigned Res) { - return buildInstr(TargetOpcode::G_IMPLICIT_DEF).addDef(Res); +MachineInstrBuilder MachineIRBuilder::buildUndef(const DstOp &Res) { + return buildInstr(TargetOpcode::G_IMPLICIT_DEF, {Res}, {}); } -MachineInstrBuilder MachineIRBuilderBase::buildMerge(unsigned Res, - ArrayRef Ops) { - -#ifndef NDEBUG - assert(!Ops.empty() && "invalid trivial sequence"); - LLT Ty = getMRI()->getType(Ops[0]); - for (auto Reg : Ops) - assert(getMRI()->getType(Reg) == Ty && "type mismatch in input list"); - assert(Ops.size() * getMRI()->getType(Ops[0]).getSizeInBits() == - getMRI()->getType(Res).getSizeInBits() && - "input operands do not cover output register"); -#endif +MachineInstrBuilder MachineIRBuilder::buildMerge(const DstOp &Res, + ArrayRef Ops) { + // Unfortunately to convert from ArrayRef to ArrayRef, + // we need some temporary storage for the DstOp objects. Here we use a + // sufficiently large SmallVector to not go through the heap. + SmallVector TmpVec(Ops.begin(), Ops.end()); + return buildInstr(TargetOpcode::G_MERGE_VALUES, Res, TmpVec); +} - if (Ops.size() == 1) - return buildCast(Res, Ops[0]); +MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef Res, + const SrcOp &Op) { + // Unfortunately to convert from ArrayRef to ArrayRef, + // we need some temporary storage for the DstOp objects. Here we use a + // sufficiently large SmallVector to not go through the heap. + SmallVector TmpVec(Res.begin(), Res.end()); + return buildInstr(TargetOpcode::G_UNMERGE_VALUES, TmpVec, Op); +} - MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_MERGE_VALUES); - MIB.addDef(Res); - for (unsigned i = 0; i < Ops.size(); ++i) - MIB.addUse(Ops[i]); - return MIB; +MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef Res, + const SrcOp &Op) { + // Unfortunately to convert from ArrayRef to ArrayRef, + // we need some temporary storage for the DstOp objects. Here we use a + // sufficiently large SmallVector to not go through the heap. + SmallVector TmpVec(Res.begin(), Res.end()); + return buildInstr(TargetOpcode::G_UNMERGE_VALUES, TmpVec, Op); } -MachineInstrBuilder MachineIRBuilderBase::buildUnmerge(ArrayRef Res, - unsigned Op) { +MachineInstrBuilder MachineIRBuilder::buildBuildVector(const DstOp &Res, + ArrayRef Ops) { + // Unfortunately to convert from ArrayRef to ArrayRef, + // we need some temporary storage for the DstOp objects. Here we use a + // sufficiently large SmallVector to not go through the heap. + SmallVector TmpVec(Ops.begin(), Ops.end()); + return buildInstr(TargetOpcode::G_BUILD_VECTOR, Res, TmpVec); +} -#ifndef NDEBUG - assert(!Res.empty() && "invalid trivial sequence"); - LLT Ty = getMRI()->getType(Res[0]); - for (auto Reg : Res) - assert(getMRI()->getType(Reg) == Ty && "type mismatch in input list"); - assert(Res.size() * getMRI()->getType(Res[0]).getSizeInBits() == - getMRI()->getType(Op).getSizeInBits() && - "input operands do not cover output register"); -#endif +MachineInstrBuilder +MachineIRBuilder::buildBuildVectorTrunc(const DstOp &Res, + ArrayRef Ops) { + // Unfortunately to convert from ArrayRef to ArrayRef, + // we need some temporary storage for the DstOp objects. Here we use a + // sufficiently large SmallVector to not go through the heap. + SmallVector TmpVec(Ops.begin(), Ops.end()); + return buildInstr(TargetOpcode::G_BUILD_VECTOR_TRUNC, Res, TmpVec); +} - MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_UNMERGE_VALUES); - for (unsigned i = 0; i < Res.size(); ++i) - MIB.addDef(Res[i]); - MIB.addUse(Op); - return MIB; +MachineInstrBuilder +MachineIRBuilder::buildConcatVectors(const DstOp &Res, ArrayRef Ops) { + // Unfortunately to convert from ArrayRef to ArrayRef, + // we need some temporary storage for the DstOp objects. Here we use a + // sufficiently large SmallVector to not go through the heap. + SmallVector TmpVec(Ops.begin(), Ops.end()); + return buildInstr(TargetOpcode::G_CONCAT_VECTORS, Res, TmpVec); } -MachineInstrBuilder MachineIRBuilderBase::buildInsert(unsigned Res, - unsigned Src, unsigned Op, - unsigned Index) { +MachineInstrBuilder MachineIRBuilder::buildInsert(unsigned Res, unsigned Src, + unsigned Op, unsigned Index) { assert(Index + getMRI()->getType(Op).getSizeInBits() <= getMRI()->getType(Res).getSizeInBits() && "insertion past the end of a register"); @@ -540,9 +541,9 @@ MachineInstrBuilder MachineIRBuilderBase::buildInsert(unsigned Res, .addImm(Index); } -MachineInstrBuilder MachineIRBuilderBase::buildIntrinsic(Intrinsic::ID ID, - unsigned Res, - bool HasSideEffects) { +MachineInstrBuilder MachineIRBuilder::buildIntrinsic(Intrinsic::ID ID, + unsigned Res, + bool HasSideEffects) { auto MIB = buildInstr(HasSideEffects ? TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS : TargetOpcode::G_INTRINSIC); @@ -552,133 +553,52 @@ MachineInstrBuilder MachineIRBuilderBase::buildIntrinsic(Intrinsic::ID ID, return MIB; } -MachineInstrBuilder MachineIRBuilderBase::buildTrunc(unsigned Res, - unsigned Op) { - validateTruncExt(Res, Op, false); - return buildInstr(TargetOpcode::G_TRUNC).addDef(Res).addUse(Op); +MachineInstrBuilder MachineIRBuilder::buildTrunc(const DstOp &Res, + const SrcOp &Op) { + return buildInstr(TargetOpcode::G_TRUNC, Res, Op); } -MachineInstrBuilder MachineIRBuilderBase::buildFPTrunc(unsigned Res, - unsigned Op) { - validateTruncExt(Res, Op, false); - return buildInstr(TargetOpcode::G_FPTRUNC).addDef(Res).addUse(Op); +MachineInstrBuilder MachineIRBuilder::buildFPTrunc(const DstOp &Res, + const SrcOp &Op) { + return buildInstr(TargetOpcode::G_FPTRUNC, Res, Op); } -MachineInstrBuilder MachineIRBuilderBase::buildICmp(CmpInst::Predicate Pred, - unsigned Res, unsigned Op0, - unsigned Op1) { -#ifndef NDEBUG - assert(getMRI()->getType(Op0) == getMRI()->getType(Op0) && "type mismatch"); - assert(CmpInst::isIntPredicate(Pred) && "invalid predicate"); - if (getMRI()->getType(Op0).isScalar() || getMRI()->getType(Op0).isPointer()) - assert(getMRI()->getType(Res).isScalar() && "type mismatch"); - else - assert(getMRI()->getType(Res).isVector() && - getMRI()->getType(Res).getNumElements() == - getMRI()->getType(Op0).getNumElements() && - "type mismatch"); -#endif - - return buildInstr(TargetOpcode::G_ICMP) - .addDef(Res) - .addPredicate(Pred) - .addUse(Op0) - .addUse(Op1); +MachineInstrBuilder MachineIRBuilder::buildICmp(CmpInst::Predicate Pred, + const DstOp &Res, + const SrcOp &Op0, + const SrcOp &Op1) { + return buildInstr(TargetOpcode::G_ICMP, Res, {Pred, Op0, Op1}); } -MachineInstrBuilder MachineIRBuilderBase::buildFCmp(CmpInst::Predicate Pred, - unsigned Res, unsigned Op0, - unsigned Op1) { -#ifndef NDEBUG - assert((getMRI()->getType(Op0).isScalar() || - getMRI()->getType(Op0).isVector()) && - "invalid operand type"); - assert(getMRI()->getType(Op0) == getMRI()->getType(Op1) && "type mismatch"); - assert(CmpInst::isFPPredicate(Pred) && "invalid predicate"); - if (getMRI()->getType(Op0).isScalar()) - assert(getMRI()->getType(Res).isScalar() && "type mismatch"); - else - assert(getMRI()->getType(Res).isVector() && - getMRI()->getType(Res).getNumElements() == - getMRI()->getType(Op0).getNumElements() && - "type mismatch"); -#endif +MachineInstrBuilder MachineIRBuilder::buildFCmp(CmpInst::Predicate Pred, + const DstOp &Res, + const SrcOp &Op0, + const SrcOp &Op1) { - return buildInstr(TargetOpcode::G_FCMP) - .addDef(Res) - .addPredicate(Pred) - .addUse(Op0) - .addUse(Op1); + return buildInstr(TargetOpcode::G_FCMP, Res, {Pred, Op0, Op1}); } -MachineInstrBuilder MachineIRBuilderBase::buildSelect(unsigned Res, - unsigned Tst, - unsigned Op0, - unsigned Op1) { -#ifndef NDEBUG - LLT ResTy = getMRI()->getType(Res); - assert((ResTy.isScalar() || ResTy.isVector() || ResTy.isPointer()) && - "invalid operand type"); - assert(ResTy == getMRI()->getType(Op0) && ResTy == getMRI()->getType(Op1) && - "type mismatch"); - if (ResTy.isScalar() || ResTy.isPointer()) - assert(getMRI()->getType(Tst).isScalar() && "type mismatch"); - else - assert((getMRI()->getType(Tst).isScalar() || - (getMRI()->getType(Tst).isVector() && - getMRI()->getType(Tst).getNumElements() == - getMRI()->getType(Op0).getNumElements())) && - "type mismatch"); -#endif +MachineInstrBuilder MachineIRBuilder::buildSelect(const DstOp &Res, + const SrcOp &Tst, + const SrcOp &Op0, + const SrcOp &Op1) { - return buildInstr(TargetOpcode::G_SELECT) - .addDef(Res) - .addUse(Tst) - .addUse(Op0) - .addUse(Op1); + return buildInstr(TargetOpcode::G_SELECT, {Res}, {Tst, Op0, Op1}); } MachineInstrBuilder -MachineIRBuilderBase::buildInsertVectorElement(unsigned Res, unsigned Val, - unsigned Elt, unsigned Idx) { -#ifndef NDEBUG - LLT ResTy = getMRI()->getType(Res); - LLT ValTy = getMRI()->getType(Val); - LLT EltTy = getMRI()->getType(Elt); - LLT IdxTy = getMRI()->getType(Idx); - assert(ResTy.isVector() && ValTy.isVector() && "invalid operand type"); - assert(IdxTy.isScalar() && "invalid operand type"); - assert(ResTy.getNumElements() == ValTy.getNumElements() && "type mismatch"); - assert(ResTy.getElementType() == EltTy && "type mismatch"); -#endif - - return buildInstr(TargetOpcode::G_INSERT_VECTOR_ELT) - .addDef(Res) - .addUse(Val) - .addUse(Elt) - .addUse(Idx); +MachineIRBuilder::buildInsertVectorElement(const DstOp &Res, const SrcOp &Val, + const SrcOp &Elt, const SrcOp &Idx) { + return buildInstr(TargetOpcode::G_INSERT_VECTOR_ELT, Res, {Val, Elt, Idx}); } MachineInstrBuilder -MachineIRBuilderBase::buildExtractVectorElement(unsigned Res, unsigned Val, - unsigned Idx) { -#ifndef NDEBUG - LLT ResTy = getMRI()->getType(Res); - LLT ValTy = getMRI()->getType(Val); - LLT IdxTy = getMRI()->getType(Idx); - assert(ValTy.isVector() && "invalid operand type"); - assert((ResTy.isScalar() || ResTy.isPointer()) && "invalid operand type"); - assert(IdxTy.isScalar() && "invalid operand type"); - assert(ValTy.getElementType() == ResTy && "type mismatch"); -#endif - - return buildInstr(TargetOpcode::G_EXTRACT_VECTOR_ELT) - .addDef(Res) - .addUse(Val) - .addUse(Idx); +MachineIRBuilder::buildExtractVectorElement(const DstOp &Res, const SrcOp &Val, + const SrcOp &Idx) { + return buildInstr(TargetOpcode::G_EXTRACT_VECTOR_ELT, Res, {Val, Idx}); } -MachineInstrBuilder MachineIRBuilderBase::buildAtomicCmpXchgWithSuccess( +MachineInstrBuilder MachineIRBuilder::buildAtomicCmpXchgWithSuccess( unsigned OldValRes, unsigned SuccessRes, unsigned Addr, unsigned CmpVal, unsigned NewVal, MachineMemOperand &MMO) { #ifndef NDEBUG @@ -706,9 +626,9 @@ MachineInstrBuilder MachineIRBuilderBase::buildAtomicCmpXchgWithSuccess( } MachineInstrBuilder -MachineIRBuilderBase::buildAtomicCmpXchg(unsigned OldValRes, unsigned Addr, - unsigned CmpVal, unsigned NewVal, - MachineMemOperand &MMO) { +MachineIRBuilder::buildAtomicCmpXchg(unsigned OldValRes, unsigned Addr, + unsigned CmpVal, unsigned NewVal, + MachineMemOperand &MMO) { #ifndef NDEBUG LLT OldValResTy = getMRI()->getType(OldValRes); LLT AddrTy = getMRI()->getType(Addr); @@ -730,10 +650,11 @@ MachineIRBuilderBase::buildAtomicCmpXchg(unsigned OldValRes, unsigned Addr, .addMemOperand(&MMO); } -MachineInstrBuilder -MachineIRBuilderBase::buildAtomicRMW(unsigned Opcode, unsigned OldValRes, - unsigned Addr, unsigned Val, - MachineMemOperand &MMO) { +MachineInstrBuilder MachineIRBuilder::buildAtomicRMW(unsigned Opcode, + unsigned OldValRes, + unsigned Addr, + unsigned Val, + MachineMemOperand &MMO) { #ifndef NDEBUG LLT OldValResTy = getMRI()->getType(OldValRes); LLT AddrTy = getMRI()->getType(Addr); @@ -752,74 +673,75 @@ MachineIRBuilderBase::buildAtomicRMW(unsigned Opcode, unsigned OldValRes, } MachineInstrBuilder -MachineIRBuilderBase::buildAtomicRMWXchg(unsigned OldValRes, unsigned Addr, - unsigned Val, MachineMemOperand &MMO) { +MachineIRBuilder::buildAtomicRMWXchg(unsigned OldValRes, unsigned Addr, + unsigned Val, MachineMemOperand &MMO) { return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_XCHG, OldValRes, Addr, Val, MMO); } MachineInstrBuilder -MachineIRBuilderBase::buildAtomicRMWAdd(unsigned OldValRes, unsigned Addr, - unsigned Val, MachineMemOperand &MMO) { +MachineIRBuilder::buildAtomicRMWAdd(unsigned OldValRes, unsigned Addr, + unsigned Val, MachineMemOperand &MMO) { return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_ADD, OldValRes, Addr, Val, MMO); } MachineInstrBuilder -MachineIRBuilderBase::buildAtomicRMWSub(unsigned OldValRes, unsigned Addr, - unsigned Val, MachineMemOperand &MMO) { +MachineIRBuilder::buildAtomicRMWSub(unsigned OldValRes, unsigned Addr, + unsigned Val, MachineMemOperand &MMO) { return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_SUB, OldValRes, Addr, Val, MMO); } MachineInstrBuilder -MachineIRBuilderBase::buildAtomicRMWAnd(unsigned OldValRes, unsigned Addr, - unsigned Val, MachineMemOperand &MMO) { +MachineIRBuilder::buildAtomicRMWAnd(unsigned OldValRes, unsigned Addr, + unsigned Val, MachineMemOperand &MMO) { return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_AND, OldValRes, Addr, Val, MMO); } MachineInstrBuilder -MachineIRBuilderBase::buildAtomicRMWNand(unsigned OldValRes, unsigned Addr, - unsigned Val, MachineMemOperand &MMO) { +MachineIRBuilder::buildAtomicRMWNand(unsigned OldValRes, unsigned Addr, + unsigned Val, MachineMemOperand &MMO) { return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_NAND, OldValRes, Addr, Val, MMO); } -MachineInstrBuilder -MachineIRBuilderBase::buildAtomicRMWOr(unsigned OldValRes, unsigned Addr, - unsigned Val, MachineMemOperand &MMO) { +MachineInstrBuilder MachineIRBuilder::buildAtomicRMWOr(unsigned OldValRes, + unsigned Addr, + unsigned Val, + MachineMemOperand &MMO) { return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_OR, OldValRes, Addr, Val, MMO); } MachineInstrBuilder -MachineIRBuilderBase::buildAtomicRMWXor(unsigned OldValRes, unsigned Addr, - unsigned Val, MachineMemOperand &MMO) { +MachineIRBuilder::buildAtomicRMWXor(unsigned OldValRes, unsigned Addr, + unsigned Val, MachineMemOperand &MMO) { return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_XOR, OldValRes, Addr, Val, MMO); } MachineInstrBuilder -MachineIRBuilderBase::buildAtomicRMWMax(unsigned OldValRes, unsigned Addr, - unsigned Val, MachineMemOperand &MMO) { +MachineIRBuilder::buildAtomicRMWMax(unsigned OldValRes, unsigned Addr, + unsigned Val, MachineMemOperand &MMO) { return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_MAX, OldValRes, Addr, Val, MMO); } MachineInstrBuilder -MachineIRBuilderBase::buildAtomicRMWMin(unsigned OldValRes, unsigned Addr, - unsigned Val, MachineMemOperand &MMO) { +MachineIRBuilder::buildAtomicRMWMin(unsigned OldValRes, unsigned Addr, + unsigned Val, MachineMemOperand &MMO) { return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_MIN, OldValRes, Addr, Val, MMO); } MachineInstrBuilder -MachineIRBuilderBase::buildAtomicRMWUmax(unsigned OldValRes, unsigned Addr, - unsigned Val, MachineMemOperand &MMO) { +MachineIRBuilder::buildAtomicRMWUmax(unsigned OldValRes, unsigned Addr, + unsigned Val, MachineMemOperand &MMO) { return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_UMAX, OldValRes, Addr, Val, MMO); } MachineInstrBuilder -MachineIRBuilderBase::buildAtomicRMWUmin(unsigned OldValRes, unsigned Addr, - unsigned Val, MachineMemOperand &MMO) { +MachineIRBuilder::buildAtomicRMWUmin(unsigned OldValRes, unsigned Addr, + unsigned Val, MachineMemOperand &MMO) { return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_UMIN, OldValRes, Addr, Val, MMO); } MachineInstrBuilder -MachineIRBuilderBase::buildBlockAddress(unsigned Res, const BlockAddress *BA) { +MachineIRBuilder::buildBlockAddress(unsigned Res, const BlockAddress *BA) { #ifndef NDEBUG assert(getMRI()->getType(Res).isPointer() && "invalid res type"); #endif @@ -827,12 +749,9 @@ MachineIRBuilderBase::buildBlockAddress(unsigned Res, const BlockAddress *BA) { return buildInstr(TargetOpcode::G_BLOCK_ADDR).addDef(Res).addBlockAddress(BA); } -void MachineIRBuilderBase::validateTruncExt(unsigned Dst, unsigned Src, - bool IsExtend) { +void MachineIRBuilder::validateTruncExt(const LLT &DstTy, const LLT &SrcTy, + bool IsExtend) { #ifndef NDEBUG - LLT SrcTy = getMRI()->getType(Src); - LLT DstTy = getMRI()->getType(Dst); - if (DstTy.isVector()) { assert(SrcTy.isVector() && "mismatched cast between vector and non-vector"); assert(SrcTy.getNumElements() == DstTy.getNumElements() && @@ -848,3 +767,236 @@ void MachineIRBuilderBase::validateTruncExt(unsigned Dst, unsigned Src, "invalid widening trunc"); #endif } + +void MachineIRBuilder::validateSelectOp(const LLT &ResTy, const LLT &TstTy, + const LLT &Op0Ty, const LLT &Op1Ty) { +#ifndef NDEBUG + assert((ResTy.isScalar() || ResTy.isVector() || ResTy.isPointer()) && + "invalid operand type"); + assert((ResTy == Op0Ty && ResTy == Op1Ty) && "type mismatch"); + if (ResTy.isScalar() || ResTy.isPointer()) + assert(TstTy.isScalar() && "type mismatch"); + else + assert((TstTy.isScalar() || + (TstTy.isVector() && + TstTy.getNumElements() == Op0Ty.getNumElements())) && + "type mismatch"); +#endif +} + +MachineInstrBuilder MachineIRBuilder::buildInstr(unsigned Opc, + ArrayRef DstOps, + ArrayRef SrcOps, + Optional Flags) { + switch (Opc) { + default: + break; + case TargetOpcode::G_SELECT: { + assert(DstOps.size() == 1 && "Invalid select"); + assert(SrcOps.size() == 3 && "Invalid select"); + validateSelectOp( + DstOps[0].getLLTTy(*getMRI()), SrcOps[0].getLLTTy(*getMRI()), + SrcOps[1].getLLTTy(*getMRI()), SrcOps[2].getLLTTy(*getMRI())); + break; + } + case TargetOpcode::G_ADD: + case TargetOpcode::G_AND: + case TargetOpcode::G_ASHR: + case TargetOpcode::G_LSHR: + case TargetOpcode::G_MUL: + case TargetOpcode::G_OR: + case TargetOpcode::G_SHL: + case TargetOpcode::G_SUB: + case TargetOpcode::G_XOR: + case TargetOpcode::G_UDIV: + case TargetOpcode::G_SDIV: + case TargetOpcode::G_UREM: + case TargetOpcode::G_SREM: { + // All these are binary ops. + assert(DstOps.size() == 1 && "Invalid Dst"); + assert(SrcOps.size() == 2 && "Invalid Srcs"); + validateBinaryOp(DstOps[0].getLLTTy(*getMRI()), + SrcOps[0].getLLTTy(*getMRI()), + SrcOps[1].getLLTTy(*getMRI())); + break; + case TargetOpcode::G_SEXT: + case TargetOpcode::G_ZEXT: + case TargetOpcode::G_ANYEXT: + assert(DstOps.size() == 1 && "Invalid Dst"); + assert(SrcOps.size() == 1 && "Invalid Srcs"); + validateTruncExt(DstOps[0].getLLTTy(*getMRI()), + SrcOps[0].getLLTTy(*getMRI()), true); + break; + case TargetOpcode::G_TRUNC: + case TargetOpcode::G_FPTRUNC: + assert(DstOps.size() == 1 && "Invalid Dst"); + assert(SrcOps.size() == 1 && "Invalid Srcs"); + validateTruncExt(DstOps[0].getLLTTy(*getMRI()), + SrcOps[0].getLLTTy(*getMRI()), false); + break; + } + case TargetOpcode::COPY: + assert(DstOps.size() == 1 && "Invalid Dst"); + assert(SrcOps.size() == 1 && "Invalid Srcs"); + assert(DstOps[0].getLLTTy(*getMRI()) == LLT() || + SrcOps[0].getLLTTy(*getMRI()) == LLT() || + DstOps[0].getLLTTy(*getMRI()) == SrcOps[0].getLLTTy(*getMRI())); + break; + case TargetOpcode::G_FCMP: + case TargetOpcode::G_ICMP: { + assert(DstOps.size() == 1 && "Invalid Dst Operands"); + assert(SrcOps.size() == 3 && "Invalid Src Operands"); + // For F/ICMP, the first src operand is the predicate, followed by + // the two comparands. + assert(SrcOps[0].getSrcOpKind() == SrcOp::SrcType::Ty_Predicate && + "Expecting predicate"); + assert([&]() -> bool { + CmpInst::Predicate Pred = SrcOps[0].getPredicate(); + return Opc == TargetOpcode::G_ICMP ? CmpInst::isIntPredicate(Pred) + : CmpInst::isFPPredicate(Pred); + }() && "Invalid predicate"); + assert(SrcOps[1].getLLTTy(*getMRI()) == SrcOps[2].getLLTTy(*getMRI()) && + "Type mismatch"); + assert([&]() -> bool { + LLT Op0Ty = SrcOps[1].getLLTTy(*getMRI()); + LLT DstTy = DstOps[0].getLLTTy(*getMRI()); + if (Op0Ty.isScalar() || Op0Ty.isPointer()) + return DstTy.isScalar(); + else + return DstTy.isVector() && + DstTy.getNumElements() == Op0Ty.getNumElements(); + }() && "Type Mismatch"); + break; + } + case TargetOpcode::G_UNMERGE_VALUES: { + assert(!DstOps.empty() && "Invalid trivial sequence"); + assert(SrcOps.size() == 1 && "Invalid src for Unmerge"); + assert(std::all_of(DstOps.begin(), DstOps.end(), + [&, this](const DstOp &Op) { + return Op.getLLTTy(*getMRI()) == + DstOps[0].getLLTTy(*getMRI()); + }) && + "type mismatch in output list"); + assert(DstOps.size() * DstOps[0].getLLTTy(*getMRI()).getSizeInBits() == + SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() && + "input operands do not cover output register"); + break; + } + case TargetOpcode::G_MERGE_VALUES: { + assert(!SrcOps.empty() && "invalid trivial sequence"); + assert(DstOps.size() == 1 && "Invalid Dst"); + assert(std::all_of(SrcOps.begin(), SrcOps.end(), + [&, this](const SrcOp &Op) { + return Op.getLLTTy(*getMRI()) == + SrcOps[0].getLLTTy(*getMRI()); + }) && + "type mismatch in input list"); + assert(SrcOps.size() * SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() == + DstOps[0].getLLTTy(*getMRI()).getSizeInBits() && + "input operands do not cover output register"); + if (SrcOps.size() == 1) + return buildCast(DstOps[0], SrcOps[0]); + if (DstOps[0].getLLTTy(*getMRI()).isVector()) + return buildInstr(TargetOpcode::G_CONCAT_VECTORS, DstOps, SrcOps); + break; + } + case TargetOpcode::G_EXTRACT_VECTOR_ELT: { + assert(DstOps.size() == 1 && "Invalid Dst size"); + assert(SrcOps.size() == 2 && "Invalid Src size"); + assert(SrcOps[0].getLLTTy(*getMRI()).isVector() && "Invalid operand type"); + assert((DstOps[0].getLLTTy(*getMRI()).isScalar() || + DstOps[0].getLLTTy(*getMRI()).isPointer()) && + "Invalid operand type"); + assert(SrcOps[1].getLLTTy(*getMRI()).isScalar() && "Invalid operand type"); + assert(SrcOps[0].getLLTTy(*getMRI()).getElementType() == + DstOps[0].getLLTTy(*getMRI()) && + "Type mismatch"); + break; + } + case TargetOpcode::G_INSERT_VECTOR_ELT: { + assert(DstOps.size() == 1 && "Invalid dst size"); + assert(SrcOps.size() == 3 && "Invalid src size"); + assert(DstOps[0].getLLTTy(*getMRI()).isVector() && + SrcOps[0].getLLTTy(*getMRI()).isVector() && "Invalid operand type"); + assert(DstOps[0].getLLTTy(*getMRI()).getElementType() == + SrcOps[1].getLLTTy(*getMRI()) && + "Type mismatch"); + assert(SrcOps[2].getLLTTy(*getMRI()).isScalar() && "Invalid index"); + assert(DstOps[0].getLLTTy(*getMRI()).getNumElements() == + SrcOps[0].getLLTTy(*getMRI()).getNumElements() && + "Type mismatch"); + break; + } + case TargetOpcode::G_BUILD_VECTOR: { + assert((!SrcOps.empty() || SrcOps.size() < 2) && + "Must have at least 2 operands"); + assert(DstOps.size() == 1 && "Invalid DstOps"); + assert(DstOps[0].getLLTTy(*getMRI()).isVector() && + "Res type must be a vector"); + assert(std::all_of(SrcOps.begin(), SrcOps.end(), + [&, this](const SrcOp &Op) { + return Op.getLLTTy(*getMRI()) == + SrcOps[0].getLLTTy(*getMRI()); + }) && + "type mismatch in input list"); + assert(SrcOps.size() * SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() == + DstOps[0].getLLTTy(*getMRI()).getSizeInBits() && + "input scalars do not exactly cover the outpur vector register"); + break; + } + case TargetOpcode::G_BUILD_VECTOR_TRUNC: { + assert((!SrcOps.empty() || SrcOps.size() < 2) && + "Must have at least 2 operands"); + assert(DstOps.size() == 1 && "Invalid DstOps"); + assert(DstOps[0].getLLTTy(*getMRI()).isVector() && + "Res type must be a vector"); + assert(std::all_of(SrcOps.begin(), SrcOps.end(), + [&, this](const SrcOp &Op) { + return Op.getLLTTy(*getMRI()) == + SrcOps[0].getLLTTy(*getMRI()); + }) && + "type mismatch in input list"); + if (SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() == + DstOps[0].getLLTTy(*getMRI()).getElementType().getSizeInBits()) + return buildInstr(TargetOpcode::G_BUILD_VECTOR, DstOps, SrcOps); + break; + } + case TargetOpcode::G_CONCAT_VECTORS: { + assert(DstOps.size() == 1 && "Invalid DstOps"); + assert((!SrcOps.empty() || SrcOps.size() < 2) && + "Must have at least 2 operands"); + assert(std::all_of(SrcOps.begin(), SrcOps.end(), + [&, this](const SrcOp &Op) { + return (Op.getLLTTy(*getMRI()).isVector() && + Op.getLLTTy(*getMRI()) == + SrcOps[0].getLLTTy(*getMRI())); + }) && + "type mismatch in input list"); + assert(SrcOps.size() * SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() == + DstOps[0].getLLTTy(*getMRI()).getSizeInBits() && + "input vectors do not exactly cover the outpur vector register"); + break; + } + case TargetOpcode::G_UADDE: { + assert(DstOps.size() == 2 && "Invalid no of dst operands"); + assert(SrcOps.size() == 3 && "Invalid no of src operands"); + assert(DstOps[0].getLLTTy(*getMRI()).isScalar() && "Invalid operand"); + assert((DstOps[0].getLLTTy(*getMRI()) == SrcOps[0].getLLTTy(*getMRI())) && + (DstOps[0].getLLTTy(*getMRI()) == SrcOps[1].getLLTTy(*getMRI())) && + "Invalid operand"); + assert(DstOps[1].getLLTTy(*getMRI()).isScalar() && "Invalid operand"); + assert(DstOps[1].getLLTTy(*getMRI()) == SrcOps[2].getLLTTy(*getMRI()) && + "type mismatch"); + break; + } + } + + auto MIB = buildInstr(Opc); + for (const DstOp &Op : DstOps) + Op.addDefToMIB(*getMRI(), MIB); + for (const SrcOp &Op : SrcOps) + Op.addSrcToMIB(MIB); + if (Flags) + MIB->setFlags(*Flags); + return MIB; +} diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp index 6bb48dc2e8aac8cef90628f54a512bd99bf7fdb1..98df7d34857e3bbb978a2484f62a36925b5c0226 100644 --- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp +++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp @@ -116,7 +116,7 @@ bool RegBankSelect::assignmentMatch( OnlyAssign = false; // Each part of a break down needs to end up in a different register. // In other word, Reg assignement does not match. - if (ValMapping.NumBreakDowns > 1) + if (ValMapping.NumBreakDowns != 1) return false; const RegisterBank *CurRegBank = RBI->getRegBank(Reg, *MRI, *TRI); diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp index deb49a1ea4826acaacf55a12f05c56003d787258..f411ee6745d0ed6f23326ea902a0d595de5fe6f4 100644 --- a/lib/CodeGen/ImplicitNullChecks.cpp +++ b/lib/CodeGen/ImplicitNullChecks.cpp @@ -360,10 +360,10 @@ ImplicitNullChecks::SuitabilityResult ImplicitNullChecks::isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg, ArrayRef PrevInsts) { int64_t Offset; - unsigned BaseReg; + MachineOperand *BaseOp; - if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI) || - BaseReg != PointerReg) + if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI) || + !BaseOp->isReg() || BaseOp->getReg() != PointerReg) return SR_Unsuitable; // We want the mem access to be issued at a sane offset from PointerReg, diff --git a/lib/CodeGen/InterleavedLoadCombinePass.cpp b/lib/CodeGen/InterleavedLoadCombinePass.cpp new file mode 100644 index 0000000000000000000000000000000000000000..989fa164ad2dd2ef3887e1353f8647019b909aac --- /dev/null +++ b/lib/CodeGen/InterleavedLoadCombinePass.cpp @@ -0,0 +1,1359 @@ +//===- InterleavedLoadCombine.cpp - Combine Interleaved Loads ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// \file +// +// This file defines the interleaved-load-combine pass. The pass searches for +// ShuffleVectorInstruction that execute interleaving loads. If a matching +// pattern is found, it adds a combined load and further instructions in a +// pattern that is detectable by InterleavedAccesPass. The old instructions are +// left dead to be removed later. The pass is specifically designed to be +// executed just before InterleavedAccesPass to find any left-over instances +// that are not detected within former passes. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/MemorySSAUpdater.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "interleaved-load-combine" + +namespace { + +/// Statistic counter +STATISTIC(NumInterleavedLoadCombine, "Number of combined loads"); + +/// Option to disable the pass +static cl::opt DisableInterleavedLoadCombine( + "disable-" DEBUG_TYPE, cl::init(false), cl::Hidden, + cl::desc("Disable combining of interleaved loads")); + +struct VectorInfo; + +struct InterleavedLoadCombineImpl { +public: + InterleavedLoadCombineImpl(Function &F, DominatorTree &DT, MemorySSA &MSSA, + TargetMachine &TM) + : F(F), DT(DT), MSSA(MSSA), + TLI(*TM.getSubtargetImpl(F)->getTargetLowering()), + TTI(TM.getTargetTransformInfo(F)) {} + + /// Scan the function for interleaved load candidates and execute the + /// replacement if applicable. + bool run(); + +private: + /// Function this pass is working on + Function &F; + + /// Dominator Tree Analysis + DominatorTree &DT; + + /// Memory Alias Analyses + MemorySSA &MSSA; + + /// Target Lowering Information + const TargetLowering &TLI; + + /// Target Transform Information + const TargetTransformInfo TTI; + + /// Find the instruction in sets LIs that dominates all others, return nullptr + /// if there is none. + LoadInst *findFirstLoad(const std::set &LIs); + + /// Replace interleaved load candidates. It does additional + /// analyses if this makes sense. Returns true on success and false + /// of nothing has been changed. + bool combine(std::list &InterleavedLoad, + OptimizationRemarkEmitter &ORE); + + /// Given a set of VectorInfo containing candidates for a given interleave + /// factor, find a set that represents a 'factor' interleaved load. + bool findPattern(std::list &Candidates, + std::list &InterleavedLoad, unsigned Factor, + const DataLayout &DL); +}; // InterleavedLoadCombine + +/// First Order Polynomial on an n-Bit Integer Value +/// +/// Polynomial(Value) = Value * B + A + E*2^(n-e) +/// +/// A and B are the coefficients. E*2^(n-e) is an error within 'e' most +/// significant bits. It is introduced if an exact computation cannot be proven +/// (e.q. division by 2). +/// +/// As part of this optimization multiple loads will be combined. It necessary +/// to prove that loads are within some relative offset to each other. This +/// class is used to prove relative offsets of values loaded from memory. +/// +/// Representing an integer in this form is sound since addition in two's +/// complement is associative (trivial) and multiplication distributes over the +/// addition (see Proof(1) in Polynomial::mul). Further, both operations +/// commute. +// +// Example: +// declare @fn(i64 %IDX, <4 x float>* %PTR) { +// %Pa1 = add i64 %IDX, 2 +// %Pa2 = lshr i64 %Pa1, 1 +// %Pa3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pa2 +// %Va = load <4 x float>, <4 x float>* %Pa3 +// +// %Pb1 = add i64 %IDX, 4 +// %Pb2 = lshr i64 %Pb1, 1 +// %Pb3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pb2 +// %Vb = load <4 x float>, <4 x float>* %Pb3 +// ... } +// +// The goal is to prove that two loads load consecutive addresses. +// +// In this case the polynomials are constructed by the following +// steps. +// +// The number tag #e specifies the error bits. +// +// Pa_0 = %IDX #0 +// Pa_1 = %IDX + 2 #0 | add 2 +// Pa_2 = %IDX/2 + 1 #1 | lshr 1 +// Pa_3 = %IDX/2 + 1 #1 | GEP, step signext to i64 +// Pa_4 = (%IDX/2)*16 + 16 #0 | GEP, multiply index by sizeof(4) for floats +// Pa_5 = (%IDX/2)*16 + 16 #0 | GEP, add offset of leading components +// +// Pb_0 = %IDX #0 +// Pb_1 = %IDX + 4 #0 | add 2 +// Pb_2 = %IDX/2 + 2 #1 | lshr 1 +// Pb_3 = %IDX/2 + 2 #1 | GEP, step signext to i64 +// Pb_4 = (%IDX/2)*16 + 32 #0 | GEP, multiply index by sizeof(4) for floats +// Pb_5 = (%IDX/2)*16 + 16 #0 | GEP, add offset of leading components +// +// Pb_5 - Pa_5 = 16 #0 | subtract to get the offset +// +// Remark: %PTR is not maintained within this class. So in this instance the +// offset of 16 can only be assumed if the pointers are equal. +// +class Polynomial { + /// Operations on B + enum BOps { + LShr, + Mul, + SExt, + Trunc, + }; + + /// Number of Error Bits e + unsigned ErrorMSBs; + + /// Value + Value *V; + + /// Coefficient B + SmallVector, 4> B; + + /// Coefficient A + APInt A; + +public: + Polynomial(Value *V) : ErrorMSBs((unsigned)-1), V(V), B(), A() { + IntegerType *Ty = dyn_cast(V->getType()); + if (Ty) { + ErrorMSBs = 0; + this->V = V; + A = APInt(Ty->getBitWidth(), 0); + } + } + + Polynomial(const APInt &A, unsigned ErrorMSBs = 0) + : ErrorMSBs(ErrorMSBs), V(NULL), B(), A(A) {} + + Polynomial(unsigned BitWidth, uint64_t A, unsigned ErrorMSBs = 0) + : ErrorMSBs(ErrorMSBs), V(NULL), B(), A(BitWidth, A) {} + + Polynomial() : ErrorMSBs((unsigned)-1), V(NULL), B(), A() {} + + /// Increment and clamp the number of undefined bits. + void incErrorMSBs(unsigned amt) { + if (ErrorMSBs == (unsigned)-1) + return; + + ErrorMSBs += amt; + if (ErrorMSBs > A.getBitWidth()) + ErrorMSBs = A.getBitWidth(); + } + + /// Decrement and clamp the number of undefined bits. + void decErrorMSBs(unsigned amt) { + if (ErrorMSBs == (unsigned)-1) + return; + + if (ErrorMSBs > amt) + ErrorMSBs -= amt; + else + ErrorMSBs = 0; + } + + /// Apply an add on the polynomial + Polynomial &add(const APInt &C) { + // Note: Addition is associative in two's complement even when in case of + // signed overflow. + // + // Error bits can only propagate into higher significant bits. As these are + // already regarded as undefined, there is no change. + // + // Theorem: Adding a constant to a polynomial does not change the error + // term. + // + // Proof: + // + // Since the addition is associative and commutes: + // + // (B + A + E*2^(n-e)) + C = B + (A + C) + E*2^(n-e) + // [qed] + + if (C.getBitWidth() != A.getBitWidth()) { + ErrorMSBs = (unsigned)-1; + return *this; + } + + A += C; + return *this; + } + + /// Apply a multiplication onto the polynomial. + Polynomial &mul(const APInt &C) { + // Note: Multiplication distributes over the addition + // + // Theorem: Multiplication distributes over the addition + // + // Proof(1): + // + // (B+A)*C =- + // = (B + A) + (B + A) + .. {C Times} + // addition is associative and commutes, hence + // = B + B + .. {C Times} .. + A + A + .. {C times} + // = B*C + A*C + // (see (function add) for signed values and overflows) + // [qed] + // + // Theorem: If C has c trailing zeros, errors bits in A or B are shifted out + // to the left. + // + // Proof(2): + // + // Let B' and A' be the n-Bit inputs with some unknown errors EA, + // EB at e leading bits. B' and A' can be written down as: + // + // B' = B + 2^(n-e)*EB + // A' = A + 2^(n-e)*EA + // + // Let C' be an input with c trailing zero bits. C' can be written as + // + // C' = C*2^c + // + // Therefore we can compute the result by using distributivity and + // commutativity. + // + // (B'*C' + A'*C') = [B + 2^(n-e)*EB] * C' + [A + 2^(n-e)*EA] * C' = + // = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' = + // = (B'+A') * C' = + // = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' = + // = [B + A + 2^(n-e)*EB + 2^(n-e)*EA] * C' = + // = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C' = + // = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C*2^c = + // = (B + A) * C' + C*(EB + EA)*2^(n-e)*2^c = + // + // Let EC be the final error with EC = C*(EB + EA) + // + // = (B + A)*C' + EC*2^(n-e)*2^c = + // = (B + A)*C' + EC*2^(n-(e-c)) + // + // Since EC is multiplied by 2^(n-(e-c)) the resulting error contains c + // less error bits than the input. c bits are shifted out to the left. + // [qed] + + if (C.getBitWidth() != A.getBitWidth()) { + ErrorMSBs = (unsigned)-1; + return *this; + } + + // Multiplying by one is a no-op. + if (C.isOneValue()) { + return *this; + } + + // Multiplying by zero removes the coefficient B and defines all bits. + if (C.isNullValue()) { + ErrorMSBs = 0; + deleteB(); + } + + // See Proof(2): Trailing zero bits indicate a left shift. This removes + // leading bits from the result even if they are undefined. + decErrorMSBs(C.countTrailingZeros()); + + A *= C; + pushBOperation(Mul, C); + return *this; + } + + /// Apply a logical shift right on the polynomial + Polynomial &lshr(const APInt &C) { + // Theorem(1): (B + A + E*2^(n-e)) >> 1 => (B >> 1) + (A >> 1) + E'*2^(n-e') + // where + // e' = e + 1, + // E is a e-bit number, + // E' is a e'-bit number, + // holds under the following precondition: + // pre(1): A % 2 = 0 + // pre(2): e < n, (see Theorem(2) for the trivial case with e=n) + // where >> expresses a logical shift to the right, with adding zeros. + // + // We need to show that for every, E there is a E' + // + // B = b_h * 2^(n-1) + b_m * 2 + b_l + // A = a_h * 2^(n-1) + a_m * 2 (pre(1)) + // + // where a_h, b_h, b_l are single bits, and a_m, b_m are (n-2) bit numbers + // + // Let X = (B + A + E*2^(n-e)) >> 1 + // Let Y = (B >> 1) + (A >> 1) + E*2^(n-e) >> 1 + // + // X = [B + A + E*2^(n-e)] >> 1 = + // = [ b_h * 2^(n-1) + b_m * 2 + b_l + + // + a_h * 2^(n-1) + a_m * 2 + + // + E * 2^(n-e) ] >> 1 = + // + // The sum is built by putting the overflow of [a_m + b+n] into the term + // 2^(n-1). As there are no more bits beyond 2^(n-1) the overflow within + // this bit is discarded. This is expressed by % 2. + // + // The bit in position 0 cannot overflow into the term (b_m + a_m). + // + // = [ ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-1) + + // + ((b_m + a_m) % 2^(n-2)) * 2 + + // + b_l + E * 2^(n-e) ] >> 1 = + // + // The shift is computed by dividing the terms by 2 and by cutting off + // b_l. + // + // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + + // + ((b_m + a_m) % 2^(n-2)) + + // + E * 2^(n-(e+1)) = + // + // by the definition in the Theorem e+1 = e' + // + // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + + // + ((b_m + a_m) % 2^(n-2)) + + // + E * 2^(n-e') = + // + // Compute Y by applying distributivity first + // + // Y = (B >> 1) + (A >> 1) + E*2^(n-e') = + // = (b_h * 2^(n-1) + b_m * 2 + b_l) >> 1 + + // + (a_h * 2^(n-1) + a_m * 2) >> 1 + + // + E * 2^(n-e) >> 1 = + // + // Again, the shift is computed by dividing the terms by 2 and by cutting + // off b_l. + // + // = b_h * 2^(n-2) + b_m + + // + a_h * 2^(n-2) + a_m + + // + E * 2^(n-(e+1)) = + // + // Again, the sum is built by putting the overflow of [a_m + b+n] into + // the term 2^(n-1). But this time there is room for a second bit in the + // term 2^(n-2) we add this bit to a new term and denote it o_h in a + // second step. + // + // = ([b_h + a_h + (b_m + a_m) >> (n-2)] >> 1) * 2^(n-1) + + // + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + + // + ((b_m + a_m) % 2^(n-2)) + + // + E * 2^(n-(e+1)) = + // + // Let o_h = [b_h + a_h + (b_m + a_m) >> (n-2)] >> 1 + // Further replace e+1 by e'. + // + // = o_h * 2^(n-1) + + // + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + + // + ((b_m + a_m) % 2^(n-2)) + + // + E * 2^(n-e') = + // + // Move o_h into the error term and construct E'. To ensure that there is + // no 2^x with negative x, this step requires pre(2) (e < n). + // + // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + + // + ((b_m + a_m) % 2^(n-2)) + + // + o_h * 2^(e'-1) * 2^(n-e') + | pre(2), move 2^(e'-1) + // | out of the old exponent + // + E * 2^(n-e') = + // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + + // + ((b_m + a_m) % 2^(n-2)) + + // + [o_h * 2^(e'-1) + E] * 2^(n-e') + | move 2^(e'-1) out of + // | the old exponent + // + // Let E' = o_h * 2^(e'-1) + E + // + // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + + // + ((b_m + a_m) % 2^(n-2)) + + // + E' * 2^(n-e') + // + // Because X and Y are distinct only in there error terms and E' can be + // constructed as shown the theorem holds. + // [qed] + // + // For completeness in case of the case e=n it is also required to show that + // distributivity can be applied. + // + // In this case Theorem(1) transforms to (the pre-condition on A can also be + // dropped) + // + // Theorem(2): (B + A + E) >> 1 => (B >> 1) + (A >> 1) + E' + // where + // A, B, E, E' are two's complement numbers with the same bit + // width + // + // Let A + B + E = X + // Let (B >> 1) + (A >> 1) = Y + // + // Therefore we need to show that for every X and Y there is an E' which + // makes the equation + // + // X = Y + E' + // + // hold. This is trivially the case for E' = X - Y. + // + // [qed] + // + // Remark: Distributing lshr with and arbitrary number n can be expressed as + // ((((B + A) lshr 1) lshr 1) ... ) {n times}. + // This construction induces n additional error bits at the left. + + if (C.getBitWidth() != A.getBitWidth()) { + ErrorMSBs = (unsigned)-1; + return *this; + } + + if (C.isNullValue()) + return *this; + + // Test if the result will be zero + unsigned shiftAmt = C.getZExtValue(); + if (shiftAmt >= C.getBitWidth()) + return mul(APInt(C.getBitWidth(), 0)); + + // The proof that shiftAmt LSBs are zero for at least one summand is only + // possible for the constant number. + // + // If this can be proven add shiftAmt to the error counter + // `ErrorMSBs`. Otherwise set all bits as undefined. + if (A.countTrailingZeros() < shiftAmt) + ErrorMSBs = A.getBitWidth(); + else + incErrorMSBs(shiftAmt); + + // Apply the operation. + pushBOperation(LShr, C); + A = A.lshr(shiftAmt); + + return *this; + } + + /// Apply a sign-extend or truncate operation on the polynomial. + Polynomial &sextOrTrunc(unsigned n) { + if (n < A.getBitWidth()) { + // Truncate: Clearly undefined Bits on the MSB side are removed + // if there are any. + decErrorMSBs(A.getBitWidth() - n); + A = A.trunc(n); + pushBOperation(Trunc, APInt(sizeof(n) * 8, n)); + } + if (n > A.getBitWidth()) { + // Extend: Clearly extending first and adding later is different + // to adding first and extending later in all extended bits. + incErrorMSBs(n - A.getBitWidth()); + A = A.sext(n); + pushBOperation(SExt, APInt(sizeof(n) * 8, n)); + } + + return *this; + } + + /// Test if there is a coefficient B. + bool isFirstOrder() const { return V != nullptr; } + + /// Test coefficient B of two Polynomials are equal. + bool isCompatibleTo(const Polynomial &o) const { + // The polynomial use different bit width. + if (A.getBitWidth() != o.A.getBitWidth()) + return false; + + // If neither Polynomial has the Coefficient B. + if (!isFirstOrder() && !o.isFirstOrder()) + return true; + + // The index variable is different. + if (V != o.V) + return false; + + // Check the operations. + if (B.size() != o.B.size()) + return false; + + auto ob = o.B.begin(); + for (auto &b : B) { + if (b != *ob) + return false; + ob++; + } + + return true; + } + + /// Subtract two polynomials, return an undefined polynomial if + /// subtraction is not possible. + Polynomial operator-(const Polynomial &o) const { + // Return an undefined polynomial if incompatible. + if (!isCompatibleTo(o)) + return Polynomial(); + + // If the polynomials are compatible (meaning they have the same + // coefficient on B), B is eliminated. Thus a polynomial solely + // containing A is returned + return Polynomial(A - o.A, std::max(ErrorMSBs, o.ErrorMSBs)); + } + + /// Subtract a constant from a polynomial, + Polynomial operator-(uint64_t C) const { + Polynomial Result(*this); + Result.A -= C; + return Result; + } + + /// Add a constant to a polynomial, + Polynomial operator+(uint64_t C) const { + Polynomial Result(*this); + Result.A += C; + return Result; + } + + /// Returns true if it can be proven that two Polynomials are equal. + bool isProvenEqualTo(const Polynomial &o) { + // Subtract both polynomials and test if it is fully defined and zero. + Polynomial r = *this - o; + return (r.ErrorMSBs == 0) && (!r.isFirstOrder()) && (r.A.isNullValue()); + } + + /// Print the polynomial into a stream. + void print(raw_ostream &OS) const { + OS << "[{#ErrBits:" << ErrorMSBs << "} "; + + if (V) { + for (auto b : B) + OS << "("; + OS << "(" << *V << ") "; + + for (auto b : B) { + switch (b.first) { + case LShr: + OS << "LShr "; + break; + case Mul: + OS << "Mul "; + break; + case SExt: + OS << "SExt "; + break; + case Trunc: + OS << "Trunc "; + break; + } + + OS << b.second << ") "; + } + } + + OS << "+ " << A << "]"; + } + +private: + void deleteB() { + V = nullptr; + B.clear(); + } + + void pushBOperation(const BOps Op, const APInt &C) { + if (isFirstOrder()) { + B.push_back(std::make_pair(Op, C)); + return; + } + } +}; + +#ifndef NDEBUG +static raw_ostream &operator<<(raw_ostream &OS, const Polynomial &S) { + S.print(OS); + return OS; +} +#endif + +/// VectorInfo stores abstract the following information for each vector +/// element: +/// +/// 1) The the memory address loaded into the element as Polynomial +/// 2) a set of load instruction necessary to construct the vector, +/// 3) a set of all other instructions that are necessary to create the vector and +/// 4) a pointer value that can be used as relative base for all elements. +struct VectorInfo { +private: + VectorInfo(const VectorInfo &c) : VTy(c.VTy) { + llvm_unreachable( + "Copying VectorInfo is neither implemented nor necessary,"); + } + +public: + /// Information of a Vector Element + struct ElementInfo { + /// Offset Polynomial. + Polynomial Ofs; + + /// The Load Instruction used to Load the entry. LI is null if the pointer + /// of the load instruction does not point on to the entry + LoadInst *LI; + + ElementInfo(Polynomial Offset = Polynomial(), LoadInst *LI = nullptr) + : Ofs(Offset), LI(LI) {} + }; + + /// Basic-block the load instructions are within + BasicBlock *BB; + + /// Pointer value of all participation load instructions + Value *PV; + + /// Participating load instructions + std::set LIs; + + /// Participating instructions + std::set Is; + + /// Final shuffle-vector instruction + ShuffleVectorInst *SVI; + + /// Information of the offset for each vector element + ElementInfo *EI; + + /// Vector Type + VectorType *const VTy; + + VectorInfo(VectorType *VTy) + : BB(nullptr), PV(nullptr), LIs(), Is(), SVI(nullptr), VTy(VTy) { + EI = new ElementInfo[VTy->getNumElements()]; + } + + virtual ~VectorInfo() { delete[] EI; } + + unsigned getDimension() const { return VTy->getNumElements(); } + + /// Test if the VectorInfo can be part of an interleaved load with the + /// specified factor. + /// + /// \param Factor of the interleave + /// \param DL Targets Datalayout + /// + /// \returns true if this is possible and false if not + bool isInterleaved(unsigned Factor, const DataLayout &DL) const { + unsigned Size = DL.getTypeAllocSize(VTy->getElementType()); + for (unsigned i = 1; i < getDimension(); i++) { + if (!EI[i].Ofs.isProvenEqualTo(EI[0].Ofs + i * Factor * Size)) { + return false; + } + } + return true; + } + + /// Recursively computes the vector information stored in V. + /// + /// This function delegates the work to specialized implementations + /// + /// \param V Value to operate on + /// \param Result Result of the computation + /// + /// \returns false if no sensible information can be gathered. + static bool compute(Value *V, VectorInfo &Result, const DataLayout &DL) { + ShuffleVectorInst *SVI = dyn_cast(V); + if (SVI) + return computeFromSVI(SVI, Result, DL); + LoadInst *LI = dyn_cast(V); + if (LI) + return computeFromLI(LI, Result, DL); + BitCastInst *BCI = dyn_cast(V); + if (BCI) + return computeFromBCI(BCI, Result, DL); + return false; + } + + /// BitCastInst specialization to compute the vector information. + /// + /// \param BCI BitCastInst to operate on + /// \param Result Result of the computation + /// + /// \returns false if no sensible information can be gathered. + static bool computeFromBCI(BitCastInst *BCI, VectorInfo &Result, + const DataLayout &DL) { + Instruction *Op = dyn_cast(BCI->getOperand(0)); + + if (!Op) + return false; + + VectorType *VTy = dyn_cast(Op->getType()); + if (!VTy) + return false; + + // We can only cast from large to smaller vectors + if (Result.VTy->getNumElements() % VTy->getNumElements()) + return false; + + unsigned Factor = Result.VTy->getNumElements() / VTy->getNumElements(); + unsigned NewSize = DL.getTypeAllocSize(Result.VTy->getElementType()); + unsigned OldSize = DL.getTypeAllocSize(VTy->getElementType()); + + if (NewSize * Factor != OldSize) + return false; + + VectorInfo Old(VTy); + if (!compute(Op, Old, DL)) + return false; + + for (unsigned i = 0; i < Result.VTy->getNumElements(); i += Factor) { + for (unsigned j = 0; j < Factor; j++) { + Result.EI[i + j] = + ElementInfo(Old.EI[i / Factor].Ofs + j * NewSize, + j == 0 ? Old.EI[i / Factor].LI : nullptr); + } + } + + Result.BB = Old.BB; + Result.PV = Old.PV; + Result.LIs.insert(Old.LIs.begin(), Old.LIs.end()); + Result.Is.insert(Old.Is.begin(), Old.Is.end()); + Result.Is.insert(BCI); + Result.SVI = nullptr; + + return true; + } + + /// ShuffleVectorInst specialization to compute vector information. + /// + /// \param SVI ShuffleVectorInst to operate on + /// \param Result Result of the computation + /// + /// Compute the left and the right side vector information and merge them by + /// applying the shuffle operation. This function also ensures that the left + /// and right side have compatible loads. This means that all loads are with + /// in the same basic block and are based on the same pointer. + /// + /// \returns false if no sensible information can be gathered. + static bool computeFromSVI(ShuffleVectorInst *SVI, VectorInfo &Result, + const DataLayout &DL) { + VectorType *ArgTy = dyn_cast(SVI->getOperand(0)->getType()); + assert(ArgTy && "ShuffleVector Operand is not a VectorType"); + + // Compute the left hand vector information. + VectorInfo LHS(ArgTy); + if (!compute(SVI->getOperand(0), LHS, DL)) + LHS.BB = nullptr; + + // Compute the right hand vector information. + VectorInfo RHS(ArgTy); + if (!compute(SVI->getOperand(1), RHS, DL)) + RHS.BB = nullptr; + + // Neither operand produced sensible results? + if (!LHS.BB && !RHS.BB) + return false; + // Only RHS produced sensible results? + else if (!LHS.BB) { + Result.BB = RHS.BB; + Result.PV = RHS.PV; + } + // Only LHS produced sensible results? + else if (!RHS.BB) { + Result.BB = LHS.BB; + Result.PV = LHS.PV; + } + // Both operands produced sensible results? + else if ((LHS.BB == RHS.BB) && (LHS.PV == RHS.PV)) { + Result.BB = LHS.BB; + Result.PV = LHS.PV; + } + // Both operands produced sensible results but they are incompatible. + else { + return false; + } + + // Merge and apply the operation on the offset information. + if (LHS.BB) { + Result.LIs.insert(LHS.LIs.begin(), LHS.LIs.end()); + Result.Is.insert(LHS.Is.begin(), LHS.Is.end()); + } + if (RHS.BB) { + Result.LIs.insert(RHS.LIs.begin(), RHS.LIs.end()); + Result.Is.insert(RHS.Is.begin(), RHS.Is.end()); + } + Result.Is.insert(SVI); + Result.SVI = SVI; + + int j = 0; + for (int i : SVI->getShuffleMask()) { + assert((i < 2 * (signed)ArgTy->getNumElements()) && + "Invalid ShuffleVectorInst (index out of bounds)"); + + if (i < 0) + Result.EI[j] = ElementInfo(); + else if (i < (signed)ArgTy->getNumElements()) { + if (LHS.BB) + Result.EI[j] = LHS.EI[i]; + else + Result.EI[j] = ElementInfo(); + } else { + if (RHS.BB) + Result.EI[j] = RHS.EI[i - ArgTy->getNumElements()]; + else + Result.EI[j] = ElementInfo(); + } + j++; + } + + return true; + } + + /// LoadInst specialization to compute vector information. + /// + /// This function also acts as abort condition to the recursion. + /// + /// \param LI LoadInst to operate on + /// \param Result Result of the computation + /// + /// \returns false if no sensible information can be gathered. + static bool computeFromLI(LoadInst *LI, VectorInfo &Result, + const DataLayout &DL) { + Value *BasePtr; + Polynomial Offset; + + if (LI->isVolatile()) + return false; + + if (LI->isAtomic()) + return false; + + // Get the base polynomial + computePolynomialFromPointer(*LI->getPointerOperand(), Offset, BasePtr, DL); + + Result.BB = LI->getParent(); + Result.PV = BasePtr; + Result.LIs.insert(LI); + Result.Is.insert(LI); + + for (unsigned i = 0; i < Result.getDimension(); i++) { + Value *Idx[2] = { + ConstantInt::get(Type::getInt32Ty(LI->getContext()), 0), + ConstantInt::get(Type::getInt32Ty(LI->getContext()), i), + }; + int64_t Ofs = DL.getIndexedOffsetInType(Result.VTy, makeArrayRef(Idx, 2)); + Result.EI[i] = ElementInfo(Offset + Ofs, i == 0 ? LI : nullptr); + } + + return true; + } + + /// Recursively compute polynomial of a value. + /// + /// \param BO Input binary operation + /// \param Result Result polynomial + static void computePolynomialBinOp(BinaryOperator &BO, Polynomial &Result) { + Value *LHS = BO.getOperand(0); + Value *RHS = BO.getOperand(1); + + // Find the RHS Constant if any + ConstantInt *C = dyn_cast(RHS); + if ((!C) && BO.isCommutative()) { + C = dyn_cast(LHS); + if (C) + std::swap(LHS, RHS); + } + + switch (BO.getOpcode()) { + case Instruction::Add: + if (!C) + break; + + computePolynomial(*LHS, Result); + Result.add(C->getValue()); + return; + + case Instruction::LShr: + if (!C) + break; + + computePolynomial(*LHS, Result); + Result.lshr(C->getValue()); + return; + + default: + break; + } + + Result = Polynomial(&BO); + } + + /// Recursively compute polynomial of a value + /// + /// \param V input value + /// \param Result result polynomial + static void computePolynomial(Value &V, Polynomial &Result) { + if (isa(&V)) + computePolynomialBinOp(*dyn_cast(&V), Result); + else + Result = Polynomial(&V); + } + + /// Compute the Polynomial representation of a Pointer type. + /// + /// \param Ptr input pointer value + /// \param Result result polynomial + /// \param BasePtr pointer the polynomial is based on + /// \param DL Datalayout of the target machine + static void computePolynomialFromPointer(Value &Ptr, Polynomial &Result, + Value *&BasePtr, + const DataLayout &DL) { + // Not a pointer type? Return an undefined polynomial + PointerType *PtrTy = dyn_cast(Ptr.getType()); + if (!PtrTy) { + Result = Polynomial(); + BasePtr = nullptr; + } + unsigned PointerBits = + DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()); + + /// Skip pointer casts. Return Zero polynomial otherwise + if (isa(&Ptr)) { + CastInst &CI = *cast(&Ptr); + switch (CI.getOpcode()) { + case Instruction::BitCast: + computePolynomialFromPointer(*CI.getOperand(0), Result, BasePtr, DL); + break; + default: + BasePtr = &Ptr; + Polynomial(PointerBits, 0); + break; + } + } + /// Resolve GetElementPtrInst. + else if (isa(&Ptr)) { + GetElementPtrInst &GEP = *cast(&Ptr); + + APInt BaseOffset(PointerBits, 0); + + // Check if we can compute the Offset with accumulateConstantOffset + if (GEP.accumulateConstantOffset(DL, BaseOffset)) { + Result = Polynomial(BaseOffset); + BasePtr = GEP.getPointerOperand(); + return; + } else { + // Otherwise we allow that the last index operand of the GEP is + // non-constant. + unsigned idxOperand, e; + SmallVector Indices; + for (idxOperand = 1, e = GEP.getNumOperands(); idxOperand < e; + idxOperand++) { + ConstantInt *IDX = dyn_cast(GEP.getOperand(idxOperand)); + if (!IDX) + break; + Indices.push_back(IDX); + } + + // It must also be the last operand. + if (idxOperand + 1 != e) { + Result = Polynomial(); + BasePtr = nullptr; + return; + } + + // Compute the polynomial of the index operand. + computePolynomial(*GEP.getOperand(idxOperand), Result); + + // Compute base offset from zero based index, excluding the last + // variable operand. + BaseOffset = + DL.getIndexedOffsetInType(GEP.getSourceElementType(), Indices); + + // Apply the operations of GEP to the polynomial. + unsigned ResultSize = DL.getTypeAllocSize(GEP.getResultElementType()); + Result.sextOrTrunc(PointerBits); + Result.mul(APInt(PointerBits, ResultSize)); + Result.add(BaseOffset); + BasePtr = GEP.getPointerOperand(); + } + } + // All other instructions are handled by using the value as base pointer and + // a zero polynomial. + else { + BasePtr = &Ptr; + Polynomial(DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()), 0); + } + } + +#ifndef NDEBUG + void print(raw_ostream &OS) const { + if (PV) + OS << *PV; + else + OS << "(none)"; + OS << " + "; + for (unsigned i = 0; i < getDimension(); i++) + OS << ((i == 0) ? "[" : ", ") << EI[i].Ofs; + OS << "]"; + } +#endif +}; + +} // anonymous namespace + +bool InterleavedLoadCombineImpl::findPattern( + std::list &Candidates, std::list &InterleavedLoad, + unsigned Factor, const DataLayout &DL) { + for (auto C0 = Candidates.begin(), E0 = Candidates.end(); C0 != E0; ++C0) { + unsigned i; + // Try to find an interleaved load using the front of Worklist as first line + unsigned Size = DL.getTypeAllocSize(C0->VTy->getElementType()); + + // List containing iterators pointing to the VectorInfos of the candidates + std::vector::iterator> Res(Factor, Candidates.end()); + + for (auto C = Candidates.begin(), E = Candidates.end(); C != E; C++) { + if (C->VTy != C0->VTy) + continue; + if (C->BB != C0->BB) + continue; + if (C->PV != C0->PV) + continue; + + // Check the current value matches any of factor - 1 remaining lines + for (i = 1; i < Factor; i++) { + if (C->EI[0].Ofs.isProvenEqualTo(C0->EI[0].Ofs + i * Size)) { + Res[i] = C; + } + } + + for (i = 1; i < Factor; i++) { + if (Res[i] == Candidates.end()) + break; + } + if (i == Factor) { + Res[0] = C0; + break; + } + } + + if (Res[0] != Candidates.end()) { + // Move the result into the output + for (unsigned i = 0; i < Factor; i++) { + InterleavedLoad.splice(InterleavedLoad.end(), Candidates, Res[i]); + } + + return true; + } + } + return false; +} + +LoadInst * +InterleavedLoadCombineImpl::findFirstLoad(const std::set &LIs) { + assert(!LIs.empty() && "No load instructions given."); + + // All LIs are within the same BB. Select the first for a reference. + BasicBlock *BB = (*LIs.begin())->getParent(); + BasicBlock::iterator FLI = + std::find_if(BB->begin(), BB->end(), [&LIs](Instruction &I) -> bool { + return is_contained(LIs, &I); + }); + assert(FLI != BB->end()); + + return cast(FLI); +} + +bool InterleavedLoadCombineImpl::combine(std::list &InterleavedLoad, + OptimizationRemarkEmitter &ORE) { + LLVM_DEBUG(dbgs() << "Checking interleaved load\n"); + + // The insertion point is the LoadInst which loads the first values. The + // following tests are used to proof that the combined load can be inserted + // just before InsertionPoint. + LoadInst *InsertionPoint = InterleavedLoad.front().EI[0].LI; + + // Test if the offset is computed + if (!InsertionPoint) + return false; + + std::set LIs; + std::set Is; + std::set SVIs; + + unsigned InterleavedCost; + unsigned InstructionCost = 0; + + // Get the interleave factor + unsigned Factor = InterleavedLoad.size(); + + // Merge all input sets used in analysis + for (auto &VI : InterleavedLoad) { + // Generate a set of all load instructions to be combined + LIs.insert(VI.LIs.begin(), VI.LIs.end()); + + // Generate a set of all instructions taking part in load + // interleaved. This list excludes the instructions necessary for the + // polynomial construction. + Is.insert(VI.Is.begin(), VI.Is.end()); + + // Generate the set of the final ShuffleVectorInst. + SVIs.insert(VI.SVI); + } + + // There is nothing to combine. + if (LIs.size() < 2) + return false; + + // Test if all participating instruction will be dead after the + // transformation. If intermediate results are used, no performance gain can + // be expected. Also sum the cost of the Instructions beeing left dead. + for (auto &I : Is) { + // Compute the old cost + InstructionCost += + TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency); + + // The final SVIs are allowed not to be dead, all uses will be replaced + if (SVIs.find(I) != SVIs.end()) + continue; + + // If there are users outside the set to be eliminated, we abort the + // transformation. No gain can be expected. + for (const auto &U : I->users()) { + if (Is.find(dyn_cast(U)) == Is.end()) + return false; + } + } + + // We know that all LoadInst are within the same BB. This guarantees that + // either everything or nothing is loaded. + LoadInst *First = findFirstLoad(LIs); + + // To be safe that the loads can be combined, iterate over all loads and test + // that the corresponding defining access dominates first LI. This guarantees + // that there are no aliasing stores in between the loads. + auto FMA = MSSA.getMemoryAccess(First); + for (auto LI : LIs) { + auto MADef = MSSA.getMemoryAccess(LI)->getDefiningAccess(); + if (!MSSA.dominates(MADef, FMA)) + return false; + } + assert(!LIs.empty() && "There are no LoadInst to combine"); + + // It is necessary that insertion point dominates all final ShuffleVectorInst. + for (auto &VI : InterleavedLoad) { + if (!DT.dominates(InsertionPoint, VI.SVI)) + return false; + } + + // All checks are done. Add instructions detectable by InterleavedAccessPass + // The old instruction will are left dead. + IRBuilder<> Builder(InsertionPoint); + Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType(); + unsigned ElementsPerSVI = + InterleavedLoad.front().SVI->getType()->getNumElements(); + VectorType *ILTy = VectorType::get(ETy, Factor * ElementsPerSVI); + + SmallVector Indices; + for (unsigned i = 0; i < Factor; i++) + Indices.push_back(i); + InterleavedCost = TTI.getInterleavedMemoryOpCost( + Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlignment(), + InsertionPoint->getPointerAddressSpace()); + + if (InterleavedCost >= InstructionCost) { + return false; + } + + // Create a pointer cast for the wide load. + auto CI = Builder.CreatePointerCast(InsertionPoint->getOperand(0), + ILTy->getPointerTo(), + "interleaved.wide.ptrcast"); + + // Create the wide load and update the MemorySSA. + auto LI = Builder.CreateAlignedLoad(CI, InsertionPoint->getAlignment(), + "interleaved.wide.load"); + auto MSSAU = MemorySSAUpdater(&MSSA); + MemoryUse *MSSALoad = cast(MSSAU.createMemoryAccessBefore( + LI, nullptr, MSSA.getMemoryAccess(InsertionPoint))); + MSSAU.insertUse(MSSALoad); + + // Create the final SVIs and replace all uses. + int i = 0; + for (auto &VI : InterleavedLoad) { + SmallVector Mask; + for (unsigned j = 0; j < ElementsPerSVI; j++) + Mask.push_back(i + j * Factor); + + Builder.SetInsertPoint(VI.SVI); + auto SVI = Builder.CreateShuffleVector(LI, UndefValue::get(LI->getType()), + Mask, "interleaved.shuffle"); + VI.SVI->replaceAllUsesWith(SVI); + i++; + } + + NumInterleavedLoadCombine++; + ORE.emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "Combined Interleaved Load", LI) + << "Load interleaved combined with factor " + << ore::NV("Factor", Factor); + }); + + return true; +} + +bool InterleavedLoadCombineImpl::run() { + OptimizationRemarkEmitter ORE(&F); + bool changed = false; + unsigned MaxFactor = TLI.getMaxSupportedInterleaveFactor(); + + auto &DL = F.getParent()->getDataLayout(); + + // Start with the highest factor to avoid combining and recombining. + for (unsigned Factor = MaxFactor; Factor >= 2; Factor--) { + std::list Candidates; + + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + if (auto SVI = dyn_cast(&I)) { + + Candidates.emplace_back(SVI->getType()); + + if (!VectorInfo::computeFromSVI(SVI, Candidates.back(), DL)) { + Candidates.pop_back(); + continue; + } + + if (!Candidates.back().isInterleaved(Factor, DL)) { + Candidates.pop_back(); + } + } + } + } + + std::list InterleavedLoad; + while (findPattern(Candidates, InterleavedLoad, Factor, DL)) { + if (combine(InterleavedLoad, ORE)) { + changed = true; + } else { + // Remove the first element of the Interleaved Load but put the others + // back on the list and continue searching + Candidates.splice(Candidates.begin(), InterleavedLoad, + std::next(InterleavedLoad.begin()), + InterleavedLoad.end()); + } + InterleavedLoad.clear(); + } + } + + return changed; +} + +namespace { +/// This pass combines interleaved loads into a pattern detectable by +/// InterleavedAccessPass. +struct InterleavedLoadCombine : public FunctionPass { + static char ID; + + InterleavedLoadCombine() : FunctionPass(ID) { + initializeInterleavedLoadCombinePass(*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { + return "Interleaved Load Combine Pass"; + } + + bool runOnFunction(Function &F) override { + if (DisableInterleavedLoadCombine) + return false; + + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) + return false; + + LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() + << "\n"); + + return InterleavedLoadCombineImpl( + F, getAnalysis().getDomTree(), + getAnalysis().getMSSA(), + TPC->getTM()) + .run(); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + FunctionPass::getAnalysisUsage(AU); + } + +private: +}; +} // anonymous namespace + +char InterleavedLoadCombine::ID = 0; + +INITIALIZE_PASS_BEGIN( + InterleavedLoadCombine, DEBUG_TYPE, + "Combine interleaved loads into wide loads and shufflevector instructions", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) +INITIALIZE_PASS_END( + InterleavedLoadCombine, DEBUG_TYPE, + "Combine interleaved loads into wide loads and shufflevector instructions", + false, false) + +FunctionPass * +llvm::createInterleavedLoadCombinePass() { + auto P = new InterleavedLoadCombine(); + return P; +} diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp index 01779b1eb9b015e59a73abd61d2f1b98006ff68e..d0d889782a358e9c6e4c37eddd313ffabb238559 100644 --- a/lib/CodeGen/LiveDebugVariables.cpp +++ b/lib/CodeGen/LiveDebugVariables.cpp @@ -132,10 +132,10 @@ private: unsigned WasIndirect : 1; }; -/// LocMap - Map of where a user value is live, and its location. +/// Map of where a user value is live, and its location. using LocMap = IntervalMap; -/// SpillOffsetMap - Map of stack slot offsets for spilled locations. +/// Map of stack slot offsets for spilled locations. /// Non-spilled locations are not added to the map. using SpillOffsetMap = DenseMap; @@ -143,7 +143,7 @@ namespace { class LDVImpl; -/// UserValue - A user value is a part of a debug info user variable. +/// A user value is a part of a debug info user variable. /// /// A DBG_VALUE instruction notes that (a sub-register of) a virtual register /// holds part of a user variable. The part is identified by a byte offset. @@ -170,26 +170,26 @@ class UserValue { /// lexical scope. SmallSet trimmedDefs; - /// insertDebugValue - Insert a DBG_VALUE into MBB at Idx for LocNo. + /// Insert a DBG_VALUE into MBB at Idx for LocNo. void insertDebugValue(MachineBasicBlock *MBB, SlotIndex StartIdx, SlotIndex StopIdx, DbgValueLocation Loc, bool Spilled, unsigned SpillOffset, LiveIntervals &LIS, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI); - /// splitLocation - Replace OldLocNo ranges with NewRegs ranges where NewRegs + /// Replace OldLocNo ranges with NewRegs ranges where NewRegs /// is live. Returns true if any changes were made. bool splitLocation(unsigned OldLocNo, ArrayRef NewRegs, LiveIntervals &LIS); public: - /// UserValue - Create a new UserValue. + /// Create a new UserValue. UserValue(const DILocalVariable *var, const DIExpression *expr, DebugLoc L, LocMap::Allocator &alloc) : Variable(var), Expression(expr), dl(std::move(L)), leader(this), locInts(alloc) {} - /// getLeader - Get the leader of this value's equivalence class. + /// Get the leader of this value's equivalence class. UserValue *getLeader() { UserValue *l = leader; while (l != l->leader) @@ -197,10 +197,10 @@ public: return leader = l; } - /// getNext - Return the next UserValue in the equivalence class. + /// Return the next UserValue in the equivalence class. UserValue *getNext() const { return next; } - /// match - Does this UserValue match the parameters? + /// Does this UserValue match the parameters? bool match(const DILocalVariable *Var, const DIExpression *Expr, const DILocation *IA) const { // FIXME: The fragment should be part of the equivalence class, but not @@ -208,7 +208,7 @@ public: return Var == Variable && Expr == Expression && dl->getInlinedAt() == IA; } - /// merge - Merge equivalence classes. + /// Merge equivalence classes. static UserValue *merge(UserValue *L1, UserValue *L2) { L2 = L2->getLeader(); if (!L1) @@ -260,10 +260,10 @@ public: return locations.size() - 1; } - /// mapVirtRegs - Ensure that all virtual register locations are mapped. + /// Ensure that all virtual register locations are mapped. void mapVirtRegs(LDVImpl *LDV); - /// addDef - Add a definition point to this value. + /// Add a definition point to this value. void addDef(SlotIndex Idx, const MachineOperand &LocMO, bool IsIndirect) { DbgValueLocation Loc(getLocationNo(LocMO), IsIndirect); // Add a singular (Idx,Idx) -> Loc mapping. @@ -275,66 +275,71 @@ public: I.setValue(Loc); } - /// extendDef - Extend the current definition as far as possible down. + /// Extend the current definition as far as possible down. + /// /// Stop when meeting an existing def or when leaving the live - /// range of VNI. - /// End points where VNI is no longer live are added to Kills. - /// @param Idx Starting point for the definition. - /// @param Loc Location number to propagate. - /// @param LR Restrict liveness to where LR has the value VNI. May be null. - /// @param VNI When LR is not null, this is the value to restrict to. - /// @param Kills Append end points of VNI's live range to Kills. - /// @param LIS Live intervals analysis. + /// range of VNI. End points where VNI is no longer live are added to Kills. + /// + /// We only propagate DBG_VALUES locally here. LiveDebugValues performs a + /// data-flow analysis to propagate them beyond basic block boundaries. + /// + /// \param Idx Starting point for the definition. + /// \param Loc Location number to propagate. + /// \param LR Restrict liveness to where LR has the value VNI. May be null. + /// \param VNI When LR is not null, this is the value to restrict to. + /// \param [out] Kills Append end points of VNI's live range to Kills. + /// \param LIS Live intervals analysis. void extendDef(SlotIndex Idx, DbgValueLocation Loc, LiveRange *LR, const VNInfo *VNI, SmallVectorImpl *Kills, LiveIntervals &LIS); - /// addDefsFromCopies - The value in LI/LocNo may be copies to other - /// registers. Determine if any of the copies are available at the kill - /// points, and add defs if possible. - /// @param LI Scan for copies of the value in LI->reg. - /// @param LocNo Location number of LI->reg. - /// @param WasIndirect Indicates if the original use of LI->reg was indirect - /// @param Kills Points where the range of LocNo could be extended. - /// @param NewDefs Append (Idx, LocNo) of inserted defs here. + /// The value in LI/LocNo may be copies to other registers. Determine if + /// any of the copies are available at the kill points, and add defs if + /// possible. + /// + /// \param LI Scan for copies of the value in LI->reg. + /// \param LocNo Location number of LI->reg. + /// \param WasIndirect Indicates if the original use of LI->reg was indirect + /// \param Kills Points where the range of LocNo could be extended. + /// \param [in,out] NewDefs Append (Idx, LocNo) of inserted defs here. void addDefsFromCopies( LiveInterval *LI, unsigned LocNo, bool WasIndirect, const SmallVectorImpl &Kills, SmallVectorImpl> &NewDefs, MachineRegisterInfo &MRI, LiveIntervals &LIS); - /// computeIntervals - Compute the live intervals of all locations after - /// collecting all their def points. + /// Compute the live intervals of all locations after collecting all their + /// def points. void computeIntervals(MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, LiveIntervals &LIS, LexicalScopes &LS); - /// splitRegister - Replace OldReg ranges with NewRegs ranges where NewRegs is + /// Replace OldReg ranges with NewRegs ranges where NewRegs is /// live. Returns true if any changes were made. bool splitRegister(unsigned OldReg, ArrayRef NewRegs, LiveIntervals &LIS); - /// rewriteLocations - Rewrite virtual register locations according to the - /// provided virtual register map. Record the stack slot offsets for the - /// locations that were spilled. + /// Rewrite virtual register locations according to the provided virtual + /// register map. Record the stack slot offsets for the locations that + /// were spilled. void rewriteLocations(VirtRegMap &VRM, const MachineFunction &MF, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, SpillOffsetMap &SpillOffsets); - /// emitDebugValues - Recreate DBG_VALUE instruction from data structures. + /// Recreate DBG_VALUE instruction from data structures. void emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const SpillOffsetMap &SpillOffsets); - /// getDebugLoc - Return DebugLoc of this UserValue. + /// Return DebugLoc of this UserValue. DebugLoc getDebugLoc() { return dl;} void print(raw_ostream &, const TargetRegisterInfo *); }; -/// LDVImpl - Implementation of the LiveDebugVariables pass. +/// Implementation of the LiveDebugVariables pass. class LDVImpl { LiveDebugVariables &pass; LocMap::Allocator allocator; @@ -348,7 +353,7 @@ class LDVImpl { /// Whether the machine function is modified during the pass. bool ModifiedMF = false; - /// userValues - All allocated UserValue instances. + /// All allocated UserValue instances. SmallVector, 8> userValues; /// Map virtual register to eq class leader. @@ -359,27 +364,31 @@ class LDVImpl { using UVMap = DenseMap; UVMap userVarMap; - /// getUserValue - Find or create a UserValue. + /// Find or create a UserValue. UserValue *getUserValue(const DILocalVariable *Var, const DIExpression *Expr, const DebugLoc &DL); - /// lookupVirtReg - Find the EC leader for VirtReg or null. + /// Find the EC leader for VirtReg or null. UserValue *lookupVirtReg(unsigned VirtReg); - /// handleDebugValue - Add DBG_VALUE instruction to our maps. - /// @param MI DBG_VALUE instruction - /// @param Idx Last valid SLotIndex before instruction. - /// @return True if the DBG_VALUE instruction should be deleted. + /// Add DBG_VALUE instruction to our maps. + /// + /// \param MI DBG_VALUE instruction + /// \param Idx Last valid SLotIndex before instruction. + /// + /// \returns True if the DBG_VALUE instruction should be deleted. bool handleDebugValue(MachineInstr &MI, SlotIndex Idx); - /// collectDebugValues - Collect and erase all DBG_VALUE instructions, adding - /// a UserValue def for each instruction. - /// @param mf MachineFunction to be scanned. - /// @return True if any debug values were found. + /// Collect and erase all DBG_VALUE instructions, adding a UserValue def + /// for each instruction. + /// + /// \param mf MachineFunction to be scanned. + /// + /// \returns True if any debug values were found. bool collectDebugValues(MachineFunction &mf); - /// computeIntervals - Compute the live intervals of all user values after - /// collecting all their def points. + /// Compute the live intervals of all user values after collecting all + /// their def points. void computeIntervals(); public: @@ -387,7 +396,7 @@ public: bool runOnMachineFunction(MachineFunction &mf); - /// clear - Release all memory. + /// Release all memory. void clear() { MF = nullptr; userValues.clear(); @@ -400,13 +409,13 @@ public: ModifiedMF = false; } - /// mapVirtReg - Map virtual register to an equivalence class. + /// Map virtual register to an equivalence class. void mapVirtReg(unsigned VirtReg, UserValue *EC); - /// splitRegister - Replace all references to OldReg with NewRegs. + /// Replace all references to OldReg with NewRegs. void splitRegister(unsigned OldReg, ArrayRef NewRegs); - /// emitDebugValues - Recreate DBG_VALUE instruction from data structures. + /// Recreate DBG_VALUE instruction from data structures. void emitDebugValues(VirtRegMap *VRM); void print(raw_ostream&); @@ -612,8 +621,6 @@ bool LDVImpl::collectDebugValues(MachineFunction &mf) { return Changed; } -/// We only propagate DBG_VALUES locally here. LiveDebugValues performs a -/// data-flow analysis to propagate them beyond basic block boundaries. void UserValue::extendDef(SlotIndex Idx, DbgValueLocation Loc, LiveRange *LR, const VNInfo *VNI, SmallVectorImpl *Kills, LiveIntervals &LIS) { diff --git a/lib/CodeGen/MIRCanonicalizerPass.cpp b/lib/CodeGen/MIRCanonicalizerPass.cpp index bbd1dcdb744bce844e6e1755c91a73ff2a49504b..f17c23619ed58b7c7a5184c4e0bcbf2e8abe8e29 100644 --- a/lib/CodeGen/MIRCanonicalizerPass.cpp +++ b/lib/CodeGen/MIRCanonicalizerPass.cpp @@ -677,8 +677,7 @@ static bool runOnBasicBlock(MachineBasicBlock *MBB, std::vector Candidates = populateCandidates(MBB); std::vector VisitedMIs; - std::copy(Candidates.begin(), Candidates.end(), - std::back_inserter(VisitedMIs)); + llvm::copy(Candidates, std::back_inserter(VisitedMIs)); std::vector VRegs; for (auto candidate : Candidates) { diff --git a/lib/CodeGen/MIRParser/MILexer.cpp b/lib/CodeGen/MIRParser/MILexer.cpp index f7cc94e34a59e7e151b136a856ea9c931bd98300..265877c2f5b42a073e36498714bb0a0f8c31cdab 100644 --- a/lib/CodeGen/MIRParser/MILexer.cpp +++ b/lib/CodeGen/MIRParser/MILexer.cpp @@ -220,6 +220,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) { .Case("undefined", MIToken::kw_cfi_undefined) .Case("register", MIToken::kw_cfi_register) .Case("window_save", MIToken::kw_cfi_window_save) + .Case("negate_ra_sign_state", MIToken::kw_cfi_aarch64_negate_ra_sign_state) .Case("blockaddress", MIToken::kw_blockaddress) .Case("intrinsic", MIToken::kw_intrinsic) .Case("target-index", MIToken::kw_target_index) @@ -576,6 +577,7 @@ static MIToken::TokenKind getMetadataKeywordKind(StringRef Identifier) { .Case("!noalias", MIToken::md_noalias) .Case("!range", MIToken::md_range) .Case("!DIExpression", MIToken::md_diexpr) + .Case("!DILocation", MIToken::md_dilocation) .Default(MIToken::Error); } diff --git a/lib/CodeGen/MIRParser/MILexer.h b/lib/CodeGen/MIRParser/MILexer.h index dffa4f745444e03f4b4ca8f86fae7a0222dd5979..ceff79087d814c1b7691054eefff22a30fdccddb 100644 --- a/lib/CodeGen/MIRParser/MILexer.h +++ b/lib/CodeGen/MIRParser/MILexer.h @@ -89,6 +89,7 @@ struct MIToken { kw_cfi_restore_state, kw_cfi_undefined, kw_cfi_window_save, + kw_cfi_aarch64_negate_ra_sign_state, kw_blockaddress, kw_intrinsic, kw_target_index, @@ -126,6 +127,7 @@ struct MIToken { md_noalias, md_range, md_diexpr, + md_dilocation, // Identifier tokens Identifier, diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp index 1a6174bf9ee23a316ed4bd3670df27b6d806fa5c..6f2d8bb53ac82b848809447b49c8ef7d270ff6e6 100644 --- a/lib/CodeGen/MIRParser/MIParser.cpp +++ b/lib/CodeGen/MIRParser/MIParser.cpp @@ -226,6 +226,7 @@ public: bool parseMCSymbolOperand(MachineOperand &Dest); bool parseMDNode(MDNode *&Node); bool parseDIExpression(MDNode *&Expr); + bool parseDILocation(MDNode *&Expr); bool parseMetadataOperand(MachineOperand &Dest); bool parseCFIOffset(int &Offset); bool parseCFIRegister(unsigned &Reg); @@ -776,11 +777,15 @@ bool MIParser::parse(MachineInstr *&MI) { DebugLoc DebugLocation; if (Token.is(MIToken::kw_debug_location)) { lex(); - if (Token.isNot(MIToken::exclaim)) - return error("expected a metadata node after 'debug-location'"); MDNode *Node = nullptr; - if (parseMDNode(Node)) - return true; + if (Token.is(MIToken::exclaim)) { + if (parseMDNode(Node)) + return true; + } else if (Token.is(MIToken::md_dilocation)) { + if (parseDILocation(Node)) + return true; + } else + return error("expected a metadata node after 'debug-location'"); if (!isa(Node)) return error("referenced metadata is not a DILocation"); DebugLocation = DebugLoc(Node); @@ -898,6 +903,9 @@ bool MIParser::parseStandaloneMDNode(MDNode *&Node) { } else if (Token.is(MIToken::md_diexpr)) { if (parseDIExpression(Node)) return true; + } else if (Token.is(MIToken::md_dilocation)) { + if (parseDILocation(Node)) + return true; } else return error("expected a metadata node"); if (Token.isNot(MIToken::Eof)) @@ -1684,6 +1692,109 @@ bool MIParser::parseDIExpression(MDNode *&Expr) { return false; } +bool MIParser::parseDILocation(MDNode *&Loc) { + assert(Token.is(MIToken::md_dilocation)); + lex(); + + bool HaveLine = false; + unsigned Line = 0; + unsigned Column = 0; + MDNode *Scope = nullptr; + MDNode *InlinedAt = nullptr; + bool ImplicitCode = false; + + if (expectAndConsume(MIToken::lparen)) + return true; + + if (Token.isNot(MIToken::rparen)) { + do { + if (Token.is(MIToken::Identifier)) { + if (Token.stringValue() == "line") { + lex(); + if (expectAndConsume(MIToken::colon)) + return true; + if (Token.isNot(MIToken::IntegerLiteral) || + Token.integerValue().isSigned()) + return error("expected unsigned integer"); + Line = Token.integerValue().getZExtValue(); + HaveLine = true; + lex(); + continue; + } + if (Token.stringValue() == "column") { + lex(); + if (expectAndConsume(MIToken::colon)) + return true; + if (Token.isNot(MIToken::IntegerLiteral) || + Token.integerValue().isSigned()) + return error("expected unsigned integer"); + Column = Token.integerValue().getZExtValue(); + lex(); + continue; + } + if (Token.stringValue() == "scope") { + lex(); + if (expectAndConsume(MIToken::colon)) + return true; + if (parseMDNode(Scope)) + return error("expected metadata node"); + if (!isa(Scope)) + return error("expected DIScope node"); + continue; + } + if (Token.stringValue() == "inlinedAt") { + lex(); + if (expectAndConsume(MIToken::colon)) + return true; + if (Token.is(MIToken::exclaim)) { + if (parseMDNode(InlinedAt)) + return true; + } else if (Token.is(MIToken::md_dilocation)) { + if (parseDILocation(InlinedAt)) + return true; + } else + return error("expected metadata node"); + if (!isa(InlinedAt)) + return error("expected DILocation node"); + continue; + } + if (Token.stringValue() == "isImplicitCode") { + lex(); + if (expectAndConsume(MIToken::colon)) + return true; + if (!Token.is(MIToken::Identifier)) + return error("expected true/false"); + // As far as I can see, we don't have any existing need for parsing + // true/false in MIR yet. Do it ad-hoc until there's something else + // that needs it. + if (Token.stringValue() == "true") + ImplicitCode = true; + else if (Token.stringValue() == "false") + ImplicitCode = false; + else + return error("expected true/false"); + lex(); + continue; + } + } + return error(Twine("invalid DILocation argument '") + + Token.stringValue() + "'"); + } while (consumeIfPresent(MIToken::comma)); + } + + if (expectAndConsume(MIToken::rparen)) + return true; + + if (!HaveLine) + return error("DILocation requires line number"); + if (!Scope) + return error("DILocation requires a scope"); + + Loc = DILocation::get(MF.getFunction().getContext(), Line, Column, Scope, + InlinedAt, ImplicitCode); + return false; +} + bool MIParser::parseMetadataOperand(MachineOperand &Dest) { MDNode *Node = nullptr; if (Token.is(MIToken::exclaim)) { @@ -1820,6 +1931,9 @@ bool MIParser::parseCFIOperand(MachineOperand &Dest) { case MIToken::kw_cfi_window_save: CFIIndex = MF.addFrameInst(MCCFIInstruction::createWindowSave(nullptr)); break; + case MIToken::kw_cfi_aarch64_negate_ra_sign_state: + CFIIndex = MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); + break; case MIToken::kw_cfi_escape: { std::string Values; if (parseCFIEscapeValues(Values)) @@ -2112,6 +2226,7 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest, case MIToken::kw_cfi_restore_state: case MIToken::kw_cfi_undefined: case MIToken::kw_cfi_window_save: + case MIToken::kw_cfi_aarch64_negate_ra_sign_state: return parseCFIOperand(Dest); case MIToken::kw_blockaddress: return parseBlockAddressOperand(Dest); diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp index 8012946371531d5446b0e3b95259b2042a0385c9..d9dcc428943f19cb1a93cce21fd3979cf59543fe 100644 --- a/lib/CodeGen/MIRPrinter.cpp +++ b/lib/CodeGen/MIRPrinter.cpp @@ -401,18 +401,20 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &YMF, for (const auto &CSInfo : MFI.getCalleeSavedInfo()) { yaml::StringValue Reg; printRegMIR(CSInfo.getReg(), Reg, TRI); - auto StackObjectInfo = StackObjectOperandMapping.find(CSInfo.getFrameIdx()); - assert(StackObjectInfo != StackObjectOperandMapping.end() && - "Invalid stack object index"); - const FrameIndexOperand &StackObject = StackObjectInfo->second; - if (StackObject.IsFixed) { - YMF.FixedStackObjects[StackObject.ID].CalleeSavedRegister = Reg; - YMF.FixedStackObjects[StackObject.ID].CalleeSavedRestored = - CSInfo.isRestored(); - } else { - YMF.StackObjects[StackObject.ID].CalleeSavedRegister = Reg; - YMF.StackObjects[StackObject.ID].CalleeSavedRestored = - CSInfo.isRestored(); + if (!CSInfo.isSpilledToReg()) { + auto StackObjectInfo = StackObjectOperandMapping.find(CSInfo.getFrameIdx()); + assert(StackObjectInfo != StackObjectOperandMapping.end() && + "Invalid stack object index"); + const FrameIndexOperand &StackObject = StackObjectInfo->second; + if (StackObject.IsFixed) { + YMF.FixedStackObjects[StackObject.ID].CalleeSavedRegister = Reg; + YMF.FixedStackObjects[StackObject.ID].CalleeSavedRestored = + CSInfo.isRestored(); + } else { + YMF.StackObjects[StackObject.ID].CalleeSavedRegister = Reg; + YMF.StackObjects[StackObject.ID].CalleeSavedRestored = + CSInfo.isRestored(); + } } } for (unsigned I = 0, E = MFI.getLocalFrameObjectCount(); I < E; ++I) { diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp index 0b9ee462d1ae7af7283d07e281b60c262a8e39be..03771bc5dae10471e0154bd0a7b461b8e865e83b 100644 --- a/lib/CodeGen/MachineBasicBlock.cpp +++ b/lib/CodeGen/MachineBasicBlock.cpp @@ -1380,24 +1380,21 @@ MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI, // Try searching forwards from Before, looking for reads or defs. const_iterator I(Before); - // If this is the last insn in the block, don't search forwards. - if (I != end()) { - for (++I; I != end() && N > 0; ++I) { - if (I->isDebugInstr()) - continue; + for (; I != end() && N > 0; ++I) { + if (I->isDebugInstr()) + continue; - --N; + --N; - MachineOperandIteratorBase::PhysRegInfo Info = - ConstMIOperands(*I).analyzePhysReg(Reg, TRI); + MachineOperandIteratorBase::PhysRegInfo Info = + ConstMIOperands(*I).analyzePhysReg(Reg, TRI); - // Register is live when we read it here. - if (Info.Read) - return LQR_Live; - // Register is dead if we can fully overwrite or clobber it here. - if (Info.FullyDefined || Info.Clobbered) - return LQR_Dead; - } + // Register is live when we read it here. + if (Info.Read) + return LQR_Live; + // Register is dead if we can fully overwrite or clobber it here. + if (Info.FullyDefined || Info.Clobbered) + return LQR_Dead; } // If we reached the end, it is safe to clobber Reg at the end of a block of diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp index 488481cec37c4cd16b3b59b4daff2ad4b0c1b164..3ded00b70fd43f3ac0405ea47c5c18d01f9d345b 100644 --- a/lib/CodeGen/MachineFunction.cpp +++ b/lib/CodeGen/MachineFunction.cpp @@ -435,7 +435,7 @@ MachineFunction::createMIExtraInfo(ArrayRef MMOs, const char *MachineFunction::createExternalSymbolName(StringRef Name) { char *Dest = Allocator.Allocate(Name.size() + 1); - std::copy(Name.begin(), Name.end(), Dest); + llvm::copy(Name, Dest); Dest[Name.size()] = 0; return Dest; } diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp index f30290109b795b326f3c21ccb8843896974f9200..764a84c7e1327ed25c0b77d1b5f9da8e1339e9d4 100644 --- a/lib/CodeGen/MachineInstr.cpp +++ b/lib/CodeGen/MachineInstr.cpp @@ -933,9 +933,7 @@ int MachineInstr::findRegisterUseOperandIdx( unsigned MOReg = MO.getReg(); if (!MOReg) continue; - if (MOReg == Reg || (TRI && TargetRegisterInfo::isPhysicalRegister(MOReg) && - TargetRegisterInfo::isPhysicalRegister(Reg) && - TRI->isSubRegister(MOReg, Reg))) + if (MOReg == Reg || (TRI && Reg && MOReg && TRI->regsOverlap(MOReg, Reg))) if (!isKill || MO.isKill()) return i; } diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp index 2c9b5963537891fee054a40fed626ed0de2aa6aa..58fd1f23842060aebc302c18bc22af7c923b3176 100644 --- a/lib/CodeGen/MachineLICM.cpp +++ b/lib/CodeGen/MachineLICM.cpp @@ -463,8 +463,12 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) { if (PhysRegDefs.test(*AS)) PhysRegClobbers.set(*AS); - PhysRegDefs.set(*AS); } + // Need a second loop because MCRegAliasIterator can visit the same + // register twice. + for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) + PhysRegDefs.set(*AS); + if (PhysRegClobbers.test(Reg)) // MI defined register is seen defined by another instruction in // the loop, it cannot be a LICM candidate. diff --git a/lib/CodeGen/MachineOperand.cpp b/lib/CodeGen/MachineOperand.cpp index 4fe51f6624816bd24443d4884de662f8d891b2ee..05e51e1873cf90a65f1ef6b5da9706340f51e4bc 100644 --- a/lib/CodeGen/MachineOperand.cpp +++ b/lib/CodeGen/MachineOperand.cpp @@ -697,6 +697,11 @@ static void printCFI(raw_ostream &OS, const MCCFIInstruction &CFI, if (MCSymbol *Label = CFI.getLabel()) MachineOperand::printSymbol(OS, *Label); break; + case MCCFIInstruction::OpNegateRAState: + OS << "negate_ra_sign_state "; + if (MCSymbol *Label = CFI.getLabel()) + MachineOperand::printSymbol(OS, *Label); + break; default: // TODO: Print the other CFI Operations. OS << ""; diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp index c69bfecd8dcd414db1115147eb630f52eef640a0..ad96c0e579e43665c2b301009b8c0b6aad305665 100644 --- a/lib/CodeGen/MachineOutliner.cpp +++ b/lib/CodeGen/MachineOutliner.cpp @@ -519,18 +519,19 @@ public: RS = RepeatedSubstring(); N = nullptr; + // Each leaf node represents a repeat of a string. + std::vector LeafChildren; + // Continue visiting nodes until we find one which repeats more than once. while (!ToVisit.empty()) { SuffixTreeNode *Curr = ToVisit.back(); ToVisit.pop_back(); + LeafChildren.clear(); // Keep track of the length of the string associated with the node. If // it's too short, we'll quit. unsigned Length = Curr->ConcatLen; - // Each leaf node represents a repeat of a string. - std::vector LeafChildren; - // Iterate over each child, saving internal nodes for visiting, and // leaf nodes in LeafChildren. Internal nodes represent individual // strings, which may repeat. @@ -621,9 +622,8 @@ struct InstructionMapper { DenseMap InstructionIntegerMap; - /// Corresponcence from unsigned integers to \p MachineInstrs. - /// Inverse of \p InstructionIntegerMap. - DenseMap IntegerInstructionMap; + /// Correspondence between \p MachineBasicBlocks and target-defined flags. + DenseMap MBBFlagsMap; /// The vector of unsigned integers that the module is mapped to. std::vector UnsignedVec; @@ -641,8 +641,7 @@ struct InstructionMapper { /// Maps \p *It to a legal integer. /// /// Updates \p CanOutlineWithPrevInstr, \p HaveLegalRange, \p InstrListForMBB, - /// \p UnsignedVecForMBB, \p InstructionIntegerMap, \p IntegerInstructionMap, - /// and \p LegalInstrNumber. + /// \p UnsignedVecForMBB, \p InstructionIntegerMap, and \p LegalInstrNumber. /// /// \returns The integer that \p *It was mapped to. unsigned mapToLegalUnsigned( @@ -675,10 +674,8 @@ struct InstructionMapper { unsigned MINumber = ResultIt->second; // There was an insertion. - if (WasInserted) { + if (WasInserted) LegalInstrNumber++; - IntegerInstructionMap.insert(std::make_pair(MINumber, &MI)); - } UnsignedVecForMBB.push_back(MINumber); @@ -742,7 +739,15 @@ struct InstructionMapper { /// \param TII \p TargetInstrInfo for the function. void convertToUnsignedVec(MachineBasicBlock &MBB, const TargetInstrInfo &TII) { - unsigned Flags = TII.getMachineOutlinerMBBFlags(MBB); + unsigned Flags = 0; + + // Don't even map in this case. + if (!TII.isMBBSafeToOutlineFrom(MBB, Flags)) + return; + + // Store info for the MBB for later outlining. + MBBFlagsMap[&MBB] = Flags; + MachineBasicBlock::iterator It = MBB.begin(); // The number of instructions in this block that will be considered for @@ -863,7 +868,8 @@ struct MachineOutliner : public ModulePass { /// Remark output explaining that a function was outlined. void emitOutlinedFunctionRemark(OutlinedFunction &OF); - /// Find all repeated substrings that satisfy the outlining cost model. + /// Find all repeated substrings that satisfy the outlining cost model by + /// constructing a suffix tree. /// /// If a substring appears at least twice, then it must be represented by /// an internal node which appears in at least two suffixes. Each suffix @@ -872,75 +878,26 @@ struct MachineOutliner : public ModulePass { /// internal node represents a beneficial substring, then we use each of /// its leaf children to find the locations of its substring. /// - /// \param ST A suffix tree to query. /// \param Mapper Contains outlining mapping information. - /// \param[out] CandidateList Filled with candidates representing each - /// beneficial substring. /// \param[out] FunctionList Filled with a list of \p OutlinedFunctions /// each type of candidate. - /// - /// \returns The length of the longest candidate found. - unsigned - findCandidates(SuffixTree &ST, - InstructionMapper &Mapper, - std::vector> &CandidateList, - std::vector &FunctionList); - - /// Replace the sequences of instructions represented by the - /// \p Candidates in \p CandidateList with calls to \p MachineFunctions - /// described in \p FunctionList. + void findCandidates(InstructionMapper &Mapper, + std::vector &FunctionList); + + /// Replace the sequences of instructions represented by \p OutlinedFunctions + /// with calls to functions. /// /// \param M The module we are outlining from. - /// \param CandidateList A list of candidates to be outlined. /// \param FunctionList A list of functions to be inserted into the module. /// \param Mapper Contains the instruction mappings for the module. - bool outline(Module &M, - const ArrayRef> &CandidateList, - std::vector &FunctionList, + bool outline(Module &M, std::vector &FunctionList, InstructionMapper &Mapper); /// Creates a function for \p OF and inserts it into the module. - MachineFunction *createOutlinedFunction(Module &M, const OutlinedFunction &OF, + MachineFunction *createOutlinedFunction(Module &M, OutlinedFunction &OF, InstructionMapper &Mapper, unsigned Name); - /// Find potential outlining candidates and store them in \p CandidateList. - /// - /// For each type of potential candidate, also build an \p OutlinedFunction - /// struct containing the information to build the function for that - /// candidate. - /// - /// \param[out] CandidateList Filled with outlining candidates for the module. - /// \param[out] FunctionList Filled with functions corresponding to each type - /// of \p Candidate. - /// \param ST The suffix tree for the module. - /// - /// \returns The length of the longest candidate found. 0 if there are none. - unsigned - buildCandidateList(std::vector> &CandidateList, - std::vector &FunctionList, - SuffixTree &ST, InstructionMapper &Mapper); - - /// Helper function for pruneOverlaps. - /// Removes \p C from the candidate list, and updates its \p OutlinedFunction. - void prune(Candidate &C, std::vector &FunctionList); - - /// Remove any overlapping candidates that weren't handled by the - /// suffix tree's pruning method. - /// - /// Pruning from the suffix tree doesn't necessarily remove all overlaps. - /// If a short candidate is chosen for outlining, then a longer candidate - /// which has that short candidate as a suffix is chosen, the tree's pruning - /// method will not find it. Thus, we need to prune before outlining as well. - /// - /// \param[in,out] CandidateList A list of outlining candidates. - /// \param[in,out] FunctionList A list of functions to be outlined. - /// \param Mapper Contains instruction mapping info for outlining. - /// \param MaxCandidateLen The length of the longest candidate. - void pruneOverlaps(std::vector> &CandidateList, - std::vector &FunctionList, - InstructionMapper &Mapper, unsigned MaxCandidateLen); - /// Construct a suffix tree on the instructions in \p M and outline repeated /// strings from that tree. bool runOnModule(Module &M) override; @@ -949,8 +906,8 @@ struct MachineOutliner : public ModulePass { /// function for remark emission. DISubprogram *getSubprogramOrNull(const OutlinedFunction &OF) { DISubprogram *SP; - for (const std::shared_ptr &C : OF.Candidates) - if (C && C->getMF() && (SP = C->getMF()->getFunction().getSubprogram())) + for (const Candidate &C : OF.Candidates) + if (C.getMF() && (SP = C.getMF()->getFunction().getSubprogram())) return SP; return nullptr; } @@ -1030,7 +987,7 @@ void MachineOutliner::emitOutlinedFunctionRemark(OutlinedFunction &OF) { MachineOptimizationRemark R(DEBUG_TYPE, "OutlinedFunction", MBB->findDebugLoc(MBB->begin()), MBB); R << "Saved " << NV("OutliningBenefit", OF.getBenefit()) << " bytes by " - << "outlining " << NV("Length", OF.Sequence.size()) << " instructions " + << "outlining " << NV("Length", OF.getNumInstrs()) << " instructions " << "from " << NV("NumOccurrences", OF.getOccurrenceCount()) << " locations. " << "(Found at: "; @@ -1038,12 +995,8 @@ void MachineOutliner::emitOutlinedFunctionRemark(OutlinedFunction &OF) { // Tell the user the other places the candidate was found. for (size_t i = 0, e = OF.Candidates.size(); i < e; i++) { - // Skip over things that were pruned. - if (!OF.Candidates[i]->InCandidateList) - continue; - R << NV((Twine("StartLoc") + Twine(i)).str(), - OF.Candidates[i]->front()->getDebugLoc()); + OF.Candidates[i].front()->getDebugLoc()); if (i != e - 1) R << ", "; } @@ -1053,19 +1006,18 @@ void MachineOutliner::emitOutlinedFunctionRemark(OutlinedFunction &OF) { MORE.emit(R); } -unsigned MachineOutliner::findCandidates( - SuffixTree &ST, InstructionMapper &Mapper, - std::vector> &CandidateList, - std::vector &FunctionList) { - CandidateList.clear(); +void +MachineOutliner::findCandidates(InstructionMapper &Mapper, + std::vector &FunctionList) { FunctionList.clear(); - unsigned MaxLen = 0; + SuffixTree ST(Mapper.UnsignedVec); // First, find dall of the repeated substrings in the tree of minimum length // 2. + std::vector CandidatesForRepeatedSeq; for (auto It = ST.begin(), Et = ST.end(); It != Et; ++It) { + CandidatesForRepeatedSeq.clear(); SuffixTree::RepeatedSubstring RS = *It; - std::vector CandidatesForRepeatedSeq; unsigned StringLen = RS.Length; for (const unsigned &StartIdx : RS.StartIndices) { unsigned EndIdx = StartIdx + StringLen - 1; @@ -1101,17 +1053,18 @@ unsigned MachineOutliner::findCandidates( MachineBasicBlock::iterator StartIt = Mapper.InstrList[StartIdx]; MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx]; + MachineBasicBlock *MBB = StartIt->getParent(); CandidatesForRepeatedSeq.emplace_back(StartIdx, StringLen, StartIt, - EndIt, StartIt->getParent(), - FunctionList.size()); + EndIt, MBB, FunctionList.size(), + Mapper.MBBFlagsMap[MBB]); } } // We've found something we might want to outline. // Create an OutlinedFunction to store it and check if it'd be beneficial // to outline. - if (CandidatesForRepeatedSeq.empty()) + if (CandidatesForRepeatedSeq.size() < 2) continue; // Arbitrarily choose a TII from the first candidate. @@ -1122,169 +1075,23 @@ unsigned MachineOutliner::findCandidates( OutlinedFunction OF = TII->getOutliningCandidateInfo(CandidatesForRepeatedSeq); - // If we deleted every candidate, then there's nothing to outline. - if (OF.Candidates.empty()) + // If we deleted too many candidates, then there's nothing worth outlining. + // FIXME: This should take target-specified instruction sizes into account. + if (OF.Candidates.size() < 2) continue; - std::vector Seq; - unsigned StartIdx = RS.StartIndices[0]; // Grab any start index. - for (unsigned i = StartIdx; i < StartIdx + StringLen; i++) - Seq.push_back(ST.Str[i]); - OF.Sequence = Seq; - // Is it better to outline this candidate than not? if (OF.getBenefit() < 1) { emitNotOutliningCheaperRemark(StringLen, CandidatesForRepeatedSeq, OF); continue; } - if (StringLen > MaxLen) - MaxLen = StringLen; - - // The function is beneficial. Save its candidates to the candidate list - // for pruning. - for (std::shared_ptr &C : OF.Candidates) - CandidateList.push_back(C); FunctionList.push_back(OF); } - - return MaxLen; -} - -// Remove C from the candidate space, and update its OutlinedFunction. -void MachineOutliner::prune(Candidate &C, - std::vector &FunctionList) { - // Get the OutlinedFunction associated with this Candidate. - OutlinedFunction &F = FunctionList[C.FunctionIdx]; - - // Update C's associated function's occurrence count. - F.decrement(); - - // Remove C from the CandidateList. - C.InCandidateList = false; - - LLVM_DEBUG(dbgs() << "- Removed a Candidate \n"; - dbgs() << "--- Num fns left for candidate: " - << F.getOccurrenceCount() << "\n"; - dbgs() << "--- Candidate's functions's benefit: " << F.getBenefit() - << "\n";); -} - -void MachineOutliner::pruneOverlaps( - std::vector> &CandidateList, - std::vector &FunctionList, InstructionMapper &Mapper, - unsigned MaxCandidateLen) { - - // Return true if this candidate became unbeneficial for outlining in a - // previous step. - auto ShouldSkipCandidate = [&FunctionList, this](Candidate &C) { - - // Check if the candidate was removed in a previous step. - if (!C.InCandidateList) - return true; - - // C must be alive. Check if we should remove it. - if (FunctionList[C.FunctionIdx].getBenefit() < 1) { - prune(C, FunctionList); - return true; - } - - // C is in the list, and F is still beneficial. - return false; - }; - - // TODO: Experiment with interval trees or other interval-checking structures - // to lower the time complexity of this function. - // TODO: Can we do better than the simple greedy choice? - // Check for overlaps in the range. - // This is O(MaxCandidateLen * CandidateList.size()). - for (auto It = CandidateList.begin(), Et = CandidateList.end(); It != Et; - It++) { - Candidate &C1 = **It; - - // If C1 was already pruned, or its function is no longer beneficial for - // outlining, move to the next candidate. - if (ShouldSkipCandidate(C1)) - continue; - - // The minimum start index of any candidate that could overlap with this - // one. - unsigned FarthestPossibleIdx = 0; - - // Either the index is 0, or it's at most MaxCandidateLen indices away. - if (C1.getStartIdx() > MaxCandidateLen) - FarthestPossibleIdx = C1.getStartIdx() - MaxCandidateLen; - - // Compare against the candidates in the list that start at most - // FarthestPossibleIdx indices away from C1. There are at most - // MaxCandidateLen of these. - for (auto Sit = It + 1; Sit != Et; Sit++) { - Candidate &C2 = **Sit; - - // Is this candidate too far away to overlap? - if (C2.getStartIdx() < FarthestPossibleIdx) - break; - - // If C2 was already pruned, or its function is no longer beneficial for - // outlining, move to the next candidate. - if (ShouldSkipCandidate(C2)) - continue; - - // Do C1 and C2 overlap? - // - // Not overlapping: - // High indices... [C1End ... C1Start][C2End ... C2Start] ...Low indices - // - // We sorted our candidate list so C2Start <= C1Start. We know that - // C2End > C2Start since each candidate has length >= 2. Therefore, all we - // have to check is C2End < C2Start to see if we overlap. - if (C2.getEndIdx() < C1.getStartIdx()) - continue; - - // C1 and C2 overlap. - // We need to choose the better of the two. - // - // Approximate this by picking the one which would have saved us the - // most instructions before any pruning. - - // Is C2 a better candidate? - if (C2.Benefit > C1.Benefit) { - // Yes, so prune C1. Since C1 is dead, we don't have to compare it - // against anything anymore, so break. - prune(C1, FunctionList); - break; - } - - // Prune C2 and move on to the next candidate. - prune(C2, FunctionList); - } - } -} - -unsigned MachineOutliner::buildCandidateList( - std::vector> &CandidateList, - std::vector &FunctionList, SuffixTree &ST, - InstructionMapper &Mapper) { - - std::vector CandidateSequence; // Current outlining candidate. - unsigned MaxCandidateLen = 0; // Length of the longest candidate. - - MaxCandidateLen = - findCandidates(ST, Mapper, CandidateList, FunctionList); - - // Sort the candidates in decending order. This will simplify the outlining - // process when we have to remove the candidates from the mapping by - // allowing us to cut them out without keeping track of an offset. - std::stable_sort( - CandidateList.begin(), CandidateList.end(), - [](const std::shared_ptr &LHS, - const std::shared_ptr &RHS) { return *LHS < *RHS; }); - - return MaxCandidateLen; } MachineFunction * -MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF, +MachineOutliner::createOutlinedFunction(Module &M, OutlinedFunction &OF, InstructionMapper &Mapper, unsigned Name) { @@ -1319,7 +1126,8 @@ MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF, // function. This makes sure the outlined function knows what kinds of // instructions are going into it. This is fine, since all parent functions // must necessarily support the instructions that are in the outlined region. - const Function &ParentFn = OF.Candidates.front()->getMF()->getFunction(); + Candidate &FirstCand = OF.Candidates.front(); + const Function &ParentFn = FirstCand.getMF()->getFunction(); if (ParentFn.hasFnAttribute("target-features")) F->addFnAttr(ParentFn.getFnAttribute("target-features")); @@ -1336,11 +1144,9 @@ MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF, // Insert the new function into the module. MF.insert(MF.begin(), &MBB); - // Copy over the instructions for the function using the integer mappings in - // its sequence. - for (unsigned Str : OF.Sequence) { - MachineInstr *NewMI = - MF.CloneMachineInstr(Mapper.IntegerInstructionMap.find(Str)->second); + for (auto I = FirstCand.front(), E = std::next(FirstCand.back()); I != E; + ++I) { + MachineInstr *NewMI = MF.CloneMachineInstr(&*I); NewMI->dropMemRefs(MF); // Don't keep debug information for outlined instructions. @@ -1372,9 +1178,10 @@ MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF, Unit /* File */, 0 /* Line 0 is reserved for compiler-generated code. */, DB.createSubroutineType(DB.getOrCreateTypeArray(None)), /* void type */ - false, true, 0, /* Line 0 is reserved for compiler-generated code. */ + 0, /* Line 0 is reserved for compiler-generated code. */ DINode::DIFlags::FlagArtificial /* Compiler-generated code. */, - true /* Outlined code is optimized code by definition. */); + /* Outlined code is optimized code by definition. */ + DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized); // Don't add any new variables to the subprogram. DB.finalizeSubprogram(OutlinedSP); @@ -1388,87 +1195,100 @@ MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF, return &MF; } -bool MachineOutliner::outline( - Module &M, const ArrayRef> &CandidateList, - std::vector &FunctionList, InstructionMapper &Mapper) { +bool MachineOutliner::outline(Module &M, + std::vector &FunctionList, + InstructionMapper &Mapper) { bool OutlinedSomething = false; // Number to append to the current outlined function. unsigned OutlinedFunctionNum = 0; - // Replace the candidates with calls to their respective outlined functions. - for (const std::shared_ptr &Cptr : CandidateList) { - Candidate &C = *Cptr; - // Was the candidate removed during pruneOverlaps? - if (!C.InCandidateList) - continue; - - // If not, then look at its OutlinedFunction. - OutlinedFunction &OF = FunctionList[C.FunctionIdx]; + // Sort by benefit. The most beneficial functions should be outlined first. + std::stable_sort( + FunctionList.begin(), FunctionList.end(), + [](const OutlinedFunction &LHS, const OutlinedFunction &RHS) { + return LHS.getBenefit() > RHS.getBenefit(); + }); + + // Walk over each function, outlining them as we go along. Functions are + // outlined greedily, based off the sort above. + for (OutlinedFunction &OF : FunctionList) { + // If we outlined something that overlapped with a candidate in a previous + // step, then we can't outline from it. + erase_if(OF.Candidates, [&Mapper](Candidate &C) { + return std::any_of( + Mapper.UnsignedVec.begin() + C.getStartIdx(), + Mapper.UnsignedVec.begin() + C.getEndIdx() + 1, + [](unsigned I) { return (I == static_cast(-1)); }); + }); - // Was its OutlinedFunction made unbeneficial during pruneOverlaps? + // If we made it unbeneficial to outline this function, skip it. if (OF.getBenefit() < 1) continue; - // Does this candidate have a function yet? - if (!OF.MF) { - OF.MF = createOutlinedFunction(M, OF, Mapper, OutlinedFunctionNum); - emitOutlinedFunctionRemark(OF); - FunctionsCreated++; - OutlinedFunctionNum++; // Created a function, move to the next name. - } - + // It's beneficial. Create the function and outline its sequence's + // occurrences. + OF.MF = createOutlinedFunction(M, OF, Mapper, OutlinedFunctionNum); + emitOutlinedFunctionRemark(OF); + FunctionsCreated++; + OutlinedFunctionNum++; // Created a function, move to the next name. MachineFunction *MF = OF.MF; - MachineBasicBlock &MBB = *C.getMBB(); - MachineBasicBlock::iterator StartIt = C.front(); - MachineBasicBlock::iterator EndIt = C.back(); - assert(StartIt != C.getMBB()->end() && "StartIt out of bounds!"); - assert(EndIt != C.getMBB()->end() && "EndIt out of bounds!"); - const TargetSubtargetInfo &STI = MF->getSubtarget(); const TargetInstrInfo &TII = *STI.getInstrInfo(); - // Insert a call to the new function and erase the old sequence. - auto CallInst = TII.insertOutlinedCall(M, MBB, StartIt, *OF.MF, C); - - // If the caller tracks liveness, then we need to make sure that anything - // we outline doesn't break liveness assumptions. - // The outlined functions themselves currently don't track liveness, but - // we should make sure that the ranges we yank things out of aren't - // wrong. - if (MBB.getParent()->getProperties().hasProperty( - MachineFunctionProperties::Property::TracksLiveness)) { - // Helper lambda for adding implicit def operands to the call instruction. - auto CopyDefs = [&CallInst](MachineInstr &MI) { - for (MachineOperand &MOP : MI.operands()) { - // Skip over anything that isn't a register. - if (!MOP.isReg()) - continue; - - // If it's a def, add it to the call instruction. - if (MOP.isDef()) - CallInst->addOperand( - MachineOperand::CreateReg(MOP.getReg(), true, /* isDef = true */ - true /* isImp = true */)); - } - }; + // Replace occurrences of the sequence with calls to the new function. + for (Candidate &C : OF.Candidates) { + MachineBasicBlock &MBB = *C.getMBB(); + MachineBasicBlock::iterator StartIt = C.front(); + MachineBasicBlock::iterator EndIt = C.back(); + + // Insert the call. + auto CallInst = TII.insertOutlinedCall(M, MBB, StartIt, *MF, C); + + // If the caller tracks liveness, then we need to make sure that + // anything we outline doesn't break liveness assumptions. The outlined + // functions themselves currently don't track liveness, but we should + // make sure that the ranges we yank things out of aren't wrong. + if (MBB.getParent()->getProperties().hasProperty( + MachineFunctionProperties::Property::TracksLiveness)) { + // Helper lambda for adding implicit def operands to the call + // instruction. + auto CopyDefs = [&CallInst](MachineInstr &MI) { + for (MachineOperand &MOP : MI.operands()) { + // Skip over anything that isn't a register. + if (!MOP.isReg()) + continue; + + // If it's a def, add it to the call instruction. + if (MOP.isDef()) + CallInst->addOperand(MachineOperand::CreateReg( + MOP.getReg(), true, /* isDef = true */ + true /* isImp = true */)); + } + }; + // Copy over the defs in the outlined range. + // First inst in outlined range <-- Anything that's defined in this + // ... .. range has to be added as an + // implicit Last inst in outlined range <-- def to the call + // instruction. + std::for_each(CallInst, std::next(EndIt), CopyDefs); + } - // Copy over the defs in the outlined range. - // First inst in outlined range <-- Anything that's defined in this - // ... .. range has to be added as an implicit - // Last inst in outlined range <-- def to the call instruction. - std::for_each(CallInst, std::next(EndIt), CopyDefs); - } + // Erase from the point after where the call was inserted up to, and + // including, the final instruction in the sequence. + // Erase needs one past the end, so we need std::next there too. + MBB.erase(std::next(StartIt), std::next(EndIt)); - // Erase from the point after where the call was inserted up to, and - // including, the final instruction in the sequence. - // Erase needs one past the end, so we need std::next there too. - MBB.erase(std::next(StartIt), std::next(EndIt)); - OutlinedSomething = true; + // Keep track of what we removed by marking them all as -1. + std::for_each(Mapper.UnsignedVec.begin() + C.getStartIdx(), + Mapper.UnsignedVec.begin() + C.getEndIdx() + 1, + [](unsigned &I) { I = static_cast(-1); }); + OutlinedSomething = true; - // Statistics. - NumOutlined++; + // Statistics. + NumOutlined++; + } } LLVM_DEBUG(dbgs() << "OutlinedSomething = " << OutlinedSomething << "\n";); @@ -1628,18 +1448,10 @@ bool MachineOutliner::runOnModule(Module &M) { // Prepare instruction mappings for the suffix tree. populateMapper(Mapper, M, MMI); - - // Construct a suffix tree, use it to find candidates, and then outline them. - SuffixTree ST(Mapper.UnsignedVec); - std::vector> CandidateList; std::vector FunctionList; // Find all of the outlining candidates. - unsigned MaxCandidateLen = - buildCandidateList(CandidateList, FunctionList, ST, Mapper); - - // Remove candidates that overlap with other candidates. - pruneOverlaps(CandidateList, FunctionList, Mapper, MaxCandidateLen); + findCandidates(Mapper, FunctionList); // If we've requested size remarks, then collect the MI counts of every // function before outlining, and the MI counts after outlining. @@ -1656,7 +1468,7 @@ bool MachineOutliner::runOnModule(Module &M) { initSizeRemarkInfo(M, MMI, FunctionToInstrCount); // Outline each of the candidates and return true if something was outlined. - bool OutlinedSomething = outline(M, CandidateList, FunctionList, Mapper); + bool OutlinedSomething = outline(M, FunctionList, Mapper); // If we outlined something, we definitely changed the MI count of the // module. If we've asked for size remarks, then output them. diff --git a/lib/CodeGen/MachinePassRegistry.cpp b/lib/CodeGen/MachinePassRegistry.cpp deleted file mode 100644 index 3ee3e40b27e236d5d984c7dc3ed4618555f7834e..0000000000000000000000000000000000000000 --- a/lib/CodeGen/MachinePassRegistry.cpp +++ /dev/null @@ -1,55 +0,0 @@ -//===-- CodeGen/MachineInstr.cpp ------------------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the machine function pass registry for register allocators -// and instruction schedulers. -// -//===----------------------------------------------------------------------===// - -#include "llvm/CodeGen/MachinePassRegistry.h" - -using namespace llvm; - -void MachinePassRegistryListener::anchor() { } - -/// setDefault - Set the default constructor by name. -void MachinePassRegistry::setDefault(StringRef Name) { - MachinePassCtor Ctor = nullptr; - for(MachinePassRegistryNode *R = getList(); R; R = R->getNext()) { - if (R->getName() == Name) { - Ctor = R->getCtor(); - break; - } - } - assert(Ctor && "Unregistered pass name"); - setDefault(Ctor); -} - -/// Add - Adds a function pass to the registration list. -/// -void MachinePassRegistry::Add(MachinePassRegistryNode *Node) { - Node->setNext(List); - List = Node; - if (Listener) Listener->NotifyAdd(Node->getName(), - Node->getCtor(), - Node->getDescription()); -} - - -/// Remove - Removes a function pass from the registration list. -/// -void MachinePassRegistry::Remove(MachinePassRegistryNode *Node) { - for (MachinePassRegistryNode **I = &List; *I; I = (*I)->getNextAddress()) { - if (*I == Node) { - if (Listener) Listener->NotifyRemove(Node->getName()); - *I = (*I)->getNext(); - break; - } - } -} diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp index bb5fc664c5f593e9469459bfcbb4346633901cf1..de8c55585d3fdfdd37593c3b9c0918e9f4e1982d 100644 --- a/lib/CodeGen/MachinePipeliner.cpp +++ b/lib/CodeGen/MachinePipeliner.cpp @@ -1121,11 +1121,12 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) { // First, perform the cheaper check that compares the base register. // If they are the same and the load offset is less than the store // offset, then mark the dependence as loop carried potentially. - unsigned BaseReg1, BaseReg2; + MachineOperand *BaseOp1, *BaseOp2; int64_t Offset1, Offset2; - if (TII->getMemOpBaseRegImmOfs(LdMI, BaseReg1, Offset1, TRI) && - TII->getMemOpBaseRegImmOfs(MI, BaseReg2, Offset2, TRI)) { - if (BaseReg1 == BaseReg2 && (int)Offset1 < (int)Offset2) { + if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1, TRI) && + TII->getMemOperandWithOffset(MI, BaseOp2, Offset2, TRI)) { + if (BaseOp1->isIdenticalTo(*BaseOp2) && + (int)Offset1 < (int)Offset2) { assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) && "What happened to the chain edge?"); SDep Dep(Load, SDep::Barrier); @@ -3246,11 +3247,16 @@ void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs, /// during each iteration. Set Delta to the amount of the change. bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) { const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - unsigned BaseReg; + MachineOperand *BaseOp; int64_t Offset; - if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI)) + if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI)) return false; + if (!BaseOp->isReg()) + return false; + + unsigned BaseReg = BaseOp->getReg(); + MachineRegisterInfo &MRI = MF.getRegInfo(); // Check if there is a Phi. If so, get the definition in the loop. MachineInstr *BaseDef = MRI.getVRegDef(BaseReg); @@ -3653,19 +3659,19 @@ bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep, if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD)) return true; - unsigned BaseRegS, BaseRegD; + MachineOperand *BaseOpS, *BaseOpD; int64_t OffsetS, OffsetD; const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - if (!TII->getMemOpBaseRegImmOfs(*SI, BaseRegS, OffsetS, TRI) || - !TII->getMemOpBaseRegImmOfs(*DI, BaseRegD, OffsetD, TRI)) + if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, TRI) || + !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, TRI)) return true; - if (BaseRegS != BaseRegD) + if (!BaseOpS->isIdenticalTo(*BaseOpD)) return true; // Check that the base register is incremented by a constant value for each // iteration. - MachineInstr *Def = MRI.getVRegDef(BaseRegS); + MachineInstr *Def = MRI.getVRegDef(BaseOpS->getReg()); if (!Def || !Def->isPHI()) return true; unsigned InitVal = 0; diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp index 9ab405f38d4d239716a8ea371311cf2fc6bb22a4..ec11586b08ead6fb3326cbcbe2449e03afa07865 100644 --- a/lib/CodeGen/MachineScheduler.cpp +++ b/lib/CodeGen/MachineScheduler.cpp @@ -41,6 +41,7 @@ #include "llvm/CodeGen/ScheduleDFS.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" #include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -240,7 +241,8 @@ void PostMachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -MachinePassRegistry MachineSchedRegistry::Registry; +MachinePassRegistry + MachineSchedRegistry::Registry; /// A dummy default scheduler factory indicates whether the scheduler /// is overridden on the command line. @@ -1482,15 +1484,40 @@ namespace { class BaseMemOpClusterMutation : public ScheduleDAGMutation { struct MemOpInfo { SUnit *SU; - unsigned BaseReg; + MachineOperand *BaseOp; int64_t Offset; - MemOpInfo(SUnit *su, unsigned reg, int64_t ofs) - : SU(su), BaseReg(reg), Offset(ofs) {} + MemOpInfo(SUnit *su, MachineOperand *Op, int64_t ofs) + : SU(su), BaseOp(Op), Offset(ofs) {} + + bool operator<(const MemOpInfo &RHS) const { + if (BaseOp->getType() != RHS.BaseOp->getType()) + return BaseOp->getType() < RHS.BaseOp->getType(); + + if (BaseOp->isReg()) + return std::make_tuple(BaseOp->getReg(), Offset, SU->NodeNum) < + std::make_tuple(RHS.BaseOp->getReg(), RHS.Offset, + RHS.SU->NodeNum); + if (BaseOp->isFI()) { + const MachineFunction &MF = + *BaseOp->getParent()->getParent()->getParent(); + const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering(); + bool StackGrowsDown = TFI.getStackGrowthDirection() == + TargetFrameLowering::StackGrowsDown; + // Can't use tuple comparison here since we might need to use a + // different order when the stack grows down. + if (BaseOp->getIndex() != RHS.BaseOp->getIndex()) + return StackGrowsDown ? BaseOp->getIndex() > RHS.BaseOp->getIndex() + : BaseOp->getIndex() < RHS.BaseOp->getIndex(); + + if (Offset != RHS.Offset) + return StackGrowsDown ? Offset > RHS.Offset : Offset < RHS.Offset; + + return SU->NodeNum < RHS.SU->NodeNum; + } - bool operator<(const MemOpInfo&RHS) const { - return std::tie(BaseReg, Offset, SU->NodeNum) < - std::tie(RHS.BaseReg, RHS.Offset, RHS.SU->NodeNum); + llvm_unreachable("MemOpClusterMutation only supports register or frame " + "index bases."); } }; @@ -1546,10 +1573,10 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps( ArrayRef MemOps, ScheduleDAGMI *DAG) { SmallVector MemOpRecords; for (SUnit *SU : MemOps) { - unsigned BaseReg; + MachineOperand *BaseOp; int64_t Offset; - if (TII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseReg, Offset, TRI)) - MemOpRecords.push_back(MemOpInfo(SU, BaseReg, Offset)); + if (TII->getMemOperandWithOffset(*SU->getInstr(), BaseOp, Offset, TRI)) + MemOpRecords.push_back(MemOpInfo(SU, BaseOp, Offset)); } if (MemOpRecords.size() < 2) return; @@ -1559,8 +1586,8 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps( for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { SUnit *SUa = MemOpRecords[Idx].SU; SUnit *SUb = MemOpRecords[Idx+1].SU; - if (TII->shouldClusterMemOps(*SUa->getInstr(), MemOpRecords[Idx].BaseReg, - *SUb->getInstr(), MemOpRecords[Idx+1].BaseReg, + if (TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp, + *MemOpRecords[Idx + 1].BaseOp, ClusterLength) && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU(" @@ -2515,7 +2542,7 @@ const char *GenericSchedulerBase::getReasonStr( switch (Reason) { case NoCand: return "NOCAND "; case Only1: return "ONLY1 "; - case PhysRegCopy: return "PREG-COPY "; + case PhysReg: return "PHYS-REG "; case RegExcess: return "REG-EXCESS"; case RegCritical: return "REG-CRIT "; case Stall: return "STALL "; @@ -2851,24 +2878,41 @@ unsigned getWeakLeft(const SUnit *SU, bool isTop) { /// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled /// with the operation that produces or consumes the physreg. We'll do this when /// regalloc has support for parallel copies. -int biasPhysRegCopy(const SUnit *SU, bool isTop) { +int biasPhysReg(const SUnit *SU, bool isTop) { const MachineInstr *MI = SU->getInstr(); - if (!MI->isCopy()) - return 0; - unsigned ScheduledOper = isTop ? 1 : 0; - unsigned UnscheduledOper = isTop ? 0 : 1; - // If we have already scheduled the physreg produce/consumer, immediately - // schedule the copy. - if (TargetRegisterInfo::isPhysicalRegister( - MI->getOperand(ScheduledOper).getReg())) - return 1; - // If the physreg is at the boundary, defer it. Otherwise schedule it - // immediately to free the dependent. We can hoist the copy later. - bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft; - if (TargetRegisterInfo::isPhysicalRegister( - MI->getOperand(UnscheduledOper).getReg())) - return AtBoundary ? -1 : 1; + if (MI->isCopy()) { + unsigned ScheduledOper = isTop ? 1 : 0; + unsigned UnscheduledOper = isTop ? 0 : 1; + // If we have already scheduled the physreg produce/consumer, immediately + // schedule the copy. + if (TargetRegisterInfo::isPhysicalRegister( + MI->getOperand(ScheduledOper).getReg())) + return 1; + // If the physreg is at the boundary, defer it. Otherwise schedule it + // immediately to free the dependent. We can hoist the copy later. + bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft; + if (TargetRegisterInfo::isPhysicalRegister( + MI->getOperand(UnscheduledOper).getReg())) + return AtBoundary ? -1 : 1; + } + + if (MI->isMoveImmediate()) { + // If we have a move immediate and all successors have been assigned, bias + // towards scheduling this later. Make sure all register defs are to + // physical registers. + bool DoBias = true; + for (const MachineOperand &Op : MI->defs()) { + if (Op.isReg() && !TargetRegisterInfo::isPhysicalRegister(Op.getReg())) { + DoBias = false; + break; + } + } + + if (DoBias) + return isTop ? -1 : 1; + } + return 0; } } // end namespace llvm @@ -2929,9 +2973,9 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand, return; } - if (tryGreater(biasPhysRegCopy(TryCand.SU, TryCand.AtTop), - biasPhysRegCopy(Cand.SU, Cand.AtTop), - TryCand, Cand, PhysRegCopy)) + // Bias PhysReg Defs and copies to their uses and defined respectively. + if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), + biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) return; // Avoid exceeding the target's limit. @@ -3178,7 +3222,7 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) { return SU; } -void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) { +void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) { MachineBasicBlock::iterator InsertPos = SU->getInstr(); if (!isTop) ++InsertPos; @@ -3193,7 +3237,7 @@ void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) { if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1) continue; MachineInstr *Copy = DepSU->getInstr(); - if (!Copy->isCopy()) + if (!Copy->isCopy() && !Copy->isMoveImmediate()) continue; LLVM_DEBUG(dbgs() << " Rescheduling physreg copy "; DAG->dumpNode(*Dep.getSUnit())); @@ -3207,18 +3251,18 @@ void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) { /// does. /// /// FIXME: Eventually, we may bundle physreg copies rather than rescheduling -/// them here. See comments in biasPhysRegCopy. +/// them here. See comments in biasPhysReg. void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { if (IsTopNode) { SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle()); Top.bumpNode(SU); if (SU->hasPhysRegUses) - reschedulePhysRegCopies(SU, true); + reschedulePhysReg(SU, true); } else { SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle()); Bot.bumpNode(SU); if (SU->hasPhysRegDefs) - reschedulePhysRegCopies(SU, false); + reschedulePhysReg(SU, false); } } diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp index d45855407f28f26dc75357657c2c2d9499a3ec1e..cdc597db640166ed5973436a9520e5240aef470b 100644 --- a/lib/CodeGen/MachineSink.cpp +++ b/lib/CodeGen/MachineSink.cpp @@ -716,9 +716,12 @@ static bool SinkingPreventsImplicitNullCheck(MachineInstr &MI, !PredBB->getTerminator()->getMetadata(LLVMContext::MD_make_implicit)) return false; - unsigned BaseReg; + MachineOperand *BaseOp; int64_t Offset; - if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI)) + if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI)) + return false; + + if (!BaseOp->isReg()) return false; if (!(MI.mayLoad() && !MI.isPredicable())) @@ -731,7 +734,7 @@ static bool SinkingPreventsImplicitNullCheck(MachineInstr &MI, return MBP.LHS.isReg() && MBP.RHS.isImm() && MBP.RHS.getImm() == 0 && (MBP.Predicate == MachineBranchPredicate::PRED_NE || MBP.Predicate == MachineBranchPredicate::PRED_EQ) && - MBP.LHS.getReg() == BaseReg; + MBP.LHS.getReg() == BaseOp->getReg(); } /// Sink an instruction and its associated debug instructions. If the debug diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp index 79ca6adf95c4604b08feb6644bd5bca7558759e3..e62ed3094651e9113271b2ca10c661c00c54c515 100644 --- a/lib/CodeGen/MachineTraceMetrics.cpp +++ b/lib/CodeGen/MachineTraceMetrics.cpp @@ -218,8 +218,7 @@ computeHeightResources(const MachineBasicBlock *MBB) { // The trace tail is done. if (!TBI->Succ) { TBI->Tail = MBB->getNumber(); - std::copy(PRCycles.begin(), PRCycles.end(), - ProcResourceHeights.begin() + PROffset); + llvm::copy(PRCycles, ProcResourceHeights.begin() + PROffset); return; } diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp index b37c421596b757f8f78aa84aff8bd7c8cd6274c4..f4804e734f612e99de4920f21cb947e65467ed73 100644 --- a/lib/CodeGen/MachineVerifier.cpp +++ b/lib/CodeGen/MachineVerifier.cpp @@ -1055,6 +1055,89 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) { } break; } + case TargetOpcode::G_MERGE_VALUES: { + // G_MERGE_VALUES should only be used to merge scalars into a larger scalar, + // e.g. s2N = MERGE sN, sN + // Merging multiple scalars into a vector is not allowed, should use + // G_BUILD_VECTOR for that. + LLT DstTy = MRI->getType(MI->getOperand(0).getReg()); + LLT SrcTy = MRI->getType(MI->getOperand(1).getReg()); + if (DstTy.isVector() || SrcTy.isVector()) + report("G_MERGE_VALUES cannot operate on vectors", MI); + break; + } + case TargetOpcode::G_UNMERGE_VALUES: { + LLT DstTy = MRI->getType(MI->getOperand(0).getReg()); + LLT SrcTy = MRI->getType(MI->getOperand(MI->getNumOperands()-1).getReg()); + // For now G_UNMERGE can split vectors. + for (unsigned i = 0; i < MI->getNumOperands()-1; ++i) { + if (MRI->getType(MI->getOperand(i).getReg()) != DstTy) + report("G_UNMERGE_VALUES destination types do not match", MI); + } + if (SrcTy.getSizeInBits() != + (DstTy.getSizeInBits() * (MI->getNumOperands() - 1))) { + report("G_UNMERGE_VALUES source operand does not cover dest operands", + MI); + } + break; + } + case TargetOpcode::G_BUILD_VECTOR: { + // Source types must be scalars, dest type a vector. Total size of scalars + // must match the dest vector size. + LLT DstTy = MRI->getType(MI->getOperand(0).getReg()); + LLT SrcEltTy = MRI->getType(MI->getOperand(1).getReg()); + if (!DstTy.isVector() || SrcEltTy.isVector()) + report("G_BUILD_VECTOR must produce a vector from scalar operands", MI); + for (unsigned i = 2; i < MI->getNumOperands(); ++i) { + if (MRI->getType(MI->getOperand(1).getReg()) != + MRI->getType(MI->getOperand(i).getReg())) + report("G_BUILD_VECTOR source operand types are not homogeneous", MI); + } + if (DstTy.getSizeInBits() != + SrcEltTy.getSizeInBits() * (MI->getNumOperands() - 1)) + report("G_BUILD_VECTOR src operands total size don't match dest " + "size.", + MI); + break; + } + case TargetOpcode::G_BUILD_VECTOR_TRUNC: { + // Source types must be scalars, dest type a vector. Scalar types must be + // larger than the dest vector elt type, as this is a truncating operation. + LLT DstTy = MRI->getType(MI->getOperand(0).getReg()); + LLT SrcEltTy = MRI->getType(MI->getOperand(1).getReg()); + if (!DstTy.isVector() || SrcEltTy.isVector()) + report("G_BUILD_VECTOR_TRUNC must produce a vector from scalar operands", + MI); + for (unsigned i = 2; i < MI->getNumOperands(); ++i) { + if (MRI->getType(MI->getOperand(1).getReg()) != + MRI->getType(MI->getOperand(i).getReg())) + report("G_BUILD_VECTOR_TRUNC source operand types are not homogeneous", + MI); + } + if (SrcEltTy.getSizeInBits() <= DstTy.getElementType().getSizeInBits()) + report("G_BUILD_VECTOR_TRUNC source operand types are not larger than " + "dest elt type", + MI); + break; + } + case TargetOpcode::G_CONCAT_VECTORS: { + // Source types should be vectors, and total size should match the dest + // vector size. + LLT DstTy = MRI->getType(MI->getOperand(0).getReg()); + LLT SrcTy = MRI->getType(MI->getOperand(1).getReg()); + if (!DstTy.isVector() || !SrcTy.isVector()) + report("G_CONCAT_VECTOR requires vector source and destination operands", + MI); + for (unsigned i = 2; i < MI->getNumOperands(); ++i) { + if (MRI->getType(MI->getOperand(1).getReg()) != + MRI->getType(MI->getOperand(i).getReg())) + report("G_CONCAT_VECTOR source operand types are not homogeneous", MI); + } + if (DstTy.getNumElements() != + SrcTy.getNumElements() * (MI->getNumOperands() - 1)) + report("G_CONCAT_VECTOR num dest and source elements should match", MI); + break; + } case TargetOpcode::COPY: { if (foundErrors) break; diff --git a/lib/CodeGen/OptimizePHIs.cpp b/lib/CodeGen/OptimizePHIs.cpp index befa8422d3993ecebe571b0dc4010c6c0f45ea04..770f6c5b0403020590152ae11162e080a5bf3687 100644 --- a/lib/CodeGen/OptimizePHIs.cpp +++ b/lib/CodeGen/OptimizePHIs.cpp @@ -90,10 +90,10 @@ bool OptimizePHIs::runOnMachineFunction(MachineFunction &Fn) { } /// IsSingleValuePHICycle - Check if MI is a PHI where all the source operands -/// are copies of SingleValReg, possibly via copies through other PHIs. If +/// are copies of SingleValReg, possibly via copies through other PHIs. If /// SingleValReg is zero on entry, it is set to the register with the single -/// non-copy value. PHIsInCycle is a set used to keep track of the PHIs that -/// have been scanned. +/// non-copy value. PHIsInCycle is a set used to keep track of the PHIs that +/// have been scanned. PHIs may be grouped by cycle, several cycles or chains. bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI, unsigned &SingleValReg, InstrSet &PHIsInCycle) { @@ -119,8 +119,10 @@ bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI, if (SrcMI && SrcMI->isCopy() && !SrcMI->getOperand(0).getSubReg() && !SrcMI->getOperand(1).getSubReg() && - TargetRegisterInfo::isVirtualRegister(SrcMI->getOperand(1).getReg())) - SrcMI = MRI->getVRegDef(SrcMI->getOperand(1).getReg()); + TargetRegisterInfo::isVirtualRegister(SrcMI->getOperand(1).getReg())) { + SrcReg = SrcMI->getOperand(1).getReg(); + SrcMI = MRI->getVRegDef(SrcReg); + } if (!SrcMI) return false; @@ -129,7 +131,7 @@ bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI, return false; } else { // Fail if there is more than one non-phi/non-move register. - if (SingleValReg != 0) + if (SingleValReg != 0 && SingleValReg != SrcReg) return false; SingleValReg = SrcReg; } @@ -180,6 +182,9 @@ bool OptimizePHIs::OptimizeBB(MachineBasicBlock &MBB) { if (!MRI->constrainRegClass(SingleValReg, MRI->getRegClass(OldReg))) continue; + // for the case SingleValReg taken from copy instr + MRI->clearKillFlags(SingleValReg); + MRI->replaceRegWith(OldReg, SingleValReg); MI->eraseFromParent(); ++NumPHICycles; diff --git a/lib/CodeGen/PreISelIntrinsicLowering.cpp b/lib/CodeGen/PreISelIntrinsicLowering.cpp index 8f88ef78828aed09d88c85f828034c47a16f387f..b0e9ac03612d10c45cc6e51ff9e7f5cf7e206183 100644 --- a/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ b/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -7,13 +7,15 @@ // //===----------------------------------------------------------------------===// // -// This pass implements IR lowering for the llvm.load.relative intrinsic. +// This pass implements IR lowering for the llvm.load.relative and llvm.objc.* +// intrinsics. // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/PreISelIntrinsicLowering.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -55,11 +57,129 @@ static bool lowerLoadRelative(Function &F) { return Changed; } +static bool lowerObjCCall(Function &F, const char *NewFn, + bool setNonLazyBind = false) { + if (F.use_empty()) + return false; + + // If we haven't already looked up this function, check to see if the + // program already contains a function with this name. + Module *M = F.getParent(); + Constant* FCache = M->getOrInsertFunction(NewFn, F.getFunctionType()); + + if (Function* Fn = dyn_cast(FCache)) { + Fn->setLinkage(F.getLinkage()); + if (setNonLazyBind && !Fn->isWeakForLinker()) { + // If we have Native ARC, set nonlazybind attribute for these APIs for + // performance. + Fn->addFnAttr(Attribute::NonLazyBind); + } + } + + for (auto I = F.use_begin(), E = F.use_end(); I != E;) { + auto *CI = dyn_cast(I->getUser()); + assert(CI->getCalledFunction() && "Cannot lower an indirect call!"); + ++I; + + IRBuilder<> Builder(CI->getParent(), CI->getIterator()); + SmallVector Args(CI->arg_begin(), CI->arg_end()); + CallInst *NewCI = Builder.CreateCall(FCache, Args); + NewCI->setName(CI->getName()); + NewCI->setTailCallKind(CI->getTailCallKind()); + if (!CI->use_empty()) + CI->replaceAllUsesWith(NewCI); + CI->eraseFromParent(); + } + + return true; +} + static bool lowerIntrinsics(Module &M) { bool Changed = false; for (Function &F : M) { - if (F.getName().startswith("llvm.load.relative.")) + if (F.getName().startswith("llvm.load.relative.")) { Changed |= lowerLoadRelative(F); + continue; + } + switch (F.getIntrinsicID()) { + default: + break; + case Intrinsic::objc_autorelease: + Changed |= lowerObjCCall(F, "objc_autorelease"); + break; + case Intrinsic::objc_autoreleasePoolPop: + Changed |= lowerObjCCall(F, "objc_autoreleasePoolPop"); + break; + case Intrinsic::objc_autoreleasePoolPush: + Changed |= lowerObjCCall(F, "objc_autoreleasePoolPush"); + break; + case Intrinsic::objc_autoreleaseReturnValue: + Changed |= lowerObjCCall(F, "objc_autoreleaseReturnValue"); + break; + case Intrinsic::objc_copyWeak: + Changed |= lowerObjCCall(F, "objc_copyWeak"); + break; + case Intrinsic::objc_destroyWeak: + Changed |= lowerObjCCall(F, "objc_destroyWeak"); + break; + case Intrinsic::objc_initWeak: + Changed |= lowerObjCCall(F, "objc_initWeak"); + break; + case Intrinsic::objc_loadWeak: + Changed |= lowerObjCCall(F, "objc_loadWeak"); + break; + case Intrinsic::objc_loadWeakRetained: + Changed |= lowerObjCCall(F, "objc_loadWeakRetained"); + break; + case Intrinsic::objc_moveWeak: + Changed |= lowerObjCCall(F, "objc_moveWeak"); + break; + case Intrinsic::objc_release: + Changed |= lowerObjCCall(F, "objc_release", true); + break; + case Intrinsic::objc_retain: + Changed |= lowerObjCCall(F, "objc_retain", true); + break; + case Intrinsic::objc_retainAutorelease: + Changed |= lowerObjCCall(F, "objc_retainAutorelease"); + break; + case Intrinsic::objc_retainAutoreleaseReturnValue: + Changed |= lowerObjCCall(F, "objc_retainAutoreleaseReturnValue"); + break; + case Intrinsic::objc_retainAutoreleasedReturnValue: + Changed |= lowerObjCCall(F, "objc_retainAutoreleasedReturnValue"); + break; + case Intrinsic::objc_retainBlock: + Changed |= lowerObjCCall(F, "objc_retainBlock"); + break; + case Intrinsic::objc_storeStrong: + Changed |= lowerObjCCall(F, "objc_storeStrong"); + break; + case Intrinsic::objc_storeWeak: + Changed |= lowerObjCCall(F, "objc_storeWeak"); + break; + case Intrinsic::objc_unsafeClaimAutoreleasedReturnValue: + Changed |= lowerObjCCall(F, "objc_unsafeClaimAutoreleasedReturnValue"); + break; + case Intrinsic::objc_retainedObject: + Changed |= lowerObjCCall(F, "objc_retainedObject"); + break; + case Intrinsic::objc_unretainedObject: + Changed |= lowerObjCCall(F, "objc_unretainedObject"); + break; + case Intrinsic::objc_unretainedPointer: + Changed |= lowerObjCCall(F, "objc_unretainedPointer"); + break; + case Intrinsic::objc_retain_autorelease: + Changed |= lowerObjCCall(F, "objc_retain_autorelease"); + break; + case Intrinsic::objc_sync_enter: + Changed |= lowerObjCCall(F, "objc_sync_enter"); + break; + case Intrinsic::objc_sync_exit: + Changed |= lowerObjCCall(F, "objc_sync_exit"); + break; + } } return Changed; } diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp index fc62c8caf59edfb0a8c9d39301778f4f360012ac..23754e487a18112e00d87c0009759dce6a07865e 100644 --- a/lib/CodeGen/PrologEpilogInserter.cpp +++ b/lib/CodeGen/PrologEpilogInserter.cpp @@ -75,6 +75,10 @@ using namespace llvm; using MBBVector = SmallVector; +STATISTIC(NumLeafFuncWithSpills, "Number of leaf functions with CSRs"); +STATISTIC(NumFuncSeen, "Number of functions seen in PEI"); + + namespace { class PEI : public MachineFunctionPass { @@ -168,6 +172,7 @@ using StackObjSet = SmallSetVector; /// runOnMachineFunction - Insert prolog/epilog code and replace abstract /// frame indexes with appropriate references. bool PEI::runOnMachineFunction(MachineFunction &MF) { + NumFuncSeen++; const Function &F = MF.getFunction(); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); @@ -357,6 +362,11 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F, // Now that we know which registers need to be saved and restored, allocate // stack slots for them. for (auto &CS : CSI) { + // If the target has spilled this register to another register, we don't + // need to allocate a stack slot. + if (CS.isSpilledToReg()) + continue; + unsigned Reg = CS.getReg(); const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); @@ -454,7 +464,22 @@ static void updateLiveness(MachineFunction &MF) { if (!MRI.isReserved(Reg) && !MBB->isLiveIn(Reg)) MBB->addLiveIn(Reg); } + // If callee-saved register is spilled to another register rather than + // spilling to stack, the destination register has to be marked as live for + // each MBB between the prologue and epilogue so that it is not clobbered + // before it is reloaded in the epilogue. The Visited set contains all + // blocks outside of the region delimited by prologue/epilogue. + if (CSI[i].isSpilledToReg()) { + for (MachineBasicBlock &MBB : MF) { + if (Visited.count(&MBB)) + continue; + MCPhysReg DstReg = CSI[i].getDstReg(); + if (!MBB.isLiveIn(DstReg)) + MBB.addLiveIn(DstReg); + } + } } + } /// Insert restore code for the callee-saved registers used in the function. @@ -530,6 +555,9 @@ void PEI::spillCalleeSavedRegs(MachineFunction &MF) { std::vector &CSI = MFI.getCalleeSavedInfo(); if (!CSI.empty()) { + if (!MFI.hasCalls()) + NumLeafFuncWithSpills++; + for (MachineBasicBlock *SaveBlock : SaveBlocks) { insertCSRSaves(*SaveBlock, CSI); // Update the live-in information of all the blocks up to the save @@ -1090,7 +1118,7 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &MF, MachineOperand &Offset = MI.getOperand(i + 1); int refOffset = TFI->getFrameIndexReferencePreferSP( MF, MI.getOperand(i).getIndex(), Reg, /*IgnoreSPUpdates*/ false); - Offset.setImm(Offset.getImm() + refOffset); + Offset.setImm(Offset.getImm() + refOffset + SPAdj); MI.getOperand(i).ChangeToRegister(Reg, false /*isDef*/); continue; } diff --git a/lib/CodeGen/README.txt b/lib/CodeGen/README.txt index 3318e109155b8eb1ca96e212a1bde9949b39276b..d8958715c6b45455dc00329d5bf2dbbdd5953fe5 100644 --- a/lib/CodeGen/README.txt +++ b/lib/CodeGen/README.txt @@ -156,8 +156,8 @@ doing the wrong thing. //===---------------------------------------------------------------------===// It would be really nice to be able to write patterns in .td files for copies, -which would eliminate a bunch of explicit predicates on them (e.g. no side -effects). Once this is in place, it would be even better to have tblgen +which would eliminate a bunch of explicit predicates on them (e.g. no side +effects). Once this is in place, it would be even better to have tblgen synthesize the various copy insertion/inspection methods in TargetInstrInfo. //===---------------------------------------------------------------------===// diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp index ea7f247214de3ea98dc3d6821914598d1281a711..eb3a4e481f5d37d4ba9e8d8b22c79b1d1020ef43 100644 --- a/lib/CodeGen/RegAllocFast.cpp +++ b/lib/CodeGen/RegAllocFast.cpp @@ -177,6 +177,8 @@ namespace { bool runOnMachineFunction(MachineFunction &MF) override; void allocateBasicBlock(MachineBasicBlock &MBB); + void allocateInstruction(MachineInstr &MI); + void handleDebugValue(MachineInstr &MI); void handleThroughOperands(MachineInstr &MI, SmallVectorImpl &VirtDead); bool isLastUseOfLocalReg(const MachineOperand &MO) const; @@ -207,7 +209,7 @@ namespace { LiveReg &reloadVirtReg(MachineInstr &MI, unsigned OpNum, unsigned VirtReg, unsigned Hint); void spillAll(MachineBasicBlock::iterator MI); - bool setPhysReg(MachineInstr &MI, unsigned OpNum, MCPhysReg PhysReg); + bool setPhysReg(MachineInstr &MI, MachineOperand &MO, MCPhysReg PhysReg); int getStackSpaceFor(unsigned VirtReg); void spill(MachineBasicBlock::iterator Before, unsigned VirtReg, @@ -373,7 +375,8 @@ void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI, LiveReg &LR) { /// Spill all dirty virtregs without killing them. void RegAllocFast::spillAll(MachineBasicBlock::iterator MI) { - if (LiveVirtRegs.empty()) return; + if (LiveVirtRegs.empty()) + return; // The LiveRegMap is keyed by an unsigned (the virtreg number), so the order // of spilling here is deterministic, if arbitrary. for (LiveReg &LR : LiveVirtRegs) { @@ -490,9 +493,9 @@ void RegAllocFast::definePhysReg(MachineBasicBlock::iterator MI, } } -/// Return the cost of spilling clearing out PhysReg and aliases so it is -/// free for allocation. Returns 0 when PhysReg is free or disabled with all -/// aliases disabled - it can be allocated directly. +/// Return the cost of spilling clearing out PhysReg and aliases so it is free +/// for allocation. Returns 0 when PhysReg is free or disabled with all aliases +/// disabled - it can be allocated directly. /// \returns spillImpossible when PhysReg or an alias can't be spilled. unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const { if (isRegUsedInInstr(PhysReg)) { @@ -588,16 +591,13 @@ void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint) { } } - LLVM_DEBUG(dbgs() << "Allocating " << printReg(VirtReg) << " from " - << TRI->getRegClassName(&RC) << '\n'); - - unsigned BestReg = 0; + MCPhysReg BestReg = 0; unsigned BestCost = spillImpossible; for (MCPhysReg PhysReg : AllocationOrder) { LLVM_DEBUG(dbgs() << "\tRegister: " << printReg(PhysReg, TRI) << ' '); unsigned Cost = calcSpillCost(PhysReg); LLVM_DEBUG(dbgs() << "Cost: " << Cost << " BestCost: " << BestCost << '\n'); - // Cost is 0 when all aliases are already disabled. + // Immediate take a register with cost 0. if (Cost == 0) { assignVirtToPhysReg(LR, PhysReg); return; @@ -609,7 +609,8 @@ void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint) { } if (!BestReg) { - // Nothing we can do. Report an error and keep going with a bad allocation. + // Nothing we can do: Report an error and keep going with an invalid + // allocation. if (MI.isInlineAsm()) MI.emitError("inline assembly requires more registers than available"); else @@ -704,9 +705,8 @@ RegAllocFast::LiveReg &RegAllocFast::reloadVirtReg(MachineInstr &MI, /// Changes operand OpNum in MI the refer the PhysReg, considering subregs. This /// may invalidate any operand pointers. Return true if the operand kills its /// register. -bool RegAllocFast::setPhysReg(MachineInstr &MI, unsigned OpNum, +bool RegAllocFast::setPhysReg(MachineInstr &MI, MachineOperand &MO, MCPhysReg PhysReg) { - MachineOperand &MO = MI.getOperand(OpNum); bool Dead = MO.isDead(); if (!MO.getSubReg()) { MO.setReg(PhysReg); @@ -769,7 +769,7 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI, SmallVector PartialDefs; LLVM_DEBUG(dbgs() << "Allocating tied uses.\n"); for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { - const MachineOperand &MO = MI.getOperand(I); + MachineOperand &MO = MI.getOperand(I); if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue; @@ -780,7 +780,7 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI, << ".\n"); LiveReg &LR = reloadVirtReg(MI, I, Reg, 0); MCPhysReg PhysReg = LR.PhysReg; - setPhysReg(MI, I, PhysReg); + setPhysReg(MI, MO, PhysReg); // Note: we don't update the def operand yet. That would cause the normal // def-scan to attempt spilling. } else if (MO.getSubReg() && MI.readsVirtualRegister(Reg)) { @@ -802,7 +802,7 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI, continue; // Note: defineVirtReg may invalidate MO. MCPhysReg PhysReg = defineVirtReg(MI, I, Reg, 0); - if (setPhysReg(MI, I, PhysReg)) + if (setPhysReg(MI, MI.getOperand(I), PhysReg)) VirtDead.push_back(Reg); } @@ -860,6 +860,199 @@ void RegAllocFast::dumpState() { } #endif +void RegAllocFast::allocateInstruction(MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + + // If this is a copy, we may be able to coalesce. + unsigned CopySrcReg = 0; + unsigned CopyDstReg = 0; + unsigned CopySrcSub = 0; + unsigned CopyDstSub = 0; + if (MI.isCopy()) { + CopyDstReg = MI.getOperand(0).getReg(); + CopySrcReg = MI.getOperand(1).getReg(); + CopyDstSub = MI.getOperand(0).getSubReg(); + CopySrcSub = MI.getOperand(1).getSubReg(); + } + + // Track registers used by instruction. + UsedInInstr.clear(); + + // First scan. + // Mark physreg uses and early clobbers as used. + // Find the end of the virtreg operands + unsigned VirtOpEnd = 0; + bool hasTiedOps = false; + bool hasEarlyClobbers = false; + bool hasPartialRedefs = false; + bool hasPhysDefs = false; + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI.getOperand(i); + // Make sure MRI knows about registers clobbered by regmasks. + if (MO.isRegMask()) { + MRI->addPhysRegsUsedFromRegMask(MO.getRegMask()); + continue; + } + if (!MO.isReg()) continue; + unsigned Reg = MO.getReg(); + if (!Reg) continue; + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + VirtOpEnd = i+1; + if (MO.isUse()) { + hasTiedOps = hasTiedOps || + MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1; + } else { + if (MO.isEarlyClobber()) + hasEarlyClobbers = true; + if (MO.getSubReg() && MI.readsVirtualRegister(Reg)) + hasPartialRedefs = true; + } + continue; + } + if (!MRI->isAllocatable(Reg)) continue; + if (MO.isUse()) { + usePhysReg(MO); + } else if (MO.isEarlyClobber()) { + definePhysReg(MI, Reg, + (MO.isImplicit() || MO.isDead()) ? regFree : regReserved); + hasEarlyClobbers = true; + } else + hasPhysDefs = true; + } + + // The instruction may have virtual register operands that must be allocated + // the same register at use-time and def-time: early clobbers and tied + // operands. If there are also physical defs, these registers must avoid + // both physical defs and uses, making them more constrained than normal + // operands. + // Similarly, if there are multiple defs and tied operands, we must make + // sure the same register is allocated to uses and defs. + // We didn't detect inline asm tied operands above, so just make this extra + // pass for all inline asm. + if (MI.isInlineAsm() || hasEarlyClobbers || hasPartialRedefs || + (hasTiedOps && (hasPhysDefs || MCID.getNumDefs() > 1))) { + handleThroughOperands(MI, VirtDead); + // Don't attempt coalescing when we have funny stuff going on. + CopyDstReg = 0; + // Pretend we have early clobbers so the use operands get marked below. + // This is not necessary for the common case of a single tied use. + hasEarlyClobbers = true; + } + + // Second scan. + // Allocate virtreg uses. + for (unsigned I = 0; I != VirtOpEnd; ++I) { + MachineOperand &MO = MI.getOperand(I); + if (!MO.isReg()) continue; + unsigned Reg = MO.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue; + if (MO.isUse()) { + LiveReg &LR = reloadVirtReg(MI, I, Reg, CopyDstReg); + MCPhysReg PhysReg = LR.PhysReg; + CopySrcReg = (CopySrcReg == Reg || CopySrcReg == PhysReg) ? PhysReg : 0; + if (setPhysReg(MI, MO, PhysReg)) + killVirtReg(LR); + } + } + + // Track registers defined by instruction - early clobbers and tied uses at + // this point. + UsedInInstr.clear(); + if (hasEarlyClobbers) { + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) continue; + unsigned Reg = MO.getReg(); + if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue; + // Look for physreg defs and tied uses. + if (!MO.isDef() && !MO.isTied()) continue; + markRegUsedInInstr(Reg); + } + } + + unsigned DefOpEnd = MI.getNumOperands(); + if (MI.isCall()) { + // Spill all virtregs before a call. This serves one purpose: If an + // exception is thrown, the landing pad is going to expect to find + // registers in their spill slots. + // Note: although this is appealing to just consider all definitions + // as call-clobbered, this is not correct because some of those + // definitions may be used later on and we do not want to reuse + // those for virtual registers in between. + LLVM_DEBUG(dbgs() << " Spilling remaining registers before call.\n"); + spillAll(MI); + } + + // Third scan. + // Allocate defs and collect dead defs. + for (unsigned I = 0; I != DefOpEnd; ++I) { + const MachineOperand &MO = MI.getOperand(I); + if (!MO.isReg() || !MO.isDef() || !MO.getReg() || MO.isEarlyClobber()) + continue; + unsigned Reg = MO.getReg(); + + if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + if (!MRI->isAllocatable(Reg)) continue; + definePhysReg(MI, Reg, MO.isDead() ? regFree : regReserved); + continue; + } + MCPhysReg PhysReg = defineVirtReg(MI, I, Reg, CopySrcReg); + if (setPhysReg(MI, MI.getOperand(I), PhysReg)) { + VirtDead.push_back(Reg); + CopyDstReg = 0; // cancel coalescing; + } else + CopyDstReg = (CopyDstReg == Reg || CopyDstReg == PhysReg) ? PhysReg : 0; + } + + // Kill dead defs after the scan to ensure that multiple defs of the same + // register are allocated identically. We didn't need to do this for uses + // because we are crerating our own kill flags, and they are always at the + // last use. + for (unsigned VirtReg : VirtDead) + killVirtReg(VirtReg); + VirtDead.clear(); + + LLVM_DEBUG(dbgs() << "<< " << MI); + if (CopyDstReg && CopyDstReg == CopySrcReg && CopyDstSub == CopySrcSub) { + LLVM_DEBUG(dbgs() << "Mark identity copy for removal\n"); + Coalesced.push_back(&MI); + } +} + +void RegAllocFast::handleDebugValue(MachineInstr &MI) { + MachineOperand &MO = MI.getOperand(0); + + // Ignore DBG_VALUEs that aren't based on virtual registers. These are + // mostly constants and frame indices. + if (!MO.isReg()) + return; + unsigned Reg = MO.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return; + + // See if this virtual register has already been allocated to a physical + // register or spilled to a stack slot. + LiveRegMap::iterator LRI = findLiveVirtReg(Reg); + if (LRI != LiveVirtRegs.end() && LRI->PhysReg) { + setPhysReg(MI, MO, LRI->PhysReg); + } else { + int SS = StackSlotForVirtReg[Reg]; + if (SS != -1) { + // Modify DBG_VALUE now that the value is in a spill slot. + updateDbgValueForSpill(MI, SS); + LLVM_DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << MI); + return; + } + + // We can't allocate a physreg for a DebugValue, sorry! + LLVM_DEBUG(dbgs() << "Unable to allocate vreg used by DBG_VALUE"); + MO.setReg(0); + } + + // If Reg hasn't been spilled, put this DBG_VALUE in LiveDbgValueMap so + // that future spills of Reg will have DBG_VALUEs. + LiveDbgValueMap[Reg].push_back(&MI); +} + void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) { this->MBB = &MBB; LLVM_DEBUG(dbgs() << "\nAllocating " << MBB); @@ -879,205 +1072,19 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) { // Otherwise, sequentially allocate each instruction in the MBB. for (MachineInstr &MI : MBB) { - const MCInstrDesc &MCID = MI.getDesc(); - LLVM_DEBUG(dbgs() << "\n>> " << MI << "Regs:"; dumpState()); + LLVM_DEBUG( + dbgs() << "\n>> " << MI << "Regs:"; + dumpState() + ); - // Debug values are not allowed to change codegen in any way. + // Special handling for debug values. Note that they are not allowed to + // affect codegen of the other instructions in any way. if (MI.isDebugValue()) { - MachineInstr *DebugMI = &MI; - MachineOperand &MO = DebugMI->getOperand(0); - - // Ignore DBG_VALUEs that aren't based on virtual registers. These are - // mostly constants and frame indices. - if (!MO.isReg()) - continue; - unsigned Reg = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(Reg)) - continue; - - // See if this virtual register has already been allocated to a physical - // register or spilled to a stack slot. - LiveRegMap::iterator LRI = findLiveVirtReg(Reg); - if (LRI != LiveVirtRegs.end() && LRI->PhysReg) - setPhysReg(*DebugMI, 0, LRI->PhysReg); - else { - int SS = StackSlotForVirtReg[Reg]; - if (SS != -1) { - // Modify DBG_VALUE now that the value is in a spill slot. - updateDbgValueForSpill(*DebugMI, SS); - LLVM_DEBUG(dbgs() << "Modifying debug info due to spill:" - << "\t" << *DebugMI); - continue; - } - - // We can't allocate a physreg for a DebugValue, sorry! - LLVM_DEBUG(dbgs() << "Unable to allocate vreg used by DBG_VALUE"); - MO.setReg(0); - } - - // If Reg hasn't been spilled, put this DBG_VALUE in LiveDbgValueMap so - // that future spills of Reg will have DBG_VALUEs. - LiveDbgValueMap[Reg].push_back(DebugMI); - continue; - } - - if (MI.isDebugLabel()) + handleDebugValue(MI); continue; - - // If this is a copy, we may be able to coalesce. - unsigned CopySrcReg = 0; - unsigned CopyDstReg = 0; - unsigned CopySrcSub = 0; - unsigned CopyDstSub = 0; - if (MI.isCopy()) { - CopyDstReg = MI.getOperand(0).getReg(); - CopySrcReg = MI.getOperand(1).getReg(); - CopyDstSub = MI.getOperand(0).getSubReg(); - CopySrcSub = MI.getOperand(1).getSubReg(); - } - - // Track registers used by instruction. - UsedInInstr.clear(); - - // First scan. - // Mark physreg uses and early clobbers as used. - // Find the end of the virtreg operands - unsigned VirtOpEnd = 0; - bool hasTiedOps = false; - bool hasEarlyClobbers = false; - bool hasPartialRedefs = false; - bool hasPhysDefs = false; - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); - // Make sure MRI knows about registers clobbered by regmasks. - if (MO.isRegMask()) { - MRI->addPhysRegsUsedFromRegMask(MO.getRegMask()); - continue; - } - if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); - if (!Reg) continue; - if (TargetRegisterInfo::isVirtualRegister(Reg)) { - VirtOpEnd = i+1; - if (MO.isUse()) { - hasTiedOps = hasTiedOps || - MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1; - } else { - if (MO.isEarlyClobber()) - hasEarlyClobbers = true; - if (MO.getSubReg() && MI.readsVirtualRegister(Reg)) - hasPartialRedefs = true; - } - continue; - } - if (!MRI->isAllocatable(Reg)) continue; - if (MO.isUse()) { - usePhysReg(MO); - } else if (MO.isEarlyClobber()) { - definePhysReg(MI, Reg, - (MO.isImplicit() || MO.isDead()) ? regFree : regReserved); - hasEarlyClobbers = true; - } else - hasPhysDefs = true; } - // The instruction may have virtual register operands that must be allocated - // the same register at use-time and def-time: early clobbers and tied - // operands. If there are also physical defs, these registers must avoid - // both physical defs and uses, making them more constrained than normal - // operands. - // Similarly, if there are multiple defs and tied operands, we must make - // sure the same register is allocated to uses and defs. - // We didn't detect inline asm tied operands above, so just make this extra - // pass for all inline asm. - if (MI.isInlineAsm() || hasEarlyClobbers || hasPartialRedefs || - (hasTiedOps && (hasPhysDefs || MCID.getNumDefs() > 1))) { - handleThroughOperands(MI, VirtDead); - // Don't attempt coalescing when we have funny stuff going on. - CopyDstReg = 0; - // Pretend we have early clobbers so the use operands get marked below. - // This is not necessary for the common case of a single tied use. - hasEarlyClobbers = true; - } - - // Second scan. - // Allocate virtreg uses. - for (unsigned I = 0; I != VirtOpEnd; ++I) { - const MachineOperand &MO = MI.getOperand(I); - if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue; - if (MO.isUse()) { - LiveReg &LR = reloadVirtReg(MI, I, Reg, CopyDstReg); - MCPhysReg PhysReg = LR.PhysReg; - CopySrcReg = (CopySrcReg == Reg || CopySrcReg == PhysReg) ? PhysReg : 0; - if (setPhysReg(MI, I, PhysReg)) - killVirtReg(LR); - } - } - - // Track registers defined by instruction - early clobbers and tied uses at - // this point. - UsedInInstr.clear(); - if (hasEarlyClobbers) { - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); - if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue; - // Look for physreg defs and tied uses. - if (!MO.isDef() && !MO.isTied()) continue; - markRegUsedInInstr(Reg); - } - } - - unsigned DefOpEnd = MI.getNumOperands(); - if (MI.isCall()) { - // Spill all virtregs before a call. This serves one purpose: If an - // exception is thrown, the landing pad is going to expect to find - // registers in their spill slots. - // Note: although this is appealing to just consider all definitions - // as call-clobbered, this is not correct because some of those - // definitions may be used later on and we do not want to reuse - // those for virtual registers in between. - LLVM_DEBUG(dbgs() << " Spilling remaining registers before call.\n"); - spillAll(MI); - } - - // Third scan. - // Allocate defs and collect dead defs. - for (unsigned I = 0; I != DefOpEnd; ++I) { - const MachineOperand &MO = MI.getOperand(I); - if (!MO.isReg() || !MO.isDef() || !MO.getReg() || MO.isEarlyClobber()) - continue; - unsigned Reg = MO.getReg(); - - if (TargetRegisterInfo::isPhysicalRegister(Reg)) { - if (!MRI->isAllocatable(Reg)) continue; - definePhysReg(MI, Reg, MO.isDead() ? regFree : regReserved); - continue; - } - MCPhysReg PhysReg = defineVirtReg(MI, I, Reg, CopySrcReg); - if (setPhysReg(MI, I, PhysReg)) { - VirtDead.push_back(Reg); - CopyDstReg = 0; // cancel coalescing; - } else - CopyDstReg = (CopyDstReg == Reg || CopyDstReg == PhysReg) ? PhysReg : 0; - } - - // Kill dead defs after the scan to ensure that multiple defs of the same - // register are allocated identically. We didn't need to do this for uses - // because we are crerating our own kill flags, and they are always at the - // last use. - for (unsigned VirtReg : VirtDead) - killVirtReg(VirtReg); - VirtDead.clear(); - - if (CopyDstReg && CopyDstReg == CopySrcReg && CopyDstSub == CopySrcSub) { - LLVM_DEBUG(dbgs() << "-- coalescing: " << MI); - Coalesced.push_back(&MI); - } else { - LLVM_DEBUG(dbgs() << "<< " << MI); - } + allocateInstruction(MI); } // Spill all physical registers holding virtual registers now. diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp index 64da8485389f2bb896e02f20e3e08d757a60eb61..f244028534f23b2937264219d7e9ed5d244acced 100644 --- a/lib/CodeGen/RegAllocGreedy.cpp +++ b/lib/CodeGen/RegAllocGreedy.cpp @@ -1183,7 +1183,10 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf, BC.Number = BI.MBB->getNumber(); Intf.moveToBlock(BC.Number); BC.Entry = BI.LiveIn ? SpillPlacement::PrefReg : SpillPlacement::DontCare; - BC.Exit = BI.LiveOut ? SpillPlacement::PrefReg : SpillPlacement::DontCare; + BC.Exit = (BI.LiveOut && + !LIS->getInstructionFromIndex(BI.LastInstr)->isImplicitDef()) + ? SpillPlacement::PrefReg + : SpillPlacement::DontCare; BC.ChangesValue = BI.FirstDef.isValid(); if (!Intf.hasInterference()) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index fc0e8efebdcbdc84f8c739500ee0255eb7553794..7a22caf9c8b1e1ec3e4b67bcc7e52eb5bc798da4 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/IntervalMap.h" #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" @@ -263,8 +264,9 @@ namespace { /// \param EltNo index of the vector element to load. /// \param OriginalLoad load that EVE came from to be replaced. /// \returns EVE on success SDValue() on failure. - SDValue ReplaceExtractVectorEltOfLoadWithNarrowedLoad( - SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad); + SDValue scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, + SDValue EltNo, + LoadSDNode *OriginalLoad); void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad); SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace); SDValue SExtPromoteOperand(SDValue Op, EVT PVT); @@ -324,6 +326,7 @@ namespace { SDValue visitSHL(SDNode *N); SDValue visitSRA(SDNode *N); SDValue visitSRL(SDNode *N); + SDValue visitFunnelShift(SDNode *N); SDValue visitRotate(SDNode *N); SDValue visitABS(SDNode *N); SDValue visitBSWAP(SDNode *N); @@ -410,11 +413,14 @@ namespace { SDValue foldVSelectOfConstants(SDNode *N); SDValue foldBinOpIntoSelect(SDNode *BO); bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS); - SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N); + SDValue hoistLogicOpWithSameOpcodeHands(SDNode *N); SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2); SDValue SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC, bool NotExtCompare = false); + SDValue convertSelectOfFPConstantsToLoadOffset( + const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, + ISD::CondCode CC); SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC); SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, @@ -490,6 +496,10 @@ namespace { /// returns false. bool findBetterNeighborChains(StoreSDNode *St); + // Helper for findBetterNeighborChains. Walk up store chain add additional + // chained stores that do not overlap and can be parallelized. + bool parallelizeChainedStores(StoreSDNode *St); + /// Holds a pointer to an LSBaseSDNode as well as information on where it /// is located in a sequence of memory operations connected by a chain. struct MemOpLink { @@ -523,7 +533,7 @@ namespace { EVT &MemVT, unsigned ShAmt = 0); /// Used by BackwardsPropagateMask to find suitable loads. - bool SearchForAndLoads(SDNode *N, SmallPtrSetImpl &Loads, + bool SearchForAndLoads(SDNode *N, SmallVectorImpl &Loads, SmallPtrSetImpl &NodesWithConsts, ConstantSDNode *Mask, SDNode *&NodeToMask); /// Attempt to propagate a given AND node back to load leaves so that they @@ -903,39 +913,6 @@ static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) { return true; } -// Determines if it is a constant null integer or a splatted vector of a -// constant null integer (with no undefs). -// Build vector implicit truncation is not an issue for null values. -static bool isNullConstantOrNullSplatConstant(SDValue N) { - // TODO: may want to use peekThroughBitcast() here. - if (ConstantSDNode *Splat = isConstOrConstSplat(N)) - return Splat->isNullValue(); - return false; -} - -// Determines if it is a constant integer of one or a splatted vector of a -// constant integer of one (with no undefs). -// Do not permit build vector implicit truncation. -static bool isOneConstantOrOneSplatConstant(SDValue N) { - // TODO: may want to use peekThroughBitcast() here. - unsigned BitWidth = N.getScalarValueSizeInBits(); - if (ConstantSDNode *Splat = isConstOrConstSplat(N)) - return Splat->isOne() && Splat->getAPIntValue().getBitWidth() == BitWidth; - return false; -} - -// Determines if it is a constant integer of all ones or a splatted vector of a -// constant integer of all ones (with no undefs). -// Do not permit build vector implicit truncation. -static bool isAllOnesConstantOrAllOnesSplatConstant(SDValue N) { - N = peekThroughBitcasts(N); - unsigned BitWidth = N.getScalarValueSizeInBits(); - if (ConstantSDNode *Splat = isConstOrConstSplat(N)) - return Splat->isAllOnesValue() && - Splat->getAPIntValue().getBitWidth() == BitWidth; - return false; -} - // Determines if a BUILD_VECTOR is composed of all-constants possibly mixed with // undef's. static bool isAnyConstantBuildVector(const SDNode *N) { @@ -1538,6 +1515,8 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::SRL: return visitSRL(N); case ISD::ROTR: case ISD::ROTL: return visitRotate(N); + case ISD::FSHL: + case ISD::FSHR: return visitFunnelShift(N); case ISD::ABS: return visitABS(N); case ISD::BSWAP: return visitBSWAP(N); case ISD::BITREVERSE: return visitBITREVERSE(N); @@ -1913,11 +1892,10 @@ SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) { // and (select Cond, 0, -1), X --> select Cond, 0, X // or X, (select Cond, -1, 0) --> select Cond, -1, X auto BinOpcode = BO->getOpcode(); - bool CanFoldNonConst = (BinOpcode == ISD::AND || BinOpcode == ISD::OR) && - (isNullConstantOrNullSplatConstant(CT) || - isAllOnesConstantOrAllOnesSplatConstant(CT)) && - (isNullConstantOrNullSplatConstant(CF) || - isAllOnesConstantOrAllOnesSplatConstant(CF)); + bool CanFoldNonConst = + (BinOpcode == ISD::AND || BinOpcode == ISD::OR) && + (isNullOrNullSplat(CT) || isAllOnesOrAllOnesSplat(CT)) && + (isNullOrNullSplat(CF) || isAllOnesOrAllOnesSplat(CF)); SDValue CBO = BO->getOperand(SelOpNo ^ 1); if (!CanFoldNonConst && @@ -2086,7 +2064,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) { // add (zext i1 X), -1 -> sext (not i1 X) // because most (?) targets generate better code for the zext form. if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() && - isOneConstantOrOneSplatConstant(N1)) { + isOneOrOneSplat(N1)) { SDValue X = N0.getOperand(0); if ((!LegalOperations || (TLI.isOperationLegal(ISD::XOR, X.getValueType()) && @@ -2115,13 +2093,11 @@ SDValue DAGCombiner::visitADD(SDNode *N) { return RADD; // fold ((0-A) + B) -> B-A - if (N0.getOpcode() == ISD::SUB && - isNullConstantOrNullSplatConstant(N0.getOperand(0))) + if (N0.getOpcode() == ISD::SUB && isNullOrNullSplat(N0.getOperand(0))) return DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1)); // fold (A + (0-B)) -> A-B - if (N1.getOpcode() == ISD::SUB && - isNullConstantOrNullSplatConstant(N1.getOperand(0))) + if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0))) return DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(1)); // fold (A+(B-A)) -> B @@ -2179,7 +2155,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) { return DAG.getNode(ISD::OR, DL, VT, N0, N1); // fold (add (xor a, -1), 1) -> (sub 0, a) - if (isBitwiseNot(N0) && isOneConstantOrOneSplatConstant(N1)) + if (isBitwiseNot(N0) && isOneOrOneSplat(N1)) return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0.getOperand(0)); @@ -2236,7 +2212,7 @@ SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n)) if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB && - isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0))) + isNullOrNullSplat(N1.getOperand(0).getOperand(0))) return DAG.getNode(ISD::SUB, DL, VT, N0, DAG.getNode(ISD::SHL, DL, VT, N1.getOperand(0).getOperand(1), @@ -2249,8 +2225,7 @@ SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) // (add z, (and (sbbl x, x), 1)) -> (sub z, (sbbl x, x)) // and similar xforms where the inner op is either ~0 or 0. - if (NumSignBits == DestBits && - isOneConstantOrOneSplatConstant(N1->getOperand(1))) + if (NumSignBits == DestBits && isOneOrOneSplat(N1->getOperand(1))) return DAG.getNode(ISD::SUB, DL, VT, N0, AndOp0); } @@ -2381,7 +2356,7 @@ SDValue DAGCombiner::visitUADDO(SDNode *N) { DAG.getConstant(0, DL, CarryVT)); // fold (uaddo (xor a, -1), 1) -> (usub 0, a) and flip carry. - if (isBitwiseNot(N0) && isOneConstantOrOneSplatConstant(N1)) { + if (isBitwiseNot(N0) && isOneOrOneSplat(N1)) { SDValue Sub = DAG.getNode(ISD::USUBO, DL, N->getVTList(), DAG.getConstant(0, DL, VT), N0.getOperand(0)); @@ -2586,7 +2561,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { DAG.getConstant(-N1C->getAPIntValue(), DL, VT)); } - if (isNullConstantOrNullSplatConstant(N0)) { + if (isNullOrNullSplat(N0)) { unsigned BitWidth = VT.getScalarSizeInBits(); // Right-shifting everything out but the sign bit followed by negation is // the same as flipping arithmetic/logical shift type without the negation: @@ -2617,12 +2592,11 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { } // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) - if (isAllOnesConstantOrAllOnesSplatConstant(N0)) + if (isAllOnesOrAllOnesSplat(N0)) return DAG.getNode(ISD::XOR, DL, VT, N1, N0); // fold (A - (0-B)) -> A+B - if (N1.getOpcode() == ISD::SUB && - isNullConstantOrNullSplatConstant(N1.getOperand(0))) + if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0))) return DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(1)); // fold A-(A-B) -> B @@ -2676,14 +2650,14 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { // fold (X - (-Y * Z)) -> (X + (Y * Z)) if (N1.getOpcode() == ISD::MUL && N1.hasOneUse()) { if (N1.getOperand(0).getOpcode() == ISD::SUB && - isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0))) { + isNullOrNullSplat(N1.getOperand(0).getOperand(0))) { SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, N1.getOperand(0).getOperand(1), N1.getOperand(1)); return DAG.getNode(ISD::ADD, DL, VT, N0, Mul); } if (N1.getOperand(1).getOpcode() == ISD::SUB && - isNullConstantOrNullSplatConstant(N1.getOperand(1).getOperand(0))) { + isNullOrNullSplat(N1.getOperand(1).getOperand(0))) { SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, N1.getOperand(0), N1.getOperand(1).getOperand(1)); @@ -3560,18 +3534,16 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, unsigned HiOp) { // If the high half is not needed, just compute the low half. bool HiExists = N->hasAnyUseOfValue(1); - if (!HiExists && - (!LegalOperations || - TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) { + if (!HiExists && (!LegalOperations || + TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) { SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops()); return CombineTo(N, Res, Res); } // If the low half is not needed, just compute the high half. bool LoExists = N->hasAnyUseOfValue(0); - if (!LoExists && - (!LegalOperations || - TLI.isOperationLegal(HiOp, N->getValueType(1)))) { + if (!LoExists && (!LegalOperations || + TLI.isOperationLegalOrCustom(HiOp, N->getValueType(1)))) { SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops()); return CombineTo(N, Res, Res); } @@ -3587,7 +3559,7 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, SDValue LoOpt = combine(Lo.getNode()); if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() && (!LegalOperations || - TLI.isOperationLegal(LoOpt.getOpcode(), LoOpt.getValueType()))) + TLI.isOperationLegalOrCustom(LoOpt.getOpcode(), LoOpt.getValueType()))) return CombineTo(N, LoOpt, LoOpt); } @@ -3597,7 +3569,7 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, SDValue HiOpt = combine(Hi.getNode()); if (HiOpt.getNode() && HiOpt != Hi && (!LegalOperations || - TLI.isOperationLegal(HiOpt.getOpcode(), HiOpt.getValueType()))) + TLI.isOperationLegalOrCustom(HiOpt.getOpcode(), HiOpt.getValueType()))) return CombineTo(N, HiOpt, HiOpt); } @@ -3729,59 +3701,95 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) { return SDValue(); } -/// If this is a binary operator with two operands of the same opcode, try to -/// simplify it. -SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) { +/// If this is a bitwise logic instruction and both operands have the same +/// opcode, try to sink the other opcode after the logic instruction. +SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) { SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); EVT VT = N0.getValueType(); - assert(N0.getOpcode() == N1.getOpcode() && "Bad input!"); + unsigned LogicOpcode = N->getOpcode(); + unsigned HandOpcode = N0.getOpcode(); + assert((LogicOpcode == ISD::AND || LogicOpcode == ISD::OR || + LogicOpcode == ISD::XOR) && "Expected logic opcode"); + assert(HandOpcode == N1.getOpcode() && "Bad input!"); // Bail early if none of these transforms apply. - if (N0.getNumOperands() == 0) return SDValue(); - - // For each of OP in AND/OR/XOR: - // fold (OP (zext x), (zext y)) -> (zext (OP x, y)) - // fold (OP (sext x), (sext y)) -> (sext (OP x, y)) - // fold (OP (aext x), (aext y)) -> (aext (OP x, y)) - // fold (OP (bswap x), (bswap y)) -> (bswap (OP x, y)) - // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free) - // - // do not sink logical op inside of a vector extend, since it may combine - // into a vsetcc. - EVT Op0VT = N0.getOperand(0).getValueType(); - if ((N0.getOpcode() == ISD::ZERO_EXTEND || - N0.getOpcode() == ISD::SIGN_EXTEND || - N0.getOpcode() == ISD::BSWAP || - // Avoid infinite looping with PromoteIntBinOp. - (N0.getOpcode() == ISD::ANY_EXTEND && - (!LegalTypes || TLI.isTypeDesirableForOp(N->getOpcode(), Op0VT))) || - (N0.getOpcode() == ISD::TRUNCATE && - (!TLI.isZExtFree(VT, Op0VT) || - !TLI.isTruncateFree(Op0VT, VT)) && - TLI.isTypeLegal(Op0VT))) && - !VT.isVector() && - Op0VT == N1.getOperand(0).getValueType() && - (!LegalOperations || TLI.isOperationLegal(N->getOpcode(), Op0VT))) { - SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0), - N0.getOperand(0).getValueType(), - N0.getOperand(0), N1.getOperand(0)); - AddToWorklist(ORNode.getNode()); - return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, ORNode); - } - - // For each of OP in SHL/SRL/SRA/AND... - // fold (and (OP x, z), (OP y, z)) -> (OP (and x, y), z) - // fold (or (OP x, z), (OP y, z)) -> (OP (or x, y), z) - // fold (xor (OP x, z), (OP y, z)) -> (OP (xor x, y), z) - if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL || - N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::AND) && + if (N0.getNumOperands() == 0) + return SDValue(); + + // FIXME: We should check number of uses of the operands to not increase + // the instruction count for all transforms. + + // Handle size-changing casts. + SDValue X = N0.getOperand(0); + SDValue Y = N1.getOperand(0); + EVT XVT = X.getValueType(); + SDLoc DL(N); + if (HandOpcode == ISD::ANY_EXTEND || HandOpcode == ISD::ZERO_EXTEND || + HandOpcode == ISD::SIGN_EXTEND) { + // If both operands have other uses, this transform would create extra + // instructions without eliminating anything. + if (!N0.hasOneUse() && !N1.hasOneUse()) + return SDValue(); + // We need matching integer source types. + // Do not hoist logic op inside of a vector extend, since it may combine + // into a vsetcc. + // TODO: Should the vector check apply to truncate though? + if (VT.isVector() || XVT != Y.getValueType()) + return SDValue(); + // Don't create an illegal op during or after legalization. + if (LegalOperations && !TLI.isOperationLegal(LogicOpcode, XVT)) + return SDValue(); + // Avoid infinite looping with PromoteIntBinOp. + // TODO: Should we apply desirable/legal constraints to all opcodes? + if (HandOpcode == ISD::ANY_EXTEND && LegalTypes && + !TLI.isTypeDesirableForOp(LogicOpcode, XVT)) + return SDValue(); + // logic_op (hand_op X), (hand_op Y) --> hand_op (logic_op X, Y) + SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); + return DAG.getNode(HandOpcode, DL, VT, Logic); + } + + // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y) + if (HandOpcode == ISD::TRUNCATE) { + // If both operands have other uses, this transform would create extra + // instructions without eliminating anything. + if (!N0.hasOneUse() && !N1.hasOneUse()) + return SDValue(); + // We need matching source types. + if (XVT != Y.getValueType()) + return SDValue(); + // Don't create an illegal op during or after legalization. + if (LegalOperations && !TLI.isOperationLegal(LogicOpcode, XVT)) + return SDValue(); + // Be extra careful sinking truncate. If it's free, there's no benefit in + // widening a binop. Also, don't create a logic op on an illegal type. + if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT)) + return SDValue(); + if (!TLI.isTypeLegal(XVT)) + return SDValue(); + SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); + return DAG.getNode(HandOpcode, DL, VT, Logic); + } + + // For binops SHL/SRL/SRA/AND: + // logic_op (OP x, z), (OP y, z) --> OP (logic_op x, y), z + if ((HandOpcode == ISD::SHL || HandOpcode == ISD::SRL || + HandOpcode == ISD::SRA || HandOpcode == ISD::AND) && N0.getOperand(1) == N1.getOperand(1)) { - SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0), - N0.getOperand(0).getValueType(), - N0.getOperand(0), N1.getOperand(0)); - AddToWorklist(ORNode.getNode()); - return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, - ORNode, N0.getOperand(1)); + // If either operand has other uses, this transform is not an improvement. + if (!N0.hasOneUse() || !N1.hasOneUse()) + return SDValue(); + SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); + return DAG.getNode(HandOpcode, DL, VT, Logic, N0.getOperand(1)); + } + + // Unary ops: logic_op (bswap x), (bswap y) --> bswap (logic_op x, y) + if (HandOpcode == ISD::BSWAP) { + // If either operand has other uses, this transform is not an improvement. + if (!N0.hasOneUse() || !N1.hasOneUse()) + return SDValue(); + SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); + return DAG.getNode(HandOpcode, DL, VT, Logic); } // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B)) @@ -3791,21 +3799,12 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) { // we don't want to undo this promotion. // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper // on scalars. - if ((N0.getOpcode() == ISD::BITCAST || - N0.getOpcode() == ISD::SCALAR_TO_VECTOR) && + if ((HandOpcode == ISD::BITCAST || HandOpcode == ISD::SCALAR_TO_VECTOR) && Level <= AfterLegalizeTypes) { - SDValue In0 = N0.getOperand(0); - SDValue In1 = N1.getOperand(0); - EVT In0Ty = In0.getValueType(); - EVT In1Ty = In1.getValueType(); - SDLoc DL(N); - // If both incoming values are integers, and the original types are the - // same. - if (In0Ty.isInteger() && In1Ty.isInteger() && In0Ty == In1Ty) { - SDValue Op = DAG.getNode(N->getOpcode(), DL, In0Ty, In0, In1); - SDValue BC = DAG.getNode(N0.getOpcode(), DL, VT, Op); - AddToWorklist(Op.getNode()); - return BC; + // Input types must be integer and the same. + if (XVT.isInteger() && XVT == Y.getValueType()) { + SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); + return DAG.getNode(HandOpcode, DL, VT, Logic); } } @@ -3821,55 +3820,44 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) { // If both shuffles use the same mask, and both shuffles have the same first // or second operand, then it might still be profitable to move the shuffle // after the xor/and/or operation. - if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) { - ShuffleVectorSDNode *SVN0 = cast(N0); - ShuffleVectorSDNode *SVN1 = cast(N1); - - assert(N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() && + if (HandOpcode == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) { + auto *SVN0 = cast(N0); + auto *SVN1 = cast(N1); + assert(X.getValueType() == Y.getValueType() && "Inputs to shuffles are not the same type"); // Check that both shuffles use the same mask. The masks are known to be of // the same length because the result vector type is the same. // Check also that shuffles have only one use to avoid introducing extra // instructions. - if (SVN0->hasOneUse() && SVN1->hasOneUse() && - SVN0->getMask().equals(SVN1->getMask())) { - SDValue ShOp = N0->getOperand(1); - - // Don't try to fold this node if it requires introducing a - // build vector of all zeros that might be illegal at this stage. - if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) { - ShOp = tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations); - } + if (!SVN0->hasOneUse() || !SVN1->hasOneUse() || + !SVN0->getMask().equals(SVN1->getMask())) + return SDValue(); - // (AND (shuf (A, C), shuf (B, C))) -> shuf (AND (A, B), C) - // (OR (shuf (A, C), shuf (B, C))) -> shuf (OR (A, B), C) - // (XOR (shuf (A, C), shuf (B, C))) -> shuf (XOR (A, B), V_0) - if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) { - SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT, - N0->getOperand(0), N1->getOperand(0)); - AddToWorklist(NewNode.getNode()); - return DAG.getVectorShuffle(VT, SDLoc(N), NewNode, ShOp, - SVN0->getMask()); - } + // Don't try to fold this node if it requires introducing a + // build vector of all zeros that might be illegal at this stage. + SDValue ShOp = N0.getOperand(1); + if (LogicOpcode == ISD::XOR && !ShOp.isUndef()) + ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations); - // Don't try to fold this node if it requires introducing a - // build vector of all zeros that might be illegal at this stage. - ShOp = N0->getOperand(0); - if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) { - ShOp = tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations); - } + // (logic_op (shuf (A, C), shuf (B, C))) --> shuf (logic_op (A, B), C) + if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) { + SDValue Logic = DAG.getNode(LogicOpcode, DL, VT, + N0.getOperand(0), N1.getOperand(0)); + return DAG.getVectorShuffle(VT, DL, Logic, ShOp, SVN0->getMask()); + } - // (AND (shuf (C, A), shuf (C, B))) -> shuf (C, AND (A, B)) - // (OR (shuf (C, A), shuf (C, B))) -> shuf (C, OR (A, B)) - // (XOR (shuf (C, A), shuf (C, B))) -> shuf (V_0, XOR (A, B)) - if (N0->getOperand(0) == N1->getOperand(0) && ShOp.getNode()) { - SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT, - N0->getOperand(1), N1->getOperand(1)); - AddToWorklist(NewNode.getNode()); - return DAG.getVectorShuffle(VT, SDLoc(N), ShOp, NewNode, - SVN0->getMask()); - } + // Don't try to fold this node if it requires introducing a + // build vector of all zeros that might be illegal at this stage. + ShOp = N0.getOperand(0); + if (LogicOpcode == ISD::XOR && !ShOp.isUndef()) + ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations); + + // (logic_op (shuf (C, A), shuf (C, B))) --> shuf (C, logic_op (A, B)) + if (N0.getOperand(0) == N1.getOperand(0) && ShOp.getNode()) { + SDValue Logic = DAG.getNode(LogicOpcode, DL, VT, N0.getOperand(1), + N1.getOperand(1)); + return DAG.getVectorShuffle(VT, DL, ShOp, Logic, SVN0->getMask()); } } @@ -3905,8 +3893,8 @@ SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, ISD::CondCode CC1 = cast(N1CC)->get(); bool IsInteger = OpVT.isInteger(); if (LR == RR && CC0 == CC1 && IsInteger) { - bool IsZero = isNullConstantOrNullSplatConstant(LR); - bool IsNeg1 = isAllOnesConstantOrAllOnesSplatConstant(LR); + bool IsZero = isNullOrNullSplat(LR); + bool IsNeg1 = isAllOnesOrAllOnesSplat(LR); // All bits clear? bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero; @@ -4208,7 +4196,7 @@ bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST, } bool DAGCombiner::SearchForAndLoads(SDNode *N, - SmallPtrSetImpl &Loads, + SmallVectorImpl &Loads, SmallPtrSetImpl &NodesWithConsts, ConstantSDNode *Mask, SDNode *&NodeToMask) { @@ -4245,7 +4233,7 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N, // Use LE to convert equal sized loads to zext. if (ExtVT.bitsLE(Load->getMemoryVT())) - Loads.insert(Load); + Loads.push_back(Load); continue; } @@ -4310,7 +4298,7 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) { if (isa(N->getOperand(0))) return false; - SmallPtrSet Loads; + SmallVector Loads; SmallPtrSet NodesWithConsts; SDNode *FixupNode = nullptr; if (SearchForAndLoads(N, Loads, NodesWithConsts, Mask, FixupNode)) { @@ -4644,8 +4632,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) { // Simplify: (and (op x...), (op y...)) -> (op (and x, y)) if (N0.getOpcode() == N1.getOpcode()) - if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N)) - return Tmp; + if (SDValue V = hoistLogicOpWithSameOpcodeHands(N)) + return V; // Masking the negated extension of a boolean is just the zero-extended // boolean: @@ -4655,7 +4643,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) { // Note: the SimplifyDemandedBits fold below can make an information-losing // transform, and then we have no way to find this better fold. if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) { - if (isNullConstantOrNullSplatConstant(N0.getOperand(0))) { + if (isNullOrNullSplat(N0.getOperand(0))) { SDValue SubRHS = N0.getOperand(1); if (SubRHS.getOpcode() == ISD::ZERO_EXTEND && SubRHS.getOperand(0).getScalarValueSizeInBits() == 1) @@ -5203,8 +5191,8 @@ SDValue DAGCombiner::visitOR(SDNode *N) { // Simplify: (or (op x...), (op y...)) -> (op (or x, y)) if (N0.getOpcode() == N1.getOpcode()) - if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N)) - return Tmp; + if (SDValue V = hoistLogicOpWithSameOpcodeHands(N)) + return V; // See if this is some rotate idiom. if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N))) @@ -5953,7 +5941,7 @@ SDValue DAGCombiner::unfoldMaskedMerge(SDNode *N) { assert(N->getOpcode() == ISD::XOR); // Don't touch 'not' (i.e. where y = -1). - if (isAllOnesConstantOrAllOnesSplatConstant(N->getOperand(1))) + if (isAllOnesOrAllOnesSplat(N->getOperand(1))) return SDValue(); EVT VT = N->getValueType(0); @@ -5970,7 +5958,7 @@ SDValue DAGCombiner::unfoldMaskedMerge(SDNode *N) { SDValue Xor0 = Xor.getOperand(0); SDValue Xor1 = Xor.getOperand(1); // Don't touch 'not' (i.e. where y = -1). - if (isAllOnesConstantOrAllOnesSplatConstant(Xor1)) + if (isAllOnesOrAllOnesSplat(Xor1)) return false; if (Other == Xor0) std::swap(Xor0, Xor1); @@ -6036,8 +6024,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { } // fold (xor undef, undef) -> 0. This is a common idiom (misuse). + SDLoc DL(N); if (N0.isUndef() && N1.isUndef()) - return DAG.getConstant(0, SDLoc(N), VT); + return DAG.getConstant(0, DL, VT); // fold (xor x, undef) -> undef if (N0.isUndef()) return N0; @@ -6047,11 +6036,11 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); if (N0C && N1C) - return DAG.FoldConstantArithmetic(ISD::XOR, SDLoc(N), VT, N0C, N1C); + return DAG.FoldConstantArithmetic(ISD::XOR, DL, VT, N0C, N1C); // canonicalize constant to RHS if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && !DAG.isConstantIntBuildVectorOrConstantInt(N1)) - return DAG.getNode(ISD::XOR, SDLoc(N), VT, N1, N0); + return DAG.getNode(ISD::XOR, DL, VT, N1, N0); // fold (xor x, 0) -> x if (isNullConstant(N1)) return N0; @@ -6060,19 +6049,18 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { return NewSel; // reassociate xor - if (SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1, N->getFlags())) + if (SDValue RXOR = ReassociateOps(ISD::XOR, DL, N0, N1, N->getFlags())) return RXOR; // fold !(x cc y) -> (x !cc y) + unsigned N0Opcode = N0.getOpcode(); SDValue LHS, RHS, CC; if (TLI.isConstTrueVal(N1.getNode()) && isSetCCEquivalent(N0, LHS, RHS, CC)) { - bool isInt = LHS.getValueType().isInteger(); ISD::CondCode NotCC = ISD::getSetCCInverse(cast(CC)->get(), - isInt); - + LHS.getValueType().isInteger()); if (!LegalOperations || TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) { - switch (N0.getOpcode()) { + switch (N0Opcode) { default: llvm_unreachable("Unhandled SetCC Equivalent!"); case ISD::SETCC: @@ -6085,54 +6073,74 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { } // fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y))) - if (isOneConstant(N1) && N0.getOpcode() == ISD::ZERO_EXTEND && - N0.getNode()->hasOneUse() && + if (isOneConstant(N1) && N0Opcode == ISD::ZERO_EXTEND && N0.hasOneUse() && isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){ SDValue V = N0.getOperand(0); - SDLoc DL(N0); - V = DAG.getNode(ISD::XOR, DL, V.getValueType(), V, - DAG.getConstant(1, DL, V.getValueType())); + SDLoc DL0(N0); + V = DAG.getNode(ISD::XOR, DL0, V.getValueType(), V, + DAG.getConstant(1, DL0, V.getValueType())); AddToWorklist(V.getNode()); - return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, V); + return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, V); } // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc if (isOneConstant(N1) && VT == MVT::i1 && N0.hasOneUse() && - (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) { + (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) { SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1); if (isOneUseSetCC(RHS) || isOneUseSetCC(LHS)) { - unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND; + unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND; LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode()); - return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS); + return DAG.getNode(NewOpcode, DL, VT, LHS, RHS); } } // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants if (isAllOnesConstant(N1) && N0.hasOneUse() && - (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) { + (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) { SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1); if (isa(RHS) || isa(LHS)) { - unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND; + unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND; LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode()); - return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS); + return DAG.getNode(NewOpcode, DL, VT, LHS, RHS); } } // fold (xor (and x, y), y) -> (and (not x), y) - if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && - N0->getOperand(1) == N1) { - SDValue X = N0->getOperand(0); + if (N0Opcode == ISD::AND && N0.hasOneUse() && N0->getOperand(1) == N1) { + SDValue X = N0.getOperand(0); SDValue NotX = DAG.getNOT(SDLoc(X), X, VT); AddToWorklist(NotX.getNode()); - return DAG.getNode(ISD::AND, SDLoc(N), VT, NotX, N1); + return DAG.getNode(ISD::AND, DL, VT, NotX, N1); + } + + if ((N0Opcode == ISD::SRL || N0Opcode == ISD::SHL) && N0.hasOneUse()) { + ConstantSDNode *XorC = isConstOrConstSplat(N1); + ConstantSDNode *ShiftC = isConstOrConstSplat(N0.getOperand(1)); + unsigned BitWidth = VT.getScalarSizeInBits(); + if (XorC && ShiftC) { + // Don't crash on an oversized shift. We can not guarantee that a bogus + // shift has been simplified to undef. + uint64_t ShiftAmt = ShiftC->getLimitedValue(); + if (ShiftAmt < BitWidth) { + APInt Ones = APInt::getAllOnesValue(BitWidth); + Ones = N0Opcode == ISD::SHL ? Ones.shl(ShiftAmt) : Ones.lshr(ShiftAmt); + if (XorC->getAPIntValue() == Ones) { + // If the xor constant is a shifted -1, do a 'not' before the shift: + // xor (X << ShiftC), XorC --> (not X) << ShiftC + // xor (X >> ShiftC), XorC --> (not X) >> ShiftC + SDValue Not = DAG.getNOT(DL, N0.getOperand(0), VT); + return DAG.getNode(N0Opcode, DL, VT, Not, N0.getOperand(1)); + } + } + } } // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X) if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) { - SDValue A = N0.getOpcode() == ISD::ADD ? N0 : N1; - SDValue S = N0.getOpcode() == ISD::SRA ? N0 : N1; + SDValue A = N0Opcode == ISD::ADD ? N0 : N1; + SDValue S = N0Opcode == ISD::SRA ? N0 : N1; if (A.getOpcode() == ISD::ADD && S.getOpcode() == ISD::SRA) { SDValue A0 = A.getOperand(0), A1 = A.getOperand(1); SDValue S0 = S.getOperand(0); @@ -6140,14 +6148,14 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { unsigned OpSizeInBits = VT.getScalarSizeInBits(); if (ConstantSDNode *C = isConstOrConstSplat(S.getOperand(1))) if (C->getAPIntValue() == (OpSizeInBits - 1)) - return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0); + return DAG.getNode(ISD::ABS, DL, VT, S0); } } } // fold (xor x, x) -> 0 if (N0 == N1) - return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations); + return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations); // fold (xor (shl 1, x), -1) -> (rotl ~1, x) // Here is a concrete example of this equivalence: @@ -6167,17 +6175,16 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { // consistent result. // - Pushing the zero left requires shifting one bits in from the right. // A rotate left of ~1 is a nice way of achieving the desired result. - if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT) && N0.getOpcode() == ISD::SHL - && isAllOnesConstant(N1) && isOneConstant(N0.getOperand(0))) { - SDLoc DL(N); + if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT) && N0Opcode == ISD::SHL && + isAllOnesConstant(N1) && isOneConstant(N0.getOperand(0))) { return DAG.getNode(ISD::ROTL, DL, VT, DAG.getConstant(~1, DL, VT), N0.getOperand(1)); } // Simplify: xor (op x...), (op y...) -> (op (xor x, y)) - if (N0.getOpcode() == N1.getOpcode()) - if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N)) - return Tmp; + if (N0Opcode == N1.getOpcode()) + if (SDValue V = hoistLogicOpWithSameOpcodeHands(N)) + return V; // Unfold ((x ^ y) & m) ^ y into (x & m) | (y & ~m) if profitable if (SDValue MM = unfoldMaskedMerge(N)) @@ -6193,6 +6200,10 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { /// Handle transforms common to the three shifts, when the shift amount is a /// constant. SDValue DAGCombiner::visitShiftByConstant(SDNode *N, ConstantSDNode *Amt) { + // Do not turn a 'not' into a regular xor. + if (isBitwiseNot(N->getOperand(0))) + return SDValue(); + SDNode *LHS = N->getOperand(0).getNode(); if (!LHS->hasOneUse()) return SDValue(); @@ -6298,9 +6309,16 @@ SDValue DAGCombiner::visitRotate(SDNode *N) { unsigned Bitsize = VT.getScalarSizeInBits(); // fold (rot x, 0) -> x - if (isNullConstantOrNullSplatConstant(N1)) + if (isNullOrNullSplat(N1)) return N0; + // fold (rot x, c) -> x iff (c % BitSize) == 0 + if (isPowerOf2_32(Bitsize) && Bitsize > 1) { + APInt ModuloMask(N1.getScalarValueSizeInBits(), Bitsize - 1); + if (DAG.MaskedValueIsZero(N1, ModuloMask)) + return N0; + } + // fold (rot x, c) -> (rot x, c % BitSize) if (ConstantSDNode *Cst = isConstOrConstSplat(N1)) { if (Cst->getAPIntValue().uge(Bitsize)) { @@ -6343,6 +6361,9 @@ SDValue DAGCombiner::visitRotate(SDNode *N) { SDValue DAGCombiner::visitSHL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + if (SDValue V = DAG.simplifyShift(N0, N1)) + return V; + EVT VT = N0.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); @@ -6377,22 +6398,6 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C); - // fold (shl 0, x) -> 0 - if (isNullConstantOrNullSplatConstant(N0)) - return N0; - // fold (shl x, c >= size(x)) -> undef - // NOTE: ALL vector elements must be too big to avoid partial UNDEFs. - auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) { - return Val->getAPIntValue().uge(OpSizeInBits); - }; - if (ISD::matchUnaryPredicate(N1, MatchShiftTooBig)) - return DAG.getUNDEF(VT); - // fold (shl x, 0) -> x - if (N1C && N1C->isNullValue()) - return N0; - // fold (shl undef, x) -> 0 - if (N0.isUndef()) - return DAG.getConstant(0, SDLoc(N), VT); if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; @@ -6582,6 +6587,9 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { SDValue DAGCombiner::visitSRA(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + if (SDValue V = DAG.simplifyShift(N0, N1)) + return V; + EVT VT = N0.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); @@ -6602,16 +6610,6 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C); - // fold (sra x, c >= size(x)) -> undef - // NOTE: ALL vector elements must be too big to avoid partial UNDEFs. - auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) { - return Val->getAPIntValue().uge(OpSizeInBits); - }; - if (ISD::matchUnaryPredicate(N1, MatchShiftTooBig)) - return DAG.getUNDEF(VT); - // fold (sra x, 0) -> x - if (N1C && N1C->isNullValue()) - return N0; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; @@ -6748,6 +6746,9 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { SDValue DAGCombiner::visitSRL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + if (SDValue V = DAG.simplifyShift(N0, N1)) + return V; + EVT VT = N0.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); @@ -6762,19 +6763,6 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C); - // fold (srl 0, x) -> 0 - if (isNullConstantOrNullSplatConstant(N0)) - return N0; - // fold (srl x, c >= size(x)) -> undef - // NOTE: ALL vector elements must be too big to avoid partial UNDEFs. - auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) { - return Val->getAPIntValue().uge(OpSizeInBits); - }; - if (ISD::matchUnaryPredicate(N1, MatchShiftTooBig)) - return DAG.getUNDEF(VT); - // fold (srl x, 0) -> x - if (N1C && N1C->isNullValue()) - return N0; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; @@ -6965,6 +6953,41 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitFunnelShift(SDNode *N) { + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue N2 = N->getOperand(2); + bool IsFSHL = N->getOpcode() == ISD::FSHL; + unsigned BitWidth = VT.getScalarSizeInBits(); + + // fold (fshl N0, N1, 0) -> N0 + // fold (fshr N0, N1, 0) -> N1 + if (isPowerOf2_32(BitWidth)) + if (DAG.MaskedValueIsZero( + N2, APInt(N2.getScalarValueSizeInBits(), BitWidth - 1))) + return IsFSHL ? N0 : N1; + + // fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth) + if (ConstantSDNode *Cst = isConstOrConstSplat(N2)) { + if (Cst->getAPIntValue().uge(BitWidth)) { + uint64_t RotAmt = Cst->getAPIntValue().urem(BitWidth); + return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0, N1, + DAG.getConstant(RotAmt, SDLoc(N), N2.getValueType())); + } + } + + // fold (fshl N0, N0, N2) -> (rotl N0, N2) + // fold (fshr N0, N0, N2) -> (rotr N0, N2) + // TODO: Investigate flipping this rotate if only one is legal, if funnel shift + // is legal as well we might be better off avoiding non-constant (BW - N2). + unsigned RotOpc = IsFSHL ? ISD::ROTL : ISD::ROTR; + if (N0 == N1 && hasOperation(RotOpc, VT)) + return DAG.getNode(RotOpc, SDLoc(N), VT, N0, N2); + + return SDValue(); +} + SDValue DAGCombiner::visitABS(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -7231,15 +7254,8 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { EVT VT0 = N0.getValueType(); SDLoc DL(N); - // fold (select C, X, X) -> X - if (N1 == N2) - return N1; - - if (const ConstantSDNode *N0C = dyn_cast(N0)) { - // fold (select true, X, Y) -> X - // fold (select false, X, Y) -> Y - return !N0C->isNullValue() ? N1 : N2; - } + if (SDValue V = DAG.simplifySelect(N0, N1, N2)) + return V; // fold (select X, X, Y) -> (or X, Y) // fold (select X, 1, Y) -> (or C, Y) @@ -7818,9 +7834,8 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { SDValue N2 = N->getOperand(2); SDLoc DL(N); - // fold (vselect C, X, X) -> X - if (N1 == N2) - return N1; + if (SDValue V = DAG.simplifySelect(N0, N1, N2)) + return V; // Canonicalize integer abs. // vselect (setg[te] X, 0), X, -X -> @@ -7874,7 +7889,7 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { // TODO: This should be extended to handle any constant. // TODO: This could be extended to handle non-loading patterns, but that // requires thorough testing to avoid regressions. - if (isNullConstantOrNullSplatConstant(RHS)) { + if (isNullOrNullSplat(RHS)) { EVT NarrowVT = LHS.getValueType(); EVT WideVT = N1.getValueType().changeVectorElementTypeToInteger(); EVT SetCCVT = getSetCCResultType(LHS.getValueType()); @@ -8017,9 +8032,8 @@ SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) { /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND). /// Vector extends are not folded if operations are legal; this is to /// avoid introducing illegal build_vector dag nodes. -static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI, - SelectionDAG &DAG, bool LegalTypes, - bool LegalOperations) { +static SDValue tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI, + SelectionDAG &DAG, bool LegalTypes) { unsigned Opcode = N->getOpcode(); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -8033,16 +8047,15 @@ static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI, // fold (zext c1) -> c1 // fold (aext c1) -> c1 if (isa(N0)) - return DAG.getNode(Opcode, SDLoc(N), VT, N0).getNode(); + return DAG.getNode(Opcode, SDLoc(N), VT, N0); // fold (sext (build_vector AllConstants) -> (build_vector AllConstants) // fold (zext (build_vector AllConstants) -> (build_vector AllConstants) // fold (aext (build_vector AllConstants) -> (build_vector AllConstants) EVT SVT = VT.getScalarType(); - if (!(VT.isVector() && - (!LegalTypes || (!LegalOperations && TLI.isTypeLegal(SVT))) && + if (!(VT.isVector() && (!LegalTypes || TLI.isTypeLegal(SVT)) && ISD::isBuildVectorOfConstantSDNodes(N0.getNode()))) - return nullptr; + return SDValue(); // We can fold this node into a build_vector. unsigned VTBits = SVT.getSizeInBits(); @@ -8068,7 +8081,7 @@ static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI, Elts.push_back(DAG.getConstant(C.zext(VTBits), DL, SVT)); } - return DAG.getBuildVector(VT, DL, Elts).getNode(); + return DAG.getBuildVector(VT, DL, Elts); } // ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this: @@ -8474,9 +8487,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { EVT VT = N->getValueType(0); SDLoc DL(N); - if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, - LegalOperations)) - return SDValue(Res, 0); + if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) + return Res; // fold (sext (sext x)) -> (sext x) // fold (sext (aext x)) -> (sext x) @@ -8613,21 +8625,24 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { // if this is the case. EVT SVT = getSetCCResultType(N00VT); - // We know that the # elements of the results is the same as the - // # elements of the compare (and the # elements of the compare result - // for that matter). Check to see that they are the same size. If so, - // we know that the element size of the sext'd result matches the - // element size of the compare operands. - if (VT.getSizeInBits() == SVT.getSizeInBits()) - return DAG.getSetCC(DL, VT, N00, N01, CC); - - // If the desired elements are smaller or larger than the source - // elements, we can use a matching integer vector type and then - // truncate/sign extend. - EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger(); - if (SVT == MatchingVecType) { - SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC); - return DAG.getSExtOrTrunc(VsetCC, DL, VT); + // If we already have the desired type, don't change it. + if (SVT != N0.getValueType()) { + // We know that the # elements of the results is the same as the + // # elements of the compare (and the # elements of the compare result + // for that matter). Check to see that they are the same size. If so, + // we know that the element size of the sext'd result matches the + // element size of the compare operands. + if (VT.getSizeInBits() == SVT.getSizeInBits()) + return DAG.getSetCC(DL, VT, N00, N01, CC); + + // If the desired elements are smaller or larger than the source + // elements, we can use a matching integer vector type and then + // truncate/sign extend. + EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger(); + if (SVT == MatchingVecType) { + SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC); + return DAG.getSExtOrTrunc(VsetCC, DL, VT); + } } } @@ -8697,9 +8712,9 @@ static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op, SDValue Op1 = N->getOperand(1); assert(Op0.getValueType() == Op1.getValueType()); - if (isNullConstantOrNullSplatConstant(Op0)) + if (isNullOrNullSplat(Op0)) Op = Op1; - else if (isNullConstantOrNullSplatConstant(Op1)) + else if (isNullOrNullSplat(Op1)) Op = Op0; else return false; @@ -8713,9 +8728,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); - if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, - LegalOperations)) - return SDValue(Res, 0); + if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) + return Res; // fold (zext (zext x)) -> (zext x) // fold (zext (aext x)) -> (zext x) @@ -8963,9 +8977,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); - if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, - LegalOperations)) - return SDValue(Res, 0); + if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) + return Res; // fold (aext (aext x)) -> (aext x) // fold (aext (zext x)) -> (zext x) @@ -9136,6 +9149,26 @@ SDValue DAGCombiner::visitAssertExt(SDNode *N) { return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert); } + // If we have (AssertZext (truncate (AssertSext X, iX)), iY) and Y is smaller + // than X. Just move the AssertZext in front of the truncate and drop the + // AssertSExt. + if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && + N0.getOperand(0).getOpcode() == ISD::AssertSext && + Opcode == ISD::AssertZext) { + SDValue BigA = N0.getOperand(0); + EVT BigA_AssertVT = cast(BigA.getOperand(1))->getVT(); + assert(BigA_AssertVT.bitsLE(N0.getValueType()) && + "Asserting zero/sign-extended bits to a type larger than the " + "truncated destination does not provide information"); + + if (AssertVT.bitsLT(BigA_AssertVT)) { + SDLoc DL(N); + SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(), + BigA.getOperand(0), N1); + return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert); + } + } + return SDValue(); } @@ -9490,9 +9523,11 @@ SDValue DAGCombiner::visitSIGN_EXTEND_VECTOR_INREG(SDNode *N) { if (N0.isUndef()) return DAG.getUNDEF(VT); - if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, - LegalOperations)) - return SDValue(Res, 0); + if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) + return Res; + + if (SimplifyDemandedVectorElts(SDValue(N, 0))) + return SDValue(N, 0); return SDValue(); } @@ -9504,9 +9539,11 @@ SDValue DAGCombiner::visitZERO_EXTEND_VECTOR_INREG(SDNode *N) { if (N0.isUndef()) return DAG.getUNDEF(VT); - if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, - LegalOperations)) - return SDValue(Res, 0); + if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) + return Res; + + if (SimplifyDemandedVectorElts(SDValue(N, 0))) + return SDValue(N, 0); return SDValue(); } @@ -9781,6 +9818,33 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) return NewVSel; + // Narrow a suitable binary operation with a non-opaque constant operand by + // moving it ahead of the truncate. This is limited to pre-legalization + // because targets may prefer a wider type during later combines and invert + // this transform. + switch (N0.getOpcode()) { + // TODO: Add case for ADD - that will likely require a change in logic here + // or target-specific changes to avoid regressions. + case ISD::SUB: + case ISD::MUL: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + if (!LegalOperations && N0.hasOneUse() && + (isConstantOrConstantVector(N0.getOperand(0), true) || + isConstantOrConstantVector(N0.getOperand(1), true))) { + // TODO: We already restricted this to pre-legalization, but for vectors + // we are extra cautious to not create an unsupported operation. + // Target-specific changes are likely needed to avoid regressions here. + if (VT.isScalarInteger() || TLI.isOperationLegal(N0.getOpcode(), VT)) { + SDLoc DL(N); + SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0)); + SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1)); + return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR); + } + } + } + return SDValue(); } @@ -12861,6 +12925,20 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { bool STCoversLD = (Offset >= 0) && (Offset * 8 + LDMemType.getSizeInBits() <= STMemType.getSizeInBits()); + + auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue { + if (LD->isIndexed()) { + bool IsSub = (LD->getAddressingMode() == ISD::PRE_DEC || + LD->getAddressingMode() == ISD::POST_DEC); + unsigned Opc = IsSub ? ISD::SUB : ISD::ADD; + SDValue Idx = DAG.getNode(Opc, SDLoc(LD), LD->getOperand(1).getValueType(), + LD->getOperand(1), LD->getOperand(2)); + SDValue Ops[] = {Val, Idx, Chain}; + return CombineTo(LD, Ops, 3); + } + return CombineTo(LD, Val, Chain); + }; + if (!STCoversLD) return SDValue(); @@ -12868,7 +12946,7 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { if (Offset == 0 && LDType == STType && STMemType == LDMemType) { // Simple case: Direct non-truncating forwarding if (LDType.getSizeInBits() == LDMemType.getSizeInBits()) - return CombineTo(LD, ST->getValue(), Chain); + return ReplaceLd(LD, ST->getValue(), Chain); // Can we model the truncate and extension with an and mask? if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() && !LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) { @@ -12878,7 +12956,7 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { STMemType.getSizeInBits()), SDLoc(ST), STType); auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask); - return CombineTo(LD, Val, Chain); + return ReplaceLd(LD, Val, Chain); } } @@ -12903,7 +12981,7 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { } if (!extendLoadedValueToExtension(LD, Val)) continue; - return CombineTo(LD, Val, Chain); + return ReplaceLd(LD, Val, Chain); } while (false); // On failure, cleanup dead nodes we may have created. @@ -15312,6 +15390,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { return InVec; EVT VT = InVec.getValueType(); + unsigned NumElts = VT.getVectorNumElements(); // Remove redundant insertions: // (insert_vector_elt x (extract_vector_elt x idx) idx) -> x @@ -15324,7 +15403,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { // If this is variable insert to undef vector, it might be better to splat: // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... > if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) { - SmallVector Ops(VT.getVectorNumElements(), InVal); + SmallVector Ops(NumElts, InVal); return DAG.getBuildVector(VT, DL, Ops); } return SDValue(); @@ -15369,11 +15448,11 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { Ops.append(InVec.getNode()->op_begin(), InVec.getNode()->op_end()); } else if (InVec.isUndef()) { - unsigned NElts = VT.getVectorNumElements(); - Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); + Ops.append(NumElts, DAG.getUNDEF(InVal.getValueType())); } else { return SDValue(); } + assert(Ops.size() == NumElts && "Unexpected vector size"); // Insert the element if (Elt < Ops.size()) { @@ -15387,8 +15466,9 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { return DAG.getBuildVector(VT, DL, Ops); } -SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad( - SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad) { +SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, + SDValue EltNo, + LoadSDNode *OriginalLoad) { assert(!OriginalLoad->isVolatile()); EVT ResultVT = EVE->getValueType(0); @@ -15471,43 +15551,52 @@ SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad( } SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { - SDValue InVec = N->getOperand(0); - EVT VT = InVec.getValueType(); - EVT NVT = N->getValueType(0); - if (InVec.isUndef()) - return DAG.getUNDEF(NVT); + SDValue VecOp = N->getOperand(0); + SDValue Index = N->getOperand(1); + EVT ScalarVT = N->getValueType(0); + EVT VecVT = VecOp.getValueType(); + if (VecOp.isUndef()) + return DAG.getUNDEF(ScalarVT); + + // extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val + // + // This only really matters if the index is non-constant since other combines + // on the constant elements already work. + SDLoc DL(N); + if (VecOp.getOpcode() == ISD::INSERT_VECTOR_ELT && + Index == VecOp.getOperand(2)) { + SDValue Elt = VecOp.getOperand(1); + return VecVT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, DL, ScalarVT) : Elt; + } // (vextract (scalar_to_vector val, 0) -> val - if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { + if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR) { // Check if the result type doesn't match the inserted element type. A // SCALAR_TO_VECTOR may truncate the inserted element and the // EXTRACT_VECTOR_ELT may widen the extracted vector. - SDValue InOp = InVec.getOperand(0); - if (InOp.getValueType() != NVT) { - assert(InOp.getValueType().isInteger() && NVT.isInteger()); - return DAG.getSExtOrTrunc(InOp, SDLoc(InVec), NVT); + SDValue InOp = VecOp.getOperand(0); + if (InOp.getValueType() != ScalarVT) { + assert(InOp.getValueType().isInteger() && ScalarVT.isInteger()); + return DAG.getSExtOrTrunc(InOp, DL, ScalarVT); } return InOp; } - SDValue EltNo = N->getOperand(1); - ConstantSDNode *ConstEltNo = dyn_cast(EltNo); - // extract_vector_elt of out-of-bounds element -> UNDEF - if (ConstEltNo && ConstEltNo->getAPIntValue().uge(VT.getVectorNumElements())) - return DAG.getUNDEF(NVT); + auto *IndexC = dyn_cast(Index); + unsigned NumElts = VecVT.getVectorNumElements(); + if (IndexC && IndexC->getAPIntValue().uge(NumElts)) + return DAG.getUNDEF(ScalarVT); // extract_vector_elt (build_vector x, y), 1 -> y - if (ConstEltNo && - InVec.getOpcode() == ISD::BUILD_VECTOR && - TLI.isTypeLegal(VT) && - (InVec.hasOneUse() || - TLI.aggressivelyPreferBuildVectorSources(VT))) { - SDValue Elt = InVec.getOperand(ConstEltNo->getZExtValue()); + if (IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR && + TLI.isTypeLegal(VecVT) && + (VecOp.hasOneUse() || TLI.aggressivelyPreferBuildVectorSources(VecVT))) { + SDValue Elt = VecOp.getOperand(IndexC->getZExtValue()); EVT InEltVT = Elt.getValueType(); // Sometimes build_vector's scalar input types do not match result type. - if (NVT == InEltVT) + if (ScalarVT == InEltVT) return Elt; // TODO: It may be useful to truncate if free if the build_vector implicitly @@ -15517,27 +15606,27 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { // TODO: These transforms should not require the 'hasOneUse' restriction, but // there are regressions on multiple targets without it. We can end up with a // mess of scalar and vector code if we reduce only part of the DAG to scalar. - if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && VT.isInteger() && - InVec.hasOneUse()) { + if (IndexC && VecOp.getOpcode() == ISD::BITCAST && VecVT.isInteger() && + VecOp.hasOneUse()) { // The vector index of the LSBs of the source depend on the endian-ness. bool IsLE = DAG.getDataLayout().isLittleEndian(); - unsigned ExtractIndex = ConstEltNo->getZExtValue(); + unsigned ExtractIndex = IndexC->getZExtValue(); // extract_elt (v2i32 (bitcast i64:x)), BCTruncElt -> i32 (trunc i64:x) - unsigned BCTruncElt = IsLE ? 0 : VT.getVectorNumElements() - 1; - SDValue BCSrc = InVec.getOperand(0); + unsigned BCTruncElt = IsLE ? 0 : NumElts - 1; + SDValue BCSrc = VecOp.getOperand(0); if (ExtractIndex == BCTruncElt && BCSrc.getValueType().isScalarInteger()) - return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, BCSrc); + return DAG.getNode(ISD::TRUNCATE, DL, ScalarVT, BCSrc); if (LegalTypes && BCSrc.getValueType().isInteger() && BCSrc.getOpcode() == ISD::SCALAR_TO_VECTOR) { // ext_elt (bitcast (scalar_to_vec i64 X to v2i64) to v4i32), TruncElt --> // trunc i64 X to i32 SDValue X = BCSrc.getOperand(0); - assert(X.getValueType().isScalarInteger() && NVT.isScalarInteger() && + assert(X.getValueType().isScalarInteger() && ScalarVT.isScalarInteger() && "Extract element and scalar to vector can't change element type " "from FP to integer."); unsigned XBitWidth = X.getValueSizeInBits(); - unsigned VecEltBitWidth = VT.getScalarSizeInBits(); + unsigned VecEltBitWidth = VecVT.getScalarSizeInBits(); BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1; // An extract element return value type can be wider than its vector @@ -15546,51 +15635,40 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { if (ExtractIndex == BCTruncElt && XBitWidth > VecEltBitWidth) { assert(XBitWidth % VecEltBitWidth == 0 && "Scalar bitwidth must be a multiple of vector element bitwidth"); - return DAG.getAnyExtOrTrunc(X, SDLoc(N), NVT); + return DAG.getAnyExtOrTrunc(X, DL, ScalarVT); } } } - // extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val - // - // This only really matters if the index is non-constant since other combines - // on the constant elements already work. - if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && - EltNo == InVec.getOperand(2)) { - SDValue Elt = InVec.getOperand(1); - return VT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, SDLoc(N), NVT) : Elt; - } - // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT. // We only perform this optimization before the op legalization phase because // we may introduce new vector instructions which are not backed by TD // patterns. For example on AVX, extracting elements from a wide vector // without using extract_subvector. However, if we can find an underlying // scalar value, then we can always use that. - if (ConstEltNo && InVec.getOpcode() == ISD::VECTOR_SHUFFLE) { - int NumElem = VT.getVectorNumElements(); - ShuffleVectorSDNode *SVOp = cast(InVec); + if (IndexC && VecOp.getOpcode() == ISD::VECTOR_SHUFFLE) { + auto *Shuf = cast(VecOp); // Find the new index to extract from. - int OrigElt = SVOp->getMaskElt(ConstEltNo->getZExtValue()); + int OrigElt = Shuf->getMaskElt(IndexC->getZExtValue()); // Extracting an undef index is undef. if (OrigElt == -1) - return DAG.getUNDEF(NVT); + return DAG.getUNDEF(ScalarVT); // Select the right vector half to extract from. SDValue SVInVec; - if (OrigElt < NumElem) { - SVInVec = InVec->getOperand(0); + if (OrigElt < (int)NumElts) { + SVInVec = VecOp.getOperand(0); } else { - SVInVec = InVec->getOperand(1); - OrigElt -= NumElem; + SVInVec = VecOp.getOperand(1); + OrigElt -= NumElts; } if (SVInVec.getOpcode() == ISD::BUILD_VECTOR) { SDValue InOp = SVInVec.getOperand(OrigElt); - if (InOp.getValueType() != NVT) { - assert(InOp.getValueType().isInteger() && NVT.isInteger()); - InOp = DAG.getSExtOrTrunc(InOp, SDLoc(SVInVec), NVT); + if (InOp.getValueType() != ScalarVT) { + assert(InOp.getValueType().isInteger() && ScalarVT.isInteger()); + InOp = DAG.getSExtOrTrunc(InOp, DL, ScalarVT); } return InOp; @@ -15601,136 +15679,132 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { if (!LegalOperations || // FIXME: Should really be just isOperationLegalOrCustom. - TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VT) || - TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VT)) { + TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecVT) || + TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VecVT)) { EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout()); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), NVT, SVInVec, - DAG.getConstant(OrigElt, SDLoc(SVOp), IndexTy)); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SVInVec, + DAG.getConstant(OrigElt, DL, IndexTy)); } } // If only EXTRACT_VECTOR_ELT nodes use the source vector we can // simplify it based on the (valid) extraction indices. - if (llvm::all_of(InVec->uses(), [&](SDNode *Use) { + if (llvm::all_of(VecOp->uses(), [&](SDNode *Use) { return Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT && - Use->getOperand(0) == InVec && + Use->getOperand(0) == VecOp && isa(Use->getOperand(1)); })) { - APInt DemandedElts = APInt::getNullValue(VT.getVectorNumElements()); - for (SDNode *Use : InVec->uses()) { + APInt DemandedElts = APInt::getNullValue(NumElts); + for (SDNode *Use : VecOp->uses()) { auto *CstElt = cast(Use->getOperand(1)); - if (CstElt->getAPIntValue().ult(VT.getVectorNumElements())) + if (CstElt->getAPIntValue().ult(NumElts)) DemandedElts.setBit(CstElt->getZExtValue()); } - if (SimplifyDemandedVectorElts(InVec, DemandedElts, true)) + if (SimplifyDemandedVectorElts(VecOp, DemandedElts, true)) { + // We simplified the vector operand of this extract element. If this + // extract is not dead, visit it again so it is folded properly. + if (N->getOpcode() != ISD::DELETED_NODE) + AddToWorklist(N); return SDValue(N, 0); + } } - bool BCNumEltsChanged = false; - EVT ExtVT = VT.getVectorElementType(); - EVT LVT = ExtVT; - + // Everything under here is trying to match an extract of a loaded value. // If the result of load has to be truncated, then it's not necessarily // profitable. - if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT)) + bool BCNumEltsChanged = false; + EVT ExtVT = VecVT.getVectorElementType(); + EVT LVT = ExtVT; + if (ScalarVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, ScalarVT)) return SDValue(); - if (InVec.getOpcode() == ISD::BITCAST) { + if (VecOp.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. - if (!InVec.hasOneUse()) + if (!VecOp.hasOneUse()) return SDValue(); - EVT BCVT = InVec.getOperand(0).getValueType(); + EVT BCVT = VecOp.getOperand(0).getValueType(); if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType())) return SDValue(); - if (VT.getVectorNumElements() != BCVT.getVectorNumElements()) + if (NumElts != BCVT.getVectorNumElements()) BCNumEltsChanged = true; - InVec = InVec.getOperand(0); + VecOp = VecOp.getOperand(0); ExtVT = BCVT.getVectorElementType(); } // (vextract (vN[if]M load $addr), i) -> ([if]M load $addr + i * size) - if (!LegalOperations && !ConstEltNo && InVec.hasOneUse() && - ISD::isNormalLoad(InVec.getNode()) && - !N->getOperand(1)->hasPredecessor(InVec.getNode())) { + if (!LegalOperations && !IndexC && VecOp.hasOneUse() && + ISD::isNormalLoad(VecOp.getNode()) && + !N->getOperand(1)->hasPredecessor(VecOp.getNode())) { SDValue Index = N->getOperand(1); - if (LoadSDNode *OrigLoad = dyn_cast(InVec)) { - if (!OrigLoad->isVolatile()) { - return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, Index, - OrigLoad); - } - } + if (auto *OrigLoad = dyn_cast(VecOp)) + if (!OrigLoad->isVolatile()) + return scalarizeExtractedVectorLoad(N, VecVT, Index, OrigLoad); } // Perform only after legalization to ensure build_vector / vector_shuffle // optimizations have already been done. - if (!LegalOperations) return SDValue(); + if (!LegalOperations || !IndexC) + return SDValue(); // (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size) // (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size) // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr) + int Elt = IndexC->getZExtValue(); + LoadSDNode *LN0 = nullptr; + if (ISD::isNormalLoad(VecOp.getNode())) { + LN0 = cast(VecOp); + } else if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR && + VecOp.getOperand(0).getValueType() == ExtVT && + ISD::isNormalLoad(VecOp.getOperand(0).getNode())) { + // Don't duplicate a load with other uses. + if (!VecOp.hasOneUse()) + return SDValue(); - if (ConstEltNo) { - int Elt = cast(EltNo)->getZExtValue(); + LN0 = cast(VecOp.getOperand(0)); + } + if (auto *Shuf = dyn_cast(VecOp)) { + // (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1) + // => + // (load $addr+1*size) - LoadSDNode *LN0 = nullptr; - const ShuffleVectorSDNode *SVN = nullptr; - if (ISD::isNormalLoad(InVec.getNode())) { - LN0 = cast(InVec); - } else if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR && - InVec.getOperand(0).getValueType() == ExtVT && - ISD::isNormalLoad(InVec.getOperand(0).getNode())) { - // Don't duplicate a load with other uses. - if (!InVec.hasOneUse()) - return SDValue(); + // Don't duplicate a load with other uses. + if (!VecOp.hasOneUse()) + return SDValue(); - LN0 = cast(InVec.getOperand(0)); - } else if ((SVN = dyn_cast(InVec))) { - // (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1) - // => - // (load $addr+1*size) + // If the bit convert changed the number of elements, it is unsafe + // to examine the mask. + if (BCNumEltsChanged) + return SDValue(); - // Don't duplicate a load with other uses. - if (!InVec.hasOneUse()) - return SDValue(); + // Select the input vector, guarding against out of range extract vector. + int Idx = (Elt > (int)NumElts) ? -1 : Shuf->getMaskElt(Elt); + VecOp = (Idx < (int)NumElts) ? VecOp.getOperand(0) : VecOp.getOperand(1); - // If the bit convert changed the number of elements, it is unsafe - // to examine the mask. - if (BCNumEltsChanged) + if (VecOp.getOpcode() == ISD::BITCAST) { + // Don't duplicate a load with other uses. + if (!VecOp.hasOneUse()) return SDValue(); - // Select the input vector, guarding against out of range extract vector. - unsigned NumElems = VT.getVectorNumElements(); - int Idx = (Elt > (int)NumElems) ? -1 : SVN->getMaskElt(Elt); - InVec = (Idx < (int)NumElems) ? InVec.getOperand(0) : InVec.getOperand(1); - - if (InVec.getOpcode() == ISD::BITCAST) { - // Don't duplicate a load with other uses. - if (!InVec.hasOneUse()) - return SDValue(); - - InVec = InVec.getOperand(0); - } - if (ISD::isNormalLoad(InVec.getNode())) { - LN0 = cast(InVec); - Elt = (Idx < (int)NumElems) ? Idx : Idx - (int)NumElems; - EltNo = DAG.getConstant(Elt, SDLoc(EltNo), EltNo.getValueType()); - } + VecOp = VecOp.getOperand(0); } + if (ISD::isNormalLoad(VecOp.getNode())) { + LN0 = cast(VecOp); + Elt = (Idx < (int)NumElts) ? Idx : Idx - (int)NumElts; + Index = DAG.getConstant(Elt, DL, Index.getValueType()); + } + } - // Make sure we found a non-volatile load and the extractelement is - // the only use. - if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile()) - return SDValue(); - - // If Idx was -1 above, Elt is going to be -1, so just return undef. - if (Elt == -1) - return DAG.getUNDEF(LVT); + // Make sure we found a non-volatile load and the extractelement is + // the only use. + if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile()) + return SDValue(); - return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, EltNo, LN0); - } + // If Idx was -1 above, Elt is going to be -1, so just return undef. + if (Elt == -1) + return DAG.getUNDEF(LVT); - return SDValue(); + return scalarizeExtractedVectorLoad(N, VecVT, Index, LN0); } // Simplify (build_vec (ext )) to (bitcast (build_vec )) @@ -16454,11 +16528,19 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { SDValue In = N->getOperand(0); assert(In.getValueType().isVector() && "Must concat vectors"); - // Transform: concat_vectors(scalar, undef) -> scalar_to_vector(sclr). - if (In->getOpcode() == ISD::BITCAST && - !In->getOperand(0).getValueType().isVector()) { - SDValue Scalar = In->getOperand(0); + SDValue Scalar = peekThroughOneUseBitcasts(In); + // concat_vectors(scalar_to_vector(scalar), undef) -> + // scalar_to_vector(scalar) + if (!LegalOperations && Scalar.getOpcode() == ISD::SCALAR_TO_VECTOR && + Scalar.hasOneUse()) { + EVT SVT = Scalar.getValueType().getVectorElementType(); + if (SVT == Scalar.getOperand(0).getValueType()) + Scalar = Scalar.getOperand(0); + } + + // concat_vectors(scalar, undef) -> scalar_to_vector(scalar) + if (!Scalar.getValueType().isVector()) { // If the bitcast type isn't legal, it might be a trunc of a legal type; // look through the trunc so we can still do the transform: // concat_vectors(trunc(scalar), undef) -> scalar_to_vector(scalar) @@ -16467,7 +16549,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { TLI.isTypeLegal(Scalar->getOperand(0).getValueType())) Scalar = Scalar->getOperand(0); - EVT SclTy = Scalar->getValueType(0); + EVT SclTy = Scalar.getValueType(); if (!SclTy.isFloatingPoint() && !SclTy.isInteger()) return SDValue(); @@ -16612,47 +16694,56 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { if (!ISD::isBinaryOp(BinOp.getNode())) return SDValue(); - // The binop must be a vector type, so we can chop it in half. + // The binop must be a vector type, so we can extract some fraction of it. EVT WideBVT = BinOp.getValueType(); if (!WideBVT.isVector()) return SDValue(); EVT VT = Extract->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); unsigned ExtractIndex = ExtractIndexC->getZExtValue(); - assert(ExtractIndex % NumElems == 0 && + assert(ExtractIndex % VT.getVectorNumElements() == 0 && "Extract index is not a multiple of the vector length."); - EVT SrcVT = Extract->getOperand(0).getValueType(); // Bail out if this is not a proper multiple width extraction. - unsigned NumSrcElems = SrcVT.getVectorNumElements(); - if (NumSrcElems % NumElems != 0) + unsigned WideWidth = WideBVT.getSizeInBits(); + unsigned NarrowWidth = VT.getSizeInBits(); + if (WideWidth % NarrowWidth != 0) return SDValue(); - // Bail out if the target does not support a narrower version of the binop. - unsigned NarrowingRatio = NumSrcElems / NumElems; - unsigned BOpcode = BinOp.getOpcode(); + // Bail out if we are extracting a fraction of a single operation. This can + // occur because we potentially looked through a bitcast of the binop. + unsigned NarrowingRatio = WideWidth / NarrowWidth; unsigned WideNumElts = WideBVT.getVectorNumElements(); + if (WideNumElts % NarrowingRatio != 0) + return SDValue(); + + // Bail out if the target does not support a narrower version of the binop. EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(), WideNumElts / NarrowingRatio); + unsigned BOpcode = BinOp.getOpcode(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT)) return SDValue(); // If extraction is cheap, we don't need to look at the binop operands // for concat ops. The narrow binop alone makes this transform profitable. - // TODO: We're not dealing with the bitcasted pattern here. That limitation - // should be lifted. - if (Extract->getOperand(0) == BinOp && BinOp.hasOneUse() && - TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtractIndex)) { + // We can't just reuse the original extract index operand because we may have + // bitcasted. + unsigned ConcatOpNum = ExtractIndex / VT.getVectorNumElements(); + unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements(); + EVT ExtBOIdxVT = Extract->getOperand(1).getValueType(); + if (TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtBOIdx) && + BinOp.hasOneUse() && Extract->getOperand(0)->hasOneUse()) { // extract (binop B0, B1), N --> binop (extract B0, N), (extract B1, N) SDLoc DL(Extract); + SDValue NewExtIndex = DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT); SDValue X = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, - BinOp.getOperand(0), Extract->getOperand(1)); + BinOp.getOperand(0), NewExtIndex); SDValue Y = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, - BinOp.getOperand(1), Extract->getOperand(1)); - return DAG.getNode(BOpcode, DL, NarrowBVT, X, Y, - BinOp.getNode()->getFlags()); + BinOp.getOperand(1), NewExtIndex); + SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y, + BinOp.getNode()->getFlags()); + return DAG.getBitcast(VT, NarrowBinOp); } // Only handle the case where we are doubling and then halving. A larger ratio @@ -16681,11 +16772,7 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { return SDValue(); // If one of the binop operands was not the result of a concat, we must - // extract a half-sized operand for our new narrow binop. We can't just reuse - // the original extract index operand because we may have bitcasted. - unsigned ConcatOpNum = ExtractIndex / NumElems; - unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements(); - EVT ExtBOIdxVT = Extract->getOperand(1).getValueType(); + // extract a half-sized operand for our new narrow binop. SDLoc DL(Extract); // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN @@ -16713,17 +16800,19 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) { if (DAG.getDataLayout().isBigEndian()) return SDValue(); - // TODO: The one-use check is overly conservative. Check the cost of the - // extract instead or remove that condition entirely. auto *Ld = dyn_cast(Extract->getOperand(0)); auto *ExtIdx = dyn_cast(Extract->getOperand(1)); - if (!Ld || !Ld->hasOneUse() || Ld->getExtensionType() || Ld->isVolatile() || - !ExtIdx) + if (!Ld || Ld->getExtensionType() || Ld->isVolatile() || !ExtIdx) + return SDValue(); + + // Allow targets to opt-out. + EVT VT = Extract->getValueType(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT)) return SDValue(); // The narrow load will be offset from the base address of the old load if // we are extracting from something besides index 0 (little-endian). - EVT VT = Extract->getValueType(0); SDLoc DL(Extract); SDValue BaseAddr = Ld->getOperand(1); unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize(); @@ -16928,12 +17017,15 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN, SDValue N0 = SVN->getOperand(0); SDValue N1 = SVN->getOperand(1); - if (!N0->hasOneUse() || !N1->hasOneUse()) + if (!N0->hasOneUse()) return SDValue(); // If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as // discussed above. if (!N1.isUndef()) { + if (!N1->hasOneUse()) + return SDValue(); + bool N0AnyConst = isAnyConstantBuildVector(N0.getNode()); bool N1AnyConst = isAnyConstantBuildVector(N1.getNode()); if (N0AnyConst && !N1AnyConst && !ISD::isBuildVectorAllZeros(N0.getNode())) @@ -17366,7 +17458,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. - if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) + if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) if (SDValue Res = combineShuffleOfScalars(SVN, DAG, TLI)) return Res; @@ -17630,12 +17722,6 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { if (N1.isUndef()) return N0; - // For nested INSERT_SUBVECTORs, attempt to combine inner node first to allow - // us to pull BITCASTs from input to output. - if (N0.hasOneUse() && N0->getOpcode() == ISD::INSERT_SUBVECTOR) - if (SDValue NN0 = visitINSERT_SUBVECTOR(N0.getNode())) - return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, NN0, N1, N2); - // If this is an insert of an extracted vector into an undef vector, we can // just use the input to the extract. if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && @@ -17718,6 +17804,10 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); } + // Simplify source operands based on insertion. + if (SimplifyDemandedVectorElts(SDValue(N, 0))) + return SDValue(N, 0); + return SDValue(); } @@ -18157,6 +18247,63 @@ SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, return DAG.getNode(ISD::AND, DL, AType, Shift, N2); } +/// Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)" +/// where "tmp" is a constant pool entry containing an array with 1.0 and 2.0 +/// in it. This may be a win when the constant is not otherwise available +/// because it replaces two constant pool loads with one. +SDValue DAGCombiner::convertSelectOfFPConstantsToLoadOffset( + const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, + ISD::CondCode CC) { + if (!TLI.reduceSelectOfFPConstantLoads(N0.getValueType().isFloatingPoint())) + return SDValue(); + + // If we are before legalize types, we want the other legalization to happen + // first (for example, to avoid messing with soft float). + auto *TV = dyn_cast(N2); + auto *FV = dyn_cast(N3); + EVT VT = N2.getValueType(); + if (!TV || !FV || !TLI.isTypeLegal(VT)) + return SDValue(); + + // If a constant can be materialized without loads, this does not make sense. + if (TLI.getOperationAction(ISD::ConstantFP, VT) == TargetLowering::Legal || + TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0)) || + TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0))) + return SDValue(); + + // If both constants have multiple uses, then we won't need to do an extra + // load. The values are likely around in registers for other users. + if (!TV->hasOneUse() && !FV->hasOneUse()) + return SDValue(); + + Constant *Elts[] = { const_cast(FV->getConstantFPValue()), + const_cast(TV->getConstantFPValue()) }; + Type *FPTy = Elts[0]->getType(); + const DataLayout &TD = DAG.getDataLayout(); + + // Create a ConstantArray of the two constants. + Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts); + SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()), + TD.getPrefTypeAlignment(FPTy)); + unsigned Alignment = cast(CPIdx)->getAlignment(); + + // Get offsets to the 0 and 1 elements of the array, so we can select between + // them. + SDValue Zero = DAG.getIntPtrConstant(0, DL); + unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType()); + SDValue One = DAG.getIntPtrConstant(EltSize, SDLoc(FV)); + SDValue Cond = + DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()), N0, N1, CC); + AddToWorklist(Cond.getNode()); + SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(), Cond, One, Zero); + AddToWorklist(CstOffset.getNode()); + CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx, CstOffset); + AddToWorklist(CPIdx.getNode()); + return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool( + DAG.getMachineFunction()), Alignment); +} + /// Simplify an expression of the form (N0 cond N1) ? N2 : N3 /// where 'cond' is the comparison specified by CC. SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, @@ -18165,75 +18312,26 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, // (x ? y : y) -> y. if (N2 == N3) return N2; + EVT CmpOpVT = N0.getValueType(); EVT VT = N2.getValueType(); - ConstantSDNode *N1C = dyn_cast(N1.getNode()); - ConstantSDNode *N2C = dyn_cast(N2.getNode()); + auto *N1C = dyn_cast(N1.getNode()); + auto *N2C = dyn_cast(N2.getNode()); + auto *N3C = dyn_cast(N3.getNode()); - // Determine if the condition we're dealing with is constant - SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), - N0, N1, CC, DL, false); + // Determine if the condition we're dealing with is constant. + SDValue SCC = SimplifySetCC(getSetCCResultType(CmpOpVT), N0, N1, CC, DL, + false); if (SCC.getNode()) AddToWorklist(SCC.getNode()); - if (ConstantSDNode *SCCC = dyn_cast_or_null(SCC.getNode())) { + if (auto *SCCC = dyn_cast_or_null(SCC.getNode())) { // fold select_cc true, x, y -> x // fold select_cc false, x, y -> y return !SCCC->isNullValue() ? N2 : N3; } - // Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)" - // where "tmp" is a constant pool entry containing an array with 1.0 and 2.0 - // in it. This is a win when the constant is not otherwise available because - // it replaces two constant pool loads with one. We only do this if the FP - // type is known to be legal, because if it isn't, then we are before legalize - // types an we want the other legalization to happen first (e.g. to avoid - // messing with soft float) and if the ConstantFP is not legal, because if - // it is legal, we may not need to store the FP constant in a constant pool. - if (ConstantFPSDNode *TV = dyn_cast(N2)) - if (ConstantFPSDNode *FV = dyn_cast(N3)) { - if (TLI.isTypeLegal(N2.getValueType()) && - (TLI.getOperationAction(ISD::ConstantFP, N2.getValueType()) != - TargetLowering::Legal && - !TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0)) && - !TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0))) && - // If both constants have multiple uses, then we won't need to do an - // extra load, they are likely around in registers for other users. - (TV->hasOneUse() || FV->hasOneUse())) { - Constant *Elts[] = { - const_cast(FV->getConstantFPValue()), - const_cast(TV->getConstantFPValue()) - }; - Type *FPTy = Elts[0]->getType(); - const DataLayout &TD = DAG.getDataLayout(); - - // Create a ConstantArray of the two constants. - Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts); - SDValue CPIdx = - DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()), - TD.getPrefTypeAlignment(FPTy)); - unsigned Alignment = cast(CPIdx)->getAlignment(); - - // Get the offsets to the 0 and 1 element of the array so that we can - // select between them. - SDValue Zero = DAG.getIntPtrConstant(0, DL); - unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType()); - SDValue One = DAG.getIntPtrConstant(EltSize, SDLoc(FV)); - - SDValue Cond = DAG.getSetCC(DL, - getSetCCResultType(N0.getValueType()), - N0, N1, CC); - AddToWorklist(Cond.getNode()); - SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(), - Cond, One, Zero); - AddToWorklist(CstOffset.getNode()); - CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx, - CstOffset); - AddToWorklist(CPIdx.getNode()); - return DAG.getLoad( - TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - Alignment); - } - } + if (SDValue V = + convertSelectOfFPConstantsToLoadOffset(DL, N0, N1, N2, N3, CC)) + return V; if (SDValue V = foldSelectCCToShiftAnd(DL, N0, N1, N2, N3, CC)) return V; @@ -18247,7 +18345,7 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND && N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) { SDValue AndLHS = N0->getOperand(0); - ConstantSDNode *ConstAndRHS = dyn_cast(N0->getOperand(1)); + auto *ConstAndRHS = dyn_cast(N0->getOperand(1)); if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) { // Shift the tested bit over the sign bit. const APInt &AndMask = ConstAndRHS->getAPIntValue(); @@ -18268,48 +18366,48 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, } // fold select C, 16, 0 -> shl C, 4 - if (N2C && isNullConstant(N3) && N2C->getAPIntValue().isPowerOf2() && - TLI.getBooleanContents(N0.getValueType()) == - TargetLowering::ZeroOrOneBooleanContent) { + bool Fold = N2C && isNullConstant(N3) && N2C->getAPIntValue().isPowerOf2(); + bool Swap = N3C && isNullConstant(N2) && N3C->getAPIntValue().isPowerOf2(); + + if ((Fold || Swap) && + TLI.getBooleanContents(CmpOpVT) == + TargetLowering::ZeroOrOneBooleanContent && + (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, CmpOpVT))) { + + if (Swap) { + CC = ISD::getSetCCInverse(CC, CmpOpVT.isInteger()); + std::swap(N2C, N3C); + } // If the caller doesn't want us to simplify this into a zext of a compare, // don't do it. if (NotExtCompare && N2C->isOne()) return SDValue(); - // Get a SetCC of the condition - // NOTE: Don't create a SETCC if it's not legal on this target. - if (!LegalOperations || - TLI.isOperationLegal(ISD::SETCC, N0.getValueType())) { - SDValue Temp, SCC; - // cast from setcc result type to select result type - if (LegalTypes) { - SCC = DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()), - N0, N1, CC); - if (N2.getValueType().bitsLT(SCC.getValueType())) - Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2), - N2.getValueType()); - else - Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), - N2.getValueType(), SCC); - } else { - SCC = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC); - Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), - N2.getValueType(), SCC); - } + SDValue Temp, SCC; + // zext (setcc n0, n1) + if (LegalTypes) { + SCC = DAG.getSetCC(DL, getSetCCResultType(CmpOpVT), N0, N1, CC); + if (VT.bitsLT(SCC.getValueType())) + Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2), VT); + else + Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC); + } else { + SCC = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC); + Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC); + } - AddToWorklist(SCC.getNode()); - AddToWorklist(Temp.getNode()); + AddToWorklist(SCC.getNode()); + AddToWorklist(Temp.getNode()); - if (N2C->isOne()) - return Temp; + if (N2C->isOne()) + return Temp; - // shl setcc result by log2 n2c - return DAG.getNode( - ISD::SHL, DL, N2.getValueType(), Temp, - DAG.getConstant(N2C->getAPIntValue().logBase2(), SDLoc(Temp), - getShiftAmountTy(Temp.getValueType()))); - } + // shl setcc result by log2 n2c + return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp, + DAG.getConstant(N2C->getAPIntValue().logBase2(), + SDLoc(Temp), + getShiftAmountTy(Temp.getValueType()))); } // Check to see if this is an integer abs. @@ -18329,18 +18427,16 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, N0 == N3 && N2.getOpcode() == ISD::SUB && N0 == N2.getOperand(1)) SubC = dyn_cast(N2.getOperand(0)); - EVT XType = N0.getValueType(); - if (SubC && SubC->isNullValue() && XType.isInteger()) { + if (SubC && SubC->isNullValue() && CmpOpVT.isInteger()) { SDLoc DL(N0); - SDValue Shift = DAG.getNode(ISD::SRA, DL, XType, - N0, - DAG.getConstant(XType.getSizeInBits() - 1, DL, - getShiftAmountTy(N0.getValueType()))); - SDValue Add = DAG.getNode(ISD::ADD, DL, - XType, N0, Shift); + SDValue Shift = DAG.getNode(ISD::SRA, DL, CmpOpVT, N0, + DAG.getConstant(CmpOpVT.getSizeInBits() - 1, + DL, + getShiftAmountTy(CmpOpVT))); + SDValue Add = DAG.getNode(ISD::ADD, DL, CmpOpVT, N0, Shift); AddToWorklist(Shift.getNode()); AddToWorklist(Add.getNode()); - return DAG.getNode(ISD::XOR, DL, XType, Add, Shift); + return DAG.getNode(ISD::XOR, DL, CmpOpVT, Add, Shift); } } @@ -18905,6 +19001,11 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) { return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases); } +// TODO: Replace with with std::monostate when we move to C++17. +struct UnitT { } Unit; +bool operator==(const UnitT &, const UnitT &) { return true; } +bool operator!=(const UnitT &, const UnitT &) { return false; } + // This function tries to collect a bunch of potentially interesting // nodes to improve the chains of, all at once. This might seem // redundant, as this function gets called when visiting every store @@ -18917,13 +19018,22 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) { // the nodes that will eventually be candidates, and then not be able // to go from a partially-merged state to the desired final // fully-merged state. -bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { - if (OptLevel == CodeGenOpt::None) - return false; + +bool DAGCombiner::parallelizeChainedStores(StoreSDNode *St) { + SmallVector ChainedStores; + StoreSDNode *STChain = St; + // Intervals records which offsets from BaseIndex have been covered. In + // the common case, every store writes to the immediately previous address + // space and thus merged with the previous interval at insertion time. + + using IMap = + llvm::IntervalMap>; + IMap::Allocator A; + IMap Intervals(A); // This holds the base pointer, index, and the offset in bytes from the base // pointer. - BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); + const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); // We must have a base and an offset. if (!BasePtr.getBase().getNode()) @@ -18933,76 +19043,114 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { if (BasePtr.getBase().isUndef()) return false; - SmallVector ChainedStores; - ChainedStores.push_back(St); + // Add ST's interval. + Intervals.insert(0, (St->getMemoryVT().getSizeInBits() + 7) / 8, Unit); - // Walk up the chain and look for nodes with offsets from the same - // base pointer. Stop when reaching an instruction with a different kind - // or instruction which has a different base pointer. - StoreSDNode *Index = St; - while (Index) { + while (StoreSDNode *Chain = dyn_cast(STChain->getChain())) { // If the chain has more than one use, then we can't reorder the mem ops. - if (Index != St && !SDValue(Index, 0)->hasOneUse()) + if (!SDValue(Chain, 0)->hasOneUse()) break; - - if (Index->isVolatile() || Index->isIndexed()) + if (Chain->isVolatile() || Chain->isIndexed()) break; // Find the base pointer and offset for this memory node. - BaseIndexOffset Ptr = BaseIndexOffset::match(Index, DAG); - + const BaseIndexOffset Ptr = BaseIndexOffset::match(Chain, DAG); // Check that the base pointer is the same as the original one. - if (!BasePtr.equalBaseIndex(Ptr, DAG)) + int64_t Offset; + if (!BasePtr.equalBaseIndex(Ptr, DAG, Offset)) + break; + int64_t Length = (Chain->getMemoryVT().getSizeInBits() + 7) / 8; + // Make sure we don't overlap with other intervals by checking the ones to + // the left or right before inserting. + auto I = Intervals.find(Offset); + // If there's a next interval, we should end before it. + if (I != Intervals.end() && I.start() < (Offset + Length)) + break; + // If there's a previous interval, we should start after it. + if (I != Intervals.begin() && (--I).stop() <= Offset) break; + Intervals.insert(Offset, Offset + Length, Unit); - // Walk up the chain to find the next store node, ignoring any - // intermediate loads. Any other kind of node will halt the loop. - SDNode *NextInChain = Index->getChain().getNode(); - while (true) { - if (StoreSDNode *STn = dyn_cast(NextInChain)) { - // We found a store node. Use it for the next iteration. - if (STn->isVolatile() || STn->isIndexed()) { - Index = nullptr; - break; - } - ChainedStores.push_back(STn); - Index = STn; - break; - } else if (LoadSDNode *Ldn = dyn_cast(NextInChain)) { - NextInChain = Ldn->getChain().getNode(); - continue; - } else { - Index = nullptr; - break; - } - }// end while + ChainedStores.push_back(Chain); + STChain = Chain; } - // At this point, ChainedStores lists all of the Store nodes - // reachable by iterating up through chain nodes matching the above - // conditions. For each such store identified, try to find an - // earlier chain to attach the store to which won't violate the - // required ordering. - bool MadeChangeToSt = false; - SmallVector, 8> BetterChains; + // If we didn't find a chained store, exit. + if (ChainedStores.size() == 0) + return false; - for (StoreSDNode *ChainedStore : ChainedStores) { - SDValue Chain = ChainedStore->getChain(); - SDValue BetterChain = FindBetterChain(ChainedStore, Chain); + // Improve all chained stores (St and ChainedStores members) starting from + // where the store chain ended and return single TokenFactor. + SDValue NewChain = STChain->getChain(); + SmallVector TFOps; + for (unsigned I = ChainedStores.size(); I;) { + StoreSDNode *S = ChainedStores[--I]; + SDValue BetterChain = FindBetterChain(S, NewChain); + S = cast(DAG.UpdateNodeOperands( + S, BetterChain, S->getOperand(1), S->getOperand(2), S->getOperand(3))); + TFOps.push_back(SDValue(S, 0)); + ChainedStores[I] = S; + } + + // Improve St's chain. Use a new node to avoid creating a loop from CombineTo. + SDValue BetterChain = FindBetterChain(St, NewChain); + SDValue NewST; + if (St->isTruncatingStore()) + NewST = DAG.getTruncStore(BetterChain, SDLoc(St), St->getValue(), + St->getBasePtr(), St->getMemoryVT(), + St->getMemOperand()); + else + NewST = DAG.getStore(BetterChain, SDLoc(St), St->getValue(), + St->getBasePtr(), St->getMemOperand()); - if (Chain != BetterChain) { - if (ChainedStore == St) - MadeChangeToSt = true; - BetterChains.push_back(std::make_pair(ChainedStore, BetterChain)); - } - } + TFOps.push_back(NewST); - // Do all replacements after finding the replacements to make to avoid making - // the chains more complicated by introducing new TokenFactors. - for (auto Replacement : BetterChains) - replaceStoreChain(Replacement.first, Replacement.second); + // If we improved every element of TFOps, then we've lost the dependence on + // NewChain to successors of St and we need to add it back to TFOps. Do so at + // the beginning to keep relative order consistent with FindBetterChains. + auto hasImprovedChain = [&](SDValue ST) -> bool { + return ST->getOperand(0) != NewChain; + }; + bool AddNewChain = llvm::all_of(TFOps, hasImprovedChain); + if (AddNewChain) + TFOps.insert(TFOps.begin(), NewChain); + + SDValue TF = DAG.getNode(ISD::TokenFactor, SDLoc(STChain), MVT::Other, TFOps); + CombineTo(St, TF); + + AddToWorklist(STChain); + // Add TF operands worklist in reverse order. + for (auto I = TF->getNumOperands(); I;) + AddToWorklist(TF->getOperand(--I).getNode()); + AddToWorklist(TF.getNode()); + return true; +} + +bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { + if (OptLevel == CodeGenOpt::None) + return false; + + const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); + + // We must have a base and an offset. + if (!BasePtr.getBase().getNode()) + return false; - return MadeChangeToSt; + // Do not handle stores to undef base pointers. + if (BasePtr.getBase().isUndef()) + return false; + + // Directly improve a chain of disjoint stores starting at St. + if (parallelizeChainedStores(St)) + return true; + + // Improve St's Chain.. + SDValue BetterChain = FindBetterChain(St, St->getChain()); + if (St->getChain() != BetterChain) { + replaceStoreChain(St, BetterChain); + return true; + } + return false; } /// This is the entry point for the file. diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp index d5f066c2423eb989420af4df1e8f6b8727520053..a9a3c44ea0c9ca6d62af1f5f84331bf8ba343761 100644 --- a/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -547,6 +547,15 @@ void FastISel::removeDeadCode(MachineBasicBlock::iterator I, assert(I.isValid() && E.isValid() && std::distance(I, E) > 0 && "Invalid iterator!"); while (I != E) { + if (LastFlushPoint == I) + LastFlushPoint = E; + if (SavedInsertPt == I) + SavedInsertPt = E; + if (EmitStartPt == I) + EmitStartPt = E.isValid() ? &*E : nullptr; + if (LastLocalValue == I) + LastLocalValue = E.isValid() ? &*E : nullptr; + MachineInstr *Dead = &*I; ++I; Dead->eraseFromParent(); diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index d3c31911d67744b88add9dc672a5cf03469e61aa..fba728625b071bbc19e24655ee0b43dd4a34511e 100644 --- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -579,9 +579,18 @@ FunctionLoweringInfo::getOrCreateSwiftErrorVRegUseAt(const Instruction *I, const const Value * FunctionLoweringInfo::getValueFromVirtualReg(unsigned Vreg) { if (VirtReg2Value.empty()) { + SmallVector ValueVTs; for (auto &P : ValueMap) { - VirtReg2Value[P.second] = P.first; + ValueVTs.clear(); + ComputeValueVTs(*TLI, Fn->getParent()->getDataLayout(), + P.first->getType(), ValueVTs); + unsigned Reg = P.second; + for (EVT VT : ValueVTs) { + unsigned NumRegisters = TLI->getNumRegisters(Fn->getContext(), VT); + for (unsigned i = 0, e = NumRegisters; i != e; ++i) + VirtReg2Value[Reg++] = P.first; + } } } - return VirtReg2Value[Vreg]; + return VirtReg2Value.lookup(Vreg); } diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index fc9c227e4dfacd3a1958a2404ad75221ce389c7b..6a6114677cc2b5ab22fff63661417d10792aca6f 100644 --- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -652,6 +652,12 @@ void InstrEmitter::EmitRegSequence(SDNode *Node, const MCInstrDesc &II = TII->get(TargetOpcode::REG_SEQUENCE); MachineInstrBuilder MIB = BuildMI(*MF, Node->getDebugLoc(), II, NewVReg); unsigned NumOps = Node->getNumOperands(); + // If the input pattern has a chain, then the root of the corresponding + // output pattern will get a chain as well. This can happen to be a + // REG_SEQUENCE (which is not "guarded" by countOperands/CountResults). + if (NumOps && Node->getOperand(NumOps-1).getValueType() == MVT::Other) + --NumOps; // Ignore chain if it exists. + assert((NumOps & 1) == 1 && "REG_SEQUENCE must have an odd number of operands!"); for (unsigned i = 1; i != NumOps; ++i) { @@ -694,6 +700,20 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD, assert(cast(Var)->isValidLocationForIntrinsic(DL) && "Expected inlined-at fields to agree"); + SD->setIsEmitted(); + + if (SD->isInvalidated()) { + // An invalidated SDNode must generate an undef DBG_VALUE: although the + // original value is no longer computed, earlier DBG_VALUEs live ranges + // must not leak into later code. + auto MIB = BuildMI(*MF, DL, TII->get(TargetOpcode::DBG_VALUE)); + MIB.addReg(0U); + MIB.addReg(0U, RegState::Debug); + MIB.addMetadata(Var); + MIB.addMetadata(Expr); + return &*MIB; + } + if (SD->getKind() == SDDbgValue::FRAMEIX) { // Stack address; this needs to be lowered in target-dependent fashion. // EmitTargetCodeForFrameDebugValue is responsible for allocation. @@ -735,6 +755,9 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD, MIB.addImm(CI->getSExtValue()); } else if (const ConstantFP *CF = dyn_cast(V)) { MIB.addFPImm(CF); + } else if (isa(V)) { + // Note: This assumes that all nullptr constants are zero-valued. + MIB.addImm(0); } else { // Could be an Undef. In any case insert an Undef so we can see what we // dropped. diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index dcb479e4ce1fb85b136a1e3f8fd214ac73090e83..9d6a6939fbabafc0fcc1e3ed1b0eed83cab6610b 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1128,6 +1128,12 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); break; } + case ISD::SMULFIX: { + unsigned Scale = Node->getConstantOperandVal(2); + Action = TLI.getFixedPointOperationAction(Node->getOpcode(), + Node->getValueType(0), Scale); + break; + } case ISD::MSCATTER: Action = TLI.getOperationAction(Node->getOpcode(), cast(Node)->getValue().getValueType()); @@ -1170,6 +1176,8 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { } } break; + case ISD::FSHL: + case ISD::FSHR: case ISD::SRL_PARTS: case ISD::SRA_PARTS: case ISD::SHL_PARTS: { @@ -3262,6 +3270,16 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { } break; } + case ISD::FSHL: + case ISD::FSHR: + if (TLI.expandFunnelShift(Node, Tmp1, DAG)) + Results.push_back(Tmp1); + break; + case ISD::ROTL: + case ISD::ROTR: + if (TLI.expandROT(Node, Tmp1, DAG)) + Results.push_back(Tmp1); + break; case ISD::SADDSAT: case ISD::UADDSAT: case ISD::SSUBSAT: @@ -3269,6 +3287,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { Results.push_back(TLI.getExpandedSaturationAdditionSubtraction(Node, DAG)); break; } + case ISD::SMULFIX: { + Results.push_back(TLI.getExpandedFixedPointMultiplication(Node, DAG)); + break; + } case ISD::SADDO: case ISD::SSUBO: { SDValue LHS = Node->getOperand(0); @@ -3708,46 +3730,6 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { ReplaceNode(SDValue(Node, 0), Result); break; } - case ISD::ROTL: - case ISD::ROTR: { - bool IsLeft = Node->getOpcode() == ISD::ROTL; - SDValue Op0 = Node->getOperand(0), Op1 = Node->getOperand(1); - EVT ResVT = Node->getValueType(0); - EVT OpVT = Op0.getValueType(); - assert(OpVT == ResVT && - "The result and the operand types of rotate should match"); - EVT ShVT = Op1.getValueType(); - SDValue Width = DAG.getConstant(OpVT.getScalarSizeInBits(), dl, ShVT); - - // If a rotate in the other direction is legal, use it. - unsigned RevRot = IsLeft ? ISD::ROTR : ISD::ROTL; - if (TLI.isOperationLegal(RevRot, ResVT)) { - SDValue Sub = DAG.getNode(ISD::SUB, dl, ShVT, Width, Op1); - Results.push_back(DAG.getNode(RevRot, dl, ResVT, Op0, Sub)); - break; - } - - // Otherwise, - // (rotl x, c) -> (or (shl x, (and c, w-1)), (srl x, (and w-c, w-1))) - // (rotr x, c) -> (or (srl x, (and c, w-1)), (shl x, (and w-c, w-1))) - // - assert(isPowerOf2_32(OpVT.getScalarSizeInBits()) && - "Expecting the type bitwidth to be a power of 2"); - unsigned ShOpc = IsLeft ? ISD::SHL : ISD::SRL; - unsigned HsOpc = IsLeft ? ISD::SRL : ISD::SHL; - SDValue Width1 = DAG.getNode(ISD::SUB, dl, ShVT, - Width, DAG.getConstant(1, dl, ShVT)); - SDValue NegOp1 = DAG.getNode(ISD::SUB, dl, ShVT, Width, Op1); - SDValue And0 = DAG.getNode(ISD::AND, dl, ShVT, Op1, Width1); - SDValue And1 = DAG.getNode(ISD::AND, dl, ShVT, NegOp1, Width1); - - SDValue Or = DAG.getNode(ISD::OR, dl, ResVT, - DAG.getNode(ShOpc, dl, ResVT, Op0, And0), - DAG.getNode(HsOpc, dl, ResVT, Op0, And1)); - Results.push_back(Or); - break; - } - case ISD::GLOBAL_OFFSET_TABLE: case ISD::GlobalAddress: case ISD::GlobalTLSAddress: diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 2b1df0165d336198eaad7ea1d208b83de62fea81..25fa2a0a4af1f6489f217db622178daa5c874b07 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -118,6 +118,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::FP_TO_FP16: Res = PromoteIntRes_FP_TO_FP16(N); break; + case ISD::FLT_ROUNDS_: Res = PromoteIntRes_FLT_ROUNDS(N); break; + case ISD::AND: case ISD::OR: case ISD::XOR: @@ -145,6 +147,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::UADDSAT: case ISD::SSUBSAT: case ISD::USUBSAT: Res = PromoteIntRes_ADDSUBSAT(N); break; + case ISD::SMULFIX: Res = PromoteIntRes_SMULFIX(N); break; case ISD::ATOMIC_LOAD: Res = PromoteIntRes_Atomic0(cast(N)); break; @@ -475,6 +478,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_FP16(SDNode *N) { return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0)); } +SDValue DAGTypeLegalizer::PromoteIntRes_FLT_ROUNDS(SDNode *N) { + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDLoc dl(N); + + return DAG.getNode(N->getOpcode(), dl, NVT); +} + SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); @@ -580,7 +590,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSAT(SDNode *N) { SDLoc dl(N); SDValue Op1 = N->getOperand(0); SDValue Op2 = N->getOperand(1); - unsigned OldBits = Op1.getValueSizeInBits(); + unsigned OldBits = Op1.getScalarValueSizeInBits(); unsigned Opcode = N->getOpcode(); unsigned ShiftOp; @@ -602,7 +612,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSAT(SDNode *N) { SDValue Op2Promoted = GetPromotedInteger(Op2); EVT PromotedType = Op1Promoted.getValueType(); - unsigned NewBits = Op1Promoted.getValueSizeInBits(); + unsigned NewBits = PromotedType.getScalarSizeInBits(); unsigned SHLAmount = NewBits - OldBits; EVT SHVT = TLI.getShiftAmountTy(PromotedType, DAG.getDataLayout()); SDValue ShiftAmount = DAG.getConstant(SHLAmount, dl, SHVT); @@ -616,6 +626,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSAT(SDNode *N) { return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount); } +SDValue DAGTypeLegalizer::PromoteIntRes_SMULFIX(SDNode *N) { + // Can just promote the operands then continue with operation. + SDLoc dl(N); + SDValue Op1Promoted = SExtPromotedInteger(N->getOperand(0)); + SDValue Op2Promoted = SExtPromotedInteger(N->getOperand(1)); + EVT PromotedType = Op1Promoted.getValueType(); + return DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted, Op2Promoted, + N->getOperand(2)); +} + SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) { if (ResNo == 1) return PromoteIntRes_Overflow(N); @@ -1042,6 +1062,13 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::ADDCARRY: case ISD::SUBCARRY: Res = PromoteIntOp_ADDSUBCARRY(N, OpNo); break; + + case ISD::FRAMEADDR: + case ISD::RETURNADDR: Res = PromoteIntOp_FRAMERETURNADDR(N); break; + + case ISD::PREFETCH: Res = PromoteIntOp_PREFETCH(N, OpNo); break; + + case ISD::SMULFIX: Res = PromoteIntOp_SMULFIX(N); break; } // If the result is null, the sub-method took care of registering results etc. @@ -1063,9 +1090,10 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { /// shared among BR_CC, SELECT_CC, and SETCC handlers. void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &NewLHS,SDValue &NewRHS, ISD::CondCode CCCode) { - // We have to insert explicit sign or zero extends. Note that we could - // insert sign extends for ALL conditions, but zero extend is cheaper on - // many machines (an AND instead of two shifts), so prefer it. + // We have to insert explicit sign or zero extends. Note that we could + // insert sign extends for ALL conditions. For those operations where either + // zero or sign extension would be valid, use SExtOrZExtPromotedInteger + // which will choose the cheapest for the target. switch (CCCode) { default: llvm_unreachable("Unknown integer comparison!"); case ISD::SETEQ: @@ -1086,8 +1114,8 @@ void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &NewLHS,SDValue &NewRHS, NewLHS = OpL; NewRHS = OpR; } else { - NewLHS = ZExtPromotedInteger(NewLHS); - NewRHS = ZExtPromotedInteger(NewRHS); + NewLHS = SExtOrZExtPromotedInteger(NewLHS); + NewRHS = SExtOrZExtPromotedInteger(NewRHS); } break; } @@ -1095,11 +1123,8 @@ void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &NewLHS,SDValue &NewRHS, case ISD::SETUGT: case ISD::SETULE: case ISD::SETULT: - // ALL of these operations will work if we either sign or zero extend - // the operands (including the unsigned comparisons!). Zero extend is - // usually a simpler/cheaper operation, so prefer it. - NewLHS = ZExtPromotedInteger(NewLHS); - NewRHS = ZExtPromotedInteger(NewRHS); + NewLHS = SExtOrZExtPromotedInteger(NewLHS); + NewRHS = SExtOrZExtPromotedInteger(NewRHS); break; case ISD::SETGE: case ISD::SETGT: @@ -1403,6 +1428,30 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ADDSUBCARRY(SDNode *N, unsigned OpNo) { return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, Carry), 0); } +SDValue DAGTypeLegalizer::PromoteIntOp_SMULFIX(SDNode *N) { + SDValue Op2 = ZExtPromotedInteger(N->getOperand(2)); + return SDValue( + DAG.UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1), Op2), 0); +} + +SDValue DAGTypeLegalizer::PromoteIntOp_FRAMERETURNADDR(SDNode *N) { + // Promote the RETURNADDR/FRAMEADDR argument to a supported integer width. + SDValue Op = ZExtPromotedInteger(N->getOperand(0)); + return SDValue(DAG.UpdateNodeOperands(N, Op), 0); +} + +SDValue DAGTypeLegalizer::PromoteIntOp_PREFETCH(SDNode *N, unsigned OpNo) { + assert(OpNo > 1 && "Don't know how to promote this operand!"); + // Promote the rw, locality, and cache type arguments to a supported integer + // width. + SDValue Op2 = ZExtPromotedInteger(N->getOperand(2)); + SDValue Op3 = ZExtPromotedInteger(N->getOperand(3)); + SDValue Op4 = ZExtPromotedInteger(N->getOperand(4)); + return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1), + Op2, Op3, Op4), + 0); +} + //===----------------------------------------------------------------------===// // Integer Result Expansion //===----------------------------------------------------------------------===// @@ -1541,6 +1590,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { case ISD::UADDSAT: case ISD::SSUBSAT: case ISD::USUBSAT: ExpandIntRes_ADDSUBSAT(N, Lo, Hi); break; + case ISD::SMULFIX: ExpandIntRes_SMULFIX(N, Lo, Hi); break; } // If Lo/Hi is null, the sub-method took care of registering results etc. @@ -2509,6 +2559,95 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUBSAT(SDNode *N, SDValue &Lo, SplitInteger(Result, Lo, Hi); } +void DAGTypeLegalizer::ExpandIntRes_SMULFIX(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + uint64_t Scale = N->getConstantOperandVal(2); + if (!Scale) { + SDValue Result = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS); + SplitInteger(Result, Lo, Hi); + return; + } + + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + SDValue LL, LH, RL, RH; + GetExpandedInteger(LHS, LL, LH); + GetExpandedInteger(RHS, RL, RH); + SmallVector Result; + + if (!TLI.expandMUL_LOHI(ISD::SMUL_LOHI, VT, dl, LHS, RHS, Result, NVT, DAG, + TargetLowering::MulExpansionKind::OnlyLegalOrCustom, + LL, LH, RL, RH)) { + report_fatal_error("Unable to expand SMUL_FIX using SMUL_LOHI."); + return; + } + + unsigned VTSize = VT.getScalarSizeInBits(); + unsigned NVTSize = NVT.getScalarSizeInBits(); + EVT ShiftTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout()); + + // Shift whole amount by scale. + SDValue ResultLL = Result[0]; + SDValue ResultLH = Result[1]; + SDValue ResultHL = Result[2]; + SDValue ResultHH = Result[3]; + + // After getting the multplication result in 4 parts, we need to perform a + // shift right by the amount of the scale to get the result in that scale. + // Let's say we multiply 2 64 bit numbers. The resulting value can be held in + // 128 bits that are cut into 4 32-bit parts: + // + // HH HL LH LL + // |---32---|---32---|---32---|---32---| + // 128 96 64 32 0 + // + // |------VTSize-----| + // + // |NVTSize-| + // + // The resulting Lo and Hi will only need to be one of these 32-bit parts + // after shifting. + if (Scale < NVTSize) { + // If the scale is less than the size of the VT we expand to, the Hi and + // Lo of the result will be in the first 2 parts of the result after + // shifting right. This only requires shifting by the scale as far as the + // third part in the result (ResultHL). + SDValue SRLAmnt = DAG.getConstant(Scale, dl, ShiftTy); + SDValue SHLAmnt = DAG.getConstant(NVTSize - Scale, dl, ShiftTy); + Lo = DAG.getNode(ISD::SRL, dl, NVT, ResultLL, SRLAmnt); + Lo = DAG.getNode(ISD::OR, dl, NVT, Lo, + DAG.getNode(ISD::SHL, dl, NVT, ResultLH, SHLAmnt)); + Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultLH, SRLAmnt); + Hi = DAG.getNode(ISD::OR, dl, NVT, Hi, + DAG.getNode(ISD::SHL, dl, NVT, ResultHL, SHLAmnt)); + } else if (Scale == NVTSize) { + // If the scales are equal, Lo and Hi are ResultLH and Result HL, + // respectively. Avoid shifting to prevent undefined behavior. + Lo = ResultLH; + Hi = ResultHL; + } else if (Scale < VTSize) { + // If the scale is instead less than the old VT size, but greater than or + // equal to the expanded VT size, the first part of the result (ResultLL) is + // no longer a part of Lo because it would be scaled out anyway. Instead we + // can start shifting right from the fourth part (ResultHH) to the second + // part (ResultLH), and Result LH will be the new Lo. + SDValue SRLAmnt = DAG.getConstant(Scale - NVTSize, dl, ShiftTy); + SDValue SHLAmnt = DAG.getConstant(VTSize - Scale, dl, ShiftTy); + Lo = DAG.getNode(ISD::SRL, dl, NVT, ResultLH, SRLAmnt); + Lo = DAG.getNode(ISD::OR, dl, NVT, Lo, + DAG.getNode(ISD::SHL, dl, NVT, ResultHL, SHLAmnt)); + Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultHL, SRLAmnt); + Hi = DAG.getNode(ISD::OR, dl, NVT, Hi, + DAG.getNode(ISD::SHL, dl, NVT, ResultHH, SHLAmnt)); + } else { + llvm_unreachable( + "Expected the scale to be less than the width of the operands"); + } +} + void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node, SDValue &Lo, SDValue &Hi) { SDValue LHS = Node->getOperand(0); diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 8b7c57cbb3b54460f37ff745af157caa109dcf44..032000f6cb799bbb6bc87ddedb9296105faff342 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -281,6 +281,20 @@ private: return DAG.getZeroExtendInReg(Op, dl, OldVT.getScalarType()); } + // Get a promoted operand and sign or zero extend it to the final size + // (depending on TargetLoweringInfo::isSExtCheaperThanZExt). For a given + // subtarget and type, the choice of sign or zero-extension will be + // consistent. + SDValue SExtOrZExtPromotedInteger(SDValue Op) { + EVT OldVT = Op.getValueType(); + SDLoc DL(Op); + Op = GetPromotedInteger(Op); + if (TLI.isSExtCheaperThanZExt(OldVT, Op.getValueType())) + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, Op.getValueType(), Op, + DAG.getValueType(OldVT)); + return DAG.getZeroExtendInReg(Op, DL, OldVT.getScalarType()); + } + // Integer Result Promotion. void PromoteIntegerResult(SDNode *N, unsigned ResNo); SDValue PromoteIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo); @@ -331,6 +345,8 @@ private: SDValue PromoteIntRes_VAARG(SDNode *N); SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo); SDValue PromoteIntRes_ADDSUBSAT(SDNode *N); + SDValue PromoteIntRes_SMULFIX(SDNode *N); + SDValue PromoteIntRes_FLT_ROUNDS(SDNode *N); // Integer Operand Promotion. bool PromoteIntegerOperand(SDNode *N, unsigned OpNo); @@ -361,6 +377,9 @@ private: SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo); SDValue PromoteIntOp_ADDSUBCARRY(SDNode *N, unsigned OpNo); + SDValue PromoteIntOp_FRAMERETURNADDR(SDNode *N); + SDValue PromoteIntOp_PREFETCH(SDNode *N, unsigned OpNo); + SDValue PromoteIntOp_SMULFIX(SDNode *N); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -416,6 +435,7 @@ private: void ExpandIntRes_UADDSUBO (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_XMULO (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ADDSUBSAT (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandIntRes_SMULFIX (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ATOMIC_LOAD (SDNode *N, SDValue &Lo, SDValue &Hi); @@ -671,6 +691,8 @@ private: SDValue ScalarizeVecRes_UNDEF(SDNode *N); SDValue ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N); + SDValue ScalarizeVecRes_SMULFIX(SDNode *N); + // Vector Operand Scalarization: <1 x ty> -> ty. bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo); SDValue ScalarizeVecOp_BITCAST(SDNode *N); @@ -706,6 +728,8 @@ private: void SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_SMULFIX(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi); diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 17f05c3ba9775daf35f686a834b0ba4c50de2089..4df02c6a6bc5f35e45bcfe1a772c86c194a5a5fd 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -132,6 +132,8 @@ class VectorLegalizer { SDValue ExpandCTPOP(SDValue Op); SDValue ExpandCTLZ(SDValue Op); SDValue ExpandCTTZ(SDValue Op); + SDValue ExpandFunnelShift(SDValue Op); + SDValue ExpandROT(SDValue Op); SDValue ExpandFMINNUM_FMAXNUM(SDValue Op); SDValue ExpandStrictFPOp(SDValue Op); @@ -244,7 +246,11 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { if (SDValue Lowered = TLI.LowerOperation(Result, DAG)) { assert(Lowered->getNumValues() == Op->getNumValues() && "Unexpected number of results"); - Changed = Lowered != Result; + if (Lowered != Result) { + // Make sure the new code is also legal. + Lowered = LegalizeOp(Lowered); + Changed = true; + } return TranslateLegalizeResults(Op, Lowered); } LLVM_FALLTHROUGH; @@ -266,7 +272,11 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { return TranslateLegalizeResults(Op, Result); case TargetLowering::Custom: { SDValue Lowered = TLI.LowerOperation(Result, DAG); - Changed = Lowered != Result; + if (Lowered != Result) { + // Make sure the new code is also legal. + Lowered = LegalizeOp(Lowered); + Changed = true; + } return TranslateLegalizeResults(Op, Lowered); } case TargetLowering::Expand: @@ -322,6 +332,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::ADD: case ISD::SUB: case ISD::MUL: + case ISD::MULHS: + case ISD::MULHU: case ISD::SDIV: case ISD::UDIV: case ISD::SREM: @@ -403,6 +415,12 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::USUBSAT: Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); break; + case ISD::SMULFIX: { + unsigned Scale = Node->getConstantOperandVal(2); + Action = TLI.getFixedPointOperationAction(Node->getOpcode(), + Node->getValueType(0), Scale); + break; + } case ISD::FP_ROUND_INREG: Action = TLI.getOperationAction(Node->getOpcode(), cast(Node->getOperand(1))->getVT()); @@ -739,6 +757,12 @@ SDValue VectorLegalizer::Expand(SDValue Op) { case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: return ExpandCTTZ(Op); + case ISD::FSHL: + case ISD::FSHR: + return ExpandFunnelShift(Op); + case ISD::ROTL: + case ISD::ROTR: + return ExpandROT(Op); case ISD::FMINNUM: case ISD::FMAXNUM: return ExpandFMINNUM_FMAXNUM(Op); @@ -1116,32 +1140,42 @@ SDValue VectorLegalizer::ExpandFSUB(SDValue Op) { } SDValue VectorLegalizer::ExpandCTPOP(SDValue Op) { - // Attempt to expand using TargetLowering. SDValue Result; if (TLI.expandCTPOP(Op.getNode(), Result, DAG)) return Result; - // Otherwise go ahead and unroll. return DAG.UnrollVectorOp(Op.getNode()); } SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) { - // Attempt to expand using TargetLowering. SDValue Result; if (TLI.expandCTLZ(Op.getNode(), Result, DAG)) return Result; - // Otherwise go ahead and unroll. return DAG.UnrollVectorOp(Op.getNode()); } SDValue VectorLegalizer::ExpandCTTZ(SDValue Op) { - // Attempt to expand using TargetLowering. SDValue Result; if (TLI.expandCTTZ(Op.getNode(), Result, DAG)) return Result; - // Otherwise go ahead and unroll. + return DAG.UnrollVectorOp(Op.getNode()); +} + +SDValue VectorLegalizer::ExpandFunnelShift(SDValue Op) { + SDValue Result; + if (TLI.expandFunnelShift(Op.getNode(), Result, DAG)) + return Result; + + return DAG.UnrollVectorOp(Op.getNode()); +} + +SDValue VectorLegalizer::ExpandROT(SDValue Op) { + SDValue Result; + if (TLI.expandROT(Op.getNode(), Result, DAG)) + return Result; + return DAG.UnrollVectorOp(Op.getNode()); } diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 88abd84366a9b2aa2041074f57ed286149ea1768..f367e9358576ab5e37d8e64f9f5d0b45ca75c513 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -172,6 +172,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::STRICT_FTRUNC: R = ScalarizeVecRes_StrictFPOp(N); break; + case ISD::SMULFIX: + R = ScalarizeVecRes_SMULFIX(N); + break; } // If R is null, the sub-method took care of registering the result. @@ -194,6 +197,14 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_TernaryOp(SDNode *N) { Op0.getValueType(), Op0, Op1, Op2); } +SDValue DAGTypeLegalizer::ScalarizeVecRes_SMULFIX(SDNode *N) { + SDValue Op0 = GetScalarizedVector(N->getOperand(0)); + SDValue Op1 = GetScalarizedVector(N->getOperand(1)); + SDValue Op2 = N->getOperand(2); + return DAG.getNode(N->getOpcode(), SDLoc(N), Op0.getValueType(), Op0, Op1, + Op2); +} + SDValue DAGTypeLegalizer::ScalarizeVecRes_StrictFPOp(SDNode *N) { EVT VT = N->getValueType(0).getVectorElementType(); unsigned NumOpers = N->getNumOperands(); @@ -848,6 +859,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::STRICT_FTRUNC: SplitVecRes_StrictFPOp(N, Lo, Hi); break; + case ISD::SMULFIX: + SplitVecRes_SMULFIX(N, Lo, Hi); + break; } // If Lo/Hi is null, the sub-method took care of registering results etc. @@ -885,6 +899,20 @@ void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, Op0Hi, Op1Hi, Op2Hi); } +void DAGTypeLegalizer::SplitVecRes_SMULFIX(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDValue LHSLo, LHSHi; + GetSplitVector(N->getOperand(0), LHSLo, LHSHi); + SDValue RHSLo, RHSHi; + GetSplitVector(N->getOperand(1), RHSLo, RHSHi); + SDLoc dl(N); + SDValue Op2 = N->getOperand(2); + + unsigned Opcode = N->getOpcode(); + Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Op2); + Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Op2); +} + void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) { // We know the result is a vector. The input may be either a vector or a @@ -1694,13 +1722,6 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::VSELECT: Res = SplitVecOp_VSELECT(N, OpNo); break; - case ISD::FP_TO_SINT: - case ISD::FP_TO_UINT: - if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType())) - Res = SplitVecOp_TruncateHelper(N); - else - Res = SplitVecOp_UnaryOp(N); - break; case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType())) @@ -1708,6 +1729,8 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { else Res = SplitVecOp_UnaryOp(N); break; + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: case ISD::CTTZ: case ISD::CTLZ: case ISD::CTPOP: @@ -1934,6 +1957,15 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { // Load back the required element. StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx); + + // FIXME: This is to handle i1 vectors with elements promoted to i8. + // i1 vector handling needs general improvement. + if (N->getValueType(0).bitsLT(EltVT)) { + SDValue Load = DAG.getLoad(EltVT, dl, Store, StackPtr, + MachinePointerInfo::getUnknownStack(DAG.getMachineFunction())); + return DAG.getZExtOrTrunc(Load, dl, N->getValueType(0)); + } + return DAG.getExtLoad( ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr, MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), EltVT); @@ -2238,16 +2270,31 @@ SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) { unsigned InElementSize = InVT.getScalarSizeInBits(); unsigned OutElementSize = OutVT.getScalarSizeInBits(); + // Determine the split output VT. If its legal we can just split dirctly. + EVT LoOutVT, HiOutVT; + std::tie(LoOutVT, HiOutVT) = DAG.GetSplitDestVTs(OutVT); + assert(LoOutVT == HiOutVT && "Unequal split?"); + // If the input elements are only 1/2 the width of the result elements, // just use the normal splitting. Our trick only work if there's room // to split more than once. - if (InElementSize <= OutElementSize * 2) + if (isTypeLegal(LoOutVT) || + InElementSize <= OutElementSize * 2) return SplitVecOp_UnaryOp(N); SDLoc DL(N); + // Don't touch if this will be scalarized. + EVT FinalVT = InVT; + while (getTypeAction(FinalVT) == TargetLowering::TypeSplitVector) + FinalVT = FinalVT.getHalfNumVectorElementsVT(*DAG.getContext()); + + if (getTypeAction(FinalVT) == TargetLowering::TypeScalarizeVector) + return SplitVecOp_UnaryOp(N); + // Get the split input vector. SDValue InLoVec, InHiVec; GetSplitVector(InVec, InLoVec, InHiVec); + // Truncate them to 1/2 the element size. EVT HalfElementVT = IsFloat ? EVT::getFloatingPointVT(InElementSize/2) : @@ -2378,6 +2425,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::SMAX: case ISD::UMIN: case ISD::UMAX: + case ISD::UADDSAT: + case ISD::SADDSAT: + case ISD::USUBSAT: + case ISD::SSUBSAT: Res = WidenVecRes_Binary(N); break; @@ -2714,7 +2765,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_StrictFP(SDNode *N) { // The Chain is the first operand. InOps.push_back(N->getOperand(0)); - // Now process the remaining operands. + // Now process the remaining operands. for (unsigned i = 1; i < NumOpers; ++i) { SDValue Oper = N->getOperand(i); @@ -2822,6 +2873,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { // If both input and result vector types are of same width, extend // operations should be done with SIGN/ZERO_EXTEND_VECTOR_INREG, which // accepts fewer elements in the result than in the input. + if (Opcode == ISD::ANY_EXTEND) + return DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, WidenVT, InOp); if (Opcode == ISD::SIGN_EXTEND) return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, WidenVT, InOp); if (Opcode == ISD::ZERO_EXTEND) diff --git a/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h b/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h index 490105a9d581486ed39ae96c2ca9cfa8cab9ec6b..f7566b246f323d13aa606026d04e507d03069922 100644 --- a/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h +++ b/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h @@ -53,6 +53,7 @@ private: enum DbgValueKind kind; bool IsIndirect; bool Invalid = false; + bool Emitted = false; public: /// Constructor for non-constants. @@ -126,6 +127,15 @@ public: void setIsInvalidated() { Invalid = true; } bool isInvalidated() const { return Invalid; } + /// setIsEmitted / isEmitted - Getter/Setter for flag indicating that this + /// SDDbgValue has been emitted to an MBB. + void setIsEmitted() { Emitted = true; } + bool isEmitted() const { return Emitted; } + + /// clearIsEmitted - Reset Emitted flag, for certain special cases where + /// dbg.addr is emitted twice. + void clearIsEmitted() { Emitted = false; } + LLVM_DUMP_METHOD void dump(raw_ostream &OS) const; }; diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp index 4b06cfcc7f674ba4a645e31333e231f8da800fc9..90e109b022fd5fee6110d95f9217db9ae7f21ecf 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp @@ -776,11 +776,9 @@ ScheduleDAGLinearize::EmitSchedule(MachineBasicBlock::iterator &InsertPos) { if (N->getHasDebugValue()) { MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos(); for (auto DV : DAG->GetDbgValues(N)) { - if (DV->isInvalidated()) - continue; - if (auto *DbgMI = Emitter.EmitDbgValue(DV, VRBaseMap)) - BB->insert(InsertPos, DbgMI); - DV->setIsInvalidated(); + if (!DV->isEmitted()) + if (auto *DbgMI = Emitter.EmitDbgValue(DV, VRBaseMap)) + BB->insert(InsertPos, DbgMI); } } } diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 957b2e35494e098b562efa7b48b0c8f6c1096e5b..e258f0a218a538cfa0fde5a1cfe5888808abdb1e 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -722,7 +722,7 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter, MachineBasicBlock *BB = Emitter.getBlock(); MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos(); for (auto DV : DAG->GetDbgValues(N)) { - if (DV->isInvalidated()) + if (DV->isEmitted()) continue; unsigned DVOrder = DV->getOrder(); if (!Order || DVOrder == Order) { @@ -731,7 +731,6 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter, Orders.push_back({DVOrder, DbgMI}); BB->insert(InsertPos, DbgMI); } - DV->setIsInvalidated(); } } } @@ -822,8 +821,12 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) { SDDbgInfo::DbgIterator PDE = DAG->ByvalParmDbgEnd(); for (; PDI != PDE; ++PDI) { MachineInstr *DbgMI= Emitter.EmitDbgValue(*PDI, VRBaseMap); - if (DbgMI) + if (DbgMI) { BB->insert(InsertPos, DbgMI); + // We re-emit the dbg_value closer to its use, too, after instructions + // are emitted to the BB. + (*PDI)->clearIsEmitted(); + } } } @@ -889,7 +892,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) { for (; DI != DE; ++DI) { if ((*DI)->getOrder() < LastOrder || (*DI)->getOrder() >= Order) break; - if ((*DI)->isInvalidated()) + if ((*DI)->isEmitted()) continue; MachineInstr *DbgMI = Emitter.EmitDbgValue(*DI, VRBaseMap); @@ -911,7 +914,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) { // some of them before one or more conditional branches? SmallVector DbgMIs; for (; DI != DE; ++DI) { - if ((*DI)->isInvalidated()) + if ((*DI)->isEmitted()) continue; assert((*DI)->getOrder() >= LastOrder && "emitting DBG_VALUE out of order"); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index fce14d53c22a9a3c84e0a2acc58f343fd04b64e2..d69e50d9007be7ae2d2a00aa773b764684a2b39b 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1685,7 +1685,7 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, // SDNode doesn't have access to it. This memory will be "leaked" when // the node is deallocated, but recovered when the NodeAllocator is released. int *MaskAlloc = OperandAllocator.Allocate(NElts); - std::copy(MaskVec.begin(), MaskVec.end(), MaskAlloc); + llvm::copy(MaskVec, MaskAlloc); auto *N = newSDNode(VT, dl.getIROrder(), dl.getDebugLoc(), MaskAlloc); @@ -2121,6 +2121,102 @@ bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask, return Mask.isSubsetOf(computeKnownBits(Op, Depth).Zero); } +/// isSplatValue - Return true if the vector V has the same value +/// across all DemandedElts. +bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts, + APInt &UndefElts) { + if (!DemandedElts) + return false; // No demanded elts, better to assume we don't know anything. + + EVT VT = V.getValueType(); + assert(VT.isVector() && "Vector type expected"); + + unsigned NumElts = VT.getVectorNumElements(); + assert(NumElts == DemandedElts.getBitWidth() && "Vector size mismatch"); + UndefElts = APInt::getNullValue(NumElts); + + switch (V.getOpcode()) { + case ISD::BUILD_VECTOR: { + SDValue Scl; + for (unsigned i = 0; i != NumElts; ++i) { + SDValue Op = V.getOperand(i); + if (Op.isUndef()) { + UndefElts.setBit(i); + continue; + } + if (!DemandedElts[i]) + continue; + if (Scl && Scl != Op) + return false; + Scl = Op; + } + return true; + } + case ISD::VECTOR_SHUFFLE: { + // Check if this is a shuffle node doing a splat. + // TODO: Do we need to handle shuffle(splat, undef, mask)? + int SplatIndex = -1; + ArrayRef Mask = cast(V)->getMask(); + for (int i = 0; i != (int)NumElts; ++i) { + int M = Mask[i]; + if (M < 0) { + UndefElts.setBit(i); + continue; + } + if (!DemandedElts[i]) + continue; + if (0 <= SplatIndex && SplatIndex != M) + return false; + SplatIndex = M; + } + return true; + } + case ISD::EXTRACT_SUBVECTOR: { + SDValue Src = V.getOperand(0); + ConstantSDNode *SubIdx = dyn_cast(V.getOperand(1)); + unsigned NumSrcElts = Src.getValueType().getVectorNumElements(); + if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) { + // Offset the demanded elts by the subvector index. + uint64_t Idx = SubIdx->getZExtValue(); + APInt UndefSrcElts; + APInt DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx); + if (isSplatValue(Src, DemandedSrc, UndefSrcElts)) { + UndefElts = UndefSrcElts.extractBits(NumElts, Idx); + return true; + } + } + break; + } + case ISD::ADD: + case ISD::SUB: + case ISD::AND: { + APInt UndefLHS, UndefRHS; + SDValue LHS = V.getOperand(0); + SDValue RHS = V.getOperand(1); + if (isSplatValue(LHS, DemandedElts, UndefLHS) && + isSplatValue(RHS, DemandedElts, UndefRHS)) { + UndefElts = UndefLHS | UndefRHS; + return true; + } + break; + } + } + + return false; +} + +/// Helper wrapper to main isSplatValue function. +bool SelectionDAG::isSplatValue(SDValue V, bool AllowUndefs) { + EVT VT = V.getValueType(); + assert(VT.isVector() && "Vector type expected"); + unsigned NumElts = VT.getVectorNumElements(); + + APInt UndefElts; + APInt DemandedElts = APInt::getAllOnesValue(NumElts); + return isSplatValue(V, DemandedElts, UndefElts) && + (AllowUndefs || !UndefElts); +} + /// Helper function that checks to see if a node is a constant or a /// build vector of splat constants at least within the demanded elts. static ConstantSDNode *isConstOrDemandedConstSplat(SDValue N, @@ -2195,6 +2291,9 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, KnownBits Known2; unsigned NumElts = DemandedElts.getBitWidth(); + assert((!Op.getValueType().isVector() || + NumElts == Op.getValueType().getVectorNumElements()) && + "Unexpected vector size"); if (!DemandedElts) return Known; // No demanded elts, better to assume we don't know anything. @@ -2203,8 +2302,6 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, switch (Opcode) { case ISD::BUILD_VECTOR: // Collect the known bits that are shared by every demanded vector element. - assert(NumElts == Op.getValueType().getVectorNumElements() && - "Unexpected vector size"); Known.Zero.setAllBits(); Known.One.setAllBits(); for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { if (!DemandedElts[i]) @@ -2344,6 +2441,19 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, } break; } + case ISD::SCALAR_TO_VECTOR: { + // We know about scalar_to_vector as much as we know about it source, + // which becomes the first element of otherwise unknown vector. + if (DemandedElts != 1) + break; + + SDValue N0 = Op.getOperand(0); + Known = computeKnownBits(N0, Depth + 1); + if (N0.getValueSizeInBits() != BitWidth) + Known = Known.trunc(BitWidth); + + break; + } case ISD::BITCAST: { SDValue N0 = Op.getOperand(0); EVT SubVT = N0.getValueType(); @@ -2569,6 +2679,39 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known.One.ashrInPlace(Shift); } break; + case ISD::FSHL: + case ISD::FSHR: + if (ConstantSDNode *C = + isConstOrDemandedConstSplat(Op.getOperand(2), DemandedElts)) { + unsigned Amt = C->getAPIntValue().urem(BitWidth); + + // For fshl, 0-shift returns the 1st arg. + // For fshr, 0-shift returns the 2nd arg. + if (Amt == 0) { + Known = computeKnownBits(Op.getOperand(Opcode == ISD::FSHL ? 0 : 1), + DemandedElts, Depth + 1); + break; + } + + // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) + // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) + Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + if (Opcode == ISD::FSHL) { + Known.One <<= Amt; + Known.Zero <<= Amt; + Known2.One.lshrInPlace(BitWidth - Amt); + Known2.Zero.lshrInPlace(BitWidth - Amt); + } else { + Known.One <<= BitWidth - Amt; + Known.Zero <<= BitWidth - Amt; + Known2.One.lshrInPlace(Amt); + Known2.Zero.lshrInPlace(Amt); + } + Known.One |= Known2.One; + Known.Zero |= Known2.Zero; + } + break; case ISD::SIGN_EXTEND_INREG: { EVT EVT = cast(Op.getOperand(1))->getVT(); unsigned EBits = EVT.getScalarSizeInBits(); @@ -4167,9 +4310,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::ZERO_EXTEND_VECTOR_INREG: case ISD::SIGN_EXTEND_VECTOR_INREG: assert(VT.isVector() && "This DAG node is restricted to vector types."); - assert(VT.getSizeInBits() == Operand.getValueSizeInBits() && - "The sizes of the input and result must match in order to perform the " - "extend in-register."); + assert(Operand.getValueType().bitsLE(VT) && + "The input must be the same size or smaller than the result."); assert(VT.getVectorNumElements() < Operand.getValueType().getVectorNumElements() && "The destination vector type must have fewer lanes than the input."); @@ -4387,14 +4529,20 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, if (GlobalAddressSDNode *GA = dyn_cast(Cst2)) return FoldSymbolOffset(Opcode, VT, GA, Cst1); - // For vectors extract each constant element into Inputs so we can constant - // fold them individually. - BuildVectorSDNode *BV1 = dyn_cast(Cst1); - BuildVectorSDNode *BV2 = dyn_cast(Cst2); - if (!BV1 || !BV2) + // For vectors, extract each constant element and fold them individually. + // Either input may be an undef value. + auto *BV1 = dyn_cast(Cst1); + if (!BV1 && !Cst1->isUndef()) + return SDValue(); + auto *BV2 = dyn_cast(Cst2); + if (!BV2 && !Cst2->isUndef()) + return SDValue(); + // If both operands are undef, that's handled the same way as scalars. + if (!BV1 && !BV2) return SDValue(); - assert(BV1->getNumOperands() == BV2->getNumOperands() && "Out of sync!"); + assert((!BV1 || !BV2 || BV1->getNumOperands() == BV2->getNumOperands()) && + "Vector binop with different number of elements in operands?"); EVT SVT = VT.getScalarType(); EVT LegalSVT = SVT; @@ -4404,10 +4552,10 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, return SDValue(); } SmallVector Outputs; - for (unsigned I = 0, E = BV1->getNumOperands(); I != E; ++I) { - SDValue V1 = BV1->getOperand(I); - SDValue V2 = BV2->getOperand(I); - + unsigned NumOps = BV1 ? BV1->getNumOperands() : BV2->getNumOperands(); + for (unsigned I = 0; I != NumOps; ++I) { + SDValue V1 = BV1 ? BV1->getOperand(I) : getUNDEF(SVT); + SDValue V2 = BV2 ? BV2->getOperand(I) : getUNDEF(SVT); if (SVT.isInteger()) { if (V1->getValueType(0).bitsGT(SVT)) V1 = getNode(ISD::TRUNCATE, DL, SVT, V1); @@ -4635,6 +4783,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::SHL: case ISD::SRA: case ISD::SRL: + if (SDValue V = simplifyShift(N1, N2)) + return V; + LLVM_FALLTHROUGH; case ISD::ROTL: case ISD::ROTR: assert(VT == N1.getValueType() && @@ -4643,7 +4794,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, "Shifts only work on integers"); assert((!VT.isVector() || VT == N2.getValueType()) && "Vector shift amounts must be in the same as their first arg"); - // Verify that the shift amount VT is bit enough to hold valid shift + // Verify that the shift amount VT is big enough to hold valid shift // amounts. This catches things like trying to shift an i1024 value by an // i8, which is easy to fall into in generic code that uses // TLI.getShiftAmount(). @@ -4691,8 +4842,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, assert(!EVT.isVector() && "AssertSExt/AssertZExt type should be the vector element type " "rather than the vector type!"); - assert(EVT.bitsLE(VT) && "Not extending!"); - if (VT == EVT) return N1; // noop assertion. + assert(EVT.bitsLE(VT.getScalarType()) && "Not extending!"); + if (VT.getScalarType() == EVT) return N1; // noop assertion. break; } case ISD::SIGN_EXTEND_INREG: { @@ -4929,14 +5080,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, } } - // Any FP binop with an undef operand is folded to NaN. This matches the - // behavior of the IR optimizer. switch (Opcode) { case ISD::FADD: case ISD::FSUB: case ISD::FMUL: case ISD::FDIV: case ISD::FREM: + // If both operands are undef, the result is undef. If 1 operand is undef, + // the result is NaN. This should match the behavior of the IR optimizer. + if (N1.isUndef() && N2.isUndef()) + return getUNDEF(VT); if (N1.isUndef() || N2.isUndef()) return getConstantFP(APFloat::getNaN(EVTToAPFloatSemantics(VT)), DL, VT); } @@ -4955,9 +5108,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::SDIV: case ISD::UREM: case ISD::SREM: - case ISD::SRA: - case ISD::SRL: - case ISD::SHL: return getConstant(0, DL, VT); // fold op(undef, arg2) -> 0 } } @@ -4973,16 +5123,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, return getConstant(0, DL, VT); LLVM_FALLTHROUGH; case ISD::ADD: - case ISD::ADDC: - case ISD::ADDE: case ISD::SUB: case ISD::UDIV: case ISD::SDIV: case ISD::UREM: case ISD::SREM: - case ISD::SRA: - case ISD::SRL: - case ISD::SHL: return getUNDEF(VT); // fold op(arg1, undef) -> undef case ISD::MUL: case ISD::AND: @@ -5078,13 +5223,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, break; } case ISD::SELECT: - if (ConstantSDNode *N1C = dyn_cast(N1)) { - if (N1C->getZExtValue()) - return N2; // select true, X, Y -> X - return N3; // select false, X, Y -> Y - } - - if (N2 == N3) return N2; // select C, X, X -> X + case ISD::VSELECT: + if (SDValue V = simplifySelect(N1, N2, N3)) + return V; break; case ISD::VECTOR_SHUFFLE: llvm_unreachable("should use getVectorShuffle constructor!"); @@ -5383,12 +5524,10 @@ static bool FindOptimalMemOpLowering(std::vector &MemOps, // If the new VT cannot cover all of the remaining bits, then consider // issuing a (or a pair of) unaligned and overlapping load / store. - // FIXME: Only does this for 64-bit or more since we don't have proper - // cost model for unaligned load / store. bool Fast; - if (NumMemOps && AllowOverlap && - VTSize >= 8 && NewVTSize < Size && - TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign, &Fast) && Fast) + if (NumMemOps && AllowOverlap && NewVTSize < Size && + TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign, &Fast) && + Fast) VTSize = Size; else { VT = NewVT; @@ -6784,6 +6923,60 @@ SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl, return V; } +SDValue SelectionDAG::simplifySelect(SDValue Cond, SDValue T, SDValue F) { + // select undef, T, F --> T (if T is a constant), otherwise F + // select, ?, undef, F --> F + // select, ?, T, undef --> T + if (Cond.isUndef()) + return isConstantValueOfAnyType(T) ? T : F; + if (T.isUndef()) + return F; + if (F.isUndef()) + return T; + + // select true, T, F --> T + // select false, T, F --> F + if (auto *CondC = dyn_cast(Cond)) + return CondC->isNullValue() ? F : T; + + // TODO: This should simplify VSELECT with constant condition using something + // like this (but check boolean contents to be complete?): + // if (ISD::isBuildVectorAllOnes(Cond.getNode())) + // return T; + // if (ISD::isBuildVectorAllZeros(Cond.getNode())) + // return F; + + // select ?, T, T --> T + if (T == F) + return T; + + return SDValue(); +} + +SDValue SelectionDAG::simplifyShift(SDValue X, SDValue Y) { + // shift undef, Y --> 0 (can always assume that the undef value is 0) + if (X.isUndef()) + return getConstant(0, SDLoc(X.getNode()), X.getValueType()); + // shift X, undef --> undef (because it may shift by the bitwidth) + if (Y.isUndef()) + return getUNDEF(X.getValueType()); + + // shift 0, Y --> 0 + // shift X, 0 --> X + if (isNullOrNullSplat(X) || isNullOrNullSplat(Y)) + return X; + + // shift X, C >= bitwidth(X) --> undef + // All vector elements must be too big to avoid partial undefs. + auto isShiftTooBig = [X](ConstantSDNode *Val) { + return Val->getAPIntValue().uge(X.getScalarValueSizeInBits()); + }; + if (ISD::matchUnaryPredicate(Y, isShiftTooBig)) + return getUNDEF(X.getValueType()); + + return SDValue(); +} + SDValue SelectionDAG::getVAArg(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, SDValue SV, unsigned Align) { SDValue Ops[] = { Chain, Ptr, SV, getTargetConstant(Align, dl, MVT::i32) }; @@ -7039,7 +7232,7 @@ SDVTList SelectionDAG::getVTList(ArrayRef VTs) { SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP); if (!Result) { EVT *Array = Allocator.Allocate(NumVTs); - std::copy(VTs.begin(), VTs.end(), Array); + llvm::copy(VTs, Array); Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, NumVTs); VTListMap.InsertNode(Result, IP); } @@ -7185,7 +7378,7 @@ void SelectionDAG::setNodeMemRefs(MachineSDNode *N, MachineMemOperand **MemRefsBuffer = Allocator.template Allocate(NewMemRefs.size()); - std::copy(NewMemRefs.begin(), NewMemRefs.end(), MemRefsBuffer); + llvm::copy(NewMemRefs, MemRefsBuffer); N->MemRefs = MemRefsBuffer; N->NumMemRefs = static_cast(NewMemRefs.size()); } @@ -7674,8 +7867,11 @@ void SelectionDAG::transferDbgValues(SDValue From, SDValue To, Dbg->getDebugLoc(), Dbg->getOrder()); ClonedDVs.push_back(Clone); - if (InvalidateDbg) + if (InvalidateDbg) { + // Invalidate value and indicate the SDDbgValue should not be emitted. Dbg->setIsInvalidated(); + Dbg->setIsEmitted(); + } } for (SDDbgValue *Dbg : ClonedDVs) @@ -7712,6 +7908,7 @@ void SelectionDAG::salvageDebugInfo(SDNode &N) { DV->isIndirect(), DV->getDebugLoc(), DV->getOrder()); ClonedDVs.push_back(Clone); DV->setIsInvalidated(); + DV->setIsEmitted(); LLVM_DEBUG(dbgs() << "SALVAGE: Rewriting"; N0.getNode()->dumprFull(this); dbgs() << " into " << *DIExpr << '\n'); @@ -8319,6 +8516,26 @@ ConstantFPSDNode *llvm::isConstOrConstSplatFP(SDValue N, bool AllowUndefs) { return nullptr; } +bool llvm::isNullOrNullSplat(SDValue N) { + // TODO: may want to use peekThroughBitcast() here. + ConstantSDNode *C = isConstOrConstSplat(N); + return C && C->isNullValue(); +} + +bool llvm::isOneOrOneSplat(SDValue N) { + // TODO: may want to use peekThroughBitcast() here. + unsigned BitWidth = N.getScalarValueSizeInBits(); + ConstantSDNode *C = isConstOrConstSplat(N); + return C && C->isOne() && C->getValueSizeInBits(0) == BitWidth; +} + +bool llvm::isAllOnesOrAllOnesSplat(SDValue N) { + N = peekThroughBitcasts(N); + unsigned BitWidth = N.getScalarValueSizeInBits(); + ConstantSDNode *C = isConstOrConstSplat(N); + return C && C->isAllOnesValue() && C->getValueSizeInBits(0) == BitWidth; +} + HandleSDNode::~HandleSDNode() { DropOperands(); } @@ -8943,8 +9160,11 @@ SDNode *SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) { void SelectionDAG::createOperands(SDNode *Node, ArrayRef Vals) { assert(!Node->OperandList && "Node already has operands"); + assert(std::numeric_limits::max() > + Vals.size() && + "too many operands to fit into SDNode"); SDUse *Ops = OperandRecycler.allocate( - ArrayRecycler::Capacity::get(Vals.size()), OperandAllocator); + ArrayRecycler::Capacity::get(Vals.size()), OperandAllocator); bool IsDivergent = false; for (unsigned I = 0; I != Vals.size(); ++I) { diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp index 8c57f18183e95777dd6210dd7440225559509117..488bac1a9a80013b17c4fd6eb9ef0a09129f2a3f 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp @@ -107,14 +107,14 @@ BaseIndexOffset BaseIndexOffset::match(const LSBaseSDNode *N, if (auto *C = dyn_cast(Base->getOperand(1))) if (DAG.MaskedValueIsZero(Base->getOperand(0), C->getAPIntValue())) { Offset += C->getSExtValue(); - Base = Base->getOperand(0); + Base = DAG.getTargetLoweringInfo().unwrapAddress(Base->getOperand(0)); continue; } break; case ISD::ADD: if (auto *C = dyn_cast(Base->getOperand(1))) { Offset += C->getSExtValue(); - Base = Base->getOperand(0); + Base = DAG.getTargetLoweringInfo().unwrapAddress(Base->getOperand(0)); continue; } break; @@ -130,7 +130,7 @@ BaseIndexOffset BaseIndexOffset::match(const LSBaseSDNode *N, Offset -= Off; else Offset += Off; - Base = LSBase->getBasePtr(); + Base = DAG.getTargetLoweringInfo().unwrapAddress(LSBase->getBasePtr()); continue; } break; diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index bf24d7f75622747e28589419c2c64f2ffe1c8734..c42d2491243553ff306423622fdf4f7a34b2ed2c 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -823,7 +823,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, // If the source register was virtual and if we know something about it, // add an assert node. if (!TargetRegisterInfo::isVirtualRegister(Regs[Part+i]) || - !RegisterVT.isInteger() || RegisterVT.isVector()) + !RegisterVT.isInteger()) continue; const FunctionLoweringInfo::LiveOutInfo *LOI = @@ -831,7 +831,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, if (!LOI) continue; - unsigned RegSize = RegisterVT.getSizeInBits(); + unsigned RegSize = RegisterVT.getScalarSizeInBits(); unsigned NumSignBits = LOI->NumSignBits; unsigned NumZeroBits = LOI->Known.countMinLeadingZeros(); @@ -1032,8 +1032,19 @@ SDValue SelectionDAGBuilder::getRoot() { } // Otherwise, we have to make a token factor node. - SDValue Root = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, - PendingLoads); + // If we have >= 2^16 loads then split across multiple token factors as + // there's a 64k limit on the number of SDNode operands. + SDValue Root; + size_t Limit = (1 << 16) - 1; + while (PendingLoads.size() > Limit) { + unsigned SliceIdx = PendingLoads.size() - Limit; + auto ExtractedTFs = ArrayRef(PendingLoads).slice(SliceIdx, Limit); + SDValue NewTF = + DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, ExtractedTFs); + PendingLoads.erase(PendingLoads.begin() + SliceIdx, PendingLoads.end()); + PendingLoads.emplace_back(NewTF); + } + Root = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, PendingLoads); PendingLoads.clear(); DAG.setRoot(Root); return Root; @@ -2801,6 +2812,15 @@ static bool isVectorReductionOp(const User *I) { return ReduxExtracted; } +void SelectionDAGBuilder::visitUnary(const User &I, unsigned Opcode) { + SDNodeFlags Flags; + + SDValue Op = getValue(I.getOperand(0)); + SDValue UnNodeValue = DAG.getNode(Opcode, getCurSDLoc(), Op.getValueType(), + Op, Flags); + setValue(&I, UnNodeValue); +} + void SelectionDAGBuilder::visitBinary(const User &I, unsigned Opcode) { SDNodeFlags Flags; if (auto *OFBinOp = dyn_cast(&I)) { @@ -5292,7 +5312,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { return nullptr; SDDbgValue *SDV; - if (isa(V) || isa(V) || isa(V)) { + if (isa(V) || isa(V) || isa(V) || + isa(V)) { SDV = DAG.getConstantDbgValue(Variable, Expression, V, dl, SDNodeOrder); DAG.AddDbgValue(SDV, nullptr, false); return nullptr; @@ -5731,6 +5752,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { SDValue Zero = DAG.getConstant(0, sdl, VT); SDValue ShAmt = DAG.getNode(ISD::UREM, sdl, VT, Z, BitWidthC); + auto FunnelOpcode = IsFSHL ? ISD::FSHL : ISD::FSHR; + if (TLI.isOperationLegalOrCustom(FunnelOpcode, VT)) { + setValue(&I, DAG.getNode(FunnelOpcode, sdl, VT, X, Y, Z)); + return nullptr; + } + // When X == Y, this is rotate. If the data type has a power-of-2 size, we // avoid the select that is necessary in the general case to filter out // the 0-shift possibility that leads to UB. @@ -5805,6 +5832,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { setValue(&I, DAG.getNode(ISD::USUBSAT, sdl, Op1.getValueType(), Op1, Op2)); return nullptr; } + case Intrinsic::smul_fix: { + SDValue Op1 = getValue(I.getArgOperand(0)); + SDValue Op2 = getValue(I.getArgOperand(1)); + SDValue Op3 = getValue(I.getArgOperand(2)); + setValue(&I, + DAG.getNode(ISD::SMULFIX, sdl, Op1.getValueType(), Op1, Op2, Op3)); + return nullptr; + } case Intrinsic::stacksave: { SDValue Op = getRoot(); Res = DAG.getNode( diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 4b5dda982f1b5c65252cfac4bf251b88f766d403..5f9cdb69daf72d0d017ace630f7f6f7bed163951 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -854,6 +854,9 @@ private: void visitInvoke(const InvokeInst &I); void visitResume(const ResumeInst &I); + void visitUnary(const User &I, unsigned Opcode); + void visitFNeg(const User &I) { visitUnary(I, ISD::FNEG); } + void visitBinary(const User &I, unsigned Opcode); void visitShift(const User &I, unsigned Opcode); void visitAdd(const User &I) { visitBinary(I, ISD::ADD); } diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 02d45df5864d906aadf12959eff1ab9cfe64b3e9..43df2abb674bebb53530040b91156b783c121efb 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -237,6 +237,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::SRL: return "srl"; case ISD::ROTL: return "rotl"; case ISD::ROTR: return "rotr"; + case ISD::FSHL: return "fshl"; + case ISD::FSHR: return "fshr"; case ISD::FADD: return "fadd"; case ISD::STRICT_FADD: return "strict_fadd"; case ISD::FSUB: return "fsub"; @@ -295,6 +297,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::UADDSAT: return "uaddsat"; case ISD::SSUBSAT: return "ssubsat"; case ISD::USUBSAT: return "usubsat"; + case ISD::SMULFIX: return "smulfix"; // Conversion operators. case ISD::SIGN_EXTEND: return "sign_extend"; diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index dca358032fb59fec6881a27b0a3ac9a136cc15fb..7d341dae210b1e532b6cfc3654dc50345e2ddc50 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -177,7 +177,8 @@ static const bool ViewDAGCombine1 = false, /// RegisterScheduler class - Track the registration of instruction schedulers. /// //===---------------------------------------------------------------------===// -MachinePassRegistry RegisterScheduler::Registry; +MachinePassRegistry + RegisterScheduler::Registry; //===---------------------------------------------------------------------===// /// @@ -696,10 +697,10 @@ void SelectionDAGISel::ComputeLiveOutVRegInfo() { if (!TargetRegisterInfo::isVirtualRegister(DestReg)) continue; - // Ignore non-scalar or non-integer values. + // Ignore non-integer values. SDValue Src = N->getOperand(2); EVT SrcVT = Src.getValueType(); - if (!SrcVT.isInteger() || SrcVT.isVector()) + if (!SrcVT.isInteger()) continue; unsigned NumSignBits = CurDAG->ComputeNumSignBits(Src); @@ -3206,6 +3207,18 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, N.getNode())) break; continue; + case OPC_CheckPredicateWithOperands: { + unsigned OpNum = MatcherTable[MatcherIndex++]; + SmallVector Operands; + + for (unsigned i = 0; i < OpNum; ++i) + Operands.push_back(RecordedNodes[MatcherTable[MatcherIndex++]].first); + + unsigned PredNo = MatcherTable[MatcherIndex++]; + if (!CheckNodePredicateWithOperands(N.getNode(), PredNo, Operands)) + break; + continue; + } case OPC_CheckComplexPat: { unsigned CPNum = MatcherTable[MatcherIndex++]; unsigned RecNo = MatcherTable[MatcherIndex++]; diff --git a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp index 54cbd6859f7031eb8682c5d72891f761af8d89f4..90a1b350fc942e1bbb426066c7c71c3ecd83d3b8 100644 --- a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp +++ b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp @@ -522,7 +522,16 @@ lowerStatepointMetaArgs(SmallVectorImpl &Ops, // The vm state arguments are lowered in an opaque manner. We do not know // what type of values are contained within. for (const Value *V : SI.DeoptState) { - SDValue Incoming = Builder.getValue(V); + SDValue Incoming; + // If this is a function argument at a static frame index, generate it as + // the frame index. + if (const Argument *Arg = dyn_cast(V)) { + int FI = Builder.FuncInfo.getArgumentFrameIndex(Arg); + if (FI != INT_MAX) + Incoming = Builder.DAG.getFrameIndex(FI, Builder.getFrameIndexTy()); + } + if (!Incoming.getNode()) + Incoming = Builder.getValue(V); const bool LiveInValue = LiveInDeopt && !isGCValue(V); lowerIncomingStatepointValue(Incoming, LiveInValue, Ops, Builder); } diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 2bc9090428b4a45a7e1c58091f5c29f7ff0a45cc..4c551d5b231ff8f56b0fd6fac193a2e9ee1a0ce9 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -496,23 +496,41 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, return Simplified; } +bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, + KnownBits &Known, + TargetLoweringOpt &TLO, + unsigned Depth, + bool AssumeSingleUse) const { + EVT VT = Op.getValueType(); + APInt DemandedElts = VT.isVector() + ? APInt::getAllOnesValue(VT.getVectorNumElements()) + : APInt(1, 1); + return SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO, Depth, + AssumeSingleUse); +} + /// Look at Op. At this point, we know that only the OriginalDemandedBits of the /// result of Op are ever used downstream. If we can use this information to /// simplify Op, create a new simplified DAG node and return true, returning the /// original and new nodes in Old and New. Otherwise, analyze the expression and /// return a mask of Known bits for the expression (used to simplify the /// caller). The Known bits may only be accurate for those bits in the -/// DemandedMask. -bool TargetLowering::SimplifyDemandedBits(SDValue Op, - const APInt &OriginalDemandedBits, - KnownBits &Known, - TargetLoweringOpt &TLO, - unsigned Depth, - bool AssumeSingleUse) const { +/// OriginalDemandedBits and OriginalDemandedElts. +bool TargetLowering::SimplifyDemandedBits( + SDValue Op, const APInt &OriginalDemandedBits, + const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, + unsigned Depth, bool AssumeSingleUse) const { unsigned BitWidth = OriginalDemandedBits.getBitWidth(); assert(Op.getScalarValueSizeInBits() == BitWidth && "Mask size mismatches value type size!"); + + unsigned NumElts = OriginalDemandedElts.getBitWidth(); + assert((!Op.getValueType().isVector() || + NumElts == Op.getValueType().getVectorNumElements()) && + "Unexpected vector size"); + APInt DemandedBits = OriginalDemandedBits; + APInt DemandedElts = OriginalDemandedElts; SDLoc dl(Op); auto &DL = TLO.DAG.getDataLayout(); @@ -532,18 +550,19 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, if (Depth != 0) { // If not at the root, Just compute the Known bits to // simplify things downstream. - TLO.DAG.computeKnownBits(Op, Known, Depth); + TLO.DAG.computeKnownBits(Op, Known, DemandedElts, Depth); return false; } // If this is the root being simplified, allow it to have multiple uses, - // just set the DemandedBits to all bits. + // just set the DemandedBits/Elts to all bits. DemandedBits = APInt::getAllOnesValue(BitWidth); - } else if (OriginalDemandedBits == 0) { - // Not demanding any bits from Op. + DemandedElts = APInt::getAllOnesValue(NumElts); + } else if (OriginalDemandedBits == 0 || OriginalDemandedElts == 0) { + // Not demanding any bits/elts from Op. if (!Op.isUndef()) return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT)); return false; - } else if (Depth == 6) { // Limit search depth. + } else if (Depth == 6) { // Limit search depth. return false; } @@ -573,18 +592,71 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, Known.One &= Known2.One; Known.Zero &= Known2.Zero; } - return false; // Don't fall through, will infinitely loop. - case ISD::CONCAT_VECTORS: + return false; // Don't fall through, will infinitely loop. + case ISD::CONCAT_VECTORS: { Known.Zero.setAllBits(); Known.One.setAllBits(); - for (SDValue SrcOp : Op->ops()) { - if (SimplifyDemandedBits(SrcOp, DemandedBits, Known2, TLO, Depth + 1)) + EVT SubVT = Op.getOperand(0).getValueType(); + unsigned NumSubVecs = Op.getNumOperands(); + unsigned NumSubElts = SubVT.getVectorNumElements(); + for (unsigned i = 0; i != NumSubVecs; ++i) { + APInt DemandedSubElts = + DemandedElts.extractBits(NumSubElts, i * NumSubElts); + if (SimplifyDemandedBits(Op.getOperand(i), DemandedBits, DemandedSubElts, + Known2, TLO, Depth + 1)) return true; - // Known bits are the values that are shared by every subvector. - Known.One &= Known2.One; - Known.Zero &= Known2.Zero; + // Known bits are shared by every demanded subvector element. + if (!!DemandedSubElts) { + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; + } + } + break; + } + case ISD::VECTOR_SHUFFLE: { + ArrayRef ShuffleMask = cast(Op)->getMask(); + + // Collect demanded elements from shuffle operands.. + APInt DemandedLHS(NumElts, 0); + APInt DemandedRHS(NumElts, 0); + for (unsigned i = 0; i != NumElts; ++i) { + if (!DemandedElts[i]) + continue; + int M = ShuffleMask[i]; + if (M < 0) { + // For UNDEF elements, we don't know anything about the common state of + // the shuffle result. + DemandedLHS.clearAllBits(); + DemandedRHS.clearAllBits(); + break; + } + assert(0 <= M && M < (int)(2 * NumElts) && "Shuffle index out of range"); + if (M < (int)NumElts) + DemandedLHS.setBit(M); + else + DemandedRHS.setBit(M - NumElts); + } + + if (!!DemandedLHS || !!DemandedRHS) { + Known.Zero.setAllBits(); + Known.One.setAllBits(); + if (!!DemandedLHS) { + if (SimplifyDemandedBits(Op.getOperand(0), DemandedBits, DemandedLHS, + Known2, TLO, Depth + 1)) + return true; + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; + } + if (!!DemandedRHS) { + if (SimplifyDemandedBits(Op.getOperand(1), DemandedBits, DemandedRHS, + Known2, TLO, Depth + 1)) + return true; + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; + } } break; + } case ISD::AND: { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); @@ -596,7 +668,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, if (ConstantSDNode *RHSC = isConstOrConstSplat(Op1)) { KnownBits LHSKnown; // Do not increment Depth here; that can cause an infinite loop. - TLO.DAG.computeKnownBits(Op0, LHSKnown, Depth); + TLO.DAG.computeKnownBits(Op0, LHSKnown, DemandedElts, Depth); // If the LHS already has zeros where RHSC does, this 'and' is dead. if ((LHSKnown.Zero & DemandedBits) == (~RHSC->getAPIntValue() & DemandedBits)) @@ -619,10 +691,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, } } - if (SimplifyDemandedBits(Op1, DemandedBits, Known, TLO, Depth + 1)) + if (SimplifyDemandedBits(Op1, DemandedBits, DemandedElts, Known, TLO, Depth + 1)) return true; assert(!Known.hasConflict() && "Bits known to be one AND zero?"); - if (SimplifyDemandedBits(Op0, ~Known.Zero & DemandedBits, Known2, TLO, + if (SimplifyDemandedBits(Op0, ~Known.Zero & DemandedBits, DemandedElts, Known2, TLO, Depth + 1)) return true; assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); @@ -653,10 +725,11 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); - if (SimplifyDemandedBits(Op1, DemandedBits, Known, TLO, Depth + 1)) + if (SimplifyDemandedBits(Op1, DemandedBits, DemandedElts, Known, TLO, Depth + 1)) return true; assert(!Known.hasConflict() && "Bits known to be one AND zero?"); - if (SimplifyDemandedBits(Op0, ~Known.One & DemandedBits, Known2, TLO, Depth + 1)) + if (SimplifyDemandedBits(Op0, ~Known.One & DemandedBits, DemandedElts, Known2, TLO, + Depth + 1)) return true; assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); @@ -683,10 +756,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); - if (SimplifyDemandedBits(Op1, DemandedBits, Known, TLO, Depth + 1)) + if (SimplifyDemandedBits(Op1, DemandedBits, DemandedElts, Known, TLO, Depth + 1)) return true; assert(!Known.hasConflict() && "Bits known to be one AND zero?"); - if (SimplifyDemandedBits(Op0, DemandedBits, Known2, TLO, Depth + 1)) + if (SimplifyDemandedBits(Op0, DemandedBits, DemandedElts, Known2, TLO, Depth + 1)) return true; assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); @@ -840,7 +913,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, } } - if (SimplifyDemandedBits(Op0, DemandedBits.lshr(ShAmt), Known, TLO, + if (SimplifyDemandedBits(Op0, DemandedBits.lshr(ShAmt), DemandedElts, Known, TLO, Depth + 1)) return true; @@ -935,7 +1008,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, } // Compute the new bits that are at the top now. - if (SimplifyDemandedBits(Op0, InDemandedMask, Known, TLO, Depth + 1)) + if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO, Depth + 1)) return true; assert(!Known.hasConflict() && "Bits known to be one AND zero?"); Known.Zero.lshrInPlace(ShAmt); @@ -974,7 +1047,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, if (DemandedBits.countLeadingZeros() < ShAmt) InDemandedMask.setSignBit(); - if (SimplifyDemandedBits(Op0, InDemandedMask, Known, TLO, Depth + 1)) + if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO, Depth + 1)) return true; assert(!Known.hasConflict() && "Bits known to be one AND zero?"); Known.Zero.lshrInPlace(ShAmt); @@ -1084,19 +1157,19 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, break; } case ISD::ZERO_EXTEND: { - unsigned OperandBitWidth = Op.getOperand(0).getScalarValueSizeInBits(); + SDValue Src = Op.getOperand(0); + unsigned InBits = Src.getScalarValueSizeInBits(); // If none of the top bits are demanded, convert this into an any_extend. - if (DemandedBits.getActiveBits() <= OperandBitWidth) - return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT, - Op.getOperand(0))); + if (DemandedBits.getActiveBits() <= InBits) + return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT, Src)); - APInt InMask = DemandedBits.trunc(OperandBitWidth); - if (SimplifyDemandedBits(Op.getOperand(0), InMask, Known, TLO, Depth+1)) + APInt InDemandedBits = DemandedBits.trunc(InBits); + if (SimplifyDemandedBits(Src, InDemandedBits, Known, TLO, Depth+1)) return true; assert(!Known.hasConflict() && "Bits known to be one AND zero?"); Known = Known.zext(BitWidth); - Known.Zero.setBitsFrom(OperandBitWidth); + Known.Zero.setBitsFrom(InBits); break; } case ISD::SIGN_EXTEND: { @@ -1143,9 +1216,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, break; } case ISD::ANY_EXTEND: { - unsigned OperandBitWidth = Op.getOperand(0).getScalarValueSizeInBits(); - APInt InMask = DemandedBits.trunc(OperandBitWidth); - if (SimplifyDemandedBits(Op.getOperand(0), InMask, Known, TLO, Depth+1)) + SDValue Src = Op.getOperand(0); + unsigned InBits = Src.getScalarValueSizeInBits(); + APInt InDemandedBits = DemandedBits.trunc(InBits); + if (SimplifyDemandedBits(Src, InDemandedBits, Known, TLO, Depth+1)) return true; assert(!Known.hasConflict() && "Bits known to be one AND zero?"); Known = Known.zext(BitWidth); @@ -1219,6 +1293,33 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, Known.Zero |= ~InMask; break; } + case ISD::EXTRACT_VECTOR_ELT: { + SDValue Src = Op.getOperand(0); + SDValue Idx = Op.getOperand(1); + unsigned NumSrcElts = Src.getValueType().getVectorNumElements(); + unsigned EltBitWidth = Src.getScalarValueSizeInBits(); + + // Demand the bits from every vector element without a constant index. + APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts); + if (auto *CIdx = dyn_cast(Idx)) + if (CIdx->getAPIntValue().ult(NumSrcElts)) + DemandedSrcElts = APInt::getOneBitSet(NumSrcElts, CIdx->getZExtValue()); + + // If BitWidth > EltBitWidth the value is anyext:ed. So we do not know + // anything about the extended bits. + APInt DemandedSrcBits = DemandedBits; + if (BitWidth > EltBitWidth) + DemandedSrcBits = DemandedSrcBits.trunc(EltBitWidth); + + if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts, Known2, TLO, + Depth + 1)) + return true; + + Known = Known2; + if (BitWidth > EltBitWidth) + Known = Known.zext(BitWidth); + break; + } case ISD::BITCAST: { SDValue Src = Op.getOperand(0); EVT SrcVT = Src.getValueType(); @@ -1247,8 +1348,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, TLO.DAG.getNode(ISD::SHL, dl, VT, Sign, ShAmt)); } } - // If bitcast from a vector and the mask covers entire elements, see if we - // can use SimplifyDemandedVectorElts. + // If bitcast from a vector, see if we can use SimplifyDemandedVectorElts by + // demanding the element if any bits from it are demanded. // TODO - bigendian once we have test coverage. // TODO - bool vectors once SimplifyDemandedVectorElts has SETCC support. if (SrcVT.isVector() && NumSrcEltBits > 1 && @@ -1260,10 +1361,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, for (unsigned i = 0; i != Scale; ++i) { unsigned Offset = i * NumSrcEltBits; APInt Sub = DemandedBits.extractBits(NumSrcEltBits, Offset); - if (Sub.isAllOnesValue()) + if (!Sub.isNullValue()) DemandedSubElts.setBit(i); - else if (!Sub.isNullValue()) - return false; } return true; }; @@ -1295,8 +1394,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); unsigned DemandedBitsLZ = DemandedBits.countLeadingZeros(); APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ); - if (SimplifyDemandedBits(Op0, LoMask, Known2, TLO, Depth + 1) || - SimplifyDemandedBits(Op1, LoMask, Known2, TLO, Depth + 1) || + if (SimplifyDemandedBits(Op0, LoMask, DemandedElts, Known2, TLO, Depth + 1) || + SimplifyDemandedBits(Op1, LoMask, DemandedElts, Known2, TLO, Depth + 1) || // See if the operation should be performed at a smaller bit width. ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO)) { SDNodeFlags Flags = Op.getNode()->getFlags(); @@ -1336,14 +1435,14 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, } default: if (Op.getOpcode() >= ISD::BUILTIN_OP_END) { - if (SimplifyDemandedBitsForTargetNode(Op, DemandedBits, Known, TLO, - Depth)) + if (SimplifyDemandedBitsForTargetNode(Op, DemandedBits, DemandedElts, + Known, TLO, Depth)) return true; break; } // Just use computeKnownBits to compute output bits. - TLO.DAG.computeKnownBits(Op, Known, Depth); + TLO.DAG.computeKnownBits(Op, Known, DemandedElts, Depth); break; } @@ -1360,7 +1459,9 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, if (C->isOpaque()) return false; } - return TLO.CombineTo(Op, TLO.DAG.getConstant(Known.One, dl, VT)); + // TODO: Handle float bits as well. + if (VT.isInteger()) + return TLO.CombineTo(Op, TLO.DAG.getConstant(Known.One, dl, VT)); } return false; @@ -1459,6 +1560,23 @@ bool TargetLowering::SimplifyDemandedVectorElts( TLO, Depth + 1)) return true; + // Try calling SimplifyDemandedBits, converting demanded elts to the bits + // of the large element. + // TODO - bigendian once we have test coverage. + if (TLO.DAG.getDataLayout().isLittleEndian()) { + unsigned SrcEltSizeInBits = SrcVT.getScalarSizeInBits(); + APInt SrcDemandedBits = APInt::getNullValue(SrcEltSizeInBits); + for (unsigned i = 0; i != NumElts; ++i) + if (DemandedElts[i]) { + unsigned Ofs = (i % Scale) * EltSizeInBits; + SrcDemandedBits.setBits(Ofs, Ofs + EltSizeInBits); + } + + KnownBits Known; + if (SimplifyDemandedBits(Src, SrcDemandedBits, Known, TLO, Depth + 1)) + return true; + } + // If the src element is zero/undef then all the output elements will be - // only demanded elements are guaranteed to be correct. for (unsigned i = 0; i != NumSrcElts; ++i) { @@ -1551,7 +1669,7 @@ bool TargetLowering::SimplifyDemandedVectorElts( EVT SubVT = Sub.getValueType(); unsigned NumSubElts = SubVT.getVectorNumElements(); const APInt& Idx = cast(Op.getOperand(2))->getAPIntValue(); - if (Idx.uge(NumElts - NumSubElts)) + if (Idx.ugt(NumElts - NumSubElts)) break; unsigned SubIdx = Idx.getZExtValue(); APInt SubElts = DemandedElts.extractBits(NumSubElts, SubIdx); @@ -1596,9 +1714,10 @@ bool TargetLowering::SimplifyDemandedVectorElts( unsigned Idx = CIdx->getZExtValue(); if (!DemandedElts[Idx]) return TLO.CombineTo(Op, Vec); - DemandedElts.clearBit(Idx); - if (SimplifyDemandedVectorElts(Vec, DemandedElts, KnownUndef, + APInt DemandedVecElts(DemandedElts); + DemandedVecElts.clearBit(Idx); + if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef, KnownZero, TLO, Depth + 1)) return true; @@ -1718,6 +1837,21 @@ bool TargetLowering::SimplifyDemandedVectorElts( } break; } + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: { + APInt SrcUndef, SrcZero; + SDValue Src = Op.getOperand(0); + unsigned NumSrcElts = Src.getValueType().getVectorNumElements(); + APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts); + if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, SrcUndef, + SrcZero, TLO, Depth + 1)) + return true; + KnownZero = SrcZero.zextOrTrunc(NumElts); + KnownUndef = SrcUndef.zextOrTrunc(NumElts); + break; + } + case ISD::OR: + case ISD::XOR: case ISD::ADD: case ISD::SUB: case ISD::FADD: @@ -1736,21 +1870,51 @@ bool TargetLowering::SimplifyDemandedVectorElts( KnownUndef &= SrcUndef; break; } + case ISD::AND: { + APInt SrcUndef, SrcZero; + if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, SrcUndef, + SrcZero, TLO, Depth + 1)) + return true; + if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef, + KnownZero, TLO, Depth + 1)) + return true; + + // If either side has a zero element, then the result element is zero, even + // if the other is an UNDEF. + KnownZero |= SrcZero; + KnownUndef &= SrcUndef; + KnownUndef &= ~KnownZero; + break; + } case ISD::TRUNCATE: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef, KnownZero, TLO, Depth + 1)) return true; break; default: { - if (Op.getOpcode() >= ISD::BUILTIN_OP_END) + if (Op.getOpcode() >= ISD::BUILTIN_OP_END) { if (SimplifyDemandedVectorEltsForTargetNode(Op, DemandedElts, KnownUndef, KnownZero, TLO, Depth)) return true; + } else { + KnownBits Known; + APInt DemandedBits = APInt::getAllOnesValue(EltSizeInBits); + if (SimplifyDemandedBits(Op, DemandedBits, DemandedEltMask, Known, TLO, + Depth, AssumeSingleUse)) + return true; + } break; } } - assert((KnownUndef & KnownZero) == 0 && "Elements flagged as undef AND zero"); + + // Constant fold all undef cases. + // TODO: Handle zero cases as well. + if (DemandedElts.isSubsetOf(KnownUndef)) + return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT)); + return false; } @@ -1811,18 +1975,14 @@ bool TargetLowering::SimplifyDemandedVectorEltsForTargetNode( } bool TargetLowering::SimplifyDemandedBitsForTargetNode( - SDValue Op, const APInt &DemandedBits, KnownBits &Known, - TargetLoweringOpt &TLO, unsigned Depth) const { + SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, + KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const { assert((Op.getOpcode() >= ISD::BUILTIN_OP_END || Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN || Op.getOpcode() == ISD::INTRINSIC_W_CHAIN || Op.getOpcode() == ISD::INTRINSIC_VOID) && "Should use SimplifyDemandedBits if you don't know whether Op" " is a target node!"); - EVT VT = Op.getValueType(); - APInt DemandedElts = VT.isVector() - ? APInt::getAllOnesValue(VT.getVectorNumElements()) - : APInt(1, 1); computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth); return false; } @@ -2963,8 +3123,11 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, /// Returns true (and the GlobalValue and the offset) if the node is a /// GlobalAddress + offset. -bool TargetLowering::isGAPlusOffset(SDNode *N, const GlobalValue *&GA, +bool TargetLowering::isGAPlusOffset(SDNode *WN, const GlobalValue *&GA, int64_t &Offset) const { + + SDNode *N = unwrapAddress(SDValue(WN, 0)).getNode(); + if (auto *GASD = dyn_cast(N)) { GA = GASD->getGlobal(); Offset += GASD->getOffset(); @@ -4028,8 +4191,17 @@ bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl, if (!MakeMUL_LOHI(LH, RL, Lo, Hi, false)) return false; - Next = DAG.getNode(ISD::ADDC, dl, DAG.getVTList(VT, MVT::Glue), Next, - Merge(Lo, Hi)); + SDValue Zero = DAG.getConstant(0, dl, HiLoVT); + EVT BoolType = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + + bool UseGlue = (isOperationLegalOrCustom(ISD::ADDC, VT) && + isOperationLegalOrCustom(ISD::ADDE, VT)); + if (UseGlue) + Next = DAG.getNode(ISD::ADDC, dl, DAG.getVTList(VT, MVT::Glue), Next, + Merge(Lo, Hi)); + else + Next = DAG.getNode(ISD::ADDCARRY, dl, DAG.getVTList(VT, BoolType), Next, + Merge(Lo, Hi), DAG.getConstant(0, dl, BoolType)); SDValue Carry = Next.getValue(1); Result.push_back(DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, Next)); @@ -4038,9 +4210,13 @@ bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl, if (!MakeMUL_LOHI(LH, RH, Lo, Hi, Opcode == ISD::SMUL_LOHI)) return false; - SDValue Zero = DAG.getConstant(0, dl, HiLoVT); - Hi = DAG.getNode(ISD::ADDE, dl, DAG.getVTList(HiLoVT, MVT::Glue), Hi, Zero, - Carry); + if (UseGlue) + Hi = DAG.getNode(ISD::ADDE, dl, DAG.getVTList(HiLoVT, MVT::Glue), Hi, Zero, + Carry); + else + Hi = DAG.getNode(ISD::ADDCARRY, dl, DAG.getVTList(HiLoVT, BoolType), Hi, + Zero, Carry); + Next = DAG.getNode(ISD::ADD, dl, VT, Next, Merge(Lo, Hi)); if (Opcode == ISD::SMUL_LOHI) { @@ -4075,6 +4251,99 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT, return Ok; } +bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result, + SelectionDAG &DAG) const { + EVT VT = Node->getValueType(0); + + if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SHL, VT) || + !isOperationLegalOrCustom(ISD::SRL, VT) || + !isOperationLegalOrCustom(ISD::SUB, VT) || + !isOperationLegalOrCustomOrPromote(ISD::OR, VT))) + return false; + + // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) + // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) + SDValue X = Node->getOperand(0); + SDValue Y = Node->getOperand(1); + SDValue Z = Node->getOperand(2); + + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + bool IsFSHL = Node->getOpcode() == ISD::FSHL; + SDLoc DL(SDValue(Node, 0)); + + EVT ShVT = Z.getValueType(); + SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT); + SDValue Zero = DAG.getConstant(0, DL, ShVT); + + SDValue ShAmt; + if (isPowerOf2_32(EltSizeInBits)) { + SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, ShVT); + ShAmt = DAG.getNode(ISD::AND, DL, ShVT, Z, Mask); + } else { + ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Z, BitWidthC); + } + + SDValue InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, ShAmt); + SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, X, IsFSHL ? ShAmt : InvShAmt); + SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt); + SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShX, ShY); + + // If (Z % BW == 0), then the opposite direction shift is shift-by-bitwidth, + // and that is undefined. We must compare and select to avoid UB. + EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ShVT); + + // For fshl, 0-shift returns the 1st arg (X). + // For fshr, 0-shift returns the 2nd arg (Y). + SDValue IsZeroShift = DAG.getSetCC(DL, CCVT, ShAmt, Zero, ISD::SETEQ); + Result = DAG.getSelect(DL, VT, IsZeroShift, IsFSHL ? X : Y, Or); + return true; +} + +// TODO: Merge with expandFunnelShift. +bool TargetLowering::expandROT(SDNode *Node, SDValue &Result, + SelectionDAG &DAG) const { + EVT VT = Node->getValueType(0); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + bool IsLeft = Node->getOpcode() == ISD::ROTL; + SDValue Op0 = Node->getOperand(0); + SDValue Op1 = Node->getOperand(1); + SDLoc DL(SDValue(Node, 0)); + + EVT ShVT = Op1.getValueType(); + SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT); + + // If a rotate in the other direction is legal, use it. + unsigned RevRot = IsLeft ? ISD::ROTR : ISD::ROTL; + if (isOperationLegal(RevRot, VT)) { + SDValue Sub = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, Op1); + Result = DAG.getNode(RevRot, DL, VT, Op0, Sub); + return true; + } + + if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SHL, VT) || + !isOperationLegalOrCustom(ISD::SRL, VT) || + !isOperationLegalOrCustom(ISD::SUB, VT) || + !isOperationLegalOrCustomOrPromote(ISD::OR, VT) || + !isOperationLegalOrCustomOrPromote(ISD::AND, VT))) + return false; + + // Otherwise, + // (rotl x, c) -> (or (shl x, (and c, w-1)), (srl x, (and w-c, w-1))) + // (rotr x, c) -> (or (srl x, (and c, w-1)), (shl x, (and w-c, w-1))) + // + assert(isPowerOf2_32(EltSizeInBits) && EltSizeInBits > 1 && + "Expecting the type bitwidth to be a power of 2"); + unsigned ShOpc = IsLeft ? ISD::SHL : ISD::SRL; + unsigned HsOpc = IsLeft ? ISD::SRL : ISD::SHL; + SDValue BitWidthMinusOneC = DAG.getConstant(EltSizeInBits - 1, DL, ShVT); + SDValue NegOp1 = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, Op1); + SDValue And0 = DAG.getNode(ISD::AND, DL, ShVT, Op1, BitWidthMinusOneC); + SDValue And1 = DAG.getNode(ISD::AND, DL, ShVT, NegOp1, BitWidthMinusOneC); + Result = DAG.getNode(ISD::OR, DL, VT, DAG.getNode(ShOpc, DL, VT, Op0, And0), + DAG.getNode(HsOpc, DL, VT, Op0, And1)); + return true; +} + bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result, SelectionDAG &DAG) const { SDValue Src = Node->getOperand(0); @@ -4153,24 +4422,51 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result, !isOperationLegalOrCustomOrPromote(ISD::XOR, SrcVT))) return false; - // Expand based on maximum range of FP_TO_SINT: - // True = fp_to_sint(Src) - // False = 0x8000000000000000 + fp_to_sint(Src - 0x8000000000000000) - // Result = select (Src < 0x8000000000000000), True, False - APFloat apf(DAG.EVTToAPFloatSemantics(SrcVT), - APInt::getNullValue(SrcVT.getScalarSizeInBits())); - APInt x = APInt::getSignMask(DstVT.getScalarSizeInBits()); - (void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven); - - SDValue Tmp1 = DAG.getConstantFP(apf, dl, SrcVT); - SDValue Tmp2 = DAG.getSetCC(dl, SetCCVT, Src, Tmp1, ISD::SETLT); - SDValue True = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src); - // TODO: Should any fast-math-flags be set for the FSUB? - SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, - DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Tmp1)); - False = - DAG.getNode(ISD::XOR, dl, DstVT, False, DAG.getConstant(x, dl, DstVT)); - Result = DAG.getSelect(dl, DstVT, Tmp2, True, False); + // If the maximum float value is smaller then the signed integer range, + // the destination signmask can't be represented by the float, so we can + // just use FP_TO_SINT directly. + const fltSemantics &APFSem = DAG.EVTToAPFloatSemantics(SrcVT); + APFloat APF(APFSem, APInt::getNullValue(SrcVT.getScalarSizeInBits())); + APInt SignMask = APInt::getSignMask(DstVT.getScalarSizeInBits()); + if (APFloat::opOverflow & + APF.convertFromAPInt(SignMask, false, APFloat::rmNearestTiesToEven)) { + Result = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src); + return true; + } + + SDValue Cst = DAG.getConstantFP(APF, dl, SrcVT); + SDValue Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT); + + bool Strict = shouldUseStrictFP_TO_INT(SrcVT, DstVT, /*IsSigned*/ false); + if (Strict) { + // Expand based on maximum range of FP_TO_SINT, if the value exceeds the + // signmask then offset (the result of which should be fully representable). + // Sel = Src < 0x8000000000000000 + // Val = select Sel, Src, Src - 0x8000000000000000 + // Ofs = select Sel, 0, 0x8000000000000000 + // Result = fp_to_sint(Val) ^ Ofs + + // TODO: Should any fast-math-flags be set for the FSUB? + SDValue Val = DAG.getSelect(dl, SrcVT, Sel, Src, + DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Cst)); + SDValue Ofs = DAG.getSelect(dl, DstVT, Sel, DAG.getConstant(0, dl, DstVT), + DAG.getConstant(SignMask, dl, DstVT)); + Result = DAG.getNode(ISD::XOR, dl, DstVT, + DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Val), Ofs); + } else { + // Expand based on maximum range of FP_TO_SINT: + // True = fp_to_sint(Src) + // False = 0x8000000000000000 + fp_to_sint(Src - 0x8000000000000000) + // Result = select (Src < 0x8000000000000000), True, False + + SDValue True = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src); + // TODO: Should any fast-math-flags be set for the FSUB? + SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, + DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Cst)); + False = DAG.getNode(ISD::XOR, dl, DstVT, False, + DAG.getConstant(SignMask, dl, DstVT)); + Result = DAG.getSelect(dl, DstVT, Sel, True, False); + } return true; } @@ -5062,3 +5358,55 @@ SDValue TargetLowering::getExpandedSaturationAdditionSubtraction( return DAG.getSelect(dl, ResultType, Overflow, Result, SumDiff); } } + +SDValue +TargetLowering::getExpandedFixedPointMultiplication(SDNode *Node, + SelectionDAG &DAG) const { + assert(Node->getOpcode() == ISD::SMULFIX && "Expected opcode to be SMULFIX."); + assert(Node->getNumOperands() == 3 && + "Expected signed fixed point multiplication to have 3 operands."); + + SDLoc dl(Node); + SDValue LHS = Node->getOperand(0); + SDValue RHS = Node->getOperand(1); + assert(LHS.getValueType().isScalarInteger() && + "Expected operands to be integers. Vector of int arguments should " + "already be unrolled."); + assert(RHS.getValueType().isScalarInteger() && + "Expected operands to be integers. Vector of int arguments should " + "already be unrolled."); + assert(LHS.getValueType() == RHS.getValueType() && + "Expected both operands to be the same type"); + + unsigned Scale = Node->getConstantOperandVal(2); + EVT VT = LHS.getValueType(); + assert(Scale < VT.getScalarSizeInBits() && + "Expected scale to be less than the number of bits."); + + if (!Scale) + return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS); + + // Get the upper and lower bits of the result. + SDValue Lo, Hi; + if (isOperationLegalOrCustom(ISD::SMUL_LOHI, VT)) { + SDValue Result = + DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT), LHS, RHS); + Lo = Result.getValue(0); + Hi = Result.getValue(1); + } else if (isOperationLegalOrCustom(ISD::MULHS, VT)) { + Lo = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS); + Hi = DAG.getNode(ISD::MULHS, dl, VT, LHS, RHS); + } else { + report_fatal_error("Unable to expand signed fixed point multiplication."); + } + + // The result will need to be shifted right by the scale since both operands + // are scaled. The result is given to us in 2 halves, so we only want part of + // both in the result. + EVT ShiftTy = getShiftAmountTy(VT, DAG.getDataLayout()); + Lo = DAG.getNode(ISD::SRL, dl, VT, Lo, DAG.getConstant(Scale, dl, ShiftTy)); + Hi = DAG.getNode( + ISD::SHL, dl, VT, Hi, + DAG.getConstant(VT.getScalarSizeInBits() - Scale, dl, ShiftTy)); + return DAG.getNode(ISD::OR, dl, VT, Lo, Hi); +} diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp index cb12c7ce6e826f3a9735ea7773fde7d45f37cb87..dcf37ca76b204bf51c285194f470328e8244f4d0 100644 --- a/lib/CodeGen/StackProtector.cpp +++ b/lib/CodeGen/StackProtector.cpp @@ -199,6 +199,18 @@ bool StackProtector::HasAddressTaken(const Instruction *AI) { return false; } +/// Search for the first call to the llvm.stackprotector intrinsic and return it +/// if present. +static const CallInst *findStackProtectorIntrinsic(Function &F) { + for (const BasicBlock &BB : F) + for (const Instruction &I : BB) + if (const CallInst *CI = dyn_cast(&I)) + if (CI->getCalledFunction() == + Intrinsic::getDeclaration(F.getParent(), Intrinsic::stackprotector)) + return CI; + return nullptr; +} + /// Check whether or not this function needs a stack protector based /// upon the stack protector level. /// @@ -215,13 +227,7 @@ bool StackProtector::HasAddressTaken(const Instruction *AI) { bool StackProtector::RequiresStackProtector() { bool Strong = false; bool NeedsProtector = false; - for (const BasicBlock &BB : *F) - for (const Instruction &I : BB) - if (const CallInst *CI = dyn_cast(&I)) - if (CI->getCalledFunction() == - Intrinsic::getDeclaration(F->getParent(), - Intrinsic::stackprotector)) - HasPrologue = true; + HasPrologue = findStackProtectorIntrinsic(*F); if (F->hasFnAttribute(Attribute::SafeStack)) return false; @@ -379,7 +385,8 @@ bool StackProtector::InsertStackProtectors() { // protection in SDAG. bool SupportsSelectionDAGSP = TLI->useStackGuardXorFP() || - (EnableSelectionDAGSP && !TM->Options.EnableFastISel); + (EnableSelectionDAGSP && !TM->Options.EnableFastISel && + !TM->Options.EnableGlobalISel); AllocaInst *AI = nullptr; // Place on stack that stores the stack guard. for (Function::iterator I = F->begin(), E = F->end(); I != E;) { @@ -399,6 +406,14 @@ bool StackProtector::InsertStackProtectors() { if (SupportsSelectionDAGSP) break; + // Find the stack guard slot if the prologue was not created by this pass + // itself via a previous call to CreatePrologue(). + if (!AI) { + const CallInst *SPCall = findStackProtectorIntrinsic(*F); + assert(SPCall && "Call to llvm.stackprotector is missing"); + AI = cast(SPCall->getArgOperand(1)); + } + // Set HasIRCheck to true, so that SelectionDAG will not generate its own // version. SelectionDAG called 'shouldEmitSDCheck' to check whether // instrumentation has already been generated. diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp index 166ff18e7759efc3ff3df0397b83d3d9431aca41..e8619037564245319d06ff5f430490875ee83c7c 100644 --- a/lib/CodeGen/TargetLoweringBase.cpp +++ b/lib/CodeGen/TargetLoweringBase.cpp @@ -610,10 +610,13 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::UMIN, VT, Expand); setOperationAction(ISD::UMAX, VT, Expand); setOperationAction(ISD::ABS, VT, Expand); + setOperationAction(ISD::FSHL, VT, Expand); + setOperationAction(ISD::FSHR, VT, Expand); setOperationAction(ISD::SADDSAT, VT, Expand); setOperationAction(ISD::UADDSAT, VT, Expand); setOperationAction(ISD::SSUBSAT, VT, Expand); setOperationAction(ISD::USUBSAT, VT, Expand); + setOperationAction(ISD::SMULFIX, VT, Expand); // Overflow operations default to expand setOperationAction(ISD::SADDO, VT, Expand); @@ -1451,6 +1454,7 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const { case CatchPad: return 0; case CatchSwitch: return 0; case CleanupPad: return 0; + case FNeg: return ISD::FNEG; case Add: return ISD::ADD; case FAdd: return ISD::FADD; case Sub: return ISD::SUB; diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index 341ab927861039d114dad9516291094003b2b287..cb2fe691d7021a7ec07907eb6168cdefec40fee4 100644 --- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -795,6 +795,14 @@ const MCExpr *TargetLoweringObjectFileELF::lowerRelativeReference( MCSymbolRefExpr::create(TM.getSymbol(RHS), getContext()), getContext()); } +MCSection *TargetLoweringObjectFileELF::getSectionForCommandLines() const { + // Use ".GCC.command.line" since this feature is to support clang's + // -frecord-gcc-switches which in turn attempts to mimic GCC's switch of the + // same name. + return getContext().getELFSection(".GCC.command.line", ELF::SHT_PROGBITS, + ELF::SHF_MERGE | ELF::SHF_STRINGS, 1, ""); +} + void TargetLoweringObjectFileELF::InitializeELF(bool UseInitArray_) { UseInitArray = UseInitArray_; @@ -1100,6 +1108,22 @@ const MCExpr *TargetLoweringObjectFileMachO::getIndirectSymViaGOTPCRel( // .indirect_symbol _extfoo // .long 0 // + // The indirect symbol table (and sections of non_lazy_symbol_pointers type) + // may point to both local (same translation unit) and global (other + // translation units) symbols. Example: + // + // .section __DATA,__pointers,non_lazy_symbol_pointers + // L1: + // .indirect_symbol _myGlobal + // .long 0 + // L2: + // .indirect_symbol _myLocal + // .long _myLocal + // + // If the symbol is local, instead of the symbol's index, the assembler + // places the constant INDIRECT_SYMBOL_LOCAL into the indirect symbol table. + // Then the linker will notice the constant in the table and will look at the + // content of the symbol. MachineModuleInfoMachO &MachOMMI = MMI->getObjFileInfo(); MCContext &Ctx = getContext(); @@ -1119,9 +1143,12 @@ const MCExpr *TargetLoweringObjectFileMachO::getIndirectSymViaGOTPCRel( MCSymbol *Stub = Ctx.getOrCreateSymbol(Name); MachineModuleInfoImpl::StubValueTy &StubSym = MachOMMI.getGVStubEntry(Stub); - if (!StubSym.getPointer()) - StubSym = MachineModuleInfoImpl:: - StubValueTy(const_cast(Sym), true /* access indirectly */); + if (!StubSym.getPointer()) { + bool IsIndirectLocal = Sym->isDefined() && !Sym->isExternal(); + // With the assumption that IsIndirectLocal == GV->hasLocalLinkage(). + StubSym = MachineModuleInfoImpl::StubValueTy(const_cast(Sym), + !IsIndirectLocal); + } const MCExpr *BSymExpr = MCSymbolRefExpr::create(BaseSym, MCSymbolRefExpr::VK_None, Ctx); @@ -1317,10 +1344,11 @@ MCSection *TargetLoweringObjectFileCOFF::SelectSectionForGlobal( MCSymbol *Sym = TM.getSymbol(ComdatGV); StringRef COMDATSymName = Sym->getName(); - // Append "$symbol" to the section name when targetting mingw. The ld.bfd + // Append "$symbol" to the section name *before* IR-level mangling is + // applied when targetting mingw. This is what GCC does, and the ld.bfd // COFF linker will not properly handle comdats otherwise. if (getTargetTriple().isWindowsGNUEnvironment()) - raw_svector_ostream(Name) << '$' << COMDATSymName; + raw_svector_ostream(Name) << '$' << ComdatGV->getName(); return getContext().getCOFFSection(Name, Characteristics, Kind, COMDATSymName, Selection, UniqueID); diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp index 9adacd2ed718a7170f7e9b0d822b6fe73732a682..defb165fe0f29adb628e8cae798a197ec16dc8a7 100644 --- a/lib/CodeGen/TargetPassConfig.cpp +++ b/lib/CodeGen/TargetPassConfig.cpp @@ -137,13 +137,15 @@ static cl::opt PrintMachineInstrs( "print-machineinstrs", cl::ValueOptional, cl::desc("Print machine instrs"), cl::value_desc("pass-name"), cl::init("option-unspecified"), cl::Hidden); -static cl::opt EnableGlobalISelAbort( +static cl::opt EnableGlobalISelAbort( "global-isel-abort", cl::Hidden, cl::desc("Enable abort calls when \"global\" instruction selection " - "fails to lower/select an instruction: 0 disable the abort, " - "1 enable the abort, and " - "2 disable the abort but emit a diagnostic on failure"), - cl::init(1)); + "fails to lower/select an instruction"), + cl::values( + clEnumValN(GlobalISelAbortMode::Disable, "0", "Disable the abort"), + clEnumValN(GlobalISelAbortMode::Enable, "1", "Enable the abort"), + clEnumValN(GlobalISelAbortMode::DisableWithDiag, "2", + "Disable the abort but emit a diagnostic on failure"))); // Temporary option to allow experimenting with MachineScheduler as a post-RA // scheduler. Targets can "properly" enable this with @@ -343,11 +345,39 @@ static AnalysisID getPassIDFromName(StringRef PassName) { return PI ? PI->getTypeInfo() : nullptr; } +static std::pair +getPassNameAndInstanceNum(StringRef PassName) { + StringRef Name, InstanceNumStr; + std::tie(Name, InstanceNumStr) = PassName.split(','); + + unsigned InstanceNum = 0; + if (!InstanceNumStr.empty() && InstanceNumStr.getAsInteger(10, InstanceNum)) + report_fatal_error("invalid pass instance specifier " + PassName); + + return std::make_pair(Name, InstanceNum); +} + void TargetPassConfig::setStartStopPasses() { - StartBefore = getPassIDFromName(StartBeforeOpt); - StartAfter = getPassIDFromName(StartAfterOpt); - StopBefore = getPassIDFromName(StopBeforeOpt); - StopAfter = getPassIDFromName(StopAfterOpt); + StringRef StartBeforeName; + std::tie(StartBeforeName, StartBeforeInstanceNum) = + getPassNameAndInstanceNum(StartBeforeOpt); + + StringRef StartAfterName; + std::tie(StartAfterName, StartAfterInstanceNum) = + getPassNameAndInstanceNum(StartAfterOpt); + + StringRef StopBeforeName; + std::tie(StopBeforeName, StopBeforeInstanceNum) + = getPassNameAndInstanceNum(StopBeforeOpt); + + StringRef StopAfterName; + std::tie(StopAfterName, StopAfterInstanceNum) + = getPassNameAndInstanceNum(StopAfterOpt); + + StartBefore = getPassIDFromName(StartBeforeName); + StartAfter = getPassIDFromName(StartAfterName); + StopBefore = getPassIDFromName(StopBeforeName); + StopAfter = getPassIDFromName(StopAfterName); if (StartBefore && StartAfter) report_fatal_error(Twine(StartBeforeOptName) + Twine(" and ") + Twine(StartAfterOptName) + Twine(" specified!")); @@ -384,6 +414,9 @@ TargetPassConfig::TargetPassConfig(LLVMTargetMachine &TM, PassManagerBase &pm) if (TM.Options.EnableIPRA) setRequiresCodeGenSCCOrder(); + if (EnableGlobalISelAbort.getNumOccurrences()) + TM.Options.GlobalISelAbort = EnableGlobalISelAbort; + setStartStopPasses(); } @@ -488,9 +521,9 @@ void TargetPassConfig::addPass(Pass *P, bool verifyAfter, bool printAfter) { // and shouldn't reference it. AnalysisID PassID = P->getPassID(); - if (StartBefore == PassID) + if (StartBefore == PassID && StartBeforeCount++ == StartBeforeInstanceNum) Started = true; - if (StopBefore == PassID) + if (StopBefore == PassID && StopBeforeCount++ == StopBeforeInstanceNum) Stopped = true; if (Started && !Stopped) { std::string Banner; @@ -513,9 +546,11 @@ void TargetPassConfig::addPass(Pass *P, bool verifyAfter, bool printAfter) { } else { delete P; } - if (StopAfter == PassID) + + if (StopAfter == PassID && StopAfterCount++ == StopAfterInstanceNum) Stopped = true; - if (StartAfter == PassID) + + if (StartAfter == PassID && StartAfterCount++ == StartAfterInstanceNum) Started = true; if (Stopped && !Started) report_fatal_error("Cannot stop compilation after pass that is not run"); @@ -721,8 +756,11 @@ bool TargetPassConfig::addCoreISelPasses() { // Enable FastISel with -fast-isel, but allow that to be overridden. TM->setO0WantsFastISel(EnableFastISelOption != cl::BOU_FALSE); if (EnableFastISelOption == cl::BOU_TRUE || - (TM->getOptLevel() == CodeGenOpt::None && TM->getO0WantsFastISel())) + (TM->getOptLevel() == CodeGenOpt::None && TM->getO0WantsFastISel() && + !TM->Options.EnableGlobalISel)) { TM->setFastISel(true); + TM->setGlobalISel(false); + } // Ask the target for an instruction selector. // Explicitly enabling fast-isel should override implicitly enabled @@ -730,6 +768,7 @@ bool TargetPassConfig::addCoreISelPasses() { if (EnableGlobalISelOption == cl::BOU_TRUE || (EnableGlobalISelOption == cl::BOU_UNSET && TM->Options.EnableGlobalISel && EnableFastISelOption != cl::BOU_TRUE)) { + TM->setGlobalISel(true); TM->setFastISel(false); SaveAndRestore SavedAddingMachinePasses(AddingMachinePasses, true); @@ -990,7 +1029,8 @@ bool TargetPassConfig::getOptimizeRegAlloc() const { } /// RegisterRegAlloc's global Registry tracks allocator registration. -MachinePassRegistry RegisterRegAlloc::Registry; +MachinePassRegistry + RegisterRegAlloc::Registry; /// A dummy default pass factory indicates whether the register allocator is /// overridden on the command line. @@ -1164,14 +1204,9 @@ void TargetPassConfig::addBlockPlacement() { /// GlobalISel Configuration //===---------------------------------------------------------------------===// bool TargetPassConfig::isGlobalISelAbortEnabled() const { - if (EnableGlobalISelAbort.getNumOccurrences() > 0) - return EnableGlobalISelAbort == 1; - - // When no abort behaviour is specified, we don't abort if the target says - // that GISel is enabled. - return !TM->Options.EnableGlobalISel; + return TM->Options.GlobalISelAbort == GlobalISelAbortMode::Enable; } bool TargetPassConfig::reportDiagnosticWhenGlobalISelFallback() const { - return EnableGlobalISelAbort == 2; + return TM->Options.GlobalISelAbort == GlobalISelAbortMode::DisableWithDiag; } diff --git a/lib/CodeGen/WasmEHPrepare.cpp b/lib/CodeGen/WasmEHPrepare.cpp index 6f02a05f56157e33a7932e16a20b97ec1ac47158..e5002eb95346dbe6228ab2498bfd68e4cb6080a2 100644 --- a/lib/CodeGen/WasmEHPrepare.cpp +++ b/lib/CodeGen/WasmEHPrepare.cpp @@ -137,6 +137,7 @@ class WasmEHPrepare : public FunctionPass { Value *LSDAField = nullptr; // lsda field Value *SelectorField = nullptr; // selector + Function *ThrowF = nullptr; // wasm.throw() intrinsic Function *CatchF = nullptr; // wasm.catch.extract() intrinsic Function *LPadIndexF = nullptr; // wasm.landingpad.index() intrinsic Function *LSDAF = nullptr; // wasm.lsda() intrinsic @@ -145,6 +146,9 @@ class WasmEHPrepare : public FunctionPass { Function *CallPersonalityF = nullptr; // _Unwind_CallPersonality() wrapper Function *ClangCallTermF = nullptr; // __clang_call_terminate() function + bool prepareEHPads(Function &F); + bool prepareThrows(Function &F); + void prepareEHPad(BasicBlock *BB, unsigned Index); void prepareTerminateCleanupPad(BasicBlock *BB); @@ -177,7 +181,62 @@ bool WasmEHPrepare::doInitialization(Module &M) { return false; } +// Erase the specified BBs if the BB does not have any remaining predecessors, +// and also all its dead children. +template +static void eraseDeadBBsAndChildren(const Container &BBs) { + SmallVector WL(BBs.begin(), BBs.end()); + while (!WL.empty()) { + auto *BB = WL.pop_back_val(); + if (pred_begin(BB) != pred_end(BB)) + continue; + WL.append(succ_begin(BB), succ_end(BB)); + DeleteDeadBlock(BB); + } +} + bool WasmEHPrepare::runOnFunction(Function &F) { + bool Changed = false; + Changed |= prepareThrows(F); + Changed |= prepareEHPads(F); + return Changed; +} + +bool WasmEHPrepare::prepareThrows(Function &F) { + Module &M = *F.getParent(); + IRBuilder<> IRB(F.getContext()); + bool Changed = false; + + // wasm.throw() intinsic, which will be lowered to wasm 'throw' instruction. + ThrowF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_throw); + + // Insert an unreachable instruction after a call to @llvm.wasm.throw and + // delete all following instructions within the BB, and delete all the dead + // children of the BB as well. + for (User *U : ThrowF->users()) { + // A call to @llvm.wasm.throw() is only generated from + // __builtin_wasm_throw() builtin call within libcxxabi, and cannot be an + // InvokeInst. + auto *ThrowI = cast(U); + if (ThrowI->getFunction() != &F) + continue; + Changed = true; + auto *BB = ThrowI->getParent(); + SmallVector Succs(succ_begin(BB), succ_end(BB)); + auto &InstList = BB->getInstList(); + InstList.erase(std::next(BasicBlock::iterator(ThrowI)), InstList.end()); + IRB.SetInsertPoint(BB); + IRB.CreateUnreachable(); + eraseDeadBBsAndChildren(Succs); + } + + return Changed; +} + +bool WasmEHPrepare::prepareEHPads(Function &F) { + Module &M = *F.getParent(); + IRBuilder<> IRB(F.getContext()); + SmallVector CatchPads; SmallVector CleanupPads; for (BasicBlock &BB : F) { @@ -194,9 +253,6 @@ bool WasmEHPrepare::runOnFunction(Function &F) { return false; assert(F.hasPersonalityFn() && "Personality function not found"); - Module &M = *F.getParent(); - IRBuilder<> IRB(F.getContext()); - // __wasm_lpad_context global variable LPadContextGV = cast( M.getOrInsertGlobal("__wasm_lpad_context", LPadContextTy)); diff --git a/lib/DebugInfo/CodeView/CMakeLists.txt b/lib/DebugInfo/CodeView/CMakeLists.txt index e1bdffd30ee799f4ff3e72b6f36aa986627130a8..1610ca469575307549c6dfb3fe1a415d2590a356 100644 --- a/lib/DebugInfo/CodeView/CMakeLists.txt +++ b/lib/DebugInfo/CodeView/CMakeLists.txt @@ -27,8 +27,9 @@ add_llvm_library(LLVMDebugInfoCodeView RecordSerialization.cpp SimpleTypeSerializer.cpp StringsAndChecksums.cpp - SymbolRecordMapping.cpp SymbolDumper.cpp + SymbolRecordHelpers.cpp + SymbolRecordMapping.cpp SymbolSerializer.cpp TypeDumpVisitor.cpp TypeIndex.cpp diff --git a/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp b/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp index 44a67743169ec9c8eb4adb03d18cd5374f2ac3e9..cbcaa569282818a8e4b3bc5617792b53241d932d 100644 --- a/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp +++ b/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp @@ -75,7 +75,7 @@ Error CVSymbolVisitor::visitSymbolStream(const CVSymbolArray &Symbols) { Error CVSymbolVisitor::visitSymbolStream(const CVSymbolArray &Symbols, uint32_t InitialOffset) { for (auto I : Symbols) { - if (auto EC = visitSymbolRecord(I, InitialOffset)) + if (auto EC = visitSymbolRecord(I, InitialOffset + Symbols.skew())) return EC; InitialOffset += I.length(); } diff --git a/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp b/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp index ca8007411cadcd9061107eb338c1a709b43e9fa3..ddcad8c631d7484c1978824d455d8b9bd8f2618c 100644 --- a/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp +++ b/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp @@ -89,6 +89,8 @@ uint32_t LazyRandomTypeCollection::getOffsetOfType(TypeIndex Index) { } CVType LazyRandomTypeCollection::getType(TypeIndex Index) { + assert(!Index.isSimple()); + auto EC = ensureTypeExists(Index); error(std::move(EC)); assert(contains(Index)); @@ -97,6 +99,9 @@ CVType LazyRandomTypeCollection::getType(TypeIndex Index) { } Optional LazyRandomTypeCollection::tryGetType(TypeIndex Index) { + if (Index.isSimple()) + return None; + if (auto EC = ensureTypeExists(Index)) { consumeError(std::move(EC)); return None; @@ -151,6 +156,7 @@ Error LazyRandomTypeCollection::ensureTypeExists(TypeIndex TI) { } void LazyRandomTypeCollection::ensureCapacityFor(TypeIndex Index) { + assert(!Index.isSimple()); uint32_t MinSize = Index.toArrayIndex() + 1; if (MinSize <= capacity()) @@ -163,6 +169,7 @@ void LazyRandomTypeCollection::ensureCapacityFor(TypeIndex Index) { } Error LazyRandomTypeCollection::visitRangeForType(TypeIndex TI) { + assert(!TI.isSimple()); if (PartialOffsets.empty()) return fullScanForType(TI); @@ -217,6 +224,7 @@ Optional LazyRandomTypeCollection::getNext(TypeIndex Prev) { } Error LazyRandomTypeCollection::fullScanForType(TypeIndex TI) { + assert(!TI.isSimple()); assert(PartialOffsets.empty()); TypeIndex CurrentTI = TypeIndex::fromArrayIndex(0); diff --git a/lib/DebugInfo/CodeView/SymbolRecordHelpers.cpp b/lib/DebugInfo/CodeView/SymbolRecordHelpers.cpp new file mode 100644 index 0000000000000000000000000000000000000000..01746138ad1f2df923ff1b1e721f0b140b3352c2 --- /dev/null +++ b/lib/DebugInfo/CodeView/SymbolRecordHelpers.cpp @@ -0,0 +1,94 @@ +//===- SymbolRecordHelpers.cpp ----------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/DebugInfo/CodeView/SymbolRecordHelpers.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h" + +using namespace llvm; +using namespace llvm::codeview; + +template RecordT createRecord(const CVSymbol &sym) { + RecordT record(static_cast(sym.kind())); + cantFail(SymbolDeserializer::deserializeAs(sym, record)); + return record; +} + +uint32_t llvm::codeview::getScopeEndOffset(const CVSymbol &Sym) { + assert(symbolOpensScope(Sym.kind())); + switch (Sym.kind()) { + case SymbolKind::S_GPROC32: + case SymbolKind::S_LPROC32: + case SymbolKind::S_GPROC32_ID: + case SymbolKind::S_LPROC32_ID: + case SymbolKind::S_LPROC32_DPC: + case SymbolKind::S_LPROC32_DPC_ID: { + ProcSym Proc = createRecord(Sym); + return Proc.End; + } + case SymbolKind::S_BLOCK32: { + BlockSym Block = createRecord(Sym); + return Block.End; + } + case SymbolKind::S_THUNK32: { + Thunk32Sym Thunk = createRecord(Sym); + return Thunk.End; + } + case SymbolKind::S_INLINESITE: { + InlineSiteSym Site = createRecord(Sym); + return Site.End; + } + default: + assert(false && "Unknown record type"); + return 0; + } +} + +uint32_t +llvm::codeview::getScopeParentOffset(const llvm::codeview::CVSymbol &Sym) { + assert(symbolOpensScope(Sym.kind())); + switch (Sym.kind()) { + case SymbolKind::S_GPROC32: + case SymbolKind::S_LPROC32: + case SymbolKind::S_GPROC32_ID: + case SymbolKind::S_LPROC32_ID: + case SymbolKind::S_LPROC32_DPC: + case SymbolKind::S_LPROC32_DPC_ID: { + ProcSym Proc = createRecord(Sym); + return Proc.Parent; + } + case SymbolKind::S_BLOCK32: { + BlockSym Block = createRecord(Sym); + return Block.Parent; + } + case SymbolKind::S_THUNK32: { + Thunk32Sym Thunk = createRecord(Sym); + return Thunk.Parent; + } + case SymbolKind::S_INLINESITE: { + InlineSiteSym Site = createRecord(Sym); + return Site.Parent; + } + default: + assert(false && "Unknown record type"); + return 0; + } +} + +CVSymbolArray +llvm::codeview::limitSymbolArrayToScope(const CVSymbolArray &Symbols, + uint32_t ScopeBegin) { + CVSymbol Opener = *Symbols.at(ScopeBegin); + assert(symbolOpensScope(Opener.kind())); + uint32_t EndOffset = getScopeEndOffset(Opener); + CVSymbol Closer = *Symbols.at(EndOffset); + EndOffset += Closer.RecordData.size(); + return Symbols.substream(ScopeBegin, EndOffset); +} diff --git a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp index 7c68c9167c9820825ccb5ad05310b59fc7ff540c..f5d3bea43a14799e1ddc3c15e53914dd956667a1 100644 --- a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp +++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp @@ -361,7 +361,6 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, TypeServer2Record &TS) { Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) { printTypeIndex("PointeeType", Ptr.getReferentType()); - W->printHex("PointerAttributes", uint32_t(Ptr.getOptions())); W->printEnum("PtrType", unsigned(Ptr.getPointerKind()), makeArrayRef(PtrKindNames)); W->printEnum("PtrMode", unsigned(Ptr.getMode()), makeArrayRef(PtrModeNames)); @@ -371,6 +370,8 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) { W->printNumber("IsVolatile", Ptr.isVolatile()); W->printNumber("IsUnaligned", Ptr.isUnaligned()); W->printNumber("IsRestrict", Ptr.isRestrict()); + W->printNumber("IsThisPtr&", Ptr.isLValueReferenceThisPtr()); + W->printNumber("IsThisPtr&&", Ptr.isRValueReferenceThisPtr()); W->printNumber("SizeOf", Ptr.getSize()); if (Ptr.isPointerToMember()) { diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp index 7ab54de6bc49e3ad604dc2d5e575381acf1ba403..e6620ee3dd1d16a342b64ba0b2e525fdb6d3a0a4 100644 --- a/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -317,7 +317,6 @@ void DWARFContext::dump( raw_ostream &OS, DIDumpOptions DumpOpts, std::array, DIDT_ID_Count> DumpOffsets) { - Optional DumpOffset; uint64_t DumpType = DumpOpts.DumpType; StringRef Extension = sys::path::extension(DObj->getFileName()); @@ -334,13 +333,13 @@ void DWARFContext::dump( bool Explicit = DumpType != DIDT_All && !IsDWO; bool ExplicitDWO = Explicit && IsDWO; auto shouldDump = [&](bool Explicit, const char *Name, unsigned ID, - StringRef Section) { - DumpOffset = DumpOffsets[ID]; + StringRef Section) -> Optional * { unsigned Mask = 1U << ID; bool Should = (DumpType & Mask) && (Explicit || !Section.empty()); - if (Should) - OS << "\n" << Name << " contents:\n"; - return Should; + if (!Should) + return nullptr; + OS << "\n" << Name << " contents:\n"; + return &DumpOffsets[ID]; }; // Dump individual sections. @@ -353,7 +352,7 @@ void DWARFContext::dump( auto dumpDebugInfo = [&](const char *Name, unit_iterator_range Units) { OS << '\n' << Name << " contents:\n"; - if ((DumpOffset = DumpOffsets[DIDT_ID_DebugInfo])) + if (auto DumpOffset = DumpOffsets[DIDT_ID_DebugInfo]) for (const auto &U : Units) U->getDIEForOffset(DumpOffset.getValue()) .dump(OS, 0, DumpOpts.noImplicitRecursion()); @@ -370,9 +369,8 @@ void DWARFContext::dump( auto dumpDebugType = [&](const char *Name, unit_iterator_range Units) { OS << '\n' << Name << " contents:\n"; - DumpOffset = DumpOffsets[DIDT_ID_DebugTypes]; for (const auto &U : Units) - if (DumpOffset) + if (auto DumpOffset = DumpOffsets[DIDT_ID_DebugTypes]) U->getDIEForOffset(*DumpOffset) .dump(OS, 0, DumpOpts.noImplicitRecursion()); else @@ -385,28 +383,30 @@ void DWARFContext::dump( dumpDebugType(".debug_types.dwo", dwo_types_section_units()); } - if (shouldDump(Explicit, ".debug_loc", DIDT_ID_DebugLoc, - DObj->getLocSection().Data)) { - getDebugLoc()->dump(OS, getRegisterInfo(), DumpOffset); + if (const auto *Off = shouldDump(Explicit, ".debug_loc", DIDT_ID_DebugLoc, + DObj->getLocSection().Data)) { + getDebugLoc()->dump(OS, getRegisterInfo(), *Off); } - if (shouldDump(Explicit, ".debug_loclists", DIDT_ID_DebugLoclists, - DObj->getLoclistsSection().Data)) { + if (const auto *Off = + shouldDump(Explicit, ".debug_loclists", DIDT_ID_DebugLoclists, + DObj->getLoclistsSection().Data)) { DWARFDataExtractor Data(*DObj, DObj->getLoclistsSection(), isLittleEndian(), 0); - dumpLoclistsSection(OS, DumpOpts, Data, getRegisterInfo(), DumpOffset); + dumpLoclistsSection(OS, DumpOpts, Data, getRegisterInfo(), *Off); } - if (shouldDump(ExplicitDWO, ".debug_loc.dwo", DIDT_ID_DebugLoc, - DObj->getLocDWOSection().Data)) { - getDebugLocDWO()->dump(OS, 0, getRegisterInfo(), DumpOffset); + if (const auto *Off = + shouldDump(ExplicitDWO, ".debug_loc.dwo", DIDT_ID_DebugLoc, + DObj->getLocDWOSection().Data)) { + getDebugLocDWO()->dump(OS, 0, getRegisterInfo(), *Off); } - if (shouldDump(Explicit, ".debug_frame", DIDT_ID_DebugFrame, - DObj->getDebugFrameSection())) - getDebugFrame()->dump(OS, getRegisterInfo(), DumpOffset); + if (const auto *Off = shouldDump(Explicit, ".debug_frame", DIDT_ID_DebugFrame, + DObj->getDebugFrameSection())) + getDebugFrame()->dump(OS, getRegisterInfo(), *Off); - if (shouldDump(Explicit, ".eh_frame", DIDT_ID_DebugFrame, - DObj->getEHFrameSection())) - getEHFrame()->dump(OS, getRegisterInfo(), DumpOffset); + if (const auto *Off = shouldDump(Explicit, ".eh_frame", DIDT_ID_DebugFrame, + DObj->getEHFrameSection())) + getEHFrame()->dump(OS, getRegisterInfo(), *Off); if (DumpType & DIDT_DebugMacro) { if (Explicit || !getDebugMacro()->empty()) { @@ -425,7 +425,8 @@ void DWARFContext::dump( } auto DumpLineSection = [&](DWARFDebugLine::SectionParser Parser, - DIDumpOptions DumpOpts) { + DIDumpOptions DumpOpts, + Optional DumpOffset) { while (!Parser.done()) { if (DumpOffset && Parser.getOffset() != *DumpOffset) { Parser.skip(dumpWarning); @@ -442,22 +443,23 @@ void DWARFContext::dump( } }; - if (shouldDump(Explicit, ".debug_line", DIDT_ID_DebugLine, - DObj->getLineSection().Data)) { + if (const auto *Off = shouldDump(Explicit, ".debug_line", DIDT_ID_DebugLine, + DObj->getLineSection().Data)) { DWARFDataExtractor LineData(*DObj, DObj->getLineSection(), isLittleEndian(), 0); DWARFDebugLine::SectionParser Parser(LineData, *this, compile_units(), type_units()); - DumpLineSection(Parser, DumpOpts); + DumpLineSection(Parser, DumpOpts, *Off); } - if (shouldDump(ExplicitDWO, ".debug_line.dwo", DIDT_ID_DebugLine, - DObj->getLineDWOSection().Data)) { + if (const auto *Off = + shouldDump(ExplicitDWO, ".debug_line.dwo", DIDT_ID_DebugLine, + DObj->getLineDWOSection().Data)) { DWARFDataExtractor LineData(*DObj, DObj->getLineDWOSection(), isLittleEndian(), 0); DWARFDebugLine::SectionParser Parser(LineData, *this, dwo_compile_units(), dwo_type_units()); - DumpLineSection(Parser, DumpOpts); + DumpLineSection(Parser, DumpOpts, *Off); } if (shouldDump(Explicit, ".debug_cu_index", DIDT_ID_DebugCUIndex, @@ -549,24 +551,24 @@ void DWARFContext::dump( } if (shouldDump(Explicit, ".debug_pubnames", DIDT_ID_DebugPubnames, - DObj->getPubNamesSection())) - DWARFDebugPubTable(DObj->getPubNamesSection(), isLittleEndian(), false) + DObj->getPubNamesSection().Data)) + DWARFDebugPubTable(*DObj, DObj->getPubNamesSection(), isLittleEndian(), false) .dump(OS); if (shouldDump(Explicit, ".debug_pubtypes", DIDT_ID_DebugPubtypes, - DObj->getPubTypesSection())) - DWARFDebugPubTable(DObj->getPubTypesSection(), isLittleEndian(), false) + DObj->getPubTypesSection().Data)) + DWARFDebugPubTable(*DObj, DObj->getPubTypesSection(), isLittleEndian(), false) .dump(OS); if (shouldDump(Explicit, ".debug_gnu_pubnames", DIDT_ID_DebugGnuPubnames, - DObj->getGnuPubNamesSection())) - DWARFDebugPubTable(DObj->getGnuPubNamesSection(), isLittleEndian(), + DObj->getGnuPubNamesSection().Data)) + DWARFDebugPubTable(*DObj, DObj->getGnuPubNamesSection(), isLittleEndian(), true /* GnuStyle */) .dump(OS); if (shouldDump(Explicit, ".debug_gnu_pubtypes", DIDT_ID_DebugGnuPubtypes, - DObj->getGnuPubTypesSection())) - DWARFDebugPubTable(DObj->getGnuPubTypesSection(), isLittleEndian(), + DObj->getGnuPubTypesSection().Data)) + DWARFDebugPubTable(*DObj, DObj->getGnuPubTypesSection(), isLittleEndian(), true /* GnuStyle */) .dump(OS); @@ -765,7 +767,7 @@ const DWARFDebugFrame *DWARFContext::getDebugFrame() { // http://lists.dwarfstd.org/htdig.cgi/dwarf-discuss-dwarfstd.org/2011-December/001173.html DWARFDataExtractor debugFrameData(DObj->getDebugFrameSection(), isLittleEndian(), DObj->getAddressSize()); - DebugFrame.reset(new DWARFDebugFrame(false /* IsEH */)); + DebugFrame.reset(new DWARFDebugFrame(getArch(), false /* IsEH */)); DebugFrame->parse(debugFrameData); return DebugFrame.get(); } @@ -776,7 +778,7 @@ const DWARFDebugFrame *DWARFContext::getEHFrame() { DWARFDataExtractor debugFrameData(DObj->getEHFrameSection(), isLittleEndian(), DObj->getAddressSize()); - DebugFrame.reset(new DWARFDebugFrame(true /* IsEH */)); + DebugFrame.reset(new DWARFDebugFrame(getArch(), true /* IsEH */)); DebugFrame->parse(debugFrameData); return DebugFrame.get(); } @@ -1265,6 +1267,10 @@ class DWARFObjInMemory final : public DWARFObject { DWARFSectionMap AppleNamespacesSection; DWARFSectionMap AppleObjCSection; DWARFSectionMap DebugNamesSection; + DWARFSectionMap PubNamesSection; + DWARFSectionMap PubTypesSection; + DWARFSectionMap GnuPubNamesSection; + DWARFSectionMap GnuPubTypesSection; DWARFSectionMap *mapNameToDWARFSection(StringRef Name) { return StringSwitch(Name) @@ -1281,6 +1287,10 @@ class DWARFObjInMemory final : public DWARFObject { .Case("debug_str_offsets.dwo", &StringOffsetDWOSection) .Case("debug_addr", &AddrSection) .Case("apple_names", &AppleNamesSection) + .Case("debug_pubnames", &PubNamesSection) + .Case("debug_pubtypes", &PubTypesSection) + .Case("debug_gnu_pubnames", &GnuPubNamesSection) + .Case("debug_gnu_pubtypes", &GnuPubTypesSection) .Case("apple_types", &AppleTypesSection) .Case("apple_namespaces", &AppleNamespacesSection) .Case("apple_namespac", &AppleNamespacesSection) @@ -1294,12 +1304,8 @@ class DWARFObjInMemory final : public DWARFObject { StringRef EHFrameSection; StringRef StringSection; StringRef MacinfoSection; - StringRef PubNamesSection; - StringRef PubTypesSection; - StringRef GnuPubNamesSection; StringRef AbbrevDWOSection; StringRef StringDWOSection; - StringRef GnuPubTypesSection; StringRef CUIndexSection; StringRef GdbIndexSection; StringRef TUIndexSection; @@ -1319,10 +1325,6 @@ class DWARFObjInMemory final : public DWARFObject { .Case("eh_frame", &EHFrameSection) .Case("debug_str", &StringSection) .Case("debug_macinfo", &MacinfoSection) - .Case("debug_pubnames", &PubNamesSection) - .Case("debug_pubtypes", &PubTypesSection) - .Case("debug_gnu_pubnames", &GnuPubNamesSection) - .Case("debug_gnu_pubtypes", &GnuPubTypesSection) .Case("debug_abbrev.dwo", &AbbrevDWOSection) .Case("debug_str.dwo", &StringDWOSection) .Case("debug_cu_index", &CUIndexSection) @@ -1598,12 +1600,12 @@ public: return RnglistsSection; } StringRef getMacinfoSection() const override { return MacinfoSection; } - StringRef getPubNamesSection() const override { return PubNamesSection; } - StringRef getPubTypesSection() const override { return PubTypesSection; } - StringRef getGnuPubNamesSection() const override { + const DWARFSection &getPubNamesSection() const override { return PubNamesSection; } + const DWARFSection &getPubTypesSection() const override { return PubTypesSection; } + const DWARFSection &getGnuPubNamesSection() const override { return GnuPubNamesSection; } - StringRef getGnuPubTypesSection() const override { + const DWARFSection &getGnuPubTypesSection() const override { return GnuPubTypesSection; } const DWARFSection &getAppleNamesSection() const override { diff --git a/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp b/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp index 7085ca067ba621d17cbe7ff9e4d14a6c48741d70..22759bfac26c859d3b3b5f329f4685de95bb5dc7 100644 --- a/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp +++ b/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp @@ -148,7 +148,7 @@ void DWARFDebugAddrTable::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const { HeaderData.Length, HeaderData.Version, HeaderData.AddrSize, HeaderData.SegSize); - static const char *Fmt32 = "0x%8.8" PRIx32; + static const char *Fmt32 = "0x%8.8" PRIx64; static const char *Fmt64 = "0x%16.16" PRIx64; std::string AddrFmt = "\n"; std::string AddrFmtVerbose = " => "; diff --git a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp index d725255fe4c53e9bc40e2dd0e06425860be94af5..6349a6efb8c21ee43398d63dfdc9a1f998b3d455 100644 --- a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp +++ b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp @@ -225,7 +225,7 @@ void CFIProgram::printOperand(raw_ostream &OS, const MCRegisterInfo *MRI, switch (Type) { case OT_Unset: { OS << " Unsupported " << (OperandIdx ? "second" : "first") << " operand to"; - auto OpcodeName = CallFrameString(Opcode); + auto OpcodeName = CallFrameString(Opcode, Arch); if (!OpcodeName.empty()) OS << " " << OpcodeName; else @@ -279,7 +279,7 @@ void CFIProgram::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH, if (Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK) Opcode &= DWARF_CFI_PRIMARY_OPCODE_MASK; OS.indent(2 * IndentLevel); - OS << CallFrameString(Opcode) << ":"; + OS << CallFrameString(Opcode, Arch) << ":"; for (unsigned i = 0; i < Instr.Ops.size(); ++i) printOperand(OS, MRI, IsEH, Instr, i, Instr.Ops[i]); OS << '\n'; @@ -325,8 +325,9 @@ void FDE::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH) const { OS << "\n"; } -DWARFDebugFrame::DWARFDebugFrame(bool IsEH, uint64_t EHFrameAddress) - : IsEH(IsEH), EHFrameAddress(EHFrameAddress) {} +DWARFDebugFrame::DWARFDebugFrame(Triple::ArchType Arch, + bool IsEH, uint64_t EHFrameAddress) + : Arch(Arch), IsEH(IsEH), EHFrameAddress(EHFrameAddress) {} DWARFDebugFrame::~DWARFDebugFrame() = default; @@ -396,7 +397,8 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) { uint8_t SegmentDescriptorSize = Version < 4 ? 0 : Data.getU8(&Offset); uint64_t CodeAlignmentFactor = Data.getULEB128(&Offset); int64_t DataAlignmentFactor = Data.getSLEB128(&Offset); - uint64_t ReturnAddressRegister = Data.getULEB128(&Offset); + uint64_t ReturnAddressRegister = + Version == 1 ? Data.getU8(&Offset) : Data.getULEB128(&Offset); // Parse the augmentation data for EH CIEs StringRef AugmentationData(""); @@ -460,7 +462,7 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) { StartOffset, Length, Version, AugmentationString, AddressSize, SegmentDescriptorSize, CodeAlignmentFactor, DataAlignmentFactor, ReturnAddressRegister, AugmentationData, FDEPointerEncoding, - LSDAPointerEncoding, Personality, PersonalityEncoding); + LSDAPointerEncoding, Personality, PersonalityEncoding, Arch); CIEs[StartOffset] = Cie.get(); Entries.emplace_back(std::move(Cie)); } else { @@ -512,7 +514,7 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) { Entries.emplace_back(new FDE(StartOffset, Length, CIEPointer, InitialLocation, AddressRange, - Cie, LSDAAddress)); + Cie, LSDAAddress, Arch)); } if (Error E = diff --git a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp index 9146b457a5ddae1bfdae098e337b3053b094fa93..f8b5ff6ec8fb60344cc9d07142fdb361b9587e32 100644 --- a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp +++ b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp @@ -124,7 +124,7 @@ DWARFDebugLoc::parseOneLocationList(DWARFDataExtractor Data, unsigned *Offset) { StringRef str = Data.getData().substr(*Offset, Bytes); *Offset += Bytes; E.Loc.reserve(str.size()); - std::copy(str.begin(), str.end(), std::back_inserter(E.Loc)); + llvm::copy(str, std::back_inserter(E.Loc)); LL.Entries.push_back(std::move(E)); } } @@ -189,7 +189,7 @@ DWARFDebugLoclists::parseOneLocationList(DataExtractor Data, unsigned *Offset, StringRef str = Data.getData().substr(*Offset, Bytes); *Offset += Bytes; E.Loc.resize(str.size()); - std::copy(str.begin(), str.end(), E.Loc.begin()); + llvm::copy(str, E.Loc.begin()); } LL.Entries.push_back(std::move(E)); @@ -235,14 +235,14 @@ void DWARFDebugLoclists::LocationList::dump(raw_ostream &OS, uint64_t BaseAddr, case dwarf::DW_LLE_start_length: OS << '\n'; OS.indent(Indent); - OS << format("[0x%*.*" PRIx64 ", 0x%*.*x): ", AddressSize * 2, + OS << format("[0x%*.*" PRIx64 ", 0x%*.*" PRIx64 "): ", AddressSize * 2, AddressSize * 2, E.Value0, AddressSize * 2, AddressSize * 2, E.Value0 + E.Value1); break; case dwarf::DW_LLE_offset_pair: OS << '\n'; OS.indent(Indent); - OS << format("[0x%*.*" PRIx64 ", 0x%*.*x): ", AddressSize * 2, + OS << format("[0x%*.*" PRIx64 ", 0x%*.*" PRIx64 "): ", AddressSize * 2, AddressSize * 2, BaseAddr + E.Value0, AddressSize * 2, AddressSize * 2, BaseAddr + E.Value1); break; diff --git a/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp b/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp index de8b6e543fab20fa150226c819ca433e4f0daf80..abd1ad59a9c18d4d1e9f22ae7f38fe275279d12c 100644 --- a/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp +++ b/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/DWARF/DWARFDebugPubTable.h" +#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h" #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/Support/DataExtractor.h" @@ -18,10 +19,11 @@ using namespace llvm; using namespace dwarf; -DWARFDebugPubTable::DWARFDebugPubTable(StringRef Data, bool LittleEndian, - bool GnuStyle) +DWARFDebugPubTable::DWARFDebugPubTable(const DWARFObject &Obj, + const DWARFSection &Sec, + bool LittleEndian, bool GnuStyle) : GnuStyle(GnuStyle) { - DataExtractor PubNames(Data, LittleEndian, 0); + DWARFDataExtractor PubNames(Obj, Sec, LittleEndian, 0); uint32_t Offset = 0; while (PubNames.isValidOffset(Offset)) { Sets.push_back({}); @@ -29,10 +31,10 @@ DWARFDebugPubTable::DWARFDebugPubTable(StringRef Data, bool LittleEndian, SetData.Length = PubNames.getU32(&Offset); SetData.Version = PubNames.getU16(&Offset); - SetData.Offset = PubNames.getU32(&Offset); + SetData.Offset = PubNames.getRelocatedValue(4, &Offset); SetData.Size = PubNames.getU32(&Offset); - while (Offset < Data.size()) { + while (Offset < Sec.Data.size()) { uint32_t DieRef = PubNames.getU32(&Offset); if (DieRef == 0) break; diff --git a/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp b/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp index cb5fb0d49dabaea6de5b58f15ec970a7ed47feae..60c6eb30857fa303e1b62e382f6868d39b7c394f 100644 --- a/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp +++ b/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp @@ -123,7 +123,7 @@ DWARFDebugRnglist::getAbsoluteRanges(llvm::Optional BaseAddr, if (RLE.EntryKind == dwarf::DW_RLE_base_addressx) { BaseAddr = U.getAddrOffsetSectionItem(RLE.Value0); if (!BaseAddr) - BaseAddr = {RLE.Value0, 0}; + BaseAddr = {RLE.Value0, -1ULL}; continue; } if (RLE.EntryKind == dwarf::DW_RLE_base_address) { @@ -156,7 +156,7 @@ DWARFDebugRnglist::getAbsoluteRanges(llvm::Optional BaseAddr, case dwarf::DW_RLE_startx_length: { auto Start = U.getAddrOffsetSectionItem(RLE.Value0); if (!Start) - Start = {0, 0}; + Start = {0, -1ULL}; E.SectionIndex = Start->SectionIndex; E.LowPC = Start->Address; E.HighPC = E.LowPC + RLE.Value1; diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp index 31c4cd5e472a5efefae5ccf4b8918eb52cdbfafc..551e292fb0323d35ecd11296a01bbe837edf048b 100644 --- a/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -155,9 +155,7 @@ static void dumpTypeTagName(raw_ostream &OS, dwarf::Tag T) { } /// Recursively dump the DIE type name when applicable. -static void dumpTypeName(raw_ostream &OS, const DWARFDie &Die) { - DWARFDie D = Die.getAttributeValueAsReferencedDie(DW_AT_type); - +static void dumpTypeName(raw_ostream &OS, const DWARFDie &D) { if (!D.isValid()) return; @@ -175,22 +173,63 @@ static void dumpTypeName(raw_ostream &OS, const DWARFDie &Die) { case DW_TAG_ptr_to_member_type: case DW_TAG_reference_type: case DW_TAG_rvalue_reference_type: + case DW_TAG_subroutine_type: break; default: dumpTypeTagName(OS, T); } // Follow the DW_AT_type if possible. - dumpTypeName(OS, D); + DWARFDie TypeDie = D.getAttributeValueAsReferencedDie(DW_AT_type); + dumpTypeName(OS, TypeDie); switch (T) { - case DW_TAG_array_type: - OS << "[]"; + case DW_TAG_subroutine_type: { + if (!TypeDie) + OS << "void"; + OS << '('; + bool First = true; + for (const DWARFDie &C : D.children()) { + if (C.getTag() == DW_TAG_formal_parameter) { + if (!First) + OS << ", "; + First = false; + dumpTypeName(OS, C.getAttributeValueAsReferencedDie(DW_AT_type)); + } + } + OS << ')'; + break; + } + case DW_TAG_array_type: { + Optional Bound; + for (const DWARFDie &C : D.children()) + if (C.getTag() == DW_TAG_subrange_type) { + OS << '['; + uint64_t LowerBound = 0; + if (Optional L = C.find(DW_AT_lower_bound)) + if (Optional LB = L->getAsUnsignedConstant()) { + LowerBound = *LB; + OS << LowerBound << '-'; + } + if (Optional CountV = C.find(DW_AT_count)) { + if (Optional C = CountV->getAsUnsignedConstant()) + OS << (*C + LowerBound); + } else if (Optional UpperV = C.find(DW_AT_upper_bound)) + if (Optional U = UpperV->getAsUnsignedConstant()) + OS << *U; + OS << ']'; + } break; + } case DW_TAG_pointer_type: OS << '*'; break; case DW_TAG_ptr_to_member_type: + if (DWARFDie Cont = + D.getAttributeValueAsReferencedDie(DW_AT_containing_type)) { + dumpTypeName(OS << ' ', Cont); + OS << "::"; + } OS << '*'; break; case DW_TAG_reference_type: @@ -270,12 +309,13 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, // having both the raw value and the pretty-printed value is // interesting. These attributes are handled below. if (Attr == DW_AT_specification || Attr == DW_AT_abstract_origin) { - if (const char *Name = Die.getAttributeValueAsReferencedDie(Attr).getName( - DINameKind::LinkageName)) + if (const char *Name = + Die.getAttributeValueAsReferencedDie(formValue).getName( + DINameKind::LinkageName)) OS << Space << "\"" << Name << '\"'; } else if (Attr == DW_AT_type) { OS << Space << "\""; - dumpTypeName(OS, Die); + dumpTypeName(OS, Die.getAttributeValueAsReferencedDie(formValue)); OS << '"'; } else if (Attr == DW_AT_APPLE_property_attribute) { if (Optional OptVal = formValue.getAsUnsignedConstant()) @@ -371,7 +411,14 @@ DWARFDie::findRecursively(ArrayRef Attrs) const { DWARFDie DWARFDie::getAttributeValueAsReferencedDie(dwarf::Attribute Attr) const { - if (auto SpecRef = toReference(find(Attr))) { + if (Optional F = find(Attr)) + return getAttributeValueAsReferencedDie(*F); + return DWARFDie(); +} + +DWARFDie +DWARFDie::getAttributeValueAsReferencedDie(const DWARFFormValue &V) const { + if (auto SpecRef = toReference(V)) { if (auto SpecUnit = U->getUnitVector().getUnitForOffset(*SpecRef)) return SpecUnit->getDIEForOffset(*SpecRef); } diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp index 1caaa249bef94f7c4174b71f982dfc8942d8c006..48900e4b7a28947f85d42488029d0b97ad7b8abb 100644 --- a/lib/DebugInfo/DWARF/DWARFUnit.cpp +++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp @@ -66,9 +66,11 @@ void DWARFUnitVector::addUnitsImpl( DWARFDataExtractor Data(Obj, Section, LE, 0); // Lazy initialization of Parser, now that we have all section info. if (!Parser) { - Parser = [=, &Context, &Obj, &Section, &SOS, &LS]( - uint32_t Offset, DWARFSectionKind SectionKind, - const DWARFSection *CurSection) -> std::unique_ptr { + Parser = [=, &Context, &Obj, &Section, &SOS, + &LS](uint32_t Offset, DWARFSectionKind SectionKind, + const DWARFSection *CurSection, + const DWARFUnitIndex::Entry *IndexEntry) + -> std::unique_ptr { const DWARFSection &InfoSection = CurSection ? *CurSection : Section; DWARFDataExtractor Data(Obj, InfoSection, LE, 0); if (!Data.isValidOffset(Offset)) @@ -77,7 +79,8 @@ void DWARFUnitVector::addUnitsImpl( if (IsDWO) Index = &getDWARFUnitIndex(Context, SectionKind); DWARFUnitHeader Header; - if (!Header.extract(Context, Data, &Offset, SectionKind, Index)) + if (!Header.extract(Context, Data, &Offset, SectionKind, Index, + IndexEntry)) return nullptr; std::unique_ptr U; if (Header.isTypeUnit()) @@ -106,7 +109,7 @@ void DWARFUnitVector::addUnitsImpl( ++I; continue; } - auto U = Parser(Offset, SectionKind, &Section); + auto U = Parser(Offset, SectionKind, &Section, nullptr); // If parsing failed, we're done with this section. if (!U) break; @@ -156,7 +159,7 @@ DWARFUnitVector::getUnitForIndexEntry(const DWARFUnitIndex::Entry &E) { if (!Parser) return nullptr; - auto U = Parser(Offset, DW_SECT_INFO, nullptr); + auto U = Parser(Offset, DW_SECT_INFO, nullptr, &E); if (!U) U = nullptr; @@ -233,9 +236,12 @@ bool DWARFUnitHeader::extract(DWARFContext &Context, const DWARFDataExtractor &debug_info, uint32_t *offset_ptr, DWARFSectionKind SectionKind, - const DWARFUnitIndex *Index) { + const DWARFUnitIndex *Index, + const DWARFUnitIndex::Entry *Entry) { Offset = *offset_ptr; - IndexEntry = Index ? Index->getFromOffset(*offset_ptr) : nullptr; + IndexEntry = Entry; + if (!IndexEntry && Index) + IndexEntry = Index->getFromOffset(*offset_ptr); Length = debug_info.getU32(offset_ptr); // FIXME: Support DWARF64. unsigned SizeOfLength = 4; diff --git a/lib/DebugInfo/PDB/GenericError.cpp b/lib/DebugInfo/PDB/GenericError.cpp index 5f5ff69fe3f829eb3d41b9b09c7ce23b395ffd2a..256952073e88a0fc6dee89a680f4c95ff5e3a2e5 100644 --- a/lib/DebugInfo/PDB/GenericError.cpp +++ b/lib/DebugInfo/PDB/GenericError.cpp @@ -34,6 +34,8 @@ public: return "The PDB file path is an invalid UTF8 sequence."; case pdb_error_code::signature_out_of_date: return "The signature does not match; the file(s) might be out of date."; + case pdb_error_code::external_cmdline_ref: + return "The path to this file must be provided on the command-line."; } llvm_unreachable("Unrecognized generic_error_code"); } diff --git a/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp b/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp index b97f1e90bcf83ead6e6d58f1e43e5b78fb940e9d..ab93efc839a9a101ff68c0595228466a64601ac0 100644 --- a/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp +++ b/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp @@ -19,7 +19,6 @@ #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h" #include "llvm/DebugInfo/PDB/Native/RawConstants.h" #include "llvm/DebugInfo/PDB/Native/RawError.h" -#include "llvm/Support/BinaryItemStream.h" #include "llvm/Support/BinaryStreamWriter.h" using namespace llvm; @@ -66,12 +65,22 @@ void DbiModuleDescriptorBuilder::setFirstSectionContrib( } void DbiModuleDescriptorBuilder::addSymbol(CVSymbol Symbol) { - Symbols.push_back(Symbol); - // Symbols written to a PDB file are required to be 4 byte aligned. The same + // Defer to the bulk API. It does the same thing. + addSymbolsInBulk(Symbol.data()); +} + +void DbiModuleDescriptorBuilder::addSymbolsInBulk( + ArrayRef BulkSymbols) { + // Do nothing for empty runs of symbols. + if (BulkSymbols.empty()) + return; + + Symbols.push_back(BulkSymbols); + // Symbols written to a PDB file are required to be 4 byte aligned. The same // is not true of object files. - assert(Symbol.length() % alignOf(CodeViewContainer::Pdb) == 0 && + assert(BulkSymbols.size() % alignOf(CodeViewContainer::Pdb) == 0 && "Invalid Symbol alignment!"); - SymbolByteSize += Symbol.length(); + SymbolByteSize += BulkSymbols.size(); } void DbiModuleDescriptorBuilder::addSourceFile(StringRef Path) { @@ -145,16 +154,13 @@ Error DbiModuleDescriptorBuilder::commit(BinaryStreamWriter &ModiWriter, if (auto EC = SymbolWriter.writeInteger(COFF::DEBUG_SECTION_MAGIC)) return EC; - BinaryItemStream Records(llvm::support::endianness::little); - Records.setItems(Symbols); - BinaryStreamRef RecordsRef(Records); - if (auto EC = SymbolWriter.writeStreamRef(RecordsRef)) - return EC; - if (auto EC = SymbolWriter.padToAlignment(4)) - return EC; - // TODO: Write C11 Line data + for (ArrayRef Syms : Symbols) { + if (auto EC = SymbolWriter.writeBytes(Syms)) + return EC; + } assert(SymbolWriter.getOffset() % alignOf(CodeViewContainer::Pdb) == 0 && "Invalid debug section alignment!"); + // TODO: Write C11 Line data for (const auto &Builder : C13Builders) { assert(Builder && "Empty C13 Fragment Builder!"); if (auto EC = Builder->commit(SymbolWriter)) diff --git a/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp index b40ae8e9007030c1cee02ed740a27738c1d70ffc..57da7003da2b82cb486bb000a22fca72575ec7fa 100644 --- a/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp +++ b/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp @@ -9,6 +9,7 @@ #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/DebugInfo/CodeView/RecordName.h" #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h" #include "llvm/DebugInfo/CodeView/SymbolRecord.h" @@ -20,6 +21,7 @@ #include "llvm/DebugInfo/PDB/Native/Hash.h" #include "llvm/Support/BinaryItemStream.h" #include "llvm/Support/BinaryStreamWriter.h" +#include "llvm/Support/xxhash.h" #include #include @@ -29,8 +31,27 @@ using namespace llvm::pdb; using namespace llvm::codeview; struct llvm::pdb::GSIHashStreamBuilder { + struct UdtDenseMapInfo { + static inline CVSymbol getEmptyKey() { + static CVSymbol Empty; + return Empty; + } + static inline CVSymbol getTombstoneKey() { + static CVSymbol Tombstone(static_cast(-1), + ArrayRef()); + return Tombstone; + } + static unsigned getHashValue(const CVSymbol &Val) { + return xxHash64(Val.RecordData); + } + static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) { + return LHS.RecordData == RHS.RecordData; + } + }; + std::vector Records; uint32_t StreamIndex; + llvm::DenseSet UdtHashes; std::vector HashRecords; std::array HashBitmap; std::vector HashBuckets; @@ -42,10 +63,18 @@ struct llvm::pdb::GSIHashStreamBuilder { template void addSymbol(const T &Symbol, MSFBuilder &Msf) { T Copy(Symbol); - Records.push_back(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(), - CodeViewContainer::Pdb)); + addSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(), + CodeViewContainer::Pdb)); + } + void addSymbol(const CVSymbol &Symbol) { + if (Symbol.kind() == S_UDT) { + auto Iter = UdtHashes.insert(Symbol); + if (!Iter.second) + return; + } + + Records.push_back(Symbol); } - void addSymbol(const CVSymbol &Symbol) { Records.push_back(Symbol); } }; uint32_t GSIHashStreamBuilder::calculateSerializedLength() const { @@ -272,10 +301,6 @@ void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) { GSH->addSymbol(Sym, Msf); } -void GSIStreamBuilder::addGlobalSymbol(const UDTSym &Sym) { - GSH->addSymbol(Sym, Msf); -} - void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Sym) { GSH->addSymbol(Sym); } diff --git a/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp b/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp index 6464b85982eca60ab9f3de9a8f2356bca4dd551c..8c97f4a012f03d61fde19ae78fce6deab7006946 100644 --- a/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp +++ b/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp @@ -11,7 +11,9 @@ #include "llvm/ADT/iterator_range.h" #include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h" +#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h" #include "llvm/DebugInfo/CodeView/SymbolRecord.h" +#include "llvm/DebugInfo/CodeView/SymbolRecordHelpers.h" #include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h" #include "llvm/DebugInfo/PDB/Native/RawError.h" #include "llvm/Support/BinaryStreamReader.h" @@ -47,7 +49,8 @@ Error ModuleDebugStreamRef::reload() { if (auto EC = Reader.readInteger(Signature)) return EC; - if (auto EC = Reader.readSubstream(SymbolsSubstream, SymbolSize - 4)) + Reader.setOffset(0); + if (auto EC = Reader.readSubstream(SymbolsSubstream, SymbolSize)) return EC; if (auto EC = Reader.readSubstream(C11LinesSubstream, C11Size)) return EC; @@ -55,8 +58,8 @@ Error ModuleDebugStreamRef::reload() { return EC; BinaryStreamReader SymbolReader(SymbolsSubstream.StreamData); - if (auto EC = - SymbolReader.readArray(SymbolArray, SymbolReader.bytesRemaining())) + if (auto EC = SymbolReader.readArray( + SymbolArray, SymbolReader.bytesRemaining(), sizeof(uint32_t))) return EC; BinaryStreamReader SubsectionsReader(C13LinesSubstream.StreamData); @@ -76,6 +79,11 @@ Error ModuleDebugStreamRef::reload() { return Error::success(); } +const codeview::CVSymbolArray +ModuleDebugStreamRef::getSymbolArrayForScope(uint32_t ScopeBegin) const { + return limitSymbolArrayToScope(SymbolArray, ScopeBegin); +} + BinarySubstreamRef ModuleDebugStreamRef::getSymbolsSubstream() const { return SymbolsSubstream; } @@ -98,9 +106,7 @@ ModuleDebugStreamRef::symbols(bool *HadError) const { } CVSymbol ModuleDebugStreamRef::readSymbolAtOffset(uint32_t Offset) const { - // Offsets include the size of the 4-byte magic at the beginning, but lookup - // doesn't take that into account, so subtract it here. - auto Iter = SymbolArray.at(Offset - 4); + auto Iter = SymbolArray.at(Offset); assert(Iter != SymbolArray.end()); return *Iter; } diff --git a/lib/DebugInfo/PDB/Native/SymbolCache.cpp b/lib/DebugInfo/PDB/Native/SymbolCache.cpp index 7485341a2064962f2f7c479273c68ad70df28288..5cdd628312fe5c7313c7062e2f8ef2efff3f63b2 100644 --- a/lib/DebugInfo/PDB/Native/SymbolCache.cpp +++ b/lib/DebugInfo/PDB/Native/SymbolCache.cpp @@ -68,9 +68,6 @@ SymbolCache::SymbolCache(NativeSession &Session, DbiStream *Dbi) if (Dbi) Compilands.resize(Dbi->modules().getModuleCount()); - - auto &Tpi = cantFail(Session.getPDBFile().getPDBTpiStream()); - Tpi.buildHashMap(); } std::unique_ptr diff --git a/lib/DebugInfo/PDB/Native/TpiStream.cpp b/lib/DebugInfo/PDB/Native/TpiStream.cpp index 44781705bfaed221953a93df9652182e7a17ed18..f234d446e6a099246e159fb0590c007aaffe3ea2 100644 --- a/lib/DebugInfo/PDB/Native/TpiStream.cpp +++ b/lib/DebugInfo/PDB/Native/TpiStream.cpp @@ -160,6 +160,9 @@ void TpiStream::buildHashMap() { } std::vector TpiStream::findRecordsByName(StringRef Name) const { + if (!supportsTypeLookup()) + const_cast(this)->buildHashMap(); + uint32_t Bucket = hashStringV1(Name) % Header->NumHashBuckets; if (Bucket > HashMap.size()) return {}; @@ -177,6 +180,9 @@ bool TpiStream::supportsTypeLookup() const { return !HashMap.empty(); } Expected TpiStream::findFullDeclForForwardRef(TypeIndex ForwardRefTI) const { + if (!supportsTypeLookup()) + const_cast(this)->buildHashMap(); + CVType F = Types->getType(ForwardRefTI); if (!isUdtForwardRef(F)) return ForwardRefTI; @@ -215,6 +221,7 @@ TpiStream::findFullDeclForForwardRef(TypeIndex ForwardRefTI) const { } codeview::CVType TpiStream::getType(codeview::TypeIndex Index) { + assert(!Index.isSimple()); return Types->getType(Index); } diff --git a/lib/Demangle/ItaniumDemangle.cpp b/lib/Demangle/ItaniumDemangle.cpp index b6b11dbddf26b5d841331e6a6d9d750318da47ea..b2de0be2b70c5f2a5e418bd5502f0291d94070e3 100644 --- a/lib/Demangle/ItaniumDemangle.cpp +++ b/lib/Demangle/ItaniumDemangle.cpp @@ -340,7 +340,7 @@ char *llvm::itaniumDemangle(const char *MangledName, char *Buf, if (AST == nullptr) InternalStatus = demangle_invalid_mangled_name; - else if (initializeOutputStream(Buf, N, S, 1024)) + else if (!initializeOutputStream(Buf, N, S, 1024)) InternalStatus = demangle_memory_alloc_failure; else { assert(Parser.ForwardTemplateRefs.empty()); @@ -356,15 +356,6 @@ char *llvm::itaniumDemangle(const char *MangledName, char *Buf, return InternalStatus == demangle_success ? Buf : nullptr; } -bool llvm::itaniumFindTypesInMangledName(const char *MangledName, void *Ctx, - void (*Callback)(void *, - const char *)) { - Demangler Parser(MangledName, MangledName + std::strlen(MangledName)); - Parser.TypeCallback = Callback; - Parser.TypeCallbackContext = Ctx; - return Parser.parse() == nullptr; -} - ItaniumPartialDemangler::ItaniumPartialDemangler() : RootNode(nullptr), Context(new Demangler{nullptr, nullptr}) {} @@ -396,7 +387,7 @@ bool ItaniumPartialDemangler::partialDemangle(const char *MangledName) { static char *printNode(const Node *RootNode, char *Buf, size_t *N) { OutputStream S; - if (initializeOutputStream(Buf, N, S, 128)) + if (!initializeOutputStream(Buf, N, S, 128)) return nullptr; RootNode->print(S); S += '\0'; @@ -441,7 +432,7 @@ char *ItaniumPartialDemangler::getFunctionDeclContextName(char *Buf, const Node *Name = static_cast(RootNode)->getName(); OutputStream S; - if (initializeOutputStream(Buf, N, S, 128)) + if (!initializeOutputStream(Buf, N, S, 128)) return nullptr; KeepGoingLocalFunction: @@ -494,7 +485,7 @@ char *ItaniumPartialDemangler::getFunctionParameters(char *Buf, NodeArray Params = static_cast(RootNode)->getParams(); OutputStream S; - if (initializeOutputStream(Buf, N, S, 128)) + if (!initializeOutputStream(Buf, N, S, 128)) return nullptr; S += '('; @@ -512,7 +503,7 @@ char *ItaniumPartialDemangler::getFunctionReturnType( return nullptr; OutputStream S; - if (initializeOutputStream(Buf, N, S, 128)) + if (!initializeOutputStream(Buf, N, S, 128)) return nullptr; if (const Node *Ret = diff --git a/lib/Demangle/MicrosoftDemangle.cpp b/lib/Demangle/MicrosoftDemangle.cpp index 882e4a578455dbcac5189167b0839d522d3ba43f..cca7bf95a7e4afdb9115fe52334b874461ac3d70 100644 --- a/lib/Demangle/MicrosoftDemangle.cpp +++ b/lib/Demangle/MicrosoftDemangle.cpp @@ -40,7 +40,8 @@ struct NodeList { NodeList *Next = nullptr; }; -static bool isMemberPointer(StringView MangledName) { +static bool isMemberPointer(StringView MangledName, bool &Error) { + Error = false; switch (MangledName.popFront()) { case '$': // This is probably an rvalue reference (e.g. $$Q), and you cannot have an @@ -58,7 +59,8 @@ static bool isMemberPointer(StringView MangledName) { // what. break; default: - assert(false && "Ty is not a pointer type!"); + Error = true; + return false; } // If it starts with a number, then 6 indicates a non-member function @@ -89,9 +91,9 @@ static bool isMemberPointer(StringView MangledName) { case 'T': return true; default: - assert(false); + Error = true; + return false; } - return false; } static SpecialIntrinsicKind @@ -874,7 +876,7 @@ void Demangler::memorizeIdentifier(IdentifierNode *Identifier) { // Render this class template name into a string buffer so that we can // memorize it for the purpose of back-referencing. OutputStream OS; - if (initializeOutputStream(nullptr, nullptr, OS, 1024)) + if (!initializeOutputStream(nullptr, nullptr, OS, 1024)) // FIXME: Propagate out-of-memory as an error? std::terminate(); Identifier->output(OS, OF_Default); @@ -1207,7 +1209,7 @@ Demangler::demangleStringLiteral(StringView &MangledName) { if (MangledName.empty()) goto StringLiteralError; - if (initializeOutputStream(nullptr, nullptr, OS, 1024)) + if (!initializeOutputStream(nullptr, nullptr, OS, 1024)) // FIXME: Propagate out-of-memory as an error? std::terminate(); if (IsWcharT) { @@ -1330,7 +1332,7 @@ Demangler::demangleLocallyScopedNamePiece(StringView &MangledName) { // Render the parent symbol's name into a buffer. OutputStream OS; - if (initializeOutputStream(nullptr, nullptr, OS, 1024)) + if (!initializeOutputStream(nullptr, nullptr, OS, 1024)) // FIXME: Propagate out-of-memory as an error? std::terminate(); OS << '`'; @@ -1651,10 +1653,12 @@ TypeNode *Demangler::demangleType(StringView &MangledName, if (isTagType(MangledName)) Ty = demangleClassType(MangledName); else if (isPointerType(MangledName)) { - if (isMemberPointer(MangledName)) + if (isMemberPointer(MangledName, Error)) Ty = demangleMemberPointerType(MangledName); - else + else if (!Error) Ty = demanglePointerType(MangledName); + else + return nullptr; } else if (isArrayType(MangledName)) Ty = demangleArrayType(MangledName); else if (isFunctionType(MangledName)) { @@ -1669,10 +1673,10 @@ TypeNode *Demangler::demangleType(StringView &MangledName, Ty = demangleCustomType(MangledName); } else { Ty = demanglePrimitiveType(MangledName); - if (!Ty || Error) - return Ty; } + if (!Ty || Error) + return Ty; Ty->Quals = Qualifiers(Ty->Quals | Quals); return Ty; } @@ -1988,6 +1992,8 @@ Demangler::demangleFunctionParameterList(StringView &MangledName) { *Current = Arena.alloc(); TypeNode *TN = demangleType(MangledName, QualifierMangleMode::Drop); + if (!TN || Error) + return nullptr; (*Current)->N = TN; @@ -2156,7 +2162,7 @@ void Demangler::dumpBackReferences() { // Create an output stream so we can render each type. OutputStream OS; - if (initializeOutputStream(nullptr, nullptr, OS, 1024)) + if (!initializeOutputStream(nullptr, nullptr, OS, 1024)) std::terminate(); for (size_t I = 0; I < Backrefs.FunctionParamCount; ++I) { OS.setCurrentPosition(0); @@ -2194,7 +2200,7 @@ char *llvm::microsoftDemangle(const char *MangledName, char *Buf, size_t *N, if (D.Error) InternalStatus = demangle_invalid_mangled_name; - else if (initializeOutputStream(Buf, N, S, 1024)) + else if (!initializeOutputStream(Buf, N, S, 1024)) InternalStatus = demangle_memory_alloc_failure; else { AST->output(S, OF_Default); diff --git a/lib/Demangle/MicrosoftDemangleNodes.cpp b/lib/Demangle/MicrosoftDemangleNodes.cpp index af893b9b68e1ce1bb56e54d4b10b79eedd250d08..a588dca5b7ba338bdf5498965099833b47d886c0 100644 --- a/lib/Demangle/MicrosoftDemangleNodes.cpp +++ b/lib/Demangle/MicrosoftDemangleNodes.cpp @@ -15,6 +15,7 @@ #include "llvm/Demangle/Compiler.h" #include "llvm/Demangle/Utility.h" #include +#include using namespace llvm; using namespace ms_demangle; @@ -113,6 +114,14 @@ static void outputCallingConvention(OutputStream &OS, CallingConv CC) { } } +std::string Node::toString() const { + OutputStream OS; + initializeOutputStream(nullptr, nullptr, OS, 1024); + this->output(OS, llvm::ms_demangle::OF_Default); + OS << '\0'; + return {OS.getBuffer()}; +} + void TypeNode::outputQuals(bool SpaceBefore, bool SpaceAfter) const {} void PrimitiveTypeNode::outputPre(OutputStream &OS, OutputFlags Flags) const { @@ -161,22 +170,21 @@ void EncodedStringLiteralNode::output(OutputStream &OS, OutputFlags Flags) const { switch (Char) { case CharKind::Wchar: - OS << "const wchar_t * {L\""; + OS << "L\""; break; case CharKind::Char: - OS << "const char * {\""; + OS << "\""; break; case CharKind::Char16: - OS << "const char16_t * {u\""; + OS << "u\""; break; case CharKind::Char32: - OS << "const char32_t * {U\""; + OS << "U\""; break; } OS << DecodedString << "\""; if (IsTruncated) OS << "..."; - OS << "}"; } void IntegerLiteralNode::output(OutputStream &OS, OutputFlags Flags) const { @@ -369,16 +377,23 @@ void LiteralOperatorIdentifierNode::output(OutputStream &OS, void FunctionSignatureNode::outputPre(OutputStream &OS, OutputFlags Flags) const { + if (FunctionClass & FC_Public) + OS << "public: "; + if (FunctionClass & FC_Protected) + OS << "protected: "; + if (FunctionClass & FC_Private) + OS << "private: "; + if (!(FunctionClass & FC_Global)) { if (FunctionClass & FC_Static) OS << "static "; } - if (FunctionClass & FC_ExternC) - OS << "extern \"C\" "; - if (FunctionClass & FC_Virtual) OS << "virtual "; + if (FunctionClass & FC_ExternC) + OS << "extern \"C\" "; + if (ReturnType) { ReturnType->outputPre(OS, Flags); OS << " "; @@ -555,9 +570,13 @@ void FunctionSymbolNode::output(OutputStream &OS, OutputFlags Flags) const { void VariableSymbolNode::output(OutputStream &OS, OutputFlags Flags) const { switch (SC) { case StorageClass::PrivateStatic: + OS << "private: static "; + break; case StorageClass::PublicStatic: + OS << "public: static "; + break; case StorageClass::ProtectedStatic: - OS << "static "; + OS << "protected: static "; break; default: break; diff --git a/lib/ExecutionEngine/GDBRegistrationListener.cpp b/lib/ExecutionEngine/GDBRegistrationListener.cpp index fd4f0746f7f9e310178a56ea9b0460de48f81642..8204f5a90268bb313dc2c1500ebf0341e9db9511 100644 --- a/lib/ExecutionEngine/GDBRegistrationListener.cpp +++ b/lib/ExecutionEngine/GDBRegistrationListener.cpp @@ -76,8 +76,8 @@ struct RegisteredObjectInfo { }; // Buffer for an in-memory object file in executable memory -typedef llvm::DenseMap< const char*, RegisteredObjectInfo> - RegisteredObjectBufferMap; +typedef llvm::DenseMap + RegisteredObjectBufferMap; /// Global access point for the JIT debugging interface designed for use with a /// singleton toolbox. Handles thread-safe registration and deregistration of @@ -99,13 +99,13 @@ public: /// Creates an entry in the JIT registry for the buffer @p Object, /// which must contain an object file in executable memory with any /// debug information for the debugger. - void NotifyObjectEmitted(const ObjectFile &Object, - const RuntimeDyld::LoadedObjectInfo &L) override; + void notifyObjectLoaded(ObjectKey K, const ObjectFile &Obj, + const RuntimeDyld::LoadedObjectInfo &L) override; /// Removes the internal registration of @p Object, and /// frees associated resources. /// Returns true if @p Object was found in ObjectBufferMap. - void NotifyFreeingObject(const ObjectFile &Object) override; + void notifyFreeingObject(ObjectKey K) override; private: /// Deregister the debug info for the given object file from the debugger @@ -147,11 +147,11 @@ GDBJITRegistrationListener::~GDBJITRegistrationListener() { ObjectBufferMap.clear(); } -void GDBJITRegistrationListener::NotifyObjectEmitted( - const ObjectFile &Object, - const RuntimeDyld::LoadedObjectInfo &L) { +void GDBJITRegistrationListener::notifyObjectLoaded( + ObjectKey K, const ObjectFile &Obj, + const RuntimeDyld::LoadedObjectInfo &L) { - OwningBinary DebugObj = L.getObjectForDebug(Object); + OwningBinary DebugObj = L.getObjectForDebug(Obj); // Bail out if debug objects aren't supported. if (!DebugObj.getBinary()) @@ -160,11 +160,8 @@ void GDBJITRegistrationListener::NotifyObjectEmitted( const char *Buffer = DebugObj.getBinary()->getMemoryBufferRef().getBufferStart(); size_t Size = DebugObj.getBinary()->getMemoryBufferRef().getBufferSize(); - const char *Key = Object.getMemoryBufferRef().getBufferStart(); - - assert(Key && "Attempt to register a null object with a debugger."); llvm::MutexGuard locked(*JITDebugLock); - assert(ObjectBufferMap.find(Key) == ObjectBufferMap.end() && + assert(ObjectBufferMap.find(K) == ObjectBufferMap.end() && "Second attempt to perform debug registration."); jit_code_entry* JITCodeEntry = new jit_code_entry(); @@ -175,16 +172,15 @@ void GDBJITRegistrationListener::NotifyObjectEmitted( JITCodeEntry->symfile_addr = Buffer; JITCodeEntry->symfile_size = Size; - ObjectBufferMap[Key] = RegisteredObjectInfo(Size, JITCodeEntry, - std::move(DebugObj)); + ObjectBufferMap[K] = + RegisteredObjectInfo(Size, JITCodeEntry, std::move(DebugObj)); NotifyDebugger(JITCodeEntry); } } -void GDBJITRegistrationListener::NotifyFreeingObject(const ObjectFile& Object) { - const char *Key = Object.getMemoryBufferRef().getBufferStart(); +void GDBJITRegistrationListener::notifyFreeingObject(ObjectKey K) { llvm::MutexGuard locked(*JITDebugLock); - RegisteredObjectBufferMap::iterator I = ObjectBufferMap.find(Key); + RegisteredObjectBufferMap::iterator I = ObjectBufferMap.find(K); if (I != ObjectBufferMap.end()) { deregisterObjectInternal(I); diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp index 211f5216811f2ceb8e39236d42d85c6ddce961f9..e9051c198506ed7d12125495036466ab0f2bbb0a 100644 --- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp +++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp @@ -47,7 +47,7 @@ class IntelJITEventListener : public JITEventListener { typedef DenseMap ObjectMap; ObjectMap LoadedObjectMap; - std::map> DebugObjects; + std::map> DebugObjects; public: IntelJITEventListener(IntelJITEventsWrapper* libraryWrapper) { @@ -57,10 +57,10 @@ public: ~IntelJITEventListener() { } - void NotifyObjectEmitted(const ObjectFile &Obj, - const RuntimeDyld::LoadedObjectInfo &L) override; + void notifyObjectLoaded(ObjectKey Key, const ObjectFile &Obj, + const RuntimeDyld::LoadedObjectInfo &L) override; - void NotifyFreeingObject(const ObjectFile &Obj) override; + void notifyFreeingObject(ObjectKey Key) override; }; static LineNumberInfo DILineInfoToIntelJITFormat(uintptr_t StartAddress, @@ -96,9 +96,9 @@ static iJIT_Method_Load FunctionDescToIntelJITFormat( return Result; } -void IntelJITEventListener::NotifyObjectEmitted( - const ObjectFile &Obj, - const RuntimeDyld::LoadedObjectInfo &L) { +void IntelJITEventListener::notifyObjectLoaded( + ObjectKey Key, const ObjectFile &Obj, + const RuntimeDyld::LoadedObjectInfo &L) { OwningBinary DebugObjOwner = L.getObjectForDebug(Obj); const ObjectFile *DebugObj = DebugObjOwner.getBinary(); @@ -188,17 +188,17 @@ void IntelJITEventListener::NotifyObjectEmitted( // registered function addresses for each loaded object. We will // use the MethodIDs map to get the registered ID for each function. LoadedObjectMap[ObjData] = Functions; - DebugObjects[Obj.getData().data()] = std::move(DebugObjOwner); + DebugObjects[Key] = std::move(DebugObjOwner); } -void IntelJITEventListener::NotifyFreeingObject(const ObjectFile &Obj) { +void IntelJITEventListener::notifyFreeingObject(ObjectKey Key) { // This object may not have been registered with the listener. If it wasn't, // bail out. - if (DebugObjects.find(Obj.getData().data()) == DebugObjects.end()) + if (DebugObjects.find(Key) == DebugObjects.end()) return; // Get the address of the object image for use as a unique identifier - const ObjectFile &DebugObj = *DebugObjects[Obj.getData().data()].getBinary(); + const ObjectFile &DebugObj = *DebugObjects[Key].getBinary(); const void* ObjData = DebugObj.getData().data(); // Get the object's function list from LoadedObjectMap @@ -223,7 +223,7 @@ void IntelJITEventListener::NotifyFreeingObject(const ObjectFile &Obj) { // Erase the object from LoadedObjectMap LoadedObjectMap.erase(OI); - DebugObjects.erase(Obj.getData().data()); + DebugObjects.erase(Key); } } // anonymous namespace. diff --git a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp index 64dca930722e61ee7eadb2600b3eafad04d7ed05..044d9b7f27a793ea235c91b163965f76e2d696cd 100644 --- a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp +++ b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp @@ -227,7 +227,8 @@ static bool ffiInvoke(RawFunc Fn, Function *F, ArrayRef ArgVals, Type *RetTy = FTy->getReturnType(); ffi_type *rtype = ffiTypeFor(RetTy); - if (ffi_prep_cif(&cif, FFI_DEFAULT_ABI, NumArgs, rtype, &args[0]) == FFI_OK) { + if (ffi_prep_cif(&cif, FFI_DEFAULT_ABI, NumArgs, rtype, args.data()) == + FFI_OK) { SmallVector ret; if (RetTy->getTypeID() != Type::VoidTyID) ret.resize(TD.getTypeStoreSize(RetTy)); diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp index 25c0cb5d6eff6e1958834723a79c324d20481ae1..ffc6707e1488c517c5168b3d1a3f9a06c19af22a 100644 --- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp +++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp @@ -95,7 +95,7 @@ MCJIT::~MCJIT() { for (auto &Obj : LoadedObjects) if (Obj) - NotifyFreeingObject(*Obj); + notifyFreeingObject(*Obj); Archives.clear(); } @@ -119,7 +119,7 @@ void MCJIT::addObjectFile(std::unique_ptr Obj) { if (Dyld.hasError()) report_fatal_error(Dyld.getErrorString()); - NotifyObjectEmitted(*Obj, *L); + notifyObjectLoaded(*Obj, *L); LoadedObjects.push_back(std::move(Obj)); } @@ -216,7 +216,7 @@ void MCJIT::generateCodeForModule(Module *M) { if (!LoadedObject) { std::string Buf; raw_string_ostream OS(Buf); - logAllUnhandledErrors(LoadedObject.takeError(), OS, ""); + logAllUnhandledErrors(LoadedObject.takeError(), OS); OS.flush(); report_fatal_error(Buf); } @@ -226,7 +226,7 @@ void MCJIT::generateCodeForModule(Module *M) { if (Dyld.hasError()) report_fatal_error(Dyld.getErrorString()); - NotifyObjectEmitted(*LoadedObject.get(), *L); + notifyObjectLoaded(*LoadedObject.get(), *L); Buffers.push_back(std::move(ObjectToLoad)); LoadedObjects.push_back(std::move(*LoadedObject)); @@ -648,19 +648,23 @@ void MCJIT::UnregisterJITEventListener(JITEventListener *L) { } } -void MCJIT::NotifyObjectEmitted(const object::ObjectFile& Obj, - const RuntimeDyld::LoadedObjectInfo &L) { +void MCJIT::notifyObjectLoaded(const object::ObjectFile &Obj, + const RuntimeDyld::LoadedObjectInfo &L) { + uint64_t Key = + static_cast(reinterpret_cast(Obj.getData().data())); MutexGuard locked(lock); MemMgr->notifyObjectLoaded(this, Obj); for (unsigned I = 0, S = EventListeners.size(); I < S; ++I) { - EventListeners[I]->NotifyObjectEmitted(Obj, L); + EventListeners[I]->notifyObjectLoaded(Key, Obj, L); } } -void MCJIT::NotifyFreeingObject(const object::ObjectFile& Obj) { +void MCJIT::notifyFreeingObject(const object::ObjectFile &Obj) { + uint64_t Key = + static_cast(reinterpret_cast(Obj.getData().data())); MutexGuard locked(lock); for (JITEventListener *L : EventListeners) - L->NotifyFreeingObject(Obj); + L->notifyFreeingObject(Key); } JITSymbol diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h index 943b14942a0f55b90d7ca314a4bd6b3bf1d86454..1119e138720f43138ee206649a452672445aeaee 100644 --- a/lib/ExecutionEngine/MCJIT/MCJIT.h +++ b/lib/ExecutionEngine/MCJIT/MCJIT.h @@ -331,9 +331,9 @@ protected: /// the future. std::unique_ptr emitObject(Module *M); - void NotifyObjectEmitted(const object::ObjectFile& Obj, - const RuntimeDyld::LoadedObjectInfo &L); - void NotifyFreeingObject(const object::ObjectFile& Obj); + void notifyObjectLoaded(const object::ObjectFile &Obj, + const RuntimeDyld::LoadedObjectInfo &L); + void notifyFreeingObject(const object::ObjectFile &Obj); JITSymbol findExistingSymbol(const std::string &Name); Module *findModuleForSymbol(const std::string &Name, bool CheckFunctionsOnly); diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp index 6f0825fb38da891604d6a923aa6e3008388b15ef..21af6b585c41e7d55a2ebab94767815e7634291f 100644 --- a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp +++ b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp @@ -40,7 +40,7 @@ class OProfileJITEventListener : public JITEventListener { std::unique_ptr Wrapper; void initialize(); - std::map> DebugObjects; + std::map> DebugObjects; public: OProfileJITEventListener(std::unique_ptr LibraryWrapper) @@ -50,10 +50,10 @@ public: ~OProfileJITEventListener(); - void NotifyObjectEmitted(const ObjectFile &Obj, - const RuntimeDyld::LoadedObjectInfo &L) override; + void notifyObjectLoaded(ObjectKey Key, const ObjectFile &Obj, + const RuntimeDyld::LoadedObjectInfo &L) override; - void NotifyFreeingObject(const ObjectFile &Obj) override; + void notifyFreeingObject(ObjectKey Key) override; }; void OProfileJITEventListener::initialize() { @@ -78,9 +78,9 @@ OProfileJITEventListener::~OProfileJITEventListener() { } } -void OProfileJITEventListener::NotifyObjectEmitted( - const ObjectFile &Obj, - const RuntimeDyld::LoadedObjectInfo &L) { +void OProfileJITEventListener::notifyObjectLoaded( + ObjectKey Key, const ObjectFile &Obj, + const RuntimeDyld::LoadedObjectInfo &L) { if (!Wrapper->isAgentAvailable()) { return; } @@ -137,18 +137,18 @@ void OProfileJITEventListener::NotifyObjectEmitted( } } - DebugObjects[Obj.getData().data()] = std::move(DebugObjOwner); + DebugObjects[Key] = std::move(DebugObjOwner); } -void OProfileJITEventListener::NotifyFreeingObject(const ObjectFile &Obj) { +void OProfileJITEventListener::notifyFreeingObject(ObjectKey Key) { if (Wrapper->isAgentAvailable()) { // If there was no agent registered when the original object was loaded then // we won't have created a debug object for it, so bail out. - if (DebugObjects.find(Obj.getData().data()) == DebugObjects.end()) + if (DebugObjects.find(Key) == DebugObjects.end()) return; - const ObjectFile &DebugObj = *DebugObjects[Obj.getData().data()].getBinary(); + const ObjectFile &DebugObj = *DebugObjects[Key].getBinary(); // Use symbol info to iterate functions in the object. for (symbol_iterator I = DebugObj.symbol_begin(), @@ -171,7 +171,7 @@ void OProfileJITEventListener::NotifyFreeingObject(const ObjectFile &Obj) { } } - DebugObjects.erase(Obj.getData().data()); + DebugObjects.erase(Key); } } // anonymous namespace. diff --git a/lib/ExecutionEngine/Orc/Core.cpp b/lib/ExecutionEngine/Orc/Core.cpp index f99cbec6d3b60a0a50199498e11fa9f220215a19..73c0bcdf7d28dab061dfc670cea5574ca2cfef0b 100644 --- a/lib/ExecutionEngine/Orc/Core.cpp +++ b/lib/ExecutionEngine/Orc/Core.cpp @@ -1947,7 +1947,8 @@ ExecutionSession::lookup(ArrayRef SearchOrder, SymbolStringPtr Name) { SymbolNameSet Names({Name}); - JITDylibSearchList FullSearchOrder(SearchOrder.size()); + JITDylibSearchList FullSearchOrder; + FullSearchOrder.reserve(SearchOrder.size()); for (auto *JD : SearchOrder) FullSearchOrder.push_back({JD, false}); diff --git a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h index deddfcb10e12e4d05ee4873615de0f08fa29e160..817a4b89bfb0517e99a30c70589f94035124b56b 100644 --- a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h +++ b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h @@ -492,13 +492,17 @@ private: void notifyFinalized(orc::VModuleKey K, const object::ObjectFile &Obj, const RuntimeDyld::LoadedObjectInfo &LoadedObjInfo) { + uint64_t Key = static_cast( + reinterpret_cast(Obj.getData().data())); for (auto &Listener : EventListeners) - Listener->NotifyObjectEmitted(Obj, LoadedObjInfo); + Listener->notifyObjectLoaded(Key, Obj, LoadedObjInfo); } void notifyFreed(orc::VModuleKey K, const object::ObjectFile &Obj) { + uint64_t Key = static_cast( + reinterpret_cast(Obj.getData().data())); for (auto &Listener : EventListeners) - Listener->NotifyFreeingObject(Obj); + Listener->notifyFreeingObject(Key); } orc::ExecutionSession ES; diff --git a/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp b/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp index 7bf8120d23dfdda91970bff25a40f41b1065f2bd..f195d028299865aeea3850bab1cc0e38a7f2e5f6 100644 --- a/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp +++ b/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp @@ -66,9 +66,9 @@ public: CloseMarker(); } - void NotifyObjectEmitted(const ObjectFile &Obj, - const RuntimeDyld::LoadedObjectInfo &L) override; - void NotifyFreeingObject(const ObjectFile &Obj) override; + void notifyObjectLoaded(ObjectKey K, const ObjectFile &Obj, + const RuntimeDyld::LoadedObjectInfo &L) override; + void notifyFreeingObject(ObjectKey K) override; private: bool InitDebuggingDir(); @@ -227,8 +227,9 @@ PerfJITEventListener::PerfJITEventListener() : Pid(::getpid()) { SuccessfullyInitialized = true; } -void PerfJITEventListener::NotifyObjectEmitted( - const ObjectFile &Obj, const RuntimeDyld::LoadedObjectInfo &L) { +void PerfJITEventListener::notifyObjectLoaded( + ObjectKey K, const ObjectFile &Obj, + const RuntimeDyld::LoadedObjectInfo &L) { if (!SuccessfullyInitialized) return; @@ -280,7 +281,7 @@ void PerfJITEventListener::NotifyObjectEmitted( Dumpstream->flush(); } -void PerfJITEventListener::NotifyFreeingObject(const ObjectFile &Obj) { +void PerfJITEventListener::notifyFreeingObject(ObjectKey K) { // perf currently doesn't have an interface for unloading. But munmap()ing the // code section does, so that's ok. } diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp index 1c54ad6fb03f8d248515ef74dfcba00b4bab1e6b..340ddaab186da07273def89ffa7d17caa3bf5ed8 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp @@ -66,7 +66,7 @@ RuntimeDyldCOFF::loadObject(const object::ObjectFile &O) { } else { HasError = true; raw_string_ostream ErrStream(ErrorStr); - logAllUnhandledErrors(ObjSectionToIDOrErr.takeError(), ErrStream, ""); + logAllUnhandledErrors(ObjSectionToIDOrErr.takeError(), ErrStream); return nullptr; } } diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index f9a81c7bd1b00538323ac4fe24c75dc060e5b838..226ee715e18bc768a1d48e572265195c9f94e2b4 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -255,7 +255,7 @@ RuntimeDyldELF::loadObject(const object::ObjectFile &O) { else { HasError = true; raw_string_ostream ErrStream(ErrorStr); - logAllUnhandledErrors(ObjSectionToIDOrErr.takeError(), ErrStream, ""); + logAllUnhandledErrors(ObjSectionToIDOrErr.takeError(), ErrStream); return nullptr; } } @@ -1130,7 +1130,7 @@ RuntimeDyldELF::processRelocationRef( if (!SymTypeOrErr) { std::string Buf; raw_string_ostream OS(Buf); - logAllUnhandledErrors(SymTypeOrErr.takeError(), OS, ""); + logAllUnhandledErrors(SymTypeOrErr.takeError(), OS); OS.flush(); report_fatal_error(Buf); } @@ -1151,7 +1151,7 @@ RuntimeDyldELF::processRelocationRef( if (!SectionOrErr) { std::string Buf; raw_string_ostream OS(Buf); - logAllUnhandledErrors(SectionOrErr.takeError(), OS, ""); + logAllUnhandledErrors(SectionOrErr.takeError(), OS); OS.flush(); report_fatal_error(Buf); } diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp index c5a215c833315039b643bf685ba79af1429518e5..d47fcd45be8859fa67e164d4a46c491f83f9e2ad 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp @@ -370,7 +370,7 @@ RuntimeDyldMachO::loadObject(const object::ObjectFile &O) { else { HasError = true; raw_string_ostream ErrStream(ErrorStr); - logAllUnhandledErrors(ObjSectionToIDOrErr.takeError(), ErrStream, ""); + logAllUnhandledErrors(ObjSectionToIDOrErr.takeError(), ErrStream); return nullptr; } } diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h index 729ea1ec48a4e9d65b0a8d214811817b4463a29f..8723dd0fd0ea5140fb8ba80d48dcb9d3f915cadb 100644 --- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h +++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h @@ -28,7 +28,7 @@ static bool isThumbFunc(symbol_iterator Symbol, const ObjectFile &Obj, if (!SymTypeOrErr) { std::string Buf; raw_string_ostream OS(Buf); - logAllUnhandledErrors(SymTypeOrErr.takeError(), OS, ""); + logAllUnhandledErrors(SymTypeOrErr.takeError(), OS); OS.flush(); report_fatal_error(Buf); } diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h index 39bdc4b69217adced55de954fda2ff46d2fb8eaa..aee5f6dc374693c86b35c93a6135804ca554c118 100644 --- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h +++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h @@ -128,6 +128,13 @@ public: break; } + case COFF::IMAGE_REL_AMD64_SECREL: { + assert(static_cast(RE.Addend) <= INT32_MAX && "Relocation overflow"); + assert(static_cast(RE.Addend) >= INT32_MIN && "Relocation underflow"); + writeBytesUnaligned(RE.Addend, Target, 4); + break; + } + default: llvm_unreachable("Relocation type not implemented yet!"); break; diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp index 3b575739263dc1c716c7f41f73356a594429f715..36f4f3aa8764b9b350ea45fbdf7c9f6080b47f31 100644 --- a/lib/IR/AsmWriter.cpp +++ b/lib/IR/AsmWriter.cpp @@ -1607,6 +1607,7 @@ struct MDFieldPrinter { void printInt(StringRef Name, IntTy Int, bool ShouldSkipZero = true); void printBool(StringRef Name, bool Value, Optional Default = None); void printDIFlags(StringRef Name, DINode::DIFlags Flags); + void printDISPFlags(StringRef Name, DISubprogram::DISPFlags Flags); template void printDwarfEnum(StringRef Name, IntTy Value, Stringifier toString, bool ShouldSkipZero = true); @@ -1705,6 +1706,30 @@ void MDFieldPrinter::printDIFlags(StringRef Name, DINode::DIFlags Flags) { Out << FlagsFS << Extra; } +void MDFieldPrinter::printDISPFlags(StringRef Name, + DISubprogram::DISPFlags Flags) { + // Always print this field, because no flags in the IR at all will be + // interpreted as old-style isDefinition: true. + Out << FS << Name << ": "; + + if (!Flags) { + Out << 0; + return; + } + + SmallVector SplitFlags; + auto Extra = DISubprogram::splitFlags(Flags, SplitFlags); + + FieldSeparator FlagsFS(" | "); + for (auto F : SplitFlags) { + auto StringF = DISubprogram::getFlagString(F); + assert(!StringF.empty() && "Expected valid flag"); + Out << FlagsFS << StringF; + } + if (Extra || SplitFlags.empty()) + Out << FlagsFS << Extra; +} + void MDFieldPrinter::printEmissionKind(StringRef Name, DICompileUnit::DebugEmissionKind EK) { Out << FS << Name << ": " << DICompileUnit::emissionKindString(EK); @@ -1910,6 +1935,7 @@ static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N, Printer.printBool("debugInfoForProfiling", N->getDebugInfoForProfiling(), false); Printer.printNameTableKind("nameTableKind", N->getNameTableKind()); + Printer.printBool("rangesBaseAddress", N->getRangesBaseAddress(), false); Out << ")"; } @@ -1924,18 +1950,14 @@ static void writeDISubprogram(raw_ostream &Out, const DISubprogram *N, Printer.printMetadata("file", N->getRawFile()); Printer.printInt("line", N->getLine()); Printer.printMetadata("type", N->getRawType()); - Printer.printBool("isLocal", N->isLocalToUnit()); - Printer.printBool("isDefinition", N->isDefinition()); Printer.printInt("scopeLine", N->getScopeLine()); Printer.printMetadata("containingType", N->getRawContainingType()); - Printer.printDwarfEnum("virtuality", N->getVirtuality(), - dwarf::VirtualityString); if (N->getVirtuality() != dwarf::DW_VIRTUALITY_none || N->getVirtualIndex() != 0) Printer.printInt("virtualIndex", N->getVirtualIndex(), false); Printer.printInt("thisAdjustment", N->getThisAdjustment()); Printer.printDIFlags("flags", N->getFlags()); - Printer.printBool("isOptimized", N->isOptimized()); + Printer.printDISPFlags("spFlags", N->getSPFlags()); Printer.printMetadata("unit", N->getRawUnit()); Printer.printMetadata("templateParams", N->getRawTemplateParams()); Printer.printMetadata("declaration", N->getRawDeclaration()); @@ -2272,11 +2294,15 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD, Machine = MachineStorage.get(); } int Slot = Machine->getMetadataSlot(N); - if (Slot == -1) + if (Slot == -1) { + if (const DILocation *Loc = dyn_cast(N)) { + writeDILocation(Out, Loc, TypePrinter, Machine, Context); + return; + } // Give the pointer value instead of "badref", since this comes up all // the time when debugging. Out << "<" << N << ">"; - else + } else Out << '!' << Slot; return; } @@ -2820,7 +2846,7 @@ void AssemblyWriter::printAliasSummary(const AliasSummary *AS) { } void AssemblyWriter::printGlobalVarSummary(const GlobalVarSummary *GS) { - // Nothing for now + Out << ", varFlags: (readonly: " << GS->VarFlags.ReadOnly << ")"; } static std::string getLinkageName(GlobalValue::LinkageTypes LT) { @@ -3014,6 +3040,8 @@ void AssemblyWriter::printSummary(const GlobalValueSummary &Summary) { FieldSeparator FS; for (auto &Ref : RefList) { Out << FS; + if (Ref.isReadOnly()) + Out << "readonly "; Out << "^" << Machine.getGUIDSlot(Ref.getGUID()); } Out << ")"; diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp index 63a0f6f22af302e0ac72eded5a1e928b0057374b..ff46debb7a9e574681eac51f06272733af1b4bcc 100644 --- a/lib/IR/Attributes.cpp +++ b/lib/IR/Attributes.cpp @@ -639,7 +639,7 @@ LLVM_DUMP_METHOD void AttributeSet::dump() const { AttributeSetNode::AttributeSetNode(ArrayRef Attrs) : AvailableAttrs(0), NumAttrs(Attrs.size()) { // There's memory after the node where we can store the entries in. - std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects()); + llvm::copy(Attrs, getTrailingObjects()); for (const auto I : *this) { if (!I.isStringAttribute()) { @@ -809,7 +809,7 @@ AttributeListImpl::AttributeListImpl(LLVMContext &C, assert(!Sets.empty() && "pointless AttributeListImpl"); // There's memory after the node where we can store the entries in. - std::copy(Sets.begin(), Sets.end(), getTrailingObjects()); + llvm::copy(Sets, getTrailingObjects()); // Initialize AvailableFunctionAttrs summary bitset. static_assert(Attribute::EndAttrKinds <= @@ -1685,28 +1685,32 @@ adjustCallerStackProbeSize(Function &Caller, const Function &Callee) { } /// If the inlined function defines a min legal vector width, then ensure -/// the calling function has the same or larger min legal vector width. This -/// function is called after the inlining decision has been made so we have to -/// merge the attribute this way. Heuristics that would use +/// the calling function has the same or larger min legal vector width. If the +/// caller has the attribute, but the callee doesn't, we need to remove the +/// attribute from the caller since we can't make any guarantees about the +/// caller's requirements. +/// This function is called after the inlining decision has been made so we have +/// to merge the attribute this way. Heuristics that would use /// min-legal-vector-width to determine inline compatibility would need to be /// handled as part of inline cost analysis. static void adjustMinLegalVectorWidth(Function &Caller, const Function &Callee) { - if (Callee.hasFnAttribute("min-legal-vector-width")) { - uint64_t CalleeVectorWidth; - Callee.getFnAttribute("min-legal-vector-width") - .getValueAsString() - .getAsInteger(0, CalleeVectorWidth); - if (Caller.hasFnAttribute("min-legal-vector-width")) { + if (Caller.hasFnAttribute("min-legal-vector-width")) { + if (Callee.hasFnAttribute("min-legal-vector-width")) { uint64_t CallerVectorWidth; Caller.getFnAttribute("min-legal-vector-width") .getValueAsString() .getAsInteger(0, CallerVectorWidth); - if (CallerVectorWidth < CalleeVectorWidth) { + uint64_t CalleeVectorWidth; + Callee.getFnAttribute("min-legal-vector-width") + .getValueAsString() + .getAsInteger(0, CalleeVectorWidth); + if (CallerVectorWidth < CalleeVectorWidth) Caller.addFnAttr(Callee.getFnAttribute("min-legal-vector-width")); - } } else { - Caller.addFnAttr(Callee.getFnAttribute("min-legal-vector-width")); + // If the callee doesn't have the attribute then we don't know anything + // and must drop the attribute from the caller. + Caller.removeFnAttr("min-legal-vector-width"); } } } diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index 10cb1ac698da9230cc21f3d770328186240d8618..39e29a2a093c3bec882b7503419ad24b2ac7c41b 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -65,24 +65,19 @@ static bool UpgradeX86IntrinsicsWith8BitMask(Function *F, Intrinsic::ID IID, return true; } -static bool UpgradeADCSBBIntrinsic(Function *F, Intrinsic::ID IID, - Function *&NewFn) { - // If this intrinsic has 3 operands, it's the new version. - if (F->getFunctionType()->getNumParams() == 3) - return false; - - rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), IID); - return true; -} - static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { // All of the intrinsics matches below should be marked with which llvm // version started autoupgrading them. At some point in the future we would // like to use this information to remove upgrade code for some older // intrinsics. It is currently undecided how we will determine that future // point. - if (Name.startswith("sse2.paddus.") || // Added in 8.0 + if (Name == "addcarryx.u32" || // Added in 8.0 + Name == "addcarryx.u64" || // Added in 8.0 + Name == "addcarry.u32" || // Added in 8.0 + Name == "addcarry.u64" || // Added in 8.0 + Name == "subborrow.u32" || // Added in 8.0 + Name == "subborrow.u64" || // Added in 8.0 + Name.startswith("sse2.paddus.") || // Added in 8.0 Name.startswith("sse2.psubus.") || // Added in 8.0 Name.startswith("avx2.paddus.") || // Added in 8.0 Name.startswith("avx2.psubus.") || // Added in 8.0 @@ -382,20 +377,7 @@ static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name, return true; } - if (Name == "addcarryx.u32") - return UpgradeADCSBBIntrinsic(F, Intrinsic::x86_addcarryx_u32, NewFn); - if (Name == "addcarryx.u64") - return UpgradeADCSBBIntrinsic(F, Intrinsic::x86_addcarryx_u64, NewFn); - if (Name == "addcarry.u32") - return UpgradeADCSBBIntrinsic(F, Intrinsic::x86_addcarry_u32, NewFn); - if (Name == "addcarry.u64") - return UpgradeADCSBBIntrinsic(F, Intrinsic::x86_addcarry_u64, NewFn); - if (Name == "subborrow.u32") - return UpgradeADCSBBIntrinsic(F, Intrinsic::x86_subborrow_u32, NewFn); - if (Name == "subborrow.u64") - return UpgradeADCSBBIntrinsic(F, Intrinsic::x86_subborrow_u64, NewFn); - - if (Name == "rdtscp") { + if (Name == "rdtscp") { // Added in 8.0 // If this intrinsic has 0 operands, it's the new version. if (F->getFunctionType()->getNumParams() == 0) return false; @@ -3277,6 +3259,39 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru); + } else if (IsX86 && (Name == "addcarryx.u32" || Name == "addcarryx.u64" || + Name == "addcarry.u32" || Name == "addcarry.u64" || + Name == "subborrow.u32" || Name == "subborrow.u64")) { + Intrinsic::ID IID; + if (Name[0] == 'a' && Name.back() == '2') + IID = Intrinsic::x86_addcarry_32; + else if (Name[0] == 'a' && Name.back() == '4') + IID = Intrinsic::x86_addcarry_64; + else if (Name[0] == 's' && Name.back() == '2') + IID = Intrinsic::x86_subborrow_32; + else if (Name[0] == 's' && Name.back() == '4') + IID = Intrinsic::x86_subborrow_64; + else + llvm_unreachable("Unexpected intrinsic"); + + // Make a call with 3 operands. + Value *Args[] = { CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2)}; + Value *NewCall = Builder.CreateCall( + Intrinsic::getDeclaration(CI->getModule(), IID), + Args); + + // Extract the second result and store it. + Value *Data = Builder.CreateExtractValue(NewCall, 1); + // Cast the pointer to the right type. + Value *Ptr = Builder.CreateBitCast(CI->getArgOperand(3), + llvm::PointerType::getUnqual(Data->getType())); + Builder.CreateAlignedStore(Data, Ptr, 1); + // Replace the original call result with the first result of the new call. + Value *CF = Builder.CreateExtractValue(NewCall, 0); + + CI->replaceAllUsesWith(CF); + Rep = nullptr; } else if (IsX86 && Name.startswith("avx512.mask.") && upgradeAVX512MaskToSelect(Name, Builder, *CI, Rep)) { // Rep will be updated by the call in the condition. @@ -3478,40 +3493,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { return; } - case Intrinsic::x86_addcarryx_u32: - case Intrinsic::x86_addcarryx_u64: - case Intrinsic::x86_addcarry_u32: - case Intrinsic::x86_addcarry_u64: - case Intrinsic::x86_subborrow_u32: - case Intrinsic::x86_subborrow_u64: { - // This used to take 4 arguments. If we only have 3 arguments its already - // upgraded. - if (CI->getNumOperands() == 3) - return; - - // Make a call with 3 operands. - NewCall = Builder.CreateCall(NewFn, { CI->getArgOperand(0), - CI->getArgOperand(1), - CI->getArgOperand(2)}); - // Extract the second result and store it. - Value *Data = Builder.CreateExtractValue(NewCall, 1); - // Cast the pointer to the right type. - Value *Ptr = Builder.CreateBitCast(CI->getArgOperand(3), - llvm::PointerType::getUnqual(Data->getType())); - Builder.CreateAlignedStore(Data, Ptr, 1); - // Replace the original call result with the first result of the new call. - Value *CF = Builder.CreateExtractValue(NewCall, 0); - - std::string Name = CI->getName(); - if (!Name.empty()) { - CI->setName(Name + ".old"); - NewCall->setName(Name); - } - CI->replaceAllUsesWith(CF); - CI->eraseFromParent(); - return; - } - case Intrinsic::x86_sse41_insertps: case Intrinsic::x86_sse41_dppd: case Intrinsic::x86_sse41_dpps: diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp index 12ab2e2ace4da07ddbebe5948a37c3c09ba66889..e9eb686c1e139536819d9df21d6c7156cc4e19ea 100644 --- a/lib/IR/BasicBlock.cpp +++ b/lib/IR/BasicBlock.cpp @@ -260,6 +260,14 @@ const BasicBlock *BasicBlock::getUniquePredecessor() const { return PredBB; } +bool BasicBlock::hasNPredecessors(unsigned N) const { + return hasNItems(pred_begin(this), pred_end(this), N); +} + +bool BasicBlock::hasNPredecessorsOrMore(unsigned N) const { + return hasNItemsOrMore(pred_begin(this), pred_end(this), N); +} + const BasicBlock *BasicBlock::getSingleSuccessor() const { succ_const_iterator SI = succ_begin(this), E = succ_end(this); if (SI == E) return nullptr; // no successors diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp index 107975df1e774ff999a1bd6a8d0a9bdf7c60e9f1..826199ee054e38df2d3c060b2cef45ce7a594658 100644 --- a/lib/IR/ConstantFold.cpp +++ b/lib/IR/ConstantFold.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/ErrorHandling.h" @@ -1077,10 +1078,10 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1, isa(CE1->getOperand(0))) { GlobalValue *GV = cast(CE1->getOperand(0)); - // Functions are at least 4-byte aligned. - unsigned GVAlign = GV->getAlignment(); - if (isa(GV)) - GVAlign = std::max(GVAlign, 4U); + unsigned GVAlign = + GV->getParent() + ? GV->getPointerAlignment(GV->getParent()->getDataLayout()) + : 0; if (GVAlign > 1) { unsigned DstWidth = CI2->getType()->getBitWidth(); @@ -2052,7 +2053,7 @@ static bool isInBoundsIndices(ArrayRef Idxs) { static bool isIndexInRangeOfArrayType(uint64_t NumElements, const ConstantInt *CI) { // We cannot bounds check the index if it doesn't fit in an int64_t. - if (CI->getValue().getActiveBits() > 64) + if (CI->getValue().getMinSignedBits() > 64) return false; // A negative index or an index past the end of our sequential type is diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp index 2351e7e4a38930f06bd2ef9148a8389560a5761f..df09d13d3ebaa8d6f2c5e205a42864e3f3ade446 100644 --- a/lib/IR/Constants.cpp +++ b/lib/IR/Constants.cpp @@ -184,18 +184,15 @@ bool Constant::isNotMinSignedValue() const { if (const ConstantFP *CFP = dyn_cast(this)) return !CFP->getValueAPF().bitcastToAPInt().isMinSignedValue(); - // Check for constant vectors which are splats of INT_MIN values. - if (const ConstantVector *CV = dyn_cast(this)) - if (Constant *Splat = CV->getSplatValue()) - return Splat->isNotMinSignedValue(); - - // Check for constant vectors which are splats of INT_MIN values. - if (const ConstantDataVector *CV = dyn_cast(this)) { - if (CV->isSplat()) { - if (CV->getElementType()->isFloatingPointTy()) - return !CV->getElementAsAPFloat(0).bitcastToAPInt().isMinSignedValue(); - return !CV->getElementAsAPInt(0).isMinSignedValue(); + // Check that vectors don't contain INT_MIN + if (this->getType()->isVectorTy()) { + unsigned NumElts = this->getType()->getVectorNumElements(); + for (unsigned i = 0; i != NumElts; ++i) { + Constant *Elt = this->getAggregateElement(i); + if (!Elt || !Elt->isNotMinSignedValue()) + return false; } + return true; } // It *may* contain INT_MIN, we can't tell. @@ -353,8 +350,12 @@ Constant *Constant::getAggregateElement(unsigned Elt) const { Constant *Constant::getAggregateElement(Constant *Elt) const { assert(isa(Elt->getType()) && "Index must be an integer"); - if (ConstantInt *CI = dyn_cast(Elt)) + if (ConstantInt *CI = dyn_cast(Elt)) { + // Check if the constant fits into an uint64_t. + if (CI->getValue().getActiveBits() > 64) + return nullptr; return getAggregateElement(CI->getZExtValue()); + } return nullptr; } @@ -722,9 +723,9 @@ Constant *ConstantFP::get(Type *Ty, StringRef Str) { return C; } -Constant *ConstantFP::getNaN(Type *Ty, bool Negative, unsigned Type) { +Constant *ConstantFP::getNaN(Type *Ty, bool Negative, uint64_t Payload) { const fltSemantics &Semantics = *TypeToFloatSemantics(Ty->getScalarType()); - APFloat NaN = APFloat::getNaN(Semantics, Negative, Type); + APFloat NaN = APFloat::getNaN(Semantics, Negative, Payload); Constant *C = get(Ty->getContext(), NaN); if (VectorType *VTy = dyn_cast(Ty)) @@ -733,6 +734,28 @@ Constant *ConstantFP::getNaN(Type *Ty, bool Negative, unsigned Type) { return C; } +Constant *ConstantFP::getQNaN(Type *Ty, bool Negative, APInt *Payload) { + const fltSemantics &Semantics = *TypeToFloatSemantics(Ty->getScalarType()); + APFloat NaN = APFloat::getQNaN(Semantics, Negative, Payload); + Constant *C = get(Ty->getContext(), NaN); + + if (VectorType *VTy = dyn_cast(Ty)) + return ConstantVector::getSplat(VTy->getNumElements(), C); + + return C; +} + +Constant *ConstantFP::getSNaN(Type *Ty, bool Negative, APInt *Payload) { + const fltSemantics &Semantics = *TypeToFloatSemantics(Ty->getScalarType()); + APFloat NaN = APFloat::getSNaN(Semantics, Negative, Payload); + Constant *C = get(Ty->getContext(), NaN); + + if (VectorType *VTy = dyn_cast(Ty)) + return ConstantVector::getSplat(VTy->getNumElements(), C); + + return C; +} + Constant *ConstantFP::getNegativeZero(Type *Ty) { const fltSemantics &Semantics = *TypeToFloatSemantics(Ty->getScalarType()); APFloat NegZero = APFloat::getZero(Semantics, /*Negative=*/true); @@ -940,7 +963,7 @@ ConstantAggregate::ConstantAggregate(CompositeType *T, ValueTy VT, ArrayRef V) : Constant(T, VT, OperandTraits::op_end(this) - V.size(), V.size()) { - std::copy(V.begin(), V.end(), op_begin()); + llvm::copy(V, op_begin()); // Check that types match, unless this is an opaque struct. if (auto *ST = dyn_cast(T)) @@ -1780,6 +1803,36 @@ Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy, return getFoldedCast(Instruction::AddrSpaceCast, C, DstTy, OnlyIfReduced); } +Constant *ConstantExpr::get(unsigned Opcode, Constant *C, unsigned Flags, + Type *OnlyIfReducedTy) { + // Check the operands for consistency first. + assert(Instruction::isUnaryOp(Opcode) && + "Invalid opcode in unary constant expression"); + +#ifndef NDEBUG + switch (Opcode) { + case Instruction::FNeg: + assert(C->getType()->isFPOrFPVectorTy() && + "Tried to create a floating-point operation on a " + "non-floating-point type!"); + break; + default: + break; + } +#endif + + // TODO: Try to constant fold operation. + + if (OnlyIfReducedTy == C->getType()) + return nullptr; + + Constant *ArgVec[] = { C }; + ConstantExprKeyType Key(Opcode, ArgVec, 0, Flags); + + LLVMContextImpl *pImpl = C->getContext().pImpl; + return pImpl->ExprConstants.getOrCreate(C->getType(), Key); +} + Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2, unsigned Flags, Type *OnlyIfReducedTy) { // Check the operands for consistency first. diff --git a/lib/IR/ConstantsContext.h b/lib/IR/ConstantsContext.h index e9f31e4ded68dd8d9cb391b4fff3baa38cbde65c..eac17139708471dc357b1ef54d0e20eb04ac88ca 100644 --- a/lib/IR/ConstantsContext.h +++ b/lib/IR/ConstantsContext.h @@ -529,7 +529,9 @@ struct ConstantExprKeyType { ConstantExpr *create(TypeClass *Ty) const { switch (Opcode) { default: - if (Instruction::isCast(Opcode)) + if (Instruction::isCast(Opcode) || + (Opcode >= Instruction::UnaryOpsBegin && + Opcode < Instruction::UnaryOpsEnd)) return new UnaryConstantExpr(Opcode, Ops[0], Ty); if ((Opcode >= Instruction::BinaryOpsBegin && Opcode < Instruction::BinaryOpsEnd)) diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp index df3578ea87f3d3520c8e780fe1be3fa72f811548..d29759f567918a8a5eca38e7a5df44aa3a4da6b4 100644 --- a/lib/IR/DIBuilder.cpp +++ b/lib/IR/DIBuilder.cpp @@ -140,7 +140,7 @@ DICompileUnit *DIBuilder::createCompileUnit( StringRef Flags, unsigned RunTimeVer, StringRef SplitName, DICompileUnit::DebugEmissionKind Kind, uint64_t DWOId, bool SplitDebugInlining, bool DebugInfoForProfiling, - DICompileUnit::DebugNameTableKind NameTableKind) { + DICompileUnit::DebugNameTableKind NameTableKind, bool RangesBaseAddress) { assert(((Lang <= dwarf::DW_LANG_Fortran08 && Lang >= dwarf::DW_LANG_C89) || (Lang <= dwarf::DW_LANG_hi_user && Lang >= dwarf::DW_LANG_lo_user)) && @@ -150,7 +150,8 @@ DICompileUnit *DIBuilder::createCompileUnit( CUNode = DICompileUnit::getDistinct( VMContext, Lang, File, Producer, isOptimized, Flags, RunTimeVer, SplitName, Kind, nullptr, nullptr, nullptr, nullptr, nullptr, DWOId, - SplitDebugInlining, DebugInfoForProfiling, NameTableKind); + SplitDebugInlining, DebugInfoForProfiling, NameTableKind, + RangesBaseAddress); // Create a named metadata so that it is easier to find cu in a module. NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu"); @@ -750,18 +751,18 @@ static DISubprogram *getSubprogram(bool IsDistinct, Ts &&... Args) { DISubprogram *DIBuilder::createFunction( DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *File, - unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit, - bool isDefinition, unsigned ScopeLine, DINode::DIFlags Flags, - bool isOptimized, DITemplateParameterArray TParams, DISubprogram *Decl, + unsigned LineNo, DISubroutineType *Ty, unsigned ScopeLine, + DINode::DIFlags Flags, DISubprogram::DISPFlags SPFlags, + DITemplateParameterArray TParams, DISubprogram *Decl, DITypeArray ThrownTypes) { + bool IsDefinition = SPFlags & DISubprogram::SPFlagDefinition; auto *Node = getSubprogram( - /* IsDistinct = */ isDefinition, VMContext, - getNonCompileUnitScope(Context), Name, LinkageName, File, LineNo, Ty, - isLocalToUnit, isDefinition, ScopeLine, nullptr, 0, 0, 0, Flags, - isOptimized, isDefinition ? CUNode : nullptr, TParams, Decl, + /*IsDistinct=*/IsDefinition, VMContext, getNonCompileUnitScope(Context), + Name, LinkageName, File, LineNo, Ty, ScopeLine, nullptr, 0, 0, Flags, + SPFlags, IsDefinition ? CUNode : nullptr, TParams, Decl, MDTuple::getTemporary(VMContext, None).release(), ThrownTypes); - if (isDefinition) + if (IsDefinition) AllSubprograms.push_back(Node); trackIfUnresolved(Node); return Node; @@ -769,35 +770,37 @@ DISubprogram *DIBuilder::createFunction( DISubprogram *DIBuilder::createTempFunctionFwdDecl( DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *File, - unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit, - bool isDefinition, unsigned ScopeLine, DINode::DIFlags Flags, - bool isOptimized, DITemplateParameterArray TParams, DISubprogram *Decl, + unsigned LineNo, DISubroutineType *Ty, unsigned ScopeLine, + DINode::DIFlags Flags, DISubprogram::DISPFlags SPFlags, + DITemplateParameterArray TParams, DISubprogram *Decl, DITypeArray ThrownTypes) { - return DISubprogram::getTemporary( - VMContext, getNonCompileUnitScope(Context), Name, LinkageName, - File, LineNo, Ty, isLocalToUnit, isDefinition, ScopeLine, nullptr, - 0, 0, 0, Flags, isOptimized, isDefinition ? CUNode : nullptr, - TParams, Decl, nullptr, ThrownTypes) + bool IsDefinition = SPFlags & DISubprogram::SPFlagDefinition; + return DISubprogram::getTemporary(VMContext, getNonCompileUnitScope(Context), + Name, LinkageName, File, LineNo, Ty, + ScopeLine, nullptr, 0, 0, Flags, SPFlags, + IsDefinition ? CUNode : nullptr, TParams, + Decl, nullptr, ThrownTypes) .release(); } DISubprogram *DIBuilder::createMethod( DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *F, - unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit, - bool isDefinition, unsigned VK, unsigned VIndex, int ThisAdjustment, - DIType *VTableHolder, DINode::DIFlags Flags, bool isOptimized, - DITemplateParameterArray TParams, DITypeArray ThrownTypes) { + unsigned LineNo, DISubroutineType *Ty, unsigned VIndex, int ThisAdjustment, + DIType *VTableHolder, DINode::DIFlags Flags, + DISubprogram::DISPFlags SPFlags, DITemplateParameterArray TParams, + DITypeArray ThrownTypes) { assert(getNonCompileUnitScope(Context) && "Methods should have both a Context and a context that isn't " "the compile unit."); // FIXME: Do we want to use different scope/lines? + bool IsDefinition = SPFlags & DISubprogram::SPFlagDefinition; auto *SP = getSubprogram( - /* IsDistinct = */ isDefinition, VMContext, cast(Context), Name, - LinkageName, F, LineNo, Ty, isLocalToUnit, isDefinition, LineNo, - VTableHolder, VK, VIndex, ThisAdjustment, Flags, isOptimized, - isDefinition ? CUNode : nullptr, TParams, nullptr, nullptr, ThrownTypes); + /*IsDistinct=*/IsDefinition, VMContext, cast(Context), Name, + LinkageName, F, LineNo, Ty, LineNo, VTableHolder, VIndex, ThisAdjustment, + Flags, SPFlags, IsDefinition ? CUNode : nullptr, TParams, nullptr, + nullptr, ThrownTypes); - if (isDefinition) + if (IsDefinition) AllSubprograms.push_back(SP); trackIfUnresolved(SP); return SP; diff --git a/lib/IR/DebugInfo.cpp b/lib/IR/DebugInfo.cpp index 02b7953cb5bbed5bcd972443c7241b9cbefddf52..d1ff54571782d10d89b0129e1593fc7bc144cd0f 100644 --- a/lib/IR/DebugInfo.cpp +++ b/lib/IR/DebugInfo.cpp @@ -438,11 +438,10 @@ private: auto distinctMDSubprogram = [&]() { return DISubprogram::getDistinct( MDS->getContext(), FileAndScope, MDS->getName(), LinkageName, - FileAndScope, MDS->getLine(), Type, MDS->isLocalToUnit(), - MDS->isDefinition(), MDS->getScopeLine(), ContainingType, - MDS->getVirtuality(), MDS->getVirtualIndex(), - MDS->getThisAdjustment(), MDS->getFlags(), MDS->isOptimized(), Unit, - TemplateParams, Declaration, Variables); + FileAndScope, MDS->getLine(), Type, MDS->getScopeLine(), + ContainingType, MDS->getVirtualIndex(), MDS->getThisAdjustment(), + MDS->getFlags(), MDS->getSPFlags(), Unit, TemplateParams, Declaration, + Variables); }; if (MDS->isDistinct()) @@ -450,11 +449,9 @@ private: auto *NewMDS = DISubprogram::get( MDS->getContext(), FileAndScope, MDS->getName(), LinkageName, - FileAndScope, MDS->getLine(), Type, MDS->isLocalToUnit(), - MDS->isDefinition(), MDS->getScopeLine(), ContainingType, - MDS->getVirtuality(), MDS->getVirtualIndex(), MDS->getThisAdjustment(), - MDS->getFlags(), MDS->isOptimized(), Unit, TemplateParams, Declaration, - Variables); + FileAndScope, MDS->getLine(), Type, MDS->getScopeLine(), ContainingType, + MDS->getVirtualIndex(), MDS->getThisAdjustment(), MDS->getFlags(), + MDS->getSPFlags(), Unit, TemplateParams, Declaration, Variables); StringRef OldLinkageName = MDS->getLinkageName(); @@ -491,7 +488,8 @@ private: CU->getSplitDebugFilename(), DICompileUnit::LineTablesOnly, EnumTypes, RetainedTypes, GlobalVariables, ImportedEntities, CU->getMacros(), CU->getDWOId(), CU->getSplitDebugInlining(), - CU->getDebugInfoForProfiling(), CU->getNameTableKind()); + CU->getDebugInfoForProfiling(), CU->getNameTableKind(), + CU->getRangesBaseAddress()); } DILocation *getReplacementMDLocation(DILocation *MLD) { @@ -719,6 +717,11 @@ static LLVMDIFlags map_to_llvmDIFlags(DINode::DIFlags Flags) { return static_cast(Flags); } +static DISubprogram::DISPFlags +pack_into_DISPFlags(bool IsLocalToUnit, bool IsDefinition, bool IsOptimized) { + return DISubprogram::toSPFlags(IsLocalToUnit, IsDefinition, IsOptimized); +} + unsigned LLVMDebugMetadataVersion() { return DEBUG_METADATA_VERSION; } @@ -802,9 +805,10 @@ LLVMMetadataRef LLVMDIBuilderCreateFunction( unsigned ScopeLine, LLVMDIFlags Flags, LLVMBool IsOptimized) { return wrap(unwrap(Builder)->createFunction( unwrapDI(Scope), {Name, NameLen}, {LinkageName, LinkageNameLen}, - unwrapDI(File), LineNo, unwrapDI(Ty), - IsLocalToUnit, IsDefinition, ScopeLine, map_from_llvmDIFlags(Flags), - IsOptimized, nullptr, nullptr, nullptr)); + unwrapDI(File), LineNo, unwrapDI(Ty), ScopeLine, + map_from_llvmDIFlags(Flags), + pack_into_DISPFlags(IsLocalToUnit, IsDefinition, IsOptimized), nullptr, + nullptr, nullptr)); } diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp index bb49cb2109a5dee8733d04d52a1576247ccabd85..3b702ce4798b99ddf10e077565c30079480a9f9e 100644 --- a/lib/IR/DebugInfoMetadata.cpp +++ b/lib/IR/DebugInfoMetadata.cpp @@ -470,7 +470,8 @@ DICompileUnit *DICompileUnit::getImpl( unsigned EmissionKind, Metadata *EnumTypes, Metadata *RetainedTypes, Metadata *GlobalVariables, Metadata *ImportedEntities, Metadata *Macros, uint64_t DWOId, bool SplitDebugInlining, bool DebugInfoForProfiling, - unsigned NameTableKind, StorageType Storage, bool ShouldCreate) { + unsigned NameTableKind, bool RangesBaseAddress, StorageType Storage, + bool ShouldCreate) { assert(Storage != Uniqued && "Cannot unique DICompileUnit"); assert(isCanonical(Producer) && "Expected canonical MDString"); assert(isCanonical(Flags) && "Expected canonical MDString"); @@ -483,7 +484,8 @@ DICompileUnit *DICompileUnit::getImpl( return storeImpl(new (array_lengthof(Ops)) DICompileUnit( Context, Storage, SourceLanguage, IsOptimized, RuntimeVersion, EmissionKind, DWOId, SplitDebugInlining, - DebugInfoForProfiling, NameTableKind, Ops), + DebugInfoForProfiling, NameTableKind, RangesBaseAddress, + Ops), Storage); } @@ -540,21 +542,55 @@ DILocalScope *DILocalScope::getNonLexicalBlockFileScope() const { return const_cast(this); } +DISubprogram::DISPFlags DISubprogram::getFlag(StringRef Flag) { + return StringSwitch(Flag) +#define HANDLE_DISP_FLAG(ID, NAME) .Case("DISPFlag" #NAME, SPFlag##NAME) +#include "llvm/IR/DebugInfoFlags.def" + .Default(SPFlagZero); +} + +StringRef DISubprogram::getFlagString(DISPFlags Flag) { + switch (Flag) { + // Appease a warning. + case SPFlagVirtuality: + return ""; +#define HANDLE_DISP_FLAG(ID, NAME) \ + case SPFlag##NAME: \ + return "DISPFlag" #NAME; +#include "llvm/IR/DebugInfoFlags.def" + } + return ""; +} + +DISubprogram::DISPFlags +DISubprogram::splitFlags(DISPFlags Flags, + SmallVectorImpl &SplitFlags) { + // Multi-bit fields can require special handling. In our case, however, the + // only multi-bit field is virtuality, and all its values happen to be + // single-bit values, so the right behavior just falls out. +#define HANDLE_DISP_FLAG(ID, NAME) \ + if (DISPFlags Bit = Flags & SPFlag##NAME) { \ + SplitFlags.push_back(Bit); \ + Flags &= ~Bit; \ + } +#include "llvm/IR/DebugInfoFlags.def" + return Flags; +} + DISubprogram *DISubprogram::getImpl( LLVMContext &Context, Metadata *Scope, MDString *Name, MDString *LinkageName, Metadata *File, unsigned Line, Metadata *Type, - bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine, - Metadata *ContainingType, unsigned Virtuality, unsigned VirtualIndex, - int ThisAdjustment, DIFlags Flags, bool IsOptimized, Metadata *Unit, + unsigned ScopeLine, Metadata *ContainingType, unsigned VirtualIndex, + int ThisAdjustment, DIFlags Flags, DISPFlags SPFlags, Metadata *Unit, Metadata *TemplateParams, Metadata *Declaration, Metadata *RetainedNodes, Metadata *ThrownTypes, StorageType Storage, bool ShouldCreate) { assert(isCanonical(Name) && "Expected canonical MDString"); assert(isCanonical(LinkageName) && "Expected canonical MDString"); - DEFINE_GETIMPL_LOOKUP( - DISubprogram, (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit, - IsDefinition, ScopeLine, ContainingType, Virtuality, - VirtualIndex, ThisAdjustment, Flags, IsOptimized, Unit, - TemplateParams, Declaration, RetainedNodes, ThrownTypes)); + DEFINE_GETIMPL_LOOKUP(DISubprogram, + (Scope, Name, LinkageName, File, Line, Type, ScopeLine, + ContainingType, VirtualIndex, ThisAdjustment, Flags, + SPFlags, Unit, TemplateParams, Declaration, + RetainedNodes, ThrownTypes)); SmallVector Ops = { File, Scope, Name, LinkageName, Type, Unit, Declaration, RetainedNodes, ContainingType, TemplateParams, ThrownTypes}; @@ -566,11 +602,10 @@ DISubprogram *DISubprogram::getImpl( Ops.pop_back(); } } - DEFINE_GETIMPL_STORE_N(DISubprogram, - (Line, ScopeLine, Virtuality, VirtualIndex, - ThisAdjustment, Flags, IsLocalToUnit, IsDefinition, - IsOptimized), - Ops, Ops.size()); + DEFINE_GETIMPL_STORE_N( + DISubprogram, + (Line, ScopeLine, VirtualIndex, ThisAdjustment, Flags, SPFlags), Ops, + Ops.size()); } bool DISubprogram::describes(const Function *F) const { diff --git a/lib/IR/DiagnosticInfo.cpp b/lib/IR/DiagnosticInfo.cpp index 5ddb1196b07226005575a9294943f9d81e5d2528..3c6665bbcb7eac8e2c4689516ab706cc5a41a879 100644 --- a/lib/IR/DiagnosticInfo.cpp +++ b/lib/IR/DiagnosticInfo.cpp @@ -33,9 +33,10 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Path.h" #include "llvm/Support/Regex.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/ScopedPrinter.h" +#include "llvm/Support/raw_ostream.h" #include #include #include @@ -106,7 +107,7 @@ void DiagnosticInfoPGOProfile::print(DiagnosticPrinter &DP) const { DiagnosticLocation::DiagnosticLocation(const DebugLoc &DL) { if (!DL) return; - Filename = DL->getFilename(); + File = DL->getFile(); Line = DL->getLine(); Column = DL->getColumn(); } @@ -114,17 +115,36 @@ DiagnosticLocation::DiagnosticLocation(const DebugLoc &DL) { DiagnosticLocation::DiagnosticLocation(const DISubprogram *SP) { if (!SP) return; - Filename = SP->getFilename(); + + File = SP->getFile(); Line = SP->getScopeLine(); Column = 0; } -void DiagnosticInfoWithLocationBase::getLocation(StringRef *Filename, - unsigned *Line, - unsigned *Column) const { - *Filename = Loc.getFilename(); - *Line = Loc.getLine(); - *Column = Loc.getColumn(); +StringRef DiagnosticLocation::getRelativePath() const { + return File->getFilename(); +} + +std::string DiagnosticLocation::getAbsolutePath() const { + StringRef Name = File->getFilename(); + if (sys::path::is_absolute(Name)) + return Name; + + SmallString<128> Path; + sys::path::append(Path, File->getDirectory(), Name); + return sys::path::remove_leading_dotslash(Path).str(); +} + +std::string DiagnosticInfoWithLocationBase::getAbsolutePath() const { + return Loc.getAbsolutePath(); +} + +void DiagnosticInfoWithLocationBase::getLocation(StringRef &RelativePath, + unsigned &Line, + unsigned &Column) const { + RelativePath = Loc.getRelativePath(); + Line = Loc.getLine(); + Column = Loc.getColumn(); } const std::string DiagnosticInfoWithLocationBase::getLocationStr() const { @@ -132,7 +152,7 @@ const std::string DiagnosticInfoWithLocationBase::getLocationStr() const { unsigned Line = 0; unsigned Column = 0; if (isLocationAvailable()) - getLocation(&Filename, &Line, &Column); + getLocation(Filename, Line, Column); return (Filename + ":" + Twine(Line) + ":" + Twine(Column)).str(); } @@ -399,7 +419,7 @@ template <> struct MappingTraits { static void mapping(IO &io, DiagnosticLocation &DL) { assert(io.outputting() && "input not yet implemented"); - StringRef File = DL.getFilename(); + StringRef File = DL.getRelativePath(); unsigned Line = DL.getLine(); unsigned Col = DL.getColumn(); diff --git a/lib/IR/IRPrintingPasses.cpp b/lib/IR/IRPrintingPasses.cpp index befe1d9ffb1cda24d0e7f98f55932fdb3a639d88..43010220b9f3cac88bfbf46d4552a4d82eb97b44 100644 --- a/lib/IR/IRPrintingPasses.cpp +++ b/lib/IR/IRPrintingPasses.cpp @@ -27,7 +27,8 @@ PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner, ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {} PreservedAnalyses PrintModulePass::run(Module &M, ModuleAnalysisManager &) { - OS << Banner; + if (!Banner.empty()) + OS << Banner << "\n"; if (llvm::isFunctionInPrintList("*")) M.print(OS, nullptr, ShouldPreserveUseListOrder); else { diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp index 75e7413a47d0ffca7785f7ac47807f54d74004f3..f077957969fa5019998401f986556c245ac89781 100644 --- a/lib/IR/Instruction.cpp +++ b/lib/IR/Instruction.cpp @@ -303,6 +303,9 @@ const char *Instruction::getOpcodeName(unsigned OpCode) { case CatchPad: return "catchpad"; case CatchSwitch: return "catchswitch"; + // Standard unary operators... + case FNeg: return "fneg"; + // Standard binary operators... case Add: return "add"; case FAdd: return "fadd"; @@ -602,6 +605,13 @@ const Instruction *Instruction::getNextNonDebugInstruction() const { return nullptr; } +const Instruction *Instruction::getPrevNonDebugInstruction() const { + for (const Instruction *I = getPrevNode(); I; I = I->getPrevNode()) + if (!isa(I)) + return I; + return nullptr; +} + bool Instruction::isAssociative() const { unsigned Opcode = getOpcode(); if (isAssociative(Opcode)) diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp index 7d4b6df18d93911840a0683e0fd51dd53a370c28..8405bbde3220d19540963f1678a477d10ed6bd0a 100644 --- a/lib/IR/Instructions.cpp +++ b/lib/IR/Instructions.cpp @@ -65,10 +65,7 @@ AllocaInst::getAllocationSizeInBits(const DataLayout &DL) const { //===----------------------------------------------------------------------===// User::op_iterator CallSite::getCallee() const { - Instruction *II(getInstruction()); - return isCall() - ? cast(II)->op_end() - 1 // Skip Callee - : cast(II)->op_end() - 3; // Skip BB, BB, Callee + return cast(getInstruction())->op_end() - 1; } //===----------------------------------------------------------------------===// @@ -253,6 +250,82 @@ void LandingPadInst::addClause(Constant *Val) { getOperandList()[OpNo] = Val; } +//===----------------------------------------------------------------------===// +// CallBase Implementation +//===----------------------------------------------------------------------===// + +Value *CallBase::getReturnedArgOperand() const { + unsigned Index; + + if (Attrs.hasAttrSomewhere(Attribute::Returned, &Index) && Index) + return getArgOperand(Index - AttributeList::FirstArgIndex); + if (const Function *F = getCalledFunction()) + if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) && + Index) + return getArgOperand(Index - AttributeList::FirstArgIndex); + + return nullptr; +} + +bool CallBase::hasRetAttr(Attribute::AttrKind Kind) const { + if (Attrs.hasAttribute(AttributeList::ReturnIndex, Kind)) + return true; + + // Look at the callee, if available. + if (const Function *F = getCalledFunction()) + return F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Kind); + return false; +} + +/// Determine whether the argument or parameter has the given attribute. +bool CallBase::paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const { + assert(ArgNo < getNumArgOperands() && "Param index out of bounds!"); + + if (Attrs.hasParamAttribute(ArgNo, Kind)) + return true; + if (const Function *F = getCalledFunction()) + return F->getAttributes().hasParamAttribute(ArgNo, Kind); + return false; +} + +bool CallBase::hasFnAttrOnCalledFunction(Attribute::AttrKind Kind) const { + if (const Function *F = getCalledFunction()) + return F->getAttributes().hasAttribute(AttributeList::FunctionIndex, Kind); + return false; +} + +bool CallBase::hasFnAttrOnCalledFunction(StringRef Kind) const { + if (const Function *F = getCalledFunction()) + return F->getAttributes().hasAttribute(AttributeList::FunctionIndex, Kind); + return false; +} + +CallBase::op_iterator +CallBase::populateBundleOperandInfos(ArrayRef Bundles, + const unsigned BeginIndex) { + auto It = op_begin() + BeginIndex; + for (auto &B : Bundles) + It = std::copy(B.input_begin(), B.input_end(), It); + + auto *ContextImpl = getContext().pImpl; + auto BI = Bundles.begin(); + unsigned CurrentIndex = BeginIndex; + + for (auto &BOI : bundle_op_infos()) { + assert(BI != Bundles.end() && "Incorrect allocation?"); + + BOI.Tag = ContextImpl->getOrInsertBundleTag(BI->getTag()); + BOI.Begin = CurrentIndex; + BOI.End = CurrentIndex + BI->input_size(); + CurrentIndex = BOI.End; + BI++; + } + + assert(BI == Bundles.end() && "Incorrect allocation?"); + + return It; +} + //===----------------------------------------------------------------------===// // CallInst Implementation //===----------------------------------------------------------------------===// @@ -262,7 +335,7 @@ void CallInst::init(FunctionType *FTy, Value *Func, ArrayRef Args, this->FTy = FTy; assert(getNumOperands() == Args.size() + CountBundleInputs(Bundles) + 1 && "NumOperands not set up?"); - Op<-1>() = Func; + setCalledOperand(Func); #ifndef NDEBUG assert((Args.size() == FTy->getNumParams() || @@ -275,7 +348,7 @@ void CallInst::init(FunctionType *FTy, Value *Func, ArrayRef Args, "Calling a function with a bad signature!"); #endif - std::copy(Args.begin(), Args.end(), op_begin()); + llvm::copy(Args, op_begin()); auto It = populateBundleOperandInfos(Bundles, Args.size()); (void)It; @@ -288,7 +361,7 @@ void CallInst::init(Value *Func, const Twine &NameStr) { FTy = cast(cast(Func->getType())->getElementType()); assert(getNumOperands() == 1 && "NumOperands not set up?"); - Op<-1>() = Func; + setCalledOperand(Func); assert(FTy->getNumParams() == 0 && "Calling a function with bad signature"); @@ -296,31 +369,27 @@ void CallInst::init(Value *Func, const Twine &NameStr) { } CallInst::CallInst(Value *Func, const Twine &Name, Instruction *InsertBefore) - : CallBase( - cast( - cast(Func->getType())->getElementType()) - ->getReturnType(), - Instruction::Call, - OperandTraits>::op_end(this) - 1, 1, - InsertBefore) { + : CallBase(cast( + cast(Func->getType())->getElementType()) + ->getReturnType(), + Instruction::Call, OperandTraits::op_end(this) - 1, 1, + InsertBefore) { init(Func, Name); } CallInst::CallInst(Value *Func, const Twine &Name, BasicBlock *InsertAtEnd) - : CallBase( - cast( - cast(Func->getType())->getElementType()) - ->getReturnType(), - Instruction::Call, - OperandTraits>::op_end(this) - 1, 1, InsertAtEnd) { + : CallBase(cast( + cast(Func->getType())->getElementType()) + ->getReturnType(), + Instruction::Call, OperandTraits::op_end(this) - 1, 1, + InsertAtEnd) { init(Func, Name); } CallInst::CallInst(const CallInst &CI) - : CallBase(CI.Attrs, CI.FTy, CI.getType(), Instruction::Call, - OperandTraits>::op_end(this) - - CI.getNumOperands(), - CI.getNumOperands()) { + : CallBase(CI.Attrs, CI.FTy, CI.getType(), Instruction::Call, + OperandTraits::op_end(this) - CI.getNumOperands(), + CI.getNumOperands()) { setTailCallKind(CI.getTailCallKind()); setCallingConv(CI.getCallingConv()); @@ -560,11 +629,12 @@ void InvokeInst::init(FunctionType *FTy, Value *Fn, BasicBlock *IfNormal, const Twine &NameStr) { this->FTy = FTy; - assert(getNumOperands() == 3 + Args.size() + CountBundleInputs(Bundles) && + assert((int)getNumOperands() == + ComputeNumOperands(Args.size(), CountBundleInputs(Bundles)) && "NumOperands not set up?"); - Op<-3>() = Fn; - Op<-2>() = IfNormal; - Op<-1>() = IfException; + setNormalDest(IfNormal); + setUnwindDest(IfException); + setCalledOperand(Fn); #ifndef NDEBUG assert(((Args.size() == FTy->getNumParams()) || @@ -577,7 +647,7 @@ void InvokeInst::init(FunctionType *FTy, Value *Fn, BasicBlock *IfNormal, "Invoking a function with a bad signature!"); #endif - std::copy(Args.begin(), Args.end(), op_begin()); + llvm::copy(Args, op_begin()); auto It = populateBundleOperandInfos(Bundles, Args.size()); (void)It; @@ -587,10 +657,9 @@ void InvokeInst::init(FunctionType *FTy, Value *Fn, BasicBlock *IfNormal, } InvokeInst::InvokeInst(const InvokeInst &II) - : CallBase(II.Attrs, II.FTy, II.getType(), Instruction::Invoke, - OperandTraits>::op_end(this) - - II.getNumOperands(), - II.getNumOperands()) { + : CallBase(II.Attrs, II.FTy, II.getType(), Instruction::Invoke, + OperandTraits::op_end(this) - II.getNumOperands(), + II.getNumOperands()) { setCallingConv(II.getCallingConv()); std::copy(II.op_begin(), II.op_end(), op_begin()); std::copy(II.bundle_op_info_begin(), II.bundle_op_info_end(), @@ -834,7 +903,7 @@ void CatchSwitchInst::removeHandler(handler_iterator HI) { void FuncletPadInst::init(Value *ParentPad, ArrayRef Args, const Twine &NameStr) { assert(getNumOperands() == 1 + Args.size() && "NumOperands not set up?"); - std::copy(Args.begin(), Args.end(), op_begin()); + llvm::copy(Args, op_begin()); setParentPad(ParentPad); setName(NameStr); } @@ -1390,7 +1459,7 @@ void GetElementPtrInst::init(Value *Ptr, ArrayRef IdxList, assert(getNumOperands() == 1 + IdxList.size() && "NumOperands not initialized?"); Op<0>() = Ptr; - std::copy(IdxList.begin(), IdxList.end(), op_begin() + 1); + llvm::copy(IdxList, op_begin() + 1); setName(Name); } @@ -1798,6 +1867,35 @@ bool ShuffleVectorInst::isTransposeMask(ArrayRef Mask) { return true; } +bool ShuffleVectorInst::isExtractSubvectorMask(ArrayRef Mask, + int NumSrcElts, int &Index) { + // Must extract from a single source. + if (!isSingleSourceMaskImpl(Mask, NumSrcElts)) + return false; + + // Must be smaller (else this is an Identity shuffle). + if (NumSrcElts <= (int)Mask.size()) + return false; + + // Find start of extraction, accounting that we may start with an UNDEF. + int SubIndex = -1; + for (int i = 0, e = Mask.size(); i != e; ++i) { + int M = Mask[i]; + if (M < 0) + continue; + int Offset = (M % NumSrcElts) - i; + if (0 <= SubIndex && SubIndex != Offset) + return false; + SubIndex = Offset; + } + + if (0 <= SubIndex) { + Index = SubIndex; + return true; + } + return false; +} + bool ShuffleVectorInst::isIdentityWithPadding() const { int NumOpElts = Op<0>()->getType()->getVectorNumElements(); int NumMaskElts = getType()->getVectorNumElements(); @@ -1927,6 +2025,59 @@ Type *ExtractValueInst::getIndexedType(Type *Agg, return const_cast(Agg); } +//===----------------------------------------------------------------------===// +// UnaryOperator Class +//===----------------------------------------------------------------------===// + +UnaryOperator::UnaryOperator(UnaryOps iType, Value *S, + Type *Ty, const Twine &Name, + Instruction *InsertBefore) + : UnaryInstruction(Ty, iType, S, InsertBefore) { + Op<0>() = S; + setName(Name); + AssertOK(); +} + +UnaryOperator::UnaryOperator(UnaryOps iType, Value *S, + Type *Ty, const Twine &Name, + BasicBlock *InsertAtEnd) + : UnaryInstruction(Ty, iType, S, InsertAtEnd) { + Op<0>() = S; + setName(Name); + AssertOK(); +} + +UnaryOperator *UnaryOperator::Create(UnaryOps Op, Value *S, + const Twine &Name, + Instruction *InsertBefore) { + return new UnaryOperator(Op, S, S->getType(), Name, InsertBefore); +} + +UnaryOperator *UnaryOperator::Create(UnaryOps Op, Value *S, + const Twine &Name, + BasicBlock *InsertAtEnd) { + UnaryOperator *Res = Create(Op, S, Name); + InsertAtEnd->getInstList().push_back(Res); + return Res; +} + +void UnaryOperator::AssertOK() { + Value *LHS = getOperand(0); + (void)LHS; // Silence warnings. +#ifndef NDEBUG + switch (getOpcode()) { + case FNeg: + assert(getType() == LHS->getType() && + "Unary operation should return same type as operand!"); + assert(getType()->isFPOrFPVectorTy() && + "Tried to create a floating-point operation on a " + "non-floating-point type!"); + break; + default: llvm_unreachable("Invalid opcode provided"); + } +#endif +} + //===----------------------------------------------------------------------===// // BinaryOperator Class //===----------------------------------------------------------------------===// @@ -3668,6 +3819,10 @@ GetElementPtrInst *GetElementPtrInst::cloneImpl() const { return new (getNumOperands()) GetElementPtrInst(*this); } +UnaryOperator *UnaryOperator::cloneImpl() const { + return Create(getOpcode(), Op<0>()); +} + BinaryOperator *BinaryOperator::cloneImpl() const { return Create(getOpcode(), Op<0>(), Op<1>()); } diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h index 4da61e1772319909dd127b9100d9e6e02f9c9949..2d120869860a0eb02500ab5f036d02262b909d93 100644 --- a/lib/IR/LLVMContextImpl.h +++ b/lib/IR/LLVMContextImpl.h @@ -612,15 +612,12 @@ template <> struct MDNodeKeyImpl { Metadata *File; unsigned Line; Metadata *Type; - bool IsLocalToUnit; - bool IsDefinition; unsigned ScopeLine; Metadata *ContainingType; - unsigned Virtuality; unsigned VirtualIndex; int ThisAdjustment; unsigned Flags; - bool IsOptimized; + unsigned SPFlags; Metadata *Unit; Metadata *TemplateParams; Metadata *Declaration; @@ -629,45 +626,39 @@ template <> struct MDNodeKeyImpl { MDNodeKeyImpl(Metadata *Scope, MDString *Name, MDString *LinkageName, Metadata *File, unsigned Line, Metadata *Type, - bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine, - Metadata *ContainingType, unsigned Virtuality, + unsigned ScopeLine, Metadata *ContainingType, unsigned VirtualIndex, int ThisAdjustment, unsigned Flags, - bool IsOptimized, Metadata *Unit, Metadata *TemplateParams, + unsigned SPFlags, Metadata *Unit, Metadata *TemplateParams, Metadata *Declaration, Metadata *RetainedNodes, Metadata *ThrownTypes) : Scope(Scope), Name(Name), LinkageName(LinkageName), File(File), - Line(Line), Type(Type), IsLocalToUnit(IsLocalToUnit), - IsDefinition(IsDefinition), ScopeLine(ScopeLine), - ContainingType(ContainingType), Virtuality(Virtuality), - VirtualIndex(VirtualIndex), ThisAdjustment(ThisAdjustment), - Flags(Flags), IsOptimized(IsOptimized), Unit(Unit), - TemplateParams(TemplateParams), Declaration(Declaration), + Line(Line), Type(Type), ScopeLine(ScopeLine), + ContainingType(ContainingType), VirtualIndex(VirtualIndex), + ThisAdjustment(ThisAdjustment), Flags(Flags), SPFlags(SPFlags), + Unit(Unit), TemplateParams(TemplateParams), Declaration(Declaration), RetainedNodes(RetainedNodes), ThrownTypes(ThrownTypes) {} MDNodeKeyImpl(const DISubprogram *N) : Scope(N->getRawScope()), Name(N->getRawName()), LinkageName(N->getRawLinkageName()), File(N->getRawFile()), - Line(N->getLine()), Type(N->getRawType()), - IsLocalToUnit(N->isLocalToUnit()), IsDefinition(N->isDefinition()), - ScopeLine(N->getScopeLine()), ContainingType(N->getRawContainingType()), - Virtuality(N->getVirtuality()), VirtualIndex(N->getVirtualIndex()), + Line(N->getLine()), Type(N->getRawType()), ScopeLine(N->getScopeLine()), + ContainingType(N->getRawContainingType()), + VirtualIndex(N->getVirtualIndex()), ThisAdjustment(N->getThisAdjustment()), Flags(N->getFlags()), - IsOptimized(N->isOptimized()), Unit(N->getRawUnit()), + SPFlags(N->getSPFlags()), Unit(N->getRawUnit()), TemplateParams(N->getRawTemplateParams()), - Declaration(N->getRawDeclaration()), RetainedNodes(N->getRawRetainedNodes()), + Declaration(N->getRawDeclaration()), + RetainedNodes(N->getRawRetainedNodes()), ThrownTypes(N->getRawThrownTypes()) {} bool isKeyOf(const DISubprogram *RHS) const { return Scope == RHS->getRawScope() && Name == RHS->getRawName() && LinkageName == RHS->getRawLinkageName() && File == RHS->getRawFile() && Line == RHS->getLine() && - Type == RHS->getRawType() && IsLocalToUnit == RHS->isLocalToUnit() && - IsDefinition == RHS->isDefinition() && - ScopeLine == RHS->getScopeLine() && + Type == RHS->getRawType() && ScopeLine == RHS->getScopeLine() && ContainingType == RHS->getRawContainingType() && - Virtuality == RHS->getVirtuality() && VirtualIndex == RHS->getVirtualIndex() && ThisAdjustment == RHS->getThisAdjustment() && - Flags == RHS->getFlags() && IsOptimized == RHS->isOptimized() && + Flags == RHS->getFlags() && SPFlags == RHS->getSPFlags() && Unit == RHS->getUnit() && TemplateParams == RHS->getRawTemplateParams() && Declaration == RHS->getRawDeclaration() && @@ -675,11 +666,13 @@ template <> struct MDNodeKeyImpl { ThrownTypes == RHS->getRawThrownTypes(); } + bool isDefinition() const { return SPFlags & DISubprogram::SPFlagDefinition; } + unsigned getHashValue() const { // If this is a declaration inside an ODR type, only hash the type and the // name. Otherwise the hash will be stronger than // MDNodeSubsetEqualImpl::isDeclarationOfODRMember(). - if (!IsDefinition && LinkageName) + if (!isDefinition() && LinkageName) if (auto *CT = dyn_cast_or_null(Scope)) if (CT->getRawIdentifier()) return hash_combine(LinkageName, Scope); @@ -696,7 +689,7 @@ template <> struct MDNodeSubsetEqualImpl { using KeyTy = MDNodeKeyImpl; static bool isSubsetEqual(const KeyTy &LHS, const DISubprogram *RHS) { - return isDeclarationOfODRMember(LHS.IsDefinition, LHS.Scope, + return isDeclarationOfODRMember(LHS.isDefinition(), LHS.Scope, LHS.LinkageName, LHS.TemplateParams, RHS); } diff --git a/lib/IR/Metadata.cpp b/lib/IR/Metadata.cpp index 204fecde32c98320e7b5bac1159756519189371e..5536c2497f1e986859933cf9fd5a90f5367d93a8 100644 --- a/lib/IR/Metadata.cpp +++ b/lib/IR/Metadata.cpp @@ -1484,7 +1484,7 @@ void GlobalObject::copyMetadata(const GlobalObject *Other, unsigned Offset) { std::vector Elements(OrigElements.size() + 2); Elements[0] = dwarf::DW_OP_plus_uconst; Elements[1] = Offset; - std::copy(OrigElements.begin(), OrigElements.end(), Elements.begin() + 2); + llvm::copy(OrigElements, Elements.begin() + 2); E = DIExpression::get(getContext(), Elements); Attachment = DIGlobalVariableExpression::get(getContext(), GV, E); } diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp index 7d02a3956ff90768f382e812a79b1b3c9fd4836c..70a16cbbf24210b45a2c11e9fa4aaebdd173fd8f 100644 --- a/lib/IR/Module.cpp +++ b/lib/IR/Module.cpp @@ -46,6 +46,7 @@ #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/RandomNumberGenerator.h" +#include "llvm/Support/VersionTuple.h" #include #include #include @@ -547,6 +548,45 @@ void Module::setRtLibUseGOT() { addModuleFlag(ModFlagBehavior::Max, "RtLibUseGOT", 1); } +void Module::setSDKVersion(const VersionTuple &V) { + SmallVector Entries; + Entries.push_back(V.getMajor()); + if (auto Minor = V.getMinor()) { + Entries.push_back(*Minor); + if (auto Subminor = V.getSubminor()) + Entries.push_back(*Subminor); + // Ignore the 'build' component as it can't be represented in the object + // file. + } + addModuleFlag(ModFlagBehavior::Warning, "SDK Version", + ConstantDataArray::get(Context, Entries)); +} + +VersionTuple Module::getSDKVersion() const { + auto *CM = dyn_cast_or_null(getModuleFlag("SDK Version")); + if (!CM) + return {}; + auto *Arr = dyn_cast_or_null(CM->getValue()); + if (!Arr) + return {}; + auto getVersionComponent = [&](unsigned Index) -> Optional { + if (Index >= Arr->getNumElements()) + return None; + return (unsigned)Arr->getElementAsInteger(Index); + }; + auto Major = getVersionComponent(0); + if (!Major) + return {}; + VersionTuple Result = VersionTuple(*Major); + if (auto Minor = getVersionComponent(1)) { + Result = VersionTuple(*Major, *Minor); + if (auto Subminor = getVersionComponent(2)) { + Result = VersionTuple(*Major, *Minor, *Subminor); + } + } + return Result; +} + GlobalVariable *llvm::collectUsedGlobalVariables( const Module &M, SmallPtrSetImpl &Set, bool CompilerUsed) { const char *Name = CompilerUsed ? "llvm.compiler.used" : "llvm.used"; diff --git a/lib/IR/ModuleSummaryIndex.cpp b/lib/IR/ModuleSummaryIndex.cpp index 8d85f7901b080476119d4c6d979858ac1ce41921..46b88cd31779a2097a8d6ddb846568b271ab9f8c 100644 --- a/lib/IR/ModuleSummaryIndex.cpp +++ b/lib/IR/ModuleSummaryIndex.cpp @@ -14,11 +14,17 @@ #include "llvm/IR/ModuleSummaryIndex.h" #include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringMap.h" #include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; +#define DEBUG_TYPE "module-summary-index" + +STATISTIC(ReadOnlyLiveGVars, + "Number of live global variables marked read only"); + FunctionSummary FunctionSummary::ExternalNode = FunctionSummary::makeDummyFunctionSummary({}); bool ValueInfo::isDSOLocal() const { @@ -30,6 +36,17 @@ bool ValueInfo::isDSOLocal() const { }); } +// Gets the number of immutable refs in RefEdgeList +unsigned FunctionSummary::immutableRefCount() const { + // Here we take advantage of having all readonly references + // located in the end of the RefEdgeList. + auto Refs = refs(); + unsigned ImmutableRefCnt = 0; + for (int I = Refs.size() - 1; I >= 0 && Refs[I].isReadOnly(); --I) + ImmutableRefCnt++; + return ImmutableRefCnt; +} + // Collect for the given module the list of function it defines // (GUID -> Summary). void ModuleSummaryIndex::collectDefinedFunctionsForModule( @@ -84,6 +101,80 @@ bool ModuleSummaryIndex::isGUIDLive(GlobalValue::GUID GUID) const { return false; } +static void propagateConstantsToRefs(GlobalValueSummary *S) { + // If reference is not readonly then referenced summary is not + // readonly either. Note that: + // - All references from GlobalVarSummary are conservatively considered as + // not readonly. Tracking them properly requires more complex analysis + // then we have now. + // + // - AliasSummary objects have no refs at all so this function is a no-op + // for them. + for (auto &VI : S->refs()) { + if (VI.isReadOnly()) { + // We only mark refs as readonly when computing function summaries on + // analysis phase. + assert(isa(S)); + continue; + } + for (auto &Ref : VI.getSummaryList()) + // If references to alias is not readonly then aliasee is not readonly + if (auto *GVS = dyn_cast(Ref->getBaseObject())) + GVS->setReadOnly(false); + } +} + +// Do the constant propagation in combined index. +// The goal of constant propagation is internalization of readonly +// variables. To determine which variables are readonly and which +// are not we take following steps: +// - During analysis we speculatively assign readonly attribute to +// all variables which can be internalized. When computing function +// summary we also assign readonly attribute to a reference if +// function doesn't modify referenced variable. +// +// - After computing dead symbols in combined index we do the constant +// propagation. During this step we clear readonly attribute from +// all variables which: +// a. are preserved or can't be imported +// b. referenced by any global variable initializer +// c. referenced by a function and reference is not readonly +// +// Internalization itself happens in the backend after import is finished +// See internalizeImmutableGVs. +void ModuleSummaryIndex::propagateConstants( + const DenseSet &GUIDPreservedSymbols) { + for (auto &P : *this) + for (auto &S : P.second.SummaryList) { + if (!isGlobalValueLive(S.get())) + // We don't examine references from dead objects + continue; + + // Global variable can't be marked read only if it is not eligible + // to import since we need to ensure that all external references + // get a local (imported) copy. It also can't be marked read only + // if it or any alias (since alias points to the same memory) are + // preserved or notEligibleToImport, since either of those means + // there could be writes that are not visible (because preserved + // means it could have external to DSO writes, and notEligibleToImport + // means it could have writes via inline assembly leading it to be + // in the @llvm.*used). + if (auto *GVS = dyn_cast(S->getBaseObject())) + // Here we intentionally pass S.get() not GVS, because S could be + // an alias. + if (!canImportGlobalVar(S.get()) || GUIDPreservedSymbols.count(P.first)) + GVS->setReadOnly(false); + propagateConstantsToRefs(S.get()); + } + if (llvm::AreStatisticsEnabled()) + for (auto &P : *this) + if (P.second.SummaryList.size()) + if (auto *GVS = dyn_cast( + P.second.SummaryList[0]->getBaseObject())) + if (isGlobalValueLive(GVS) && GVS->isReadOnly()) + ReadOnlyLiveGVars++; +} + // TODO: write a graphviz dumper for SCCs (see ModuleSummaryIndex::exportToDot) // then delete this function and update its tests LLVM_DUMP_METHOD @@ -108,6 +199,7 @@ namespace { struct Attributes { void add(const Twine &Name, const Twine &Value, const Twine &Comment = Twine()); + void addComment(const Twine &Comment); std::string getAsString() const; std::vector Attrs; @@ -129,6 +221,10 @@ void Attributes::add(const Twine &Name, const Twine &Value, A += Value.str(); A += "\""; Attrs.push_back(A); + addComment(Comment); +} + +void Attributes::addComment(const Twine &Comment) { if (!Comment.isTriviallyEmpty()) { if (Comments.empty()) Comments = " // "; @@ -237,6 +333,12 @@ static void defineExternalNode(raw_ostream &OS, const char *Pfx, OS << "\"]; // defined externally\n"; } +static bool hasReadOnlyFlag(const GlobalValueSummary *S) { + if (auto *GVS = dyn_cast(S)) + return GVS->isReadOnly(); + return false; +} + void ModuleSummaryIndex::exportToDot(raw_ostream &OS) const { std::vector CrossModuleEdges; DenseMap> NodeMap; @@ -252,13 +354,17 @@ void ModuleSummaryIndex::exportToDot(raw_ostream &OS) const { }; auto DrawEdge = [&](const char *Pfx, uint64_t SrcMod, GlobalValue::GUID SrcId, - uint64_t DstMod, GlobalValue::GUID DstId, int TypeOrHotness) { - // 0 corresponds to alias edge, 1 to ref edge, 2 to call with unknown - // hotness, ... - TypeOrHotness += 2; + uint64_t DstMod, GlobalValue::GUID DstId, + int TypeOrHotness) { + // 0 - alias + // 1 - reference + // 2 - constant reference + // Other value: (hotness - 3). + TypeOrHotness += 3; static const char *EdgeAttrs[] = { " [style=dotted]; // alias", " [style=dashed]; // ref", + " [style=dashed,color=forestgreen]; // const-ref", " // call (hotness : Unknown)", " [color=blue]; // call (hotness : Cold)", " // call (hotness : None)", @@ -301,6 +407,8 @@ void ModuleSummaryIndex::exportToDot(raw_ostream &OS) const { A.add("shape", "box"); } else { A.add("shape", "Mrecord", "variable"); + if (Flags.Live && hasReadOnlyFlag(SummaryIt.second)) + A.addComment("immutable"); } auto VI = getValueInfo(SummaryIt.first); @@ -318,7 +426,7 @@ void ModuleSummaryIndex::exportToDot(raw_ostream &OS) const { for (auto &SummaryIt : GVSMap) { auto *GVS = SummaryIt.second; for (auto &R : GVS->refs()) - Draw(SummaryIt.first, R.getGUID(), -1); + Draw(SummaryIt.first, R.getGUID(), R.isReadOnly() ? -1 : -2); if (auto *AS = dyn_cast_or_null(SummaryIt.second)) { GlobalValue::GUID AliaseeId; @@ -331,7 +439,7 @@ void ModuleSummaryIndex::exportToDot(raw_ostream &OS) const { AliaseeId = AliaseeOrigId; } - Draw(SummaryIt.first, AliaseeId, -2); + Draw(SummaryIt.first, AliaseeId, -3); continue; } diff --git a/lib/IR/PassTimingInfo.cpp b/lib/IR/PassTimingInfo.cpp index 8165704a9ec0c52e4d202798867c5ccffb77c172..40b3977ecbd9b92dbaea277803af952d4c591e2f 100644 --- a/lib/IR/PassTimingInfo.cpp +++ b/lib/IR/PassTimingInfo.cpp @@ -226,7 +226,7 @@ static bool matchPassManager(StringRef PassID) { Prefix.endswith("AnalysisManagerProxy"); } -bool TimePassesHandler::runBeforePass(StringRef PassID, Any IR) { +bool TimePassesHandler::runBeforePass(StringRef PassID) { if (matchPassManager(PassID)) return true; @@ -239,7 +239,7 @@ bool TimePassesHandler::runBeforePass(StringRef PassID, Any IR) { return true; } -void TimePassesHandler::runAfterPass(StringRef PassID, Any IR) { +void TimePassesHandler::runAfterPass(StringRef PassID) { if (matchPassManager(PassID)) return; @@ -254,13 +254,15 @@ void TimePassesHandler::registerCallbacks(PassInstrumentationCallbacks &PIC) { return; PIC.registerBeforePassCallback( - [this](StringRef P, Any IR) { return this->runBeforePass(P, IR); }); + [this](StringRef P, Any) { return this->runBeforePass(P); }); PIC.registerAfterPassCallback( - [this](StringRef P, Any IR) { this->runAfterPass(P, IR); }); + [this](StringRef P, Any) { this->runAfterPass(P); }); + PIC.registerAfterPassInvalidatedCallback( + [this](StringRef P) { this->runAfterPass(P); }); PIC.registerBeforeAnalysisCallback( - [this](StringRef P, Any IR) { this->runBeforePass(P, IR); }); + [this](StringRef P, Any) { this->runBeforePass(P); }); PIC.registerAfterAnalysisCallback( - [this](StringRef P, Any IR) { this->runAfterPass(P, IR); }); + [this](StringRef P, Any) { this->runAfterPass(P); }); } } // namespace llvm diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp index 6dc48ad97cc64294575bed1fdec28fa2e211181d..dc8af6b68e7bf4deb2a9f945af79ba8ee5138dd5 100644 --- a/lib/IR/Value.cpp +++ b/lib/IR/Value.cpp @@ -130,20 +130,11 @@ void Value::destroyValueName() { } bool Value::hasNUses(unsigned N) const { - const_use_iterator UI = use_begin(), E = use_end(); - - for (; N; --N, ++UI) - if (UI == E) return false; // Too few. - return UI == E; + return hasNItems(use_begin(), use_end(), N); } bool Value::hasNUsesOrMore(unsigned N) const { - const_use_iterator UI = use_begin(), E = use_end(); - - for (; N; --N, ++UI) - if (UI == E) return false; // Too few. - - return true; + return hasNItemsOrMore(use_begin(), use_end(), N); } bool Value::isUsedInBasicBlock(const BasicBlock *BB) const { diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp index 4d0135d8338e92825462a3c69b834eb022fe4bd1..7fd6df3e6e001f304205ef3db1d7ef774691e817 100644 --- a/lib/IR/Verifier.cpp +++ b/lib/IR/Verifier.cpp @@ -281,6 +281,9 @@ class Verifier : public InstVisitor, VerifierSupport { /// Whether the current function has a DISubprogram attached to it. bool HasDebugInfo = false; + /// Whether source was present on the first DIFile encountered in each CU. + DenseMap HasSourceDebugInfo; + /// Stores the count of how many objects were passed to llvm.localescape for a /// given function and the largest index passed to llvm.localrecover. DenseMap> FrameEscapeInfo; @@ -383,6 +386,7 @@ public: visitModuleFlags(M); visitModuleIdents(M); + visitModuleCommandLines(M); verifyCompileUnits(); @@ -405,6 +409,7 @@ private: void visitValueAsMetadata(const ValueAsMetadata &MD, Function *F); void visitComdat(const Comdat &C); void visitModuleIdents(const Module &M); + void visitModuleCommandLines(const Module &M); void visitModuleFlags(const Module &M); void visitModuleFlag(const MDNode *Op, DenseMap &SeenIDs, @@ -443,6 +448,7 @@ private: void visitBitCastInst(BitCastInst &I); void visitAddrSpaceCastInst(AddrSpaceCastInst &I); void visitPHINode(PHINode &PN); + void visitUnaryOperator(UnaryOperator &U); void visitBinaryOperator(BinaryOperator &B); void visitICmpInst(ICmpInst &IC); void visitFCmpInst(FCmpInst &FC); @@ -518,6 +524,9 @@ private: /// Module-level verification that all @llvm.experimental.deoptimize /// declarations share the same calling convention. void verifyDeoptimizeCallingConvs(); + + /// Verify all-or-nothing property of DIFile source attribute within a CU. + void verifySourceDebugInfo(const DICompileUnit &U, const DIFile &F); }; } // end anonymous namespace @@ -1031,6 +1040,8 @@ void Verifier::visitDICompileUnit(const DICompileUnit &N) { AssertDI(!N.getFile()->getFilename().empty(), "invalid filename", &N, N.getFile()); + verifySourceDebugInfo(N, *N.getFile()); + AssertDI((N.getEmissionKind() <= DICompileUnit::LastEmissionKind), "invalid emission kind", &N); @@ -1108,6 +1119,8 @@ void Verifier::visitDISubprogram(const DISubprogram &N) { AssertDI(N.isDistinct(), "subprogram definitions must be distinct", &N); AssertDI(Unit, "subprogram definitions must have a compile unit", &N); AssertDI(isa(Unit), "invalid unit type", &N, Unit); + if (N.getFile()) + verifySourceDebugInfo(*N.getUnit(), *N.getFile()); } else { // Subprogram declarations (part of the type hierarchy). AssertDI(!Unit, "subprogram declarations must not have a compile unit", &N); @@ -1302,6 +1315,24 @@ void Verifier::visitModuleIdents(const Module &M) { } } +void Verifier::visitModuleCommandLines(const Module &M) { + const NamedMDNode *CommandLines = M.getNamedMetadata("llvm.commandline"); + if (!CommandLines) + return; + + // llvm.commandline takes a list of metadata entry. Each entry has only one + // string. Scan each llvm.commandline entry and make sure that this + // requirement is met. + for (const MDNode *N : CommandLines->operands()) { + Assert(N->getNumOperands() == 1, + "incorrect number of operands in llvm.commandline metadata", N); + Assert(dyn_cast_or_null(N->getOperand(0)), + ("invalid value for llvm.commandline metadata entry operand" + "(the operand should be a string)"), + N->getOperand(0)); + } +} + void Verifier::visitModuleFlags(const Module &M) { const NamedMDNode *Flags = M.getModuleFlagsMetadata(); if (!Flags) return; @@ -1929,6 +1960,7 @@ void Verifier::verifyStatepoint(ImmutableCallSite CS) { // Verify that the types of the call parameter arguments match // the type of the wrapped callee. + AttributeList Attrs = CS.getAttributes(); for (int i = 0; i < NumParams; i++) { Type *ParamType = TargetFuncType->getParamType(i); Type *ArgType = CS.getArgument(5 + i)->getType(); @@ -1936,6 +1968,12 @@ void Verifier::verifyStatepoint(ImmutableCallSite CS) { "gc.statepoint call argument does not match wrapped " "function type", &CI); + + if (TargetFuncType->isVarArg()) { + AttributeSet ArgAttrs = Attrs.getParamAttributes(5 + i); + Assert(!ArgAttrs.hasAttribute(Attribute::StructRet), + "Attribute 'sret' cannot be used for vararg call arguments!", &CI); + } } const int EndCallArgsInx = 4 + NumCallArgs; @@ -2813,8 +2851,13 @@ void Verifier::verifyCallSite(CallSite CS) { SawReturned = true; } - Assert(!ArgAttrs.hasAttribute(Attribute::StructRet), - "Attribute 'sret' cannot be used for vararg call arguments!", I); + // Statepoint intrinsic is vararg but the wrapped function may be not. + // Allow sret here and check the wrapped function in verifyStatepoint. + if (CS.getCalledFunction() == nullptr || + CS.getCalledFunction()->getIntrinsicID() != + Intrinsic::experimental_gc_statepoint) + Assert(!ArgAttrs.hasAttribute(Attribute::StructRet), + "Attribute 'sret' cannot be used for vararg call arguments!", I); if (ArgAttrs.hasAttribute(Attribute::InAlloca)) Assert(Idx == CS.arg_size() - 1, "inalloca isn't on the last argument!", @@ -2990,6 +3033,28 @@ void Verifier::visitInvokeInst(InvokeInst &II) { visitTerminator(II); } +/// visitUnaryOperator - Check the argument to the unary operator. +/// +void Verifier::visitUnaryOperator(UnaryOperator &U) { + Assert(U.getType() == U.getOperand(0)->getType(), + "Unary operators must have same type for" + "operands and result!", + &U); + + switch (U.getOpcode()) { + // Check that floating-point arithmetic operators are only used with + // floating-point operands. + case Instruction::FNeg: + Assert(U.getType()->isFPOrFPVectorTy(), + "FNeg operator only works with float types!", &U); + break; + default: + llvm_unreachable("Unknown UnaryOperator opcode!"); + } + + visitInstruction(U); +} + /// visitBinaryOperator - Check that both arguments to the binary operator are /// of the same type! /// @@ -3887,6 +3952,10 @@ void Verifier::visitInstruction(Instruction &I) { } } + // Get a pointer to the call base of the instruction if it is some form of + // call. + const CallBase *CBI = dyn_cast(&I); + for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) { Assert(I.getOperand(i) != nullptr, "Instruction has null operand!", &I); @@ -3899,10 +3968,9 @@ void Verifier::visitInstruction(Instruction &I) { if (Function *F = dyn_cast(I.getOperand(i))) { // Check to make sure that the "address of" an intrinsic function is never // taken. - Assert( - !F->isIntrinsic() || - i == (isa(I) ? e - 1 : isa(I) ? e - 3 : 0), - "Cannot take the address of an intrinsic!", &I); + Assert(!F->isIntrinsic() || + (CBI && &CBI->getCalledOperandUse() == &I.getOperandUse(i)), + "Cannot take the address of an intrinsic!", &I); Assert( !F->isIntrinsic() || isa(I) || F->getIntrinsicID() == Intrinsic::donothing || @@ -3928,8 +3996,7 @@ void Verifier::visitInstruction(Instruction &I) { } else if (isa(I.getOperand(i))) { verifyDominatesUse(I, i); } else if (isa(I.getOperand(i))) { - Assert((i + 1 == e && isa(I)) || - (i + 3 == e && isa(I)), + Assert(CBI && &CBI->getCalledOperandUse() == &I.getOperandUse(i), "Cannot take the address of an inline asm!", &I); } else if (ConstantExpr *CE = dyn_cast(I.getOperand(i))) { if (CE->getType()->isPtrOrPtrVectorTy() || @@ -4494,6 +4561,24 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) { "of ints"); break; } + case Intrinsic::smul_fix: { + Value *Op1 = CS.getArgOperand(0); + Value *Op2 = CS.getArgOperand(1); + Assert(Op1->getType()->isIntOrIntVectorTy(), + "first operand of smul_fix must be an int type or vector " + "of ints"); + Assert(Op2->getType()->isIntOrIntVectorTy(), + "second operand of smul_fix must be an int type or vector " + "of ints"); + + auto *Op3 = dyn_cast(CS.getArgOperand(2)); + Assert(Op3, "third argument of smul_fix must be a constant integer"); + Assert(Op3->getType()->getBitWidth() <= 32, + "third argument of smul_fix must fit within 32 bits"); + Assert(Op3->getZExtValue() < Op1->getType()->getScalarSizeInBits(), + "the scale of smul_fix must be less than the width of the operands"); + break; + } }; } @@ -4707,6 +4792,14 @@ void Verifier::verifyDeoptimizeCallingConvs() { } } +void Verifier::verifySourceDebugInfo(const DICompileUnit &U, const DIFile &F) { + bool HasSource = F.getSource().hasValue(); + if (!HasSourceDebugInfo.count(&U)) + HasSourceDebugInfo[&U] = HasSource; + AssertDI(HasSource == HasSourceDebugInfo[&U], + "inconsistent use of embedded source"); +} + //===----------------------------------------------------------------------===// // Implement the public interfaces to this file... //===----------------------------------------------------------------------===// diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt index 0eb4bba26760079001118a33f6d105412a0241cb..d87bf797161bea615d6f2402bda9f3d9849f1f05 100644 --- a/lib/LLVMBuild.txt +++ b/lib/LLVMBuild.txt @@ -31,6 +31,7 @@ subdirectories = IRReader LTO MC + MCA Object BinaryFormat ObjectYAML @@ -40,6 +41,7 @@ subdirectories = ProfileData Support TableGen + TextAPI Target Testing ToolDrivers diff --git a/lib/LTO/CMakeLists.txt b/lib/LTO/CMakeLists.txt index 73b5662d4bc8ed10b1383f0c617db5d71d011028..1730df665d83b31a0e81edff4403d18a5eacfc2d 100644 --- a/lib/LTO/CMakeLists.txt +++ b/lib/LTO/CMakeLists.txt @@ -4,6 +4,7 @@ add_llvm_library(LLVMLTO LTOBackend.cpp LTOModule.cpp LTOCodeGenerator.cpp + SummaryBasedOptimizations.cpp UpdateCompilerUsed.cpp ThinLTOCodeGenerator.cpp diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp index 2726b6785eddfd174e62112b68fc64aca5f45600..08924fb92dd6a4cb4ee516d88a23b27da5f002f5 100644 --- a/lib/LTO/LTO.cpp +++ b/lib/LTO/LTO.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/Mangler.h" #include "llvm/IR/Metadata.h" #include "llvm/LTO/LTOBackend.h" +#include "llvm/LTO/SummaryBasedOptimizations.h" #include "llvm/Linker/IRMover.h" #include "llvm/Object/IRObjectFile.h" #include "llvm/Support/Error.h" @@ -42,6 +43,7 @@ #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" +#include "llvm/Transforms/Utils/FunctionImportUtils.h" #include "llvm/Transforms/Utils/SplitModule.h" #include @@ -61,10 +63,10 @@ cl::opt EnableLTOInternalization( "enable-lto-internalization", cl::init(true), cl::Hidden, cl::desc("Enable global value internalization in LTO")); -// Returns a unique hash for the Module considering the current list of +// Computes a unique hash for the Module considering the current list of // export/import and other global analysis results. // The hash is produced in \p Key. -static void computeCacheKey( +void llvm::computeLTOCacheKey( SmallString<40> &Key, const Config &Conf, const ModuleSummaryIndex &Index, StringRef ModuleID, const FunctionImporter::ImportMapTy &ImportList, const FunctionImporter::ExportSetTy &ExportList, @@ -132,6 +134,7 @@ static void computeCacheKey( AddUnsigned(Conf.CGFileType); AddUnsigned(Conf.OptLevel); AddUnsigned(Conf.UseNewPM); + AddUnsigned(Conf.Freestanding); AddString(Conf.OptPipeline); AddString(Conf.AAPipeline); AddString(Conf.OverrideTriple); @@ -187,6 +190,8 @@ static void computeCacheKey( AddUnsigned(VI.isDSOLocal()); AddUsedCfiGlobal(VI.getGUID()); } + if (auto *GVS = dyn_cast(GS)) + AddUnsigned(GVS->isReadOnly()); if (auto *FS = dyn_cast(GS)) { for (auto &TT : FS->type_tests()) UsedTypeIds.insert(TT); @@ -218,8 +223,14 @@ static void computeCacheKey( // Imported functions may introduce new uses of type identifier resolutions, // so we need to collect their used resolutions as well. for (auto &ImpM : ImportList) - for (auto &ImpF : ImpM.second) - AddUsedThings(Index.findSummaryInModule(ImpF, ImpM.first())); + for (auto &ImpF : ImpM.second) { + GlobalValueSummary *S = Index.findSummaryInModule(ImpF, ImpM.first()); + AddUsedThings(S); + // If this is an alias, we also care about any types/etc. that the aliasee + // may reference. + if (auto *AS = dyn_cast_or_null(S)) + AddUsedThings(AS->getBaseObject()); + } auto AddTypeIdSummary = [&](StringRef TId, const TypeIdSummary &S) { AddString(TId); @@ -282,7 +293,7 @@ static void computeCacheKey( Key = toHex(Hasher.result()); } -static void thinLTOResolveWeakForLinkerGUID( +static void thinLTOResolvePrevailingGUID( GlobalValueSummaryList &GVSummaryList, GlobalValue::GUID GUID, DenseSet &GlobalInvolvedWithAlias, function_ref @@ -291,7 +302,10 @@ static void thinLTOResolveWeakForLinkerGUID( recordNewLinkage) { for (auto &S : GVSummaryList) { GlobalValue::LinkageTypes OriginalLinkage = S->linkage(); - if (!GlobalValue::isWeakForLinker(OriginalLinkage)) + // Ignore local and appending linkage values since the linker + // doesn't resolve them. + if (GlobalValue::isLocalLinkage(OriginalLinkage) || + GlobalValue::isAppendingLinkage(S->linkage())) continue; // We need to emit only one of these. The prevailing module will keep it, // but turned into a weak, while the others will drop it when possible. @@ -315,13 +329,13 @@ static void thinLTOResolveWeakForLinkerGUID( } } -// Resolve Weak and LinkOnce values in the \p Index. +/// Resolve linkage for prevailing symbols in the \p Index. // // We'd like to drop these functions if they are no longer referenced in the // current module. However there is a chance that another module is still // referencing them because of the import. We make sure we always emit at least // one copy. -void llvm::thinLTOResolveWeakForLinkerInIndex( +void llvm::thinLTOResolvePrevailingInIndex( ModuleSummaryIndex &Index, function_ref isPrevailing, @@ -337,9 +351,9 @@ void llvm::thinLTOResolveWeakForLinkerInIndex( GlobalInvolvedWithAlias.insert(&AS->getAliasee()); for (auto &I : Index) - thinLTOResolveWeakForLinkerGUID(I.second.SummaryList, I.first, - GlobalInvolvedWithAlias, isPrevailing, - recordNewLinkage); + thinLTOResolvePrevailingGUID(I.second.SummaryList, I.first, + GlobalInvolvedWithAlias, isPrevailing, + recordNewLinkage); } static void thinLTOInternalizeAndPromoteGUID( @@ -350,7 +364,13 @@ static void thinLTOInternalizeAndPromoteGUID( if (GlobalValue::isLocalLinkage(S->linkage())) S->setLinkage(GlobalValue::ExternalLinkage); } else if (EnableLTOInternalization && - !GlobalValue::isLocalLinkage(S->linkage())) + // Ignore local and appending linkage values since the linker + // doesn't resolve them. + !GlobalValue::isLocalLinkage(S->linkage()) && + S->linkage() != GlobalValue::AppendingLinkage && + // We can't internalize available_externally globals because this + // can break function pointer equality. + S->linkage() != GlobalValue::AvailableExternallyLinkage) S->setLinkage(GlobalValue::InternalLinkage); } } @@ -803,7 +823,8 @@ Error LTO::run(AddStreamFn AddStream, NativeObjectCache Cache) { return PrevailingType::Unknown; return It->second; }; - computeDeadSymbols(ThinLTO.CombinedIndex, GUIDPreservedSymbols, isPrevailing); + computeDeadSymbolsWithConstProp(ThinLTO.CombinedIndex, GUIDPreservedSymbols, + isPrevailing, Conf.OptLevel > 0); // Setup output file to emit statistics. std::unique_ptr StatsFile = nullptr; @@ -974,9 +995,9 @@ public: SmallString<40> Key; // The module may be cached, this helps handling it. - computeCacheKey(Key, Conf, CombinedIndex, ModuleID, ImportList, ExportList, - ResolvedODR, DefinedGlobals, CfiFunctionDefs, - CfiFunctionDecls); + computeLTOCacheKey(Key, Conf, CombinedIndex, ModuleID, ImportList, + ExportList, ResolvedODR, DefinedGlobals, CfiFunctionDefs, + CfiFunctionDecls); if (AddStreamFn CacheAddStream = Cache(Task, Key)) return RunThinBackend(CacheAddStream); @@ -1151,6 +1172,9 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache) { if (!ModuleToDefinedGVSummaries.count(Mod.first)) ModuleToDefinedGVSummaries.try_emplace(Mod.first); + // Synthesize entry counts for functions in the CombinedIndex. + computeSyntheticCounts(ThinLTO.CombinedIndex); + StringMap ImportLists( ThinLTO.ModuleMap.size()); StringMap ExportLists( @@ -1205,8 +1229,8 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache) { GlobalValue::LinkageTypes NewLinkage) { ResolvedODR[ModuleIdentifier][GUID] = NewLinkage; }; - thinLTOResolveWeakForLinkerInIndex(ThinLTO.CombinedIndex, isPrevailing, - recordNewLinkage); + thinLTOResolvePrevailingInIndex(ThinLTO.CombinedIndex, isPrevailing, + recordNewLinkage); std::unique_ptr BackendProc = ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp index 1f9d60a5bdff520c1eb7e5baa7476cfdacfd09cf..926c419e34a84c35a3b9470e7ed29758c420e39e 100644 --- a/lib/LTO/LTOBackend.cpp +++ b/lib/LTO/LTOBackend.cpp @@ -490,7 +490,7 @@ Error lto::thinBackend(Config &Conf, unsigned Task, AddStreamFn AddStream, dropDeadSymbols(Mod, DefinedGlobals, CombinedIndex); - thinLTOResolveWeakForLinkerModule(Mod, DefinedGlobals); + thinLTOResolvePrevailingInModule(Mod, DefinedGlobals); if (Conf.PostPromoteModuleHook && !Conf.PostPromoteModuleHook(Task, Mod)) return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp index 20fc0943539f1bd50349bad9664a493c3ffc45ab..0d40d49dbe394e3281164a1ef999d269c6933162 100644 --- a/lib/LTO/LTOModule.cpp +++ b/lib/LTO/LTOModule.cpp @@ -73,7 +73,7 @@ bool LTOModule::isBitcodeFile(StringRef Path) { bool LTOModule::isThinLTO() { Expected Result = getBitcodeLTOInfo(MBRef); if (!Result) { - logAllUnhandledErrors(Result.takeError(), errs(), ""); + logAllUnhandledErrors(Result.takeError(), errs()); return false; } return Result->IsThinLTO; diff --git a/lib/LTO/SummaryBasedOptimizations.cpp b/lib/LTO/SummaryBasedOptimizations.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8b1abb78462b1ab9044bae7e72c8f31a98f4cd67 --- /dev/null +++ b/lib/LTO/SummaryBasedOptimizations.cpp @@ -0,0 +1,80 @@ +//==-SummaryBasedOptimizations.cpp - Optimizations based on ThinLTO summary-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements optimizations that are based on the module summaries. +// These optimizations are performed during the thinlink phase of the +// compilation. +// +//===----------------------------------------------------------------------===// + +#include "llvm/LTO/SummaryBasedOptimizations.h" +#include "llvm/Analysis/SyntheticCountsUtils.h" +#include "llvm/IR/ModuleSummaryIndex.h" + +using namespace llvm; + +cl::opt ThinLTOSynthesizeEntryCounts( + "thinlto-synthesize-entry-counts", cl::init(false), cl::Hidden, + cl::desc("Synthesize entry counts based on the summary")); + +extern cl::opt InitialSyntheticCount; + +static void initializeCounts(ModuleSummaryIndex &Index) { + auto Root = Index.calculateCallGraphRoot(); + // Root is a fake node. All its successors are the actual roots of the + // callgraph. + // FIXME: This initializes the entry counts of only the root nodes. This makes + // sense when compiling a binary with ThinLTO, but for libraries any of the + // non-root nodes could be called from outside. + for (auto &C : Root.calls()) { + auto &V = C.first; + for (auto &GVS : V.getSummaryList()) { + auto S = GVS.get()->getBaseObject(); + auto *F = cast(S); + F->setEntryCount(InitialSyntheticCount); + } + } +} + +void llvm::computeSyntheticCounts(ModuleSummaryIndex &Index) { + if (!ThinLTOSynthesizeEntryCounts) + return; + + using Scaled64 = ScaledNumber; + initializeCounts(Index); + auto GetCallSiteRelFreq = [](FunctionSummary::EdgeTy &Edge) { + return Scaled64(Edge.second.RelBlockFreq, -CalleeInfo::ScaleShift); + }; + auto GetEntryCount = [](ValueInfo V) { + if (V.getSummaryList().size()) { + auto S = V.getSummaryList().front().get()->getBaseObject(); + auto *F = cast(S); + return F->entryCount(); + } else { + return UINT64_C(0); + } + }; + auto AddToEntryCount = [](ValueInfo V, uint64_t New) { + if (!V.getSummaryList().size()) + return; + for (auto &GVS : V.getSummaryList()) { + auto S = GVS.get()->getBaseObject(); + auto *F = cast(S); + F->setEntryCount(SaturatingAdd(F->entryCount(), New)); + } + }; + + // After initializing the counts in initializeCounts above, the counts have to + // be propagated across the combined callgraph. + // SyntheticCountsUtils::propagate takes care of this propagation on any + // callgraph that specialized GraphTraits. + SyntheticCountsUtils::propagate( + &Index, GetCallSiteRelFreq, GetEntryCount, AddToEntryCount); + Index.setHasSyntheticEntryCounts(); +} diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp index 9500b2ded70ea8de4d05c969ede91d1e1800ff65..d9ec68fe3eb50a3234d16e2e40d9715df5e9730f 100644 --- a/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/lib/LTO/ThinLTOCodeGenerator.cpp @@ -33,6 +33,7 @@ #include "llvm/IR/Verifier.h" #include "llvm/IRReader/IRReader.h" #include "llvm/LTO/LTO.h" +#include "llvm/LTO/SummaryBasedOptimizations.h" #include "llvm/MC/SubtargetFeature.h" #include "llvm/Object/IRObjectFile.h" #include "llvm/Support/CachePruning.h" @@ -298,8 +299,7 @@ public: const FunctionImporter::ImportMapTy &ImportList, const FunctionImporter::ExportSetTy &ExportList, const std::map &ResolvedODR, - const GVSummaryMapTy &DefinedFunctions, - const DenseSet &PreservedSymbols, unsigned OptLevel, + const GVSummaryMapTy &DefinedGVSummaries, unsigned OptLevel, bool Freestanding, const TargetMachineBuilder &TMBuilder) { if (CachePath.empty()) return; @@ -308,87 +308,26 @@ public: // The module does not have an entry, it can't have a hash at all return; - // Compute the unique hash for this entry - // This is based on the current compiler version, the module itself, the - // export list, the hash for every single module in the import list, the - // list of ResolvedODR for the module, and the list of preserved symbols. - - // Include the hash for the current module - auto ModHash = Index.getModuleHash(ModuleID); - - if (all_of(ModHash, [](uint32_t V) { return V == 0; })) + if (all_of(Index.getModuleHash(ModuleID), + [](uint32_t V) { return V == 0; })) // No hash entry, no caching! return; - SHA1 Hasher; - - // Include the parts of the LTO configuration that affect code generation. - auto AddString = [&](StringRef Str) { - Hasher.update(Str); - Hasher.update(ArrayRef{0}); - }; - auto AddUnsigned = [&](unsigned I) { - uint8_t Data[4]; - Data[0] = I; - Data[1] = I >> 8; - Data[2] = I >> 16; - Data[3] = I >> 24; - Hasher.update(ArrayRef{Data, 4}); - }; - - // Start with the compiler revision - Hasher.update(LLVM_VERSION_STRING); -#ifdef LLVM_REVISION - Hasher.update(LLVM_REVISION); -#endif - - // Hash the optimization level and the target machine settings. - AddString(TMBuilder.MCpu); - // FIXME: Hash more of Options. For now all clients initialize Options from - // command-line flags (which is unsupported in production), but may set - // RelaxELFRelocations. The clang driver can also pass FunctionSections, - // DataSections and DebuggerTuning via command line flags. - AddUnsigned(TMBuilder.Options.RelaxELFRelocations); - AddUnsigned(TMBuilder.Options.FunctionSections); - AddUnsigned(TMBuilder.Options.DataSections); - AddUnsigned((unsigned)TMBuilder.Options.DebuggerTuning); - AddString(TMBuilder.MAttr); - if (TMBuilder.RelocModel) - AddUnsigned(*TMBuilder.RelocModel); - AddUnsigned(TMBuilder.CGOptLevel); - AddUnsigned(OptLevel); - AddUnsigned(Freestanding); - - Hasher.update(ArrayRef((uint8_t *)&ModHash[0], sizeof(ModHash))); - for (auto F : ExportList) - // The export list can impact the internalization, be conservative here - Hasher.update(ArrayRef((uint8_t *)&F, sizeof(F))); - - // Include the hash for every module we import functions from - for (auto &Entry : ImportList) { - auto ModHash = Index.getModuleHash(Entry.first()); - Hasher.update(ArrayRef((uint8_t *)&ModHash[0], sizeof(ModHash))); - } - - // Include the hash for the resolved ODR. - for (auto &Entry : ResolvedODR) { - Hasher.update(ArrayRef((const uint8_t *)&Entry.first, - sizeof(GlobalValue::GUID))); - Hasher.update(ArrayRef((const uint8_t *)&Entry.second, - sizeof(GlobalValue::LinkageTypes))); - } - - // Include the hash for the preserved symbols. - for (auto &Entry : PreservedSymbols) { - if (DefinedFunctions.count(Entry)) - Hasher.update( - ArrayRef((const uint8_t *)&Entry, sizeof(GlobalValue::GUID))); - } + llvm::lto::Config Conf; + Conf.OptLevel = OptLevel; + Conf.Options = TMBuilder.Options; + Conf.CPU = TMBuilder.MCpu; + Conf.MAttrs.push_back(TMBuilder.MAttr); + Conf.RelocModel = TMBuilder.RelocModel; + Conf.CGOptLevel = TMBuilder.CGOptLevel; + Conf.Freestanding = Freestanding; + SmallString<40> Key; + computeLTOCacheKey(Key, Conf, Index, ModuleID, ImportList, ExportList, + ResolvedODR, DefinedGVSummaries); // This choice of file name allows the cache to be pruned (see pruneCache() // in include/llvm/Support/CachePruning.h). - sys::path::append(EntryPath, CachePath, - "llvmcache-" + toHex(Hasher.result())); + sys::path::append(EntryPath, CachePath, "llvmcache-" + Key); } // Access the path to this entry in the cache. @@ -457,8 +396,8 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index, if (!SingleModule) { promoteModule(TheModule, Index); - // Apply summary-based LinkOnce/Weak resolution decisions. - thinLTOResolveWeakForLinkerModule(TheModule, DefinedGlobals); + // Apply summary-based prevailing-symbol resolution decisions. + thinLTOResolvePrevailingInModule(TheModule, DefinedGlobals); // Save temps: after promotion. saveTempBitcode(TheModule, SaveTempsDir, count, ".1.promoted.bc"); @@ -500,12 +439,12 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index, return codegenModule(TheModule, TM); } -/// Resolve LinkOnce/Weak symbols. Record resolutions in the \p ResolvedODR map +/// Resolve prevailing symbols. Record resolutions in the \p ResolvedODR map /// for caching, and in the \p Index for application during the ThinLTO /// backends. This is needed for correctness for exported symbols (ensure /// at least one copy kept) and a compile-time optimization (to drop duplicate /// copies when possible). -static void resolveWeakForLinkerInIndex( +static void resolvePrevailingInIndex( ModuleSummaryIndex &Index, StringMap> &ResolvedODR) { @@ -527,7 +466,7 @@ static void resolveWeakForLinkerInIndex( ResolvedODR[ModuleIdentifier][GUID] = NewLinkage; }; - thinLTOResolveWeakForLinkerInIndex(Index, isPrevailing, recordNewLinkage); + thinLTOResolvePrevailingInIndex(Index, isPrevailing, recordNewLinkage); } // Initialize the TargetMachine builder for a given Triple @@ -646,7 +585,8 @@ static void computeDeadSymbolsInIndex( auto isPrevailing = [&](GlobalValue::GUID G) { return PrevailingType::Unknown; }; - computeDeadSymbols(Index, GUIDPreservedSymbols, isPrevailing); + computeDeadSymbolsWithConstProp(Index, GUIDPreservedSymbols, isPrevailing, + /* ImportEnabled = */ true); } /** @@ -675,11 +615,11 @@ void ThinLTOCodeGenerator::promote(Module &TheModule, ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists, ExportLists); - // Resolve LinkOnce/Weak symbols. + // Resolve prevailing symbols StringMap> ResolvedODR; - resolveWeakForLinkerInIndex(Index, ResolvedODR); + resolvePrevailingInIndex(Index, ResolvedODR); - thinLTOResolveWeakForLinkerModule( + thinLTOResolvePrevailingInModule( TheModule, ModuleToDefinedGVSummaries[ModuleIdentifier]); // Promote the exported values in the index, so that they are promoted @@ -722,37 +662,52 @@ void ThinLTOCodeGenerator::crossModuleImport(Module &TheModule, * Compute the list of summaries needed for importing into module. */ void ThinLTOCodeGenerator::gatherImportedSummariesForModule( - StringRef ModulePath, ModuleSummaryIndex &Index, + Module &TheModule, ModuleSummaryIndex &Index, std::map &ModuleToSummariesForIndex) { auto ModuleCount = Index.modulePaths().size(); + auto ModuleIdentifier = TheModule.getModuleIdentifier(); // Collect for each module the list of function it defines (GUID -> Summary). StringMap ModuleToDefinedGVSummaries(ModuleCount); Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries); + // Convert the preserved symbols set from string to GUID + auto GUIDPreservedSymbols = computeGUIDPreservedSymbols( + PreservedSymbols, Triple(TheModule.getTargetTriple())); + + // Compute "dead" symbols, we don't want to import/export these! + computeDeadSymbolsInIndex(Index, GUIDPreservedSymbols); + // Generate import/export list StringMap ImportLists(ModuleCount); StringMap ExportLists(ModuleCount); ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists, ExportLists); - llvm::gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries, - ImportLists[ModulePath], - ModuleToSummariesForIndex); + llvm::gatherImportedSummariesForModule( + ModuleIdentifier, ModuleToDefinedGVSummaries, + ImportLists[ModuleIdentifier], ModuleToSummariesForIndex); } /** * Emit the list of files needed for importing into module. */ -void ThinLTOCodeGenerator::emitImports(StringRef ModulePath, - StringRef OutputName, +void ThinLTOCodeGenerator::emitImports(Module &TheModule, StringRef OutputName, ModuleSummaryIndex &Index) { auto ModuleCount = Index.modulePaths().size(); + auto ModuleIdentifier = TheModule.getModuleIdentifier(); // Collect for each module the list of function it defines (GUID -> Summary). StringMap ModuleToDefinedGVSummaries(ModuleCount); Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries); + // Convert the preserved symbols set from string to GUID + auto GUIDPreservedSymbols = computeGUIDPreservedSymbols( + PreservedSymbols, Triple(TheModule.getTargetTriple())); + + // Compute "dead" symbols, we don't want to import/export these! + computeDeadSymbolsInIndex(Index, GUIDPreservedSymbols); + // Generate import/export list StringMap ImportLists(ModuleCount); StringMap ExportLists(ModuleCount); @@ -760,13 +715,13 @@ void ThinLTOCodeGenerator::emitImports(StringRef ModulePath, ExportLists); std::map ModuleToSummariesForIndex; - llvm::gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries, - ImportLists[ModulePath], - ModuleToSummariesForIndex); + llvm::gatherImportedSummariesForModule( + ModuleIdentifier, ModuleToDefinedGVSummaries, + ImportLists[ModuleIdentifier], ModuleToSummariesForIndex); std::error_code EC; - if ((EC = - EmitImportsFiles(ModulePath, OutputName, ModuleToSummariesForIndex))) + if ((EC = EmitImportsFiles(ModuleIdentifier, OutputName, + ModuleToSummariesForIndex))) report_fatal_error(Twine("Failed to open ") + OutputName + " to save imports lists\n"); } @@ -929,6 +884,9 @@ void ThinLTOCodeGenerator::run() { // Compute "dead" symbols, we don't want to import/export these! computeDeadSymbolsInIndex(*Index, GUIDPreservedSymbols); + // Synthesize entry counts for functions in the combined index. + computeSyntheticCounts(*Index); + // Collect the import/export lists for all modules from the call-graph in the // combined index. StringMap ImportLists(ModuleCount); @@ -942,9 +900,9 @@ void ThinLTOCodeGenerator::run() { // on the index, and nuke this map. StringMap> ResolvedODR; - // Resolve LinkOnce/Weak symbols, this has to be computed early because it + // Resolve prevailing symbols, this has to be computed early because it // impacts the caching. - resolveWeakForLinkerInIndex(*Index, ResolvedODR); + resolvePrevailingInIndex(*Index, ResolvedODR); // Use global summary-based analysis to identify symbols that can be // internalized (because they aren't exported or preserved as per callback). @@ -983,14 +941,14 @@ void ThinLTOCodeGenerator::run() { auto ModuleIdentifier = ModuleBuffer.getBufferIdentifier(); auto &ExportList = ExportLists[ModuleIdentifier]; - auto &DefinedFunctions = ModuleToDefinedGVSummaries[ModuleIdentifier]; + auto &DefinedGVSummaries = ModuleToDefinedGVSummaries[ModuleIdentifier]; // The module may be cached, this helps handling it. ModuleCacheEntry CacheEntry(CacheOptions.Path, *Index, ModuleIdentifier, ImportLists[ModuleIdentifier], ExportList, ResolvedODR[ModuleIdentifier], - DefinedFunctions, GUIDPreservedSymbols, - OptLevel, Freestanding, TMBuilder); + DefinedGVSummaries, OptLevel, Freestanding, + TMBuilder); auto CacheEntryPath = CacheEntry.getEntryPath(); { diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp index b304bfc401aa0497e608178ab124a23b153539a7..afbc57abfcc08da06c0afe775e550353af5ce4cb 100644 --- a/lib/Linker/IRMover.cpp +++ b/lib/Linker/IRMover.cpp @@ -1062,10 +1062,15 @@ void IRLinker::prepareCompileUnitsForImport() { ValueMap.MD()[CU->getRawEnumTypes()].reset(nullptr); ValueMap.MD()[CU->getRawMacros()].reset(nullptr); ValueMap.MD()[CU->getRawRetainedTypes()].reset(nullptr); - // We import global variables only temporarily in order for instcombine - // and globalopt to perform constant folding and static constructor - // evaluation. After that elim-avail-extern will covert imported globals - // back to declarations, so we don't need debug info for them. + // The original definition (or at least its debug info - if the variable is + // internalized an optimized away) will remain in the source module, so + // there's no need to import them. + // If LLVM ever does more advanced optimizations on global variables + // (removing/localizing write operations, for instance) that can track + // through debug info, this decision may need to be revisited - but do so + // with care when it comes to debug info size. Emitting small CUs containing + // only a few imported entities into every destination module may be very + // size inefficient. ValueMap.MD()[CU->getRawGlobalVariables()].reset(nullptr); // Imported entities only need to be mapped in if they have local diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp index 5b8b013ad0875a4bdc3020764950db96f6c59605..89f3b30cddd6f44087340e3ab40d84c4e51ee1a1 100644 --- a/lib/MC/ELFObjectWriter.cpp +++ b/lib/MC/ELFObjectWriter.cpp @@ -669,6 +669,20 @@ void ELFWriter::computeSymbolTable( } else { const MCSectionELF &Section = static_cast(Symbol.getSection()); + + // We may end up with a situation when section symbol is technically + // defined, but should not be. That happens because we explicitly + // pre-create few .debug_* sections to have accessors. + // And if these sections were not really defined in the code, but were + // referenced, we simply error out. + if (!Section.isRegistered()) { + assert(static_cast(Symbol).getType() == + ELF::STT_SECTION); + Ctx.reportError(SMLoc(), + "Undefined section reference: " + Symbol.getName()); + continue; + } + if (Mode == NonDwoOnly && isDwoSection(Section)) continue; MSD.SectionIndex = SectionIndexMap.lookup(&Section); diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp index 463e9066616d81ba35897522889dde5299cc7045..fd10c6901b732899bf4c266f66e430f6df74ab25 100644 --- a/lib/MC/MCAsmStreamer.cpp +++ b/lib/MC/MCAsmStreamer.cpp @@ -147,9 +147,9 @@ public: void EmitLinkerOptions(ArrayRef Options) override; void EmitDataRegion(MCDataRegionType Kind) override; void EmitVersionMin(MCVersionMinType Kind, unsigned Major, unsigned Minor, - unsigned Update) override; + unsigned Update, VersionTuple SDKVersion) override; void EmitBuildVersion(unsigned Platform, unsigned Major, unsigned Minor, - unsigned Update) override; + unsigned Update, VersionTuple SDKVersion) override; void EmitThumbFunc(MCSymbol *Func) override; void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override; @@ -285,6 +285,7 @@ public: void EmitCFIUndefined(int64_t Register) override; void EmitCFIRegister(int64_t Register1, int64_t Register2) override; void EmitCFIWindowSave() override; + void EmitCFINegateRAState() override; void EmitCFIReturnColumn(int64_t Register) override; void EmitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc) override; @@ -514,11 +515,26 @@ static const char *getVersionMinDirective(MCVersionMinType Type) { llvm_unreachable("Invalid MC version min type"); } +static void EmitSDKVersionSuffix(raw_ostream &OS, + const VersionTuple &SDKVersion) { + if (SDKVersion.empty()) + return; + OS << '\t' << "sdk_version " << SDKVersion.getMajor(); + if (auto Minor = SDKVersion.getMinor()) { + OS << ", " << *Minor; + if (auto Subminor = SDKVersion.getSubminor()) { + OS << ", " << *Subminor; + } + } +} + void MCAsmStreamer::EmitVersionMin(MCVersionMinType Type, unsigned Major, - unsigned Minor, unsigned Update) { + unsigned Minor, unsigned Update, + VersionTuple SDKVersion) { OS << '\t' << getVersionMinDirective(Type) << ' ' << Major << ", " << Minor; if (Update) OS << ", " << Update; + EmitSDKVersionSuffix(OS, SDKVersion); EmitEOL(); } @@ -534,11 +550,13 @@ static const char *getPlatformName(MachO::PlatformType Type) { } void MCAsmStreamer::EmitBuildVersion(unsigned Platform, unsigned Major, - unsigned Minor, unsigned Update) { + unsigned Minor, unsigned Update, + VersionTuple SDKVersion) { const char *PlatformName = getPlatformName((MachO::PlatformType)Platform); OS << "\t.build_version " << PlatformName << ", " << Major << ", " << Minor; if (Update) OS << ", " << Update; + EmitSDKVersionSuffix(OS, SDKVersion); EmitEOL(); } @@ -1569,6 +1587,12 @@ void MCAsmStreamer::EmitCFIWindowSave() { EmitEOL(); } +void MCAsmStreamer::EmitCFINegateRAState() { + MCStreamer::EmitCFINegateRAState(); + OS << "\t.cfi_negate_ra_state"; + EmitEOL(); +} + void MCAsmStreamer::EmitCFIReturnColumn(int64_t Register) { MCStreamer::EmitCFIReturnColumn(Register); OS << "\t.cfi_return_column " << Register; diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp index 1e23b6d816e8a083d946cc0a351515a2316746a8..cde6a93a164729636b8671cd48634bbc028219aa 100644 --- a/lib/MC/MCAssembler.cpp +++ b/lib/MC/MCAssembler.cpp @@ -111,6 +111,7 @@ void MCAssembler::reset() { ELFHeaderEFlags = 0; LOHContainer.reset(); VersionInfo.Major = 0; + VersionInfo.SDKVersion = VersionTuple(); // reset objects owned by us if (getBackendPtr()) diff --git a/lib/MC/MCCodeView.cpp b/lib/MC/MCCodeView.cpp index 234b43e3d4962b31db7f4988ed448a0f23608b4f..978ac789c31e118afca58925c40b0b9115b04d13 100644 --- a/lib/MC/MCCodeView.cpp +++ b/lib/MC/MCCodeView.cpp @@ -432,13 +432,13 @@ void CodeViewContext::emitInlineLineTableForFunction(MCObjectStreamer &OS, OS.getCurrentSectionOnly()); } -void CodeViewContext::emitDefRange( +MCFragment *CodeViewContext::emitDefRange( MCObjectStreamer &OS, ArrayRef> Ranges, StringRef FixedSizePortion) { // Create and insert a fragment into the current section that will be encoded // later. - new MCCVDefRangeFragment(Ranges, FixedSizePortion, + return new MCCVDefRangeFragment(Ranges, FixedSizePortion, OS.getCurrentSectionOnly()); } diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp index 7093446815f3787d93e86f4153587b41217c66b3..9791fb5724b1365219dbce1f2c3abaf419cf9558 100644 --- a/lib/MC/MCDwarf.cpp +++ b/lib/MC/MCDwarf.cpp @@ -1332,6 +1332,10 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) { Streamer.EmitIntValue(dwarf::DW_CFA_GNU_window_save, 1); return; + case MCCFIInstruction::OpNegateRAState: + Streamer.EmitIntValue(dwarf::DW_CFA_AARCH64_negate_ra_state, 1); + return; + case MCCFIInstruction::OpUndefined: { unsigned Reg = Instr.getRegister(); Streamer.EmitIntValue(dwarf::DW_CFA_undefined, 1); @@ -1418,7 +1422,12 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) { unsigned Reg = Instr.getRegister(); if (!IsEH) Reg = MRI->getDwarfRegNumFromDwarfEHRegNum(Reg); - Streamer.EmitIntValue(dwarf::DW_CFA_restore | Reg, 1); + if (Reg < 64) { + Streamer.EmitIntValue(dwarf::DW_CFA_restore | Reg, 1); + } else { + Streamer.EmitIntValue(dwarf::DW_CFA_restore_extended, 1); + Streamer.EmitULEB128IntValue(Reg); + } return; } case MCCFIInstruction::OpGnuArgsSize: diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp index 38f311be7c66b49fac14355c6716b6bbfaa30abc..3c022199145fbafe128f4ce4368cc546c969486f 100644 --- a/lib/MC/MCExpr.cpp +++ b/lib/MC/MCExpr.cpp @@ -306,6 +306,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) { case VK_WebAssembly_FUNCTION: return "FUNCTION"; case VK_WebAssembly_GLOBAL: return "GLOBAL"; case VK_WebAssembly_TYPEINDEX: return "TYPEINDEX"; + case VK_WebAssembly_EVENT: return "EVENT"; case VK_AMDGPU_GOTPCREL32_LO: return "gotpcrel32@lo"; case VK_AMDGPU_GOTPCREL32_HI: return "gotpcrel32@hi"; case VK_AMDGPU_REL32_LO: return "rel32@lo"; @@ -421,6 +422,7 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) { .Case("function", VK_WebAssembly_FUNCTION) .Case("global", VK_WebAssembly_GLOBAL) .Case("typeindex", VK_WebAssembly_TYPEINDEX) + .Case("event", VK_WebAssembly_EVENT) .Case("gotpcrel32@lo", VK_AMDGPU_GOTPCREL32_LO) .Case("gotpcrel32@hi", VK_AMDGPU_GOTPCREL32_HI) .Case("rel32@lo", VK_AMDGPU_REL32_LO) diff --git a/lib/MC/MCFragment.cpp b/lib/MC/MCFragment.cpp index 0ebcf21a422e3f2fae407a38357b54e270279333..d22b117972bf230c38d62ab70f110de947673203 100644 --- a/lib/MC/MCFragment.cpp +++ b/lib/MC/MCFragment.cpp @@ -237,8 +237,8 @@ MCFragment::~MCFragment() = default; MCFragment::MCFragment(FragmentType Kind, bool HasInstructions, MCSection *Parent) - : Kind(Kind), HasInstructions(HasInstructions), Parent(Parent), - Atom(nullptr), Offset(~UINT64_C(0)) { + : Kind(Kind), HasInstructions(HasInstructions), LayoutOrder(0), + Parent(Parent), Atom(nullptr), Offset(~UINT64_C(0)) { if (Parent && !isDummy()) Parent->getFragmentList().push_back(this); } diff --git a/lib/MC/MCInst.cpp b/lib/MC/MCInst.cpp index f9b71caaf91c997ba0dd4566e96b3d77c05240ef..64f111fc7114847f4a39c5a60b16b7349185d8a2 100644 --- a/lib/MC/MCInst.cpp +++ b/lib/MC/MCInst.cpp @@ -72,11 +72,17 @@ void MCInst::print(raw_ostream &OS) const { void MCInst::dump_pretty(raw_ostream &OS, const MCInstPrinter *Printer, StringRef Separator) const { + StringRef InstName = Printer ? Printer->getOpcodeName(getOpcode()) : ""; + dump_pretty(OS, InstName, Separator); +} + +void MCInst::dump_pretty(raw_ostream &OS, StringRef Name, + StringRef Separator) const { OS << "getOpcodeName(getOpcode()); + // Show the instruction opcode name if we have it. + if (!Name.empty()) + OS << ' ' << Name; for (unsigned i = 0, e = getNumOperands(); i != e; ++i) { OS << Separator; diff --git a/lib/MC/MCInstrDesc.cpp b/lib/MC/MCInstrDesc.cpp index ee55f3eff3ac739d81532714e8aa644d4b7e8749..53cba864a85d7b893b6d6406dd115e7119fc70a4 100644 --- a/lib/MC/MCInstrDesc.cpp +++ b/lib/MC/MCInstrDesc.cpp @@ -39,15 +39,6 @@ bool MCInstrDesc::mayAffectControlFlow(const MCInst &MI, return false; if (hasDefOfPhysReg(MI, PC, RI)) return true; - // A variadic instruction may define PC in the variable operand list. - // There's currently no indication of which entries in a variable - // list are defs and which are uses. While that's the case, this function - // needs to assume they're defs in order to be conservatively correct. - for (int i = NumOperands, e = MI.getNumOperands(); i != e; ++i) { - if (MI.getOperand(i).isReg() && - RI.isSubRegisterEq(PC, MI.getOperand(i).getReg())) - return true; - } return false; } @@ -66,5 +57,10 @@ bool MCInstrDesc::hasDefOfPhysReg(const MCInst &MI, unsigned Reg, if (MI.getOperand(i).isReg() && RI.isSubRegisterEq(Reg, MI.getOperand(i).getReg())) return true; + if (variadicOpsAreDefs()) + for (int i = NumOperands - 1, e = MI.getNumOperands(); i != e; ++i) + if (MI.getOperand(i).isReg() && + RI.isSubRegisterEq(Reg, MI.getOperand(i).getReg())) + return true; return hasImplicitDefOfPhysReg(Reg, &RI); } diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp index 43e69605787cd3394d00c6b4302a1cf4c4b75b42..b30317e74672bb3198a1f6aa8efad4e61411fcf9 100644 --- a/lib/MC/MCMachOStreamer.cpp +++ b/lib/MC/MCMachOStreamer.cpp @@ -89,10 +89,10 @@ public: void EmitAssemblerFlag(MCAssemblerFlag Flag) override; void EmitLinkerOptions(ArrayRef Options) override; void EmitDataRegion(MCDataRegionType Kind) override; - void EmitVersionMin(MCVersionMinType Kind, unsigned Major, - unsigned Minor, unsigned Update) override; - void EmitBuildVersion(unsigned Platform, unsigned Major, - unsigned Minor, unsigned Update) override; + void EmitVersionMin(MCVersionMinType Kind, unsigned Major, unsigned Minor, + unsigned Update, VersionTuple SDKVersion) override; + void EmitBuildVersion(unsigned Platform, unsigned Major, unsigned Minor, + unsigned Update, VersionTuple SDKVersion) override; void EmitThumbFunc(MCSymbol *Func) override; bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override; void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override; @@ -270,14 +270,16 @@ void MCMachOStreamer::EmitDataRegion(MCDataRegionType Kind) { } void MCMachOStreamer::EmitVersionMin(MCVersionMinType Kind, unsigned Major, - unsigned Minor, unsigned Update) { - getAssembler().setVersionMin(Kind, Major, Minor, Update); + unsigned Minor, unsigned Update, + VersionTuple SDKVersion) { + getAssembler().setVersionMin(Kind, Major, Minor, Update, SDKVersion); } void MCMachOStreamer::EmitBuildVersion(unsigned Platform, unsigned Major, - unsigned Minor, unsigned Update) { + unsigned Minor, unsigned Update, + VersionTuple SDKVersion) { getAssembler().setBuildVersion((MachO::PlatformType)Platform, Major, Minor, - Update); + Update, SDKVersion); } void MCMachOStreamer::EmitThumbFunc(MCSymbol *Symbol) { @@ -507,7 +509,7 @@ MCStreamer *llvm::createMachOStreamer(MCContext &Context, new MCMachOStreamer(Context, std::move(MAB), std::move(OW), std::move(CE), DWARFMustBeAtTheEnd, LabelSections); const Triple &Target = Context.getObjectFileInfo()->getTargetTriple(); - S->EmitVersionForTarget(Target); + S->EmitVersionForTarget(Target, Context.getObjectFileInfo()->getSDKVersion()); if (RelaxAll) S->getAssembler().setRelaxAll(true); return S; diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp index ab8e0f31db9e7a8faa482fbea124cdef8f8738c8..9e35355d06e0a50e4825fd1f927a6e13b266012c 100644 --- a/lib/MC/MCObjectFileInfo.cpp +++ b/lib/MC/MCObjectFileInfo.cpp @@ -479,10 +479,10 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) { } void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) { - EHFrameSection = Ctx->getCOFFSection( - ".eh_frame", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | - COFF::IMAGE_SCN_MEM_READ | COFF::IMAGE_SCN_MEM_WRITE, - SectionKind::getData()); + EHFrameSection = + Ctx->getCOFFSection(".eh_frame", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_READ, + SectionKind::getData()); // Set the `IMAGE_SCN_MEM_16BIT` flag when compiling for thumb mode. This is // used to indicate to the linker that the text segment contains thumb instructions @@ -510,11 +510,7 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) { ".rdata", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ, SectionKind::getReadOnly()); - // FIXME: We're emitting LSDA info into a readonly section on COFF, even - // though it contains relocatable pointers. In PIC mode, this is probably a - // big runtime hit for C++ apps. Either the contents of the LSDA need to be - // adjusted or this should be a data section. - if (T.getArch() == Triple::x86_64) { + if (T.getArch() == Triple::x86_64 || T.getArch() == Triple::aarch64) { // On Windows 64 with SEH, the LSDA is emitted into the .xdata section LSDASection = nullptr; } else { @@ -812,16 +808,17 @@ void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple, bool PIC, } } -MCSection *MCObjectFileInfo::getDwarfTypesSection(uint64_t Hash) const { +MCSection *MCObjectFileInfo::getDwarfComdatSection(const char *Name, + uint64_t Hash) const { switch (TT.getObjectFormat()) { case Triple::ELF: - return Ctx->getELFSection(".debug_types", ELF::SHT_PROGBITS, ELF::SHF_GROUP, - 0, utostr(Hash)); + return Ctx->getELFSection(Name, ELF::SHT_PROGBITS, ELF::SHF_GROUP, 0, + utostr(Hash)); case Triple::MachO: case Triple::COFF: case Triple::Wasm: case Triple::UnknownObjectFormat: - report_fatal_error("Cannot get DWARF types section for this object file " + report_fatal_error("Cannot get DWARF comdat section for this object file " "format: not implemented."); break; } diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp index 8c88db009bd480c7f8031213bd56581ea027b63e..6ec705bdddb73451aa0169e6df9847a7881b14ef 100644 --- a/lib/MC/MCObjectStreamer.cpp +++ b/lib/MC/MCObjectStreamer.cpp @@ -59,6 +59,27 @@ void MCObjectStreamer::flushPendingLabels(MCFragment *F, uint64_t FOffset) { PendingLabels.clear(); } +// When fixup's offset is a forward declared label, e.g.: +// +// .reloc 1f, R_MIPS_JALR, foo +// 1: nop +// +// postpone adding it to Fixups vector until the label is defined and its offset +// is known. +void MCObjectStreamer::resolvePendingFixups() { + for (PendingMCFixup &PendingFixup : PendingFixups) { + if (!PendingFixup.Sym || PendingFixup.Sym->isUndefined ()) { + getContext().reportError(PendingFixup.Fixup.getLoc(), + "unresolved relocation offset"); + continue; + } + flushPendingLabels(PendingFixup.DF, PendingFixup.DF->getContents().size()); + PendingFixup.Fixup.setOffset(PendingFixup.Sym->getOffset()); + PendingFixup.DF->getFixups().push_back(PendingFixup.Fixup); + } + PendingFixups.clear(); +} + // As a compile-time optimization, avoid allocating and evaluating an MCExpr // tree for (Hi - Lo) when Hi and Lo are offsets into the same fragment. static Optional @@ -476,7 +497,11 @@ void MCObjectStreamer::EmitCVInlineLinetableDirective( void MCObjectStreamer::EmitCVDefRangeDirective( ArrayRef> Ranges, StringRef FixedSizePortion) { - getContext().getCVContext().emitDefRange(*this, Ranges, FixedSizePortion); + MCFragment *Frag = + getContext().getCVContext().emitDefRange(*this, Ranges, FixedSizePortion); + // Attach labels that were pending before we created the defrange fragment to + // the beginning of the new fragment. + flushPendingLabels(Frag, 0); this->MCStreamer::EmitCVDefRangeDirective(Ranges, FixedSizePortion); } @@ -603,16 +628,6 @@ void MCObjectStreamer::EmitGPRel64Value(const MCExpr *Value) { bool MCObjectStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name, const MCExpr *Expr, SMLoc Loc, const MCSubtargetInfo &STI) { - int64_t OffsetValue; - if (!Offset.evaluateAsAbsolute(OffsetValue)) - llvm_unreachable("Offset is not absolute"); - - if (OffsetValue < 0) - llvm_unreachable("Offset is negative"); - - MCDataFragment *DF = getOrCreateDataFragment(&STI); - flushPendingLabels(DF, DF->getContents().size()); - Optional MaybeKind = Assembler->getBackend().getFixupKind(Name); if (!MaybeKind.hasValue()) return true; @@ -622,7 +637,30 @@ bool MCObjectStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name, if (Expr == nullptr) Expr = MCSymbolRefExpr::create(getContext().createTempSymbol(), getContext()); - DF->getFixups().push_back(MCFixup::create(OffsetValue, Expr, Kind, Loc)); + + MCDataFragment *DF = getOrCreateDataFragment(&STI); + flushPendingLabels(DF, DF->getContents().size()); + + int64_t OffsetValue; + if (Offset.evaluateAsAbsolute(OffsetValue)) { + if (OffsetValue < 0) + llvm_unreachable(".reloc offset is negative"); + DF->getFixups().push_back(MCFixup::create(OffsetValue, Expr, Kind, Loc)); + return false; + } + + if (Offset.getKind() != llvm::MCExpr::SymbolRef) + llvm_unreachable(".reloc offset is not absolute nor a label"); + + const MCSymbolRefExpr &SRE = cast(Offset); + if (SRE.getSymbol().isDefined()) { + DF->getFixups().push_back(MCFixup::create(SRE.getSymbol().getOffset(), + Expr, Kind, Loc)); + return false; + } + + PendingFixups.emplace_back(&SRE.getSymbol(), DF, + MCFixup::create(-1, Expr, Kind, Loc)); return false; } @@ -689,5 +727,6 @@ void MCObjectStreamer::FinishImpl() { MCDwarfLineTable::Emit(this, getAssembler().getDWARFLinetableParams()); flushPendingLabels(); + resolvePendingFixups(); getAssembler().Finish(); } diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp index c8d48f033f65f6a958568a9e956a018d5fde4341..2b0d20f9b8e2efc10507323dc1cd80edcb7ce613 100644 --- a/lib/MC/MCParser/AsmLexer.cpp +++ b/lib/MC/MCParser/AsmLexer.cpp @@ -627,7 +627,6 @@ AsmToken AsmLexer::LexToken() { return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); - case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); @@ -646,6 +645,12 @@ AsmToken AsmLexer::LexToken() { return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); } return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); + case '-': + if (*CurPtr == '>') { + ++CurPtr; + return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2)); + } + return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); case '|': if (*CurPtr == '|') { ++CurPtr; diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp index 3f7b507791eca9ec957e2b7fde49db974cbd6307..aa07eee44916483c4e6b5fbb27268d1d010241c6 100644 --- a/lib/MC/MCParser/AsmParser.cpp +++ b/lib/MC/MCParser/AsmParser.cpp @@ -675,6 +675,7 @@ namespace llvm { extern MCAsmParserExtension *createDarwinAsmParser(); extern MCAsmParserExtension *createELFAsmParser(); extern MCAsmParserExtension *createCOFFAsmParser(); +extern MCAsmParserExtension *createWasmAsmParser(); } // end namespace llvm @@ -705,10 +706,7 @@ AsmParser::AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out, PlatformParser.reset(createELFAsmParser()); break; case MCObjectFileInfo::IsWasm: - // TODO: WASM will need its own MCAsmParserExtension implementation, but - // for now we can re-use the ELF one, since the directives can be the - // same for now. - PlatformParser.reset(createELFAsmParser()); + PlatformParser.reset(createWasmAsmParser()); break; } @@ -2940,20 +2938,20 @@ bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) { bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) { const MCExpr *Offset; const MCExpr *Expr = nullptr; - - SMLoc OffsetLoc = Lexer.getTok().getLoc(); int64_t OffsetValue; - // We can only deal with constant expressions at the moment. + SMLoc OffsetLoc = Lexer.getTok().getLoc(); if (parseExpression(Offset)) return true; - if (check(!Offset->evaluateAsAbsolute(OffsetValue, - getStreamer().getAssemblerPtr()), - OffsetLoc, "expression is not a constant value") || - check(OffsetValue < 0, OffsetLoc, "expression is negative") || - parseToken(AsmToken::Comma, "expected comma") || - check(getTok().isNot(AsmToken::Identifier), "expected relocation name")) + if ((Offset->evaluateAsAbsolute(OffsetValue, + getStreamer().getAssemblerPtr()) && + check(OffsetValue < 0, OffsetLoc, "expression is negative")) || + (check(Offset->getKind() != llvm::MCExpr::Constant && + Offset->getKind() != llvm::MCExpr::SymbolRef, + OffsetLoc, "expected non-negative number or a label")) || + (parseToken(AsmToken::Comma, "expected comma") || + check(getTok().isNot(AsmToken::Identifier), "expected relocation name"))) return true; SMLoc NameLoc = Lexer.getTok().getLoc(); @@ -3921,7 +3919,7 @@ bool AsmParser::parseDirectiveCFIStartProc() { parseToken(AsmToken::EndOfStatement)) return addErrorSuffix(" in '.cfi_startproc' directive"); } - + // TODO(kristina): Deal with a corner case of incorrect diagnostic context // being produced if this directive is emitted as part of preprocessor macro // expansion which can *ONLY* happen if Clang's cc1as is the API consumer. diff --git a/lib/MC/MCParser/CMakeLists.txt b/lib/MC/MCParser/CMakeLists.txt index 99fdd0167993780f81c43be5503cf260ba2f240f..0c54e8e901931897ffe4e9fa118ef07a4a9d8701 100644 --- a/lib/MC/MCParser/CMakeLists.txt +++ b/lib/MC/MCParser/CMakeLists.txt @@ -8,6 +8,7 @@ add_llvm_library(LLVMMCParser MCAsmParser.cpp MCAsmParserExtension.cpp MCTargetAsmParser.cpp + WasmAsmParser.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/MC/MCParser diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp index e6fc1fac81ba07f9a769200b65ee9832c59d6239..07926d6c1c9ab7840f17c93f8e7fcce75fed1e44 100644 --- a/lib/MC/MCParser/DarwinAsmParser.cpp +++ b/lib/MC/MCParser/DarwinAsmParser.cpp @@ -459,7 +459,12 @@ public: bool parseBuildVersion(StringRef Directive, SMLoc Loc); bool parseVersionMin(StringRef Directive, SMLoc Loc, MCVersionMinType Type); + bool parseMajorMinorVersionComponent(unsigned *Major, unsigned *Minor, + const char *VersionName); + bool parseOptionalTrailingVersionComponent(unsigned *Component, + const char *ComponentName); bool parseVersion(unsigned *Major, unsigned *Minor, unsigned *Update); + bool parseSDKVersion(VersionTuple &SDKVersion); void checkVersion(StringRef Directive, StringRef Arg, SMLoc Loc, Triple::OSType ExpectedOS); }; @@ -1000,43 +1005,89 @@ bool DarwinAsmParser::parseDirectiveDataRegionEnd(StringRef, SMLoc) { return false; } -/// parseVersion ::= major, minor [, update] -bool DarwinAsmParser::parseVersion(unsigned *Major, unsigned *Minor, - unsigned *Update) { +static bool isSDKVersionToken(const AsmToken &Tok) { + return Tok.is(AsmToken::Identifier) && Tok.getIdentifier() == "sdk_version"; +} + +/// parseMajorMinorVersionComponent ::= major, minor +bool DarwinAsmParser::parseMajorMinorVersionComponent(unsigned *Major, + unsigned *Minor, + const char *VersionName) { // Get the major version number. if (getLexer().isNot(AsmToken::Integer)) - return TokError("invalid OS major version number, integer expected"); + return TokError(Twine("invalid ") + VersionName + + " major version number, integer expected"); int64_t MajorVal = getLexer().getTok().getIntVal(); if (MajorVal > 65535 || MajorVal <= 0) - return TokError("invalid OS major version number"); + return TokError(Twine("invalid ") + VersionName + " major version number"); *Major = (unsigned)MajorVal; Lex(); if (getLexer().isNot(AsmToken::Comma)) - return TokError("OS minor version number required, comma expected"); + return TokError(Twine(VersionName) + + " minor version number required, comma expected"); Lex(); // Get the minor version number. if (getLexer().isNot(AsmToken::Integer)) - return TokError("invalid OS minor version number, integer expected"); + return TokError(Twine("invalid ") + VersionName + + " minor version number, integer expected"); int64_t MinorVal = getLexer().getTok().getIntVal(); if (MinorVal > 255 || MinorVal < 0) - return TokError("invalid OS minor version number"); + return TokError(Twine("invalid ") + VersionName + " minor version number"); *Minor = MinorVal; Lex(); + return false; +} + +/// parseOptionalTrailingVersionComponent ::= , version_number +bool DarwinAsmParser::parseOptionalTrailingVersionComponent( + unsigned *Component, const char *ComponentName) { + assert(getLexer().is(AsmToken::Comma) && "comma expected"); + Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return TokError(Twine("invalid ") + ComponentName + + " version number, integer expected"); + int64_t Val = getLexer().getTok().getIntVal(); + if (Val > 255 || Val < 0) + return TokError(Twine("invalid ") + ComponentName + " version number"); + *Component = Val; + Lex(); + return false; +} + +/// parseVersion ::= parseMajorMinorVersionComponent +/// parseOptionalTrailingVersionComponent +bool DarwinAsmParser::parseVersion(unsigned *Major, unsigned *Minor, + unsigned *Update) { + if (parseMajorMinorVersionComponent(Major, Minor, "OS")) + return true; // Get the update level, if specified *Update = 0; - if (getLexer().is(AsmToken::EndOfStatement)) + if (getLexer().is(AsmToken::EndOfStatement) || + isSDKVersionToken(getLexer().getTok())) return false; if (getLexer().isNot(AsmToken::Comma)) return TokError("invalid OS update specifier, comma expected"); + if (parseOptionalTrailingVersionComponent(Update, "OS update")) + return true; + return false; +} + +bool DarwinAsmParser::parseSDKVersion(VersionTuple &SDKVersion) { + assert(isSDKVersionToken(getLexer().getTok()) && "expected sdk_version"); Lex(); - if (getLexer().isNot(AsmToken::Integer)) - return TokError("invalid OS update version number, integer expected"); - int64_t UpdateVal = getLexer().getTok().getIntVal(); - if (UpdateVal > 255 || UpdateVal < 0) - return TokError("invalid OS update version number"); - *Update = UpdateVal; - Lex(); + unsigned Major, Minor; + if (parseMajorMinorVersionComponent(&Major, &Minor, "SDK")) + return true; + SDKVersion = VersionTuple(Major, Minor); + + // Get the subminor version, if specified. + if (getLexer().is(AsmToken::Comma)) { + unsigned Subminor; + if (parseOptionalTrailingVersionComponent(&Subminor, "SDK subminor")) + return true; + SDKVersion = VersionTuple(Major, Minor, Subminor); + } return false; } @@ -1066,10 +1117,10 @@ static Triple::OSType getOSTypeFromMCVM(MCVersionMinType Type) { } /// parseVersionMin -/// ::= .ios_version_min parseVersion -/// | .macosx_version_min parseVersion -/// | .tvos_version_min parseVersion -/// | .watchos_version_min parseVersion +/// ::= .ios_version_min parseVersion parseSDKVersion +/// | .macosx_version_min parseVersion parseSDKVersion +/// | .tvos_version_min parseVersion parseSDKVersion +/// | .watchos_version_min parseVersion parseSDKVersion bool DarwinAsmParser::parseVersionMin(StringRef Directive, SMLoc Loc, MCVersionMinType Type) { unsigned Major; @@ -1078,13 +1129,16 @@ bool DarwinAsmParser::parseVersionMin(StringRef Directive, SMLoc Loc, if (parseVersion(&Major, &Minor, &Update)) return true; + VersionTuple SDKVersion; + if (isSDKVersionToken(getLexer().getTok()) && parseSDKVersion(SDKVersion)) + return true; + if (parseToken(AsmToken::EndOfStatement)) return addErrorSuffix(Twine(" in '") + Directive + "' directive"); Triple::OSType ExpectedOS = getOSTypeFromMCVM(Type); checkVersion(Directive, StringRef(), Loc, ExpectedOS); - - getStreamer().EmitVersionMin(Type, Major, Minor, Update); + getStreamer().EmitVersionMin(Type, Major, Minor, Update, SDKVersion); return false; } @@ -1100,7 +1154,7 @@ static Triple::OSType getOSTypeFromPlatform(MachO::PlatformType Type) { } /// parseBuildVersion -/// ::= .build_version (macos|ios|tvos|watchos), parseVersion +/// ::= .build_version (macos|ios|tvos|watchos), parseVersion parseSDKVersion bool DarwinAsmParser::parseBuildVersion(StringRef Directive, SMLoc Loc) { StringRef PlatformName; SMLoc PlatformLoc = getTok().getLoc(); @@ -1126,14 +1180,17 @@ bool DarwinAsmParser::parseBuildVersion(StringRef Directive, SMLoc Loc) { if (parseVersion(&Major, &Minor, &Update)) return true; + VersionTuple SDKVersion; + if (isSDKVersionToken(getLexer().getTok()) && parseSDKVersion(SDKVersion)) + return true; + if (parseToken(AsmToken::EndOfStatement)) return addErrorSuffix(" in '.build_version' directive"); Triple::OSType ExpectedOS = getOSTypeFromPlatform((MachO::PlatformType)Platform); checkVersion(Directive, PlatformName, Loc, ExpectedOS); - - getStreamer().EmitBuildVersion(Platform, Major, Minor, Update); + getStreamer().EmitBuildVersion(Platform, Major, Minor, Update, SDKVersion); return false; } diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp index 3d9590e1f9f5f5e7503bd9223eb8c6395360ec6d..d568f7a71eeb5dca8f4e9293e2c0678be636d706 100644 --- a/lib/MC/MCParser/ELFAsmParser.cpp +++ b/lib/MC/MCParser/ELFAsmParser.cpp @@ -321,6 +321,9 @@ static unsigned parseSectionFlags(StringRef flagsStr, bool *UseLastGroup) { case 'y': flags |= ELF::SHF_ARM_PURECODE; break; + case 's': + flags |= ELF::SHF_HEX_GPREL; + break; case 'G': flags |= ELF::SHF_GROUP; break; diff --git a/lib/MC/MCParser/MCAsmLexer.cpp b/lib/MC/MCParser/MCAsmLexer.cpp index 1ed056800d4a6cd7b8a24897fde417b9d19e01b7..10960fc696334870f353a07c9a4b6e990f41c305 100644 --- a/lib/MC/MCParser/MCAsmLexer.cpp +++ b/lib/MC/MCParser/MCAsmLexer.cpp @@ -85,6 +85,7 @@ void AsmToken::dump(raw_ostream &OS) const { case AsmToken::LessGreater: OS << "LessGreater"; break; case AsmToken::LessLess: OS << "LessLess"; break; case AsmToken::Minus: OS << "Minus"; break; + case AsmToken::MinusGreater: OS << "MinusGreater"; break; case AsmToken::Percent: OS << "Percent"; break; case AsmToken::Pipe: OS << "Pipe"; break; case AsmToken::PipePipe: OS << "PipePipe"; break; diff --git a/lib/MC/MCParser/WasmAsmParser.cpp b/lib/MC/MCParser/WasmAsmParser.cpp new file mode 100644 index 0000000000000000000000000000000000000000..93bb0cb3c72e476360da73d8e054f8bb8bcc0af2 --- /dev/null +++ b/lib/MC/MCParser/WasmAsmParser.cpp @@ -0,0 +1,145 @@ +//===- WasmAsmParser.cpp - Wasm Assembly Parser -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// -- +// +// Note, this is for wasm, the binary format (analogous to ELF), not wasm, +// the instruction set (analogous to x86), for which parsing code lives in +// WebAssemblyAsmParser. +// +// This file contains processing for generic directives implemented using +// MCTargetStreamer, the ones that depend on WebAssemblyTargetStreamer are in +// WebAssemblyAsmParser. +// +//===----------------------------------------------------------------------===// + +#include "llvm/BinaryFormat/Wasm.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCAsmParserExtension.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCSymbolWasm.h" +#include "llvm/Support/MachineValueType.h" + +using namespace llvm; + +namespace { + +class WasmAsmParser : public MCAsmParserExtension { + MCAsmParser *Parser; + MCAsmLexer *Lexer; + + template + void addDirectiveHandler(StringRef Directive) { + MCAsmParser::ExtensionDirectiveHandler Handler = std::make_pair( + this, HandleDirective); + + getParser().addDirectiveHandler(Directive, Handler); + } + +public: + WasmAsmParser() : Parser(nullptr), Lexer(nullptr) { + BracketExpressionsSupported = true; + } + + void Initialize(MCAsmParser &P) override { + Parser = &P; + Lexer = &Parser->getLexer(); + // Call the base implementation. + this->MCAsmParserExtension::Initialize(*Parser); + + addDirectiveHandler<&WasmAsmParser::parseSectionDirectiveText>(".text"); + addDirectiveHandler<&WasmAsmParser::parseSectionDirective>(".section"); + addDirectiveHandler<&WasmAsmParser::parseDirectiveSize>(".size"); + addDirectiveHandler<&WasmAsmParser::parseDirectiveType>(".type"); + } + + bool Error(const StringRef &msg, const AsmToken &tok) { + return Parser->Error(tok.getLoc(), msg + tok.getString()); + } + + bool IsNext(AsmToken::TokenKind Kind) { + auto ok = Lexer->is(Kind); + if (ok) Lex(); + return ok; + } + + bool Expect(AsmToken::TokenKind Kind, const char *KindName) { + if (!IsNext(Kind)) + return Error(std::string("Expected ") + KindName + ", instead got: ", + Lexer->getTok()); + return false; + } + + bool parseSectionDirectiveText(StringRef, SMLoc) { + // FIXME: .text currently no-op. + return false; + } + + bool parseSectionDirective(StringRef, SMLoc) { + // FIXME: .section currently no-op. + while (Lexer->isNot(AsmToken::EndOfStatement)) Parser->Lex(); + return false; + } + + // TODO: This function is almost the same as ELFAsmParser::ParseDirectiveSize + // so maybe could be shared somehow. + bool parseDirectiveSize(StringRef, SMLoc) { + StringRef Name; + if (Parser->parseIdentifier(Name)) + return TokError("expected identifier in directive"); + auto Sym = getContext().getOrCreateSymbol(Name); + if (Lexer->isNot(AsmToken::Comma)) + return TokError("unexpected token in directive"); + Lex(); + const MCExpr *Expr; + if (Parser->parseExpression(Expr)) + return true; + if (Lexer->isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + Lex(); + // MCWasmStreamer implements this. + getStreamer().emitELFSize(Sym, Expr); + return false; + } + + bool parseDirectiveType(StringRef, SMLoc) { + // This could be the start of a function, check if followed by + // "label,@function" + if (!Lexer->is(AsmToken::Identifier)) + return Error("Expected label after .type directive, got: ", + Lexer->getTok()); + auto WasmSym = cast( + getStreamer().getContext().getOrCreateSymbol( + Lexer->getTok().getString())); + Lex(); + if (!(IsNext(AsmToken::Comma) && IsNext(AsmToken::At) && + Lexer->is(AsmToken::Identifier))) + return Error("Expected label,@type declaration, got: ", Lexer->getTok()); + auto TypeName = Lexer->getTok().getString(); + if (TypeName == "function") + WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); + else if (TypeName == "global") + WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL); + else + return Error("Unknown WASM symbol type: ", Lexer->getTok()); + Lex(); + return Expect(AsmToken::EndOfStatement, "EOL"); + } +}; + +} // end anonymous namespace + +namespace llvm { + +MCAsmParserExtension *createWasmAsmParser() { + return new WasmAsmParser; +} + +} // end namespace llvm diff --git a/lib/MC/MCSectionELF.cpp b/lib/MC/MCSectionELF.cpp index 4d77d05cc5053c06fe72f9e641c60da999673e8f..7ee1694ebbf71dd8f09be6386683a6110f91fea2 100644 --- a/lib/MC/MCSectionELF.cpp +++ b/lib/MC/MCSectionELF.cpp @@ -116,6 +116,9 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T, } else if (T.isARM() || T.isThumb()) { if (Flags & ELF::SHF_ARM_PURECODE) OS << 'y'; + } else if (Arch == Triple::hexagon) { + if (Flags & ELF::SHF_HEX_GPREL) + OS << 's'; } OS << '"'; diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp index 3722c0ad3c818292e55c6a5fd48d75055a771835..85e69f6f663e0283a47c3d2e01a2b452d34daa7a 100644 --- a/lib/MC/MCStreamer.cpp +++ b/lib/MC/MCStreamer.cpp @@ -577,6 +577,15 @@ void MCStreamer::EmitCFIWindowSave() { CurFrame->Instructions.push_back(Instruction); } +void MCStreamer::EmitCFINegateRAState() { + MCSymbol *Label = EmitCFILabel(); + MCCFIInstruction Instruction = MCCFIInstruction::createNegateRAState(Label); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); + if (!CurFrame) + return; + CurFrame->Instructions.push_back(Instruction); +} + void MCStreamer::EmitCFIReturnColumn(int64_t Register) { MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); if (!CurFrame) @@ -1045,7 +1054,8 @@ MCSymbol *MCStreamer::endSection(MCSection *Section) { return Sym; } -void MCStreamer::EmitVersionForTarget(const Triple &Target) { +void MCStreamer::EmitVersionForTarget(const Triple &Target, + const VersionTuple &SDKVersion) { if (!Target.isOSBinFormatMachO() || !Target.isOSDarwin()) return; // Do we even know the version? @@ -1071,5 +1081,5 @@ void MCStreamer::EmitVersionForTarget(const Triple &Target) { Target.getiOSVersion(Major, Minor, Update); } if (Major != 0) - EmitVersionMin(VersionType, Major, Minor, Update); + EmitVersionMin(VersionType, Major, Minor, Update, SDKVersion); } diff --git a/lib/MC/MCWin64EH.cpp b/lib/MC/MCWin64EH.cpp index 0c8d58e597271a97366d3467417c192659760209..0724b109e1a1da6ab89a43e7c4d3561e6470bc7a 100644 --- a/lib/MC/MCWin64EH.cpp +++ b/lib/MC/MCWin64EH.cpp @@ -468,10 +468,10 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) { info->Symbol = Label; uint32_t FuncLength = 0x0; - FuncLength = (uint32_t)GetAbsDifference(streamer, info->FuncletOrFuncEnd, - info->Begin); - if (FuncLength) - FuncLength /= 4; + if (info->FuncletOrFuncEnd) + FuncLength = (uint32_t)GetAbsDifference(streamer, info->FuncletOrFuncEnd, + info->Begin); + FuncLength /= 4; uint32_t PrologCodeBytes = ARM64CountOfUnwindCodes(info->Instructions); uint32_t TotalCodeBytes = PrologCodeBytes; @@ -487,8 +487,8 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) { // Code Words, Epilog count, E, X, Vers, Function Length uint32_t row1 = 0x0; - uint8_t CodeWords = TotalCodeBytes / 4; - uint8_t CodeWordsMod = TotalCodeBytes % 4; + uint32_t CodeWords = TotalCodeBytes / 4; + uint32_t CodeWordsMod = TotalCodeBytes % 4; if (CodeWordsMod) CodeWords++; uint32_t EpilogCount = info->EpilogMap.size(); @@ -505,6 +505,11 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) { // Extended Code Words, Extended Epilog Count if (ExtensionWord) { + // FIXME: We should be able to split unwind info into multiple sections. + // FIXME: We should share epilog codes across epilogs, where possible, + // which would make this issue show up less frequently. + if (CodeWords > 0xFF || EpilogCount > 0xFFFF) + report_fatal_error("SEH unwind data splitting not yet implemented"); uint32_t row2 = 0x0; row2 |= (CodeWords & 0xFF) << 16; row2 |= (EpilogCount & 0xFFFF); diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp index c2e8dc1ef6584b149d962283178989403899b41a..2fa65658ccfacfec0db3928d16586ee18f3be9e8 100644 --- a/lib/MC/MachObjectWriter.cpp +++ b/lib/MC/MachObjectWriter.cpp @@ -846,18 +846,27 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm, // Write out the deployment target information, if it's available. if (VersionInfo.Major != 0) { - assert(VersionInfo.Update < 256 && "unencodable update target version"); - assert(VersionInfo.Minor < 256 && "unencodable minor target version"); - assert(VersionInfo.Major < 65536 && "unencodable major target version"); - uint32_t EncodedVersion = VersionInfo.Update | (VersionInfo.Minor << 8) | - (VersionInfo.Major << 16); + auto EncodeVersion = [](VersionTuple V) -> uint32_t { + assert(!V.empty() && "empty version"); + unsigned Update = V.getSubminor() ? *V.getSubminor() : 0; + unsigned Minor = V.getMinor() ? *V.getMinor() : 0; + assert(Update < 256 && "unencodable update target version"); + assert(Minor < 256 && "unencodable minor target version"); + assert(V.getMajor() < 65536 && "unencodable major target version"); + return Update | (Minor << 8) | (V.getMajor() << 16); + }; + uint32_t EncodedVersion = EncodeVersion( + VersionTuple(VersionInfo.Major, VersionInfo.Minor, VersionInfo.Update)); + uint32_t SDKVersion = !VersionInfo.SDKVersion.empty() + ? EncodeVersion(VersionInfo.SDKVersion) + : 0; if (VersionInfo.EmitBuildVersion) { // FIXME: Currently empty tools. Add clang version in the future. W.write(MachO::LC_BUILD_VERSION); W.write(sizeof(MachO::build_version_command)); W.write(VersionInfo.TypeOrPlatform.Platform); W.write(EncodedVersion); - W.write(0); // SDK version. + W.write(SDKVersion); W.write(0); // Empty tools list. } else { MachO::LoadCommandType LCType @@ -865,7 +874,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm, W.write(LCType); W.write(sizeof(MachO::version_min_command)); W.write(EncodedVersion); - W.write(0); // reserved. + W.write(SDKVersion); } } diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp index c1e0b7aa7ab01a526f61d4e24f9fcde4f6a8e435..8a369204cd6c85aed7061960acf04f2d4d247fe7 100644 --- a/lib/MC/WasmObjectWriter.cpp +++ b/lib/MC/WasmObjectWriter.cpp @@ -56,10 +56,10 @@ struct SectionBookkeeping { uint32_t Index; }; -// The signature of a wasm function, in a struct capable of being used as a -// DenseMap key. -// TODO: Consider using WasmSignature directly instead. -struct WasmFunctionType { +// The signature of a wasm function or event, in a struct capable of being used +// as a DenseMap key. +// TODO: Consider using wasm::WasmSignature directly instead. +struct WasmSignature { // Support empty and tombstone instances, needed by DenseMap. enum { Plain, Empty, Tombstone } State; @@ -69,36 +69,35 @@ struct WasmFunctionType { // The parameter types of the function. SmallVector Params; - WasmFunctionType() : State(Plain) {} + WasmSignature() : State(Plain) {} - bool operator==(const WasmFunctionType &Other) const { + bool operator==(const WasmSignature &Other) const { return State == Other.State && Returns == Other.Returns && Params == Other.Params; } }; -// Traits for using WasmFunctionType in a DenseMap. -struct WasmFunctionTypeDenseMapInfo { - static WasmFunctionType getEmptyKey() { - WasmFunctionType FuncTy; - FuncTy.State = WasmFunctionType::Empty; - return FuncTy; +// Traits for using WasmSignature in a DenseMap. +struct WasmSignatureDenseMapInfo { + static WasmSignature getEmptyKey() { + WasmSignature Sig; + Sig.State = WasmSignature::Empty; + return Sig; } - static WasmFunctionType getTombstoneKey() { - WasmFunctionType FuncTy; - FuncTy.State = WasmFunctionType::Tombstone; - return FuncTy; + static WasmSignature getTombstoneKey() { + WasmSignature Sig; + Sig.State = WasmSignature::Tombstone; + return Sig; } - static unsigned getHashValue(const WasmFunctionType &FuncTy) { - uintptr_t Value = FuncTy.State; - for (wasm::ValType Ret : FuncTy.Returns) + static unsigned getHashValue(const WasmSignature &Sig) { + uintptr_t Value = Sig.State; + for (wasm::ValType Ret : Sig.Returns) Value += DenseMapInfo::getHashValue(uint32_t(Ret)); - for (wasm::ValType Param : FuncTy.Params) + for (wasm::ValType Param : Sig.Params) Value += DenseMapInfo::getHashValue(uint32_t(Param)); return Value; } - static bool isEqual(const WasmFunctionType &LHS, - const WasmFunctionType &RHS) { + static bool isEqual(const WasmSignature &LHS, const WasmSignature &RHS) { return LHS == RHS; } }; @@ -118,7 +117,7 @@ struct WasmDataSegment { // A wasm function to be written into the function section. struct WasmFunction { - uint32_t Type; + uint32_t SigIndex; const MCSymbolWasm *Sym; }; @@ -216,7 +215,8 @@ class WasmObjectWriter : public MCObjectWriter { // Maps function symbols to the table element index space. Used // for TABLE_INDEX relocation types (i.e. address taken functions). DenseMap TableIndices; - // Maps function/global symbols to the function/global/section index space. + // Maps function/global symbols to the function/global/event/section index + // space. DenseMap WasmIndices; // Maps data symbols to the Wasm segment and offset/size with the segment. DenseMap DataLocations; @@ -231,13 +231,13 @@ class WasmObjectWriter : public MCObjectWriter { // Map from section to defining function symbol. DenseMap SectionFunctions; - DenseMap - FunctionTypeIndices; - SmallVector FunctionTypes; + DenseMap SignatureIndices; + SmallVector Signatures; SmallVector Globals; SmallVector DataSegments; unsigned NumFunctionImports = 0; unsigned NumGlobalImports = 0; + unsigned NumEventImports = 0; uint32_t SectionCount = 0; // TargetObjectWriter wrappers. @@ -266,8 +266,8 @@ private: TableIndices.clear(); DataLocations.clear(); CustomSectionsRelocations.clear(); - FunctionTypeIndices.clear(); - FunctionTypes.clear(); + SignatureIndices.clear(); + Signatures.clear(); Globals.clear(); DataSegments.clear(); SectionFunctions.clear(); @@ -294,7 +294,7 @@ private: void writeValueType(wasm::ValType Ty) { W.OS << static_cast(Ty); } - void writeTypeSection(ArrayRef FunctionTypes); + void writeTypeSection(ArrayRef Signatures); void writeImportSection(ArrayRef Imports, uint32_t DataSize, uint32_t NumElements); void writeFunctionSection(ArrayRef Functions); @@ -304,6 +304,7 @@ private: void writeCodeSection(const MCAssembler &Asm, const MCAsmLayout &Layout, ArrayRef Functions); void writeDataSection(); + void writeEventSection(ArrayRef Events); void writeRelocSection(uint32_t SectionIndex, StringRef Name, std::vector &Relocations); void writeLinkingMetaDataSection( @@ -322,7 +323,9 @@ private: uint32_t getRelocationIndexValue(const WasmRelocationEntry &RelEntry); uint32_t getFunctionType(const MCSymbolWasm &Symbol); - uint32_t registerFunctionType(const MCSymbolWasm &Symbol); + uint32_t getEventType(const MCSymbolWasm &Symbol); + void registerFunctionType(const MCSymbolWasm &Symbol); + void registerEventType(const MCSymbolWasm &Symbol); }; } // end anonymous namespace @@ -581,7 +584,8 @@ WasmObjectWriter::getProvisionalValue(const WasmRelocationEntry &RelEntry) { return getRelocationIndexValue(RelEntry); case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB: case wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB: - // Provisional value is function/global Wasm index + case wasm::R_WEBASSEMBLY_EVENT_INDEX_LEB: + // Provisional value is function/global/event Wasm index if (!WasmIndices.count(RelEntry.Symbol)) report_fatal_error("symbol not found in wasm index space: " + RelEntry.Symbol->getName()); @@ -678,6 +682,7 @@ void WasmObjectWriter::applyRelocations( case wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB: case wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB: case wasm::R_WEBASSEMBLY_MEMORY_ADDR_LEB: + case wasm::R_WEBASSEMBLY_EVENT_INDEX_LEB: WritePatchableLEB(Stream, Value, Offset); break; case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32: @@ -696,23 +701,22 @@ void WasmObjectWriter::applyRelocations( } } -void WasmObjectWriter::writeTypeSection( - ArrayRef FunctionTypes) { - if (FunctionTypes.empty()) +void WasmObjectWriter::writeTypeSection(ArrayRef Signatures) { + if (Signatures.empty()) return; SectionBookkeeping Section; startSection(Section, wasm::WASM_SEC_TYPE); - encodeULEB128(FunctionTypes.size(), W.OS); + encodeULEB128(Signatures.size(), W.OS); - for (const WasmFunctionType &FuncTy : FunctionTypes) { + for (const WasmSignature &Sig : Signatures) { W.OS << char(wasm::WASM_TYPE_FUNC); - encodeULEB128(FuncTy.Params.size(), W.OS); - for (wasm::ValType Ty : FuncTy.Params) + encodeULEB128(Sig.Params.size(), W.OS); + for (wasm::ValType Ty : Sig.Params) writeValueType(Ty); - encodeULEB128(FuncTy.Returns.size(), W.OS); - for (wasm::ValType Ty : FuncTy.Returns) + encodeULEB128(Sig.Returns.size(), W.OS); + for (wasm::ValType Ty : Sig.Returns) writeValueType(Ty); } @@ -753,6 +757,10 @@ void WasmObjectWriter::writeImportSection(ArrayRef Imports, encodeULEB128(0, W.OS); // flags encodeULEB128(NumElements, W.OS); // initial break; + case wasm::WASM_EXTERNAL_EVENT: + encodeULEB128(Import.Event.Attribute, W.OS); + encodeULEB128(Import.Event.SigIndex, W.OS); + break; default: llvm_unreachable("unsupported import kind"); } @@ -770,7 +778,7 @@ void WasmObjectWriter::writeFunctionSection(ArrayRef Functions) { encodeULEB128(Functions.size(), W.OS); for (const WasmFunction &Func : Functions) - encodeULEB128(Func.Type, W.OS); + encodeULEB128(Func.SigIndex, W.OS); endSection(Section); } @@ -795,6 +803,22 @@ void WasmObjectWriter::writeGlobalSection() { endSection(Section); } +void WasmObjectWriter::writeEventSection(ArrayRef Events) { + if (Events.empty()) + return; + + SectionBookkeeping Section; + startSection(Section, wasm::WASM_SEC_EVENT); + + encodeULEB128(Events.size(), W.OS); + for (const wasm::WasmEventType &Event : Events) { + encodeULEB128(Event.Attribute, W.OS); + encodeULEB128(Event.SigIndex, W.OS); + } + + endSection(Section); +} + void WasmObjectWriter::writeExportSection(ArrayRef Exports) { if (Exports.empty()) return; @@ -956,6 +980,7 @@ void WasmObjectWriter::writeLinkingMetaDataSection( switch (Sym.Kind) { case wasm::WASM_SYMBOL_TYPE_FUNCTION: case wasm::WASM_SYMBOL_TYPE_GLOBAL: + case wasm::WASM_SYMBOL_TYPE_EVENT: encodeULEB128(Sym.ElementIndex, W.OS); if ((Sym.Flags & wasm::WASM_SYMBOL_UNDEFINED) == 0) writeString(Sym.Name); @@ -1047,26 +1072,51 @@ uint32_t WasmObjectWriter::getFunctionType(const MCSymbolWasm &Symbol) { return TypeIndices[&Symbol]; } -uint32_t WasmObjectWriter::registerFunctionType(const MCSymbolWasm &Symbol) { +uint32_t WasmObjectWriter::getEventType(const MCSymbolWasm &Symbol) { + assert(Symbol.isEvent()); + assert(TypeIndices.count(&Symbol)); + return TypeIndices[&Symbol]; +} + +void WasmObjectWriter::registerFunctionType(const MCSymbolWasm &Symbol) { assert(Symbol.isFunction()); - WasmFunctionType F; + WasmSignature S; const MCSymbolWasm *ResolvedSym = ResolveSymbol(Symbol); if (auto *Sig = ResolvedSym->getSignature()) { - F.Returns = Sig->Returns; - F.Params = Sig->Params; + S.Returns = Sig->Returns; + S.Params = Sig->Params; } - auto Pair = - FunctionTypeIndices.insert(std::make_pair(F, FunctionTypes.size())); + auto Pair = SignatureIndices.insert(std::make_pair(S, Signatures.size())); if (Pair.second) - FunctionTypes.push_back(F); + Signatures.push_back(S); TypeIndices[&Symbol] = Pair.first->second; LLVM_DEBUG(dbgs() << "registerFunctionType: " << Symbol << " new:" << Pair.second << "\n"); LLVM_DEBUG(dbgs() << " -> type index: " << Pair.first->second << "\n"); - return Pair.first->second; +} + +void WasmObjectWriter::registerEventType(const MCSymbolWasm &Symbol) { + assert(Symbol.isEvent()); + + // TODO Currently we don't generate imported exceptions, but if we do, we + // should have a way of infering types of imported exceptions. + WasmSignature S; + if (auto *Sig = Symbol.getSignature()) { + S.Returns = Sig->Returns; + S.Params = Sig->Params; + } + + auto Pair = SignatureIndices.insert(std::make_pair(S, Signatures.size())); + if (Pair.second) + Signatures.push_back(S); + TypeIndices[&Symbol] = Pair.first->second; + + LLVM_DEBUG(dbgs() << "registerEventType: " << Symbol << " new:" << Pair.second + << "\n"); + LLVM_DEBUG(dbgs() << " -> type index: " << Pair.first->second << "\n"); } static bool isInSymtab(const MCSymbolWasm &Sym) { @@ -1100,6 +1150,7 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm, SmallVector TableElems; SmallVector Imports; SmallVector Exports; + SmallVector Events; SmallVector SymbolInfos; SmallVector, 2> InitFuncs; std::map> Comdats; @@ -1128,7 +1179,7 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm, TableImport.Table.ElemType = wasm::WASM_TYPE_ANYFUNC; Imports.push_back(TableImport); - // Populate FunctionTypeIndices, and Imports and WasmIndices for undefined + // Populate SignatureIndices, and Imports and WasmIndices for undefined // symbols. This must be done before populating WasmIndices for defined // symbols. for (const MCSymbol &S : Asm.symbols()) { @@ -1139,6 +1190,9 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm, if (WS.isFunction()) registerFunctionType(WS); + if (WS.isEvent()) + registerEventType(WS); + if (WS.isTemporary()) continue; @@ -1163,6 +1217,18 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm, Import.Global = WS.getGlobalType(); Imports.push_back(Import); WasmIndices[&WS] = NumGlobalImports++; + } else if (WS.isEvent()) { + if (WS.isWeak()) + report_fatal_error("undefined event symbol cannot be weak"); + + wasm::WasmImport Import; + Import.Module = WS.getModuleName(); + Import.Field = WS.getName(); + Import.Kind = wasm::WASM_EXTERNAL_EVENT; + Import.Event.Attribute = wasm::WASM_EVENT_ATTRIBUTE_EXCEPTION; + Import.Event.SigIndex = getEventType(WS); + Imports.push_back(Import); + WasmIndices[&WS] = NumEventImports++; } } } @@ -1254,7 +1320,7 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm, // A definition. Write out the function body. Index = NumFunctionImports + Functions.size(); WasmFunction Func; - Func.Type = getFunctionType(WS); + Func.SigIndex = getFunctionType(WS); Func.Sym = &WS; WasmIndices[&WS] = Index; Functions.push_back(Func); @@ -1270,6 +1336,7 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm, } LLVM_DEBUG(dbgs() << " -> function index: " << Index << "\n"); + } else if (WS.isData()) { if (WS.isTemporary() && !WS.getSize()) continue; @@ -1299,6 +1366,7 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm, static_cast(Size)}; DataLocations[&WS] = Ref; LLVM_DEBUG(dbgs() << " -> segment index: " << Ref.Segment << "\n"); + } else if (WS.isGlobal()) { // A "true" Wasm global (currently just __stack_pointer) if (WS.isDefined()) @@ -1307,6 +1375,24 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm, // An import; the index was assigned above LLVM_DEBUG(dbgs() << " -> global index: " << WasmIndices.find(&WS)->second << "\n"); + + } else if (WS.isEvent()) { + // C++ exception symbol (__cpp_exception) + unsigned Index; + if (WS.isDefined()) { + Index = NumEventImports + Events.size(); + wasm::WasmEventType Event; + Event.SigIndex = getEventType(WS); + Event.Attribute = wasm::WASM_EVENT_ATTRIBUTE_EXCEPTION; + WasmIndices[&WS] = Index; + Events.push_back(Event); + } else { + // An import; the index was assigned above. + Index = WasmIndices.find(&WS)->second; + } + LLVM_DEBUG(dbgs() << " -> event index: " << WasmIndices.find(&WS)->second + << "\n"); + } else { assert(WS.isSection()); } @@ -1340,7 +1426,7 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm, DataLocations[&WS] = Ref; LLVM_DEBUG(dbgs() << " -> index:" << Ref.Segment << "\n"); } else { - report_fatal_error("don't yet support global aliases"); + report_fatal_error("don't yet support global/event aliases"); } } @@ -1473,12 +1559,13 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm, // Write out the Wasm header. writeHeader(Asm); - writeTypeSection(FunctionTypes); + writeTypeSection(Signatures); writeImportSection(Imports, DataSize, TableElems.size()); writeFunctionSection(Functions); // Skip the "table" section; we import the table instead. // Skip the "memory" section; we import the memory instead. writeGlobalSection(); + writeEventSection(Events); writeExportSection(Exports); writeElemSection(TableElems); writeCodeSection(Asm, Layout, Functions); diff --git a/tools/llvm-mca/lib/CMakeLists.txt b/lib/MCA/CMakeLists.txt similarity index 59% rename from tools/llvm-mca/lib/CMakeLists.txt rename to lib/MCA/CMakeLists.txt index 359a7d49d49d43cb6d55d7b0ef9cb1ff6c3f5568..bfd0782d1f7d5012b8f6798c8109eab278e38ba3 100644 --- a/tools/llvm-mca/lib/CMakeLists.txt +++ b/lib/MCA/CMakeLists.txt @@ -1,7 +1,4 @@ -include_directories(${LLVM_MCA_SOURCE_DIR}/include) - -add_library(LLVMMCA - STATIC +add_llvm_library(LLVMMCA Context.cpp HWEventListener.cpp HardwareUnits/HardwareUnit.cpp @@ -14,19 +11,13 @@ add_library(LLVMMCA Instruction.cpp Pipeline.cpp Stages/DispatchStage.cpp + Stages/EntryStage.cpp Stages/ExecuteStage.cpp - Stages/FetchStage.cpp Stages/InstructionTables.cpp Stages/RetireStage.cpp Stages/Stage.cpp Support.cpp - ) -llvm_update_compile_flags(LLVMMCA) -llvm_map_components_to_libnames(libs - MC - Support + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/MCA ) - -target_link_libraries(LLVMMCA ${libs}) -set_target_properties(LLVMMCA PROPERTIES FOLDER "Libraries") diff --git a/tools/llvm-mca/lib/Context.cpp b/lib/MCA/Context.cpp similarity index 76% rename from tools/llvm-mca/lib/Context.cpp rename to lib/MCA/Context.cpp index 5b6f52478ddbb60b84ef0b79f3b2b718092e5339..c1b197dfe2e6744fc53e2c9b05283e7da9406083 100644 --- a/tools/llvm-mca/lib/Context.cpp +++ b/lib/MCA/Context.cpp @@ -15,14 +15,14 @@ /// //===----------------------------------------------------------------------===// -#include "Context.h" -#include "HardwareUnits/RegisterFile.h" -#include "HardwareUnits/RetireControlUnit.h" -#include "HardwareUnits/Scheduler.h" -#include "Stages/DispatchStage.h" -#include "Stages/ExecuteStage.h" -#include "Stages/FetchStage.h" -#include "Stages/RetireStage.h" +#include "llvm/MCA/Context.h" +#include "llvm/MCA/HardwareUnits/RegisterFile.h" +#include "llvm/MCA/HardwareUnits/RetireControlUnit.h" +#include "llvm/MCA/HardwareUnits/Scheduler.h" +#include "llvm/MCA/Stages/DispatchStage.h" +#include "llvm/MCA/Stages/EntryStage.h" +#include "llvm/MCA/Stages/ExecuteStage.h" +#include "llvm/MCA/Stages/RetireStage.h" namespace llvm { namespace mca { @@ -35,12 +35,12 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB, // Create the hardware units defining the backend. auto RCU = llvm::make_unique(SM); auto PRF = llvm::make_unique(SM, MRI, Opts.RegisterFileSize); - auto LSU = llvm::make_unique(Opts.LoadQueueSize, Opts.StoreQueueSize, - Opts.AssumeNoAlias); - auto HWS = llvm::make_unique(SM, LSU.get()); + auto LSU = llvm::make_unique(SM, Opts.LoadQueueSize, + Opts.StoreQueueSize, Opts.AssumeNoAlias); + auto HWS = llvm::make_unique(SM, *LSU); // Create the pipeline stages. - auto Fetch = llvm::make_unique(SrcMgr); + auto Fetch = llvm::make_unique(SrcMgr); auto Dispatch = llvm::make_unique(STI, MRI, Opts.DispatchWidth, *RCU, *PRF); auto Execute = llvm::make_unique(*HWS); diff --git a/tools/llvm-mca/lib/HWEventListener.cpp b/lib/MCA/HWEventListener.cpp similarity index 94% rename from tools/llvm-mca/lib/HWEventListener.cpp rename to lib/MCA/HWEventListener.cpp index 3930e2555a9ba33df5c9b42496fdc1b16e711c58..4a0e5b1754dd593761557726cf1fcfaab97af9f6 100644 --- a/tools/llvm-mca/lib/HWEventListener.cpp +++ b/lib/MCA/HWEventListener.cpp @@ -12,7 +12,7 @@ /// //===----------------------------------------------------------------------===// -#include "HWEventListener.h" +#include "llvm/MCA/HWEventListener.h" namespace llvm { namespace mca { diff --git a/tools/llvm-mca/lib/HardwareUnits/HardwareUnit.cpp b/lib/MCA/HardwareUnits/HardwareUnit.cpp similarity index 93% rename from tools/llvm-mca/lib/HardwareUnits/HardwareUnit.cpp rename to lib/MCA/HardwareUnits/HardwareUnit.cpp index 4e46ffacbd4044799c8a86a8916dd8455c882e3a..edd32b9c0c1ab9bbbb3d9e49e18145ee7058274b 100644 --- a/tools/llvm-mca/lib/HardwareUnits/HardwareUnit.cpp +++ b/lib/MCA/HardwareUnits/HardwareUnit.cpp @@ -13,7 +13,7 @@ /// //===----------------------------------------------------------------------===// -#include "HardwareUnits/HardwareUnit.h" +#include "llvm/MCA/HardwareUnits/HardwareUnit.h" namespace llvm { namespace mca { diff --git a/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp b/lib/MCA/HardwareUnits/LSUnit.cpp similarity index 71% rename from tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp rename to lib/MCA/HardwareUnits/LSUnit.cpp index 6923c6e0dc8ff532af346f2c9fb70f9013b038a4..8895eb392b60ae92dd17aa377f4b9f03efb077f7 100644 --- a/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp +++ b/lib/MCA/HardwareUnits/LSUnit.cpp @@ -12,8 +12,8 @@ /// //===----------------------------------------------------------------------===// -#include "HardwareUnits/LSUnit.h" -#include "Instruction.h" +#include "llvm/MCA/HardwareUnits/LSUnit.h" +#include "llvm/MCA/Instruction.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -22,6 +22,23 @@ namespace llvm { namespace mca { +LSUnit::LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, + bool AssumeNoAlias) + : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) { + if (SM.hasExtraProcessorInfo()) { + const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo(); + if (!LQ_Size && EPI.LoadQueueID) { + const MCProcResourceDesc &LdQDesc = *SM.getProcResource(EPI.LoadQueueID); + LQ_Size = LdQDesc.BufferSize; + } + + if (!SQ_Size && EPI.StoreQueueID) { + const MCProcResourceDesc &StQDesc = *SM.getProcResource(EPI.StoreQueueID); + SQ_Size = StQDesc.BufferSize; + } + } +} + #ifndef NDEBUG void LSUnit::dump() const { dbgs() << "[LSUnit] LQ_Size = " << LQ_Size << '\n'; @@ -136,31 +153,38 @@ bool LSUnit::isReady(const InstRef &IR) const { } void LSUnit::onInstructionExecuted(const InstRef &IR) { + const InstrDesc &Desc = IR.getInstruction()->getDesc(); const unsigned Index = IR.getSourceIndex(); - std::set::iterator it = LoadQueue.find(Index); - if (it != LoadQueue.end()) { - LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index - << " has been removed from the load queue.\n"); - LoadQueue.erase(it); - } + bool IsALoad = Desc.MayLoad; + bool IsAStore = Desc.MayStore; - it = StoreQueue.find(Index); - if (it != StoreQueue.end()) { - LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index - << " has been removed from the store queue.\n"); - StoreQueue.erase(it); + if (IsALoad) { + if (LoadQueue.erase(Index)) { + LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index + << " has been removed from the load queue.\n"); + } + if (!LoadBarriers.empty() && Index == *LoadBarriers.begin()) { + LLVM_DEBUG( + dbgs() << "[LSUnit]: Instruction idx=" << Index + << " has been removed from the set of load barriers.\n"); + LoadBarriers.erase(Index); + } } - if (!StoreBarriers.empty() && Index == *StoreBarriers.begin()) { - LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index - << " has been removed from the set of store barriers.\n"); - StoreBarriers.erase(StoreBarriers.begin()); - } - if (!LoadBarriers.empty() && Index == *LoadBarriers.begin()) { - LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index - << " has been removed from the set of load barriers.\n"); - LoadBarriers.erase(LoadBarriers.begin()); + if (IsAStore) { + if (StoreQueue.erase(Index)) { + LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index + << " has been removed from the store queue.\n"); + } + + if (!StoreBarriers.empty() && Index == *StoreBarriers.begin()) { + LLVM_DEBUG( + dbgs() << "[LSUnit]: Instruction idx=" << Index + << " has been removed from the set of store barriers.\n"); + StoreBarriers.erase(Index); + } } } + } // namespace mca } // namespace llvm diff --git a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp b/lib/MCA/HardwareUnits/RegisterFile.cpp similarity index 98% rename from tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp rename to lib/MCA/HardwareUnits/RegisterFile.cpp index 6bc63a0db5000787beaf91458aa84ce75d5bccfe..22977e5ded65b054ce6be3c4207ac05d1e98a6b2 100644 --- a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp +++ b/lib/MCA/HardwareUnits/RegisterFile.cpp @@ -14,8 +14,8 @@ /// //===----------------------------------------------------------------------===// -#include "HardwareUnits/RegisterFile.h" -#include "Instruction.h" +#include "llvm/MCA/HardwareUnits/RegisterFile.h" +#include "llvm/MCA/Instruction.h" #include "llvm/Support/Debug.h" #define DEBUG_TYPE "llvm-mca" @@ -185,11 +185,11 @@ void RegisterFile::addRegisterWrite(WriteRef Write, // register is allocated. ShouldAllocatePhysRegs = false; - if (OtherWrite.getWriteState() && - (OtherWrite.getSourceIndex() != Write.getSourceIndex())) { + WriteState *OtherWS = OtherWrite.getWriteState(); + if (OtherWS && (OtherWrite.getSourceIndex() != Write.getSourceIndex())) { // This partial write has a false dependency on RenameAs. assert(!IsEliminated && "Unexpected partial update!"); - WS.setDependentWrite(OtherWrite.getWriteState()); + OtherWS->addUser(&WS); } } } diff --git a/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp b/lib/MCA/HardwareUnits/ResourceManager.cpp similarity index 86% rename from tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp rename to lib/MCA/HardwareUnits/ResourceManager.cpp index e371f50ed4892aa9a88bf84af0a15b2e4e75ad6b..e76543c53e14560dc71a9f6ce8cb5db594d7bdf0 100644 --- a/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp +++ b/lib/MCA/HardwareUnits/ResourceManager.cpp @@ -13,8 +13,8 @@ /// //===----------------------------------------------------------------------===// -#include "HardwareUnits/ResourceManager.h" -#include "Support.h" +#include "llvm/MCA/HardwareUnits/ResourceManager.h" +#include "llvm/MCA/Support.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -24,21 +24,28 @@ namespace mca { #define DEBUG_TYPE "llvm-mca" ResourceStrategy::~ResourceStrategy() = default; -void DefaultResourceStrategy::skipMask(uint64_t Mask) { - NextInSequenceMask &= (~Mask); - if (!NextInSequenceMask) { - NextInSequenceMask = ResourceUnitMask ^ RemovedFromNextInSequence; - RemovedFromNextInSequence = 0; - } -} - uint64_t DefaultResourceStrategy::select(uint64_t ReadyMask) { // This method assumes that ReadyMask cannot be zero. - uint64_t CandidateMask = PowerOf2Floor(NextInSequenceMask); - while (!(ReadyMask & CandidateMask)) { - skipMask(CandidateMask); - CandidateMask = PowerOf2Floor(NextInSequenceMask); + uint64_t CandidateMask = ReadyMask & NextInSequenceMask; + if (CandidateMask) { + CandidateMask = PowerOf2Floor(CandidateMask); + NextInSequenceMask &= (CandidateMask | (CandidateMask - 1)); + return CandidateMask; + } + + NextInSequenceMask = ResourceUnitMask ^ RemovedFromNextInSequence; + RemovedFromNextInSequence = 0; + CandidateMask = ReadyMask & NextInSequenceMask; + + if (CandidateMask) { + CandidateMask = PowerOf2Floor(CandidateMask); + NextInSequenceMask &= (CandidateMask | (CandidateMask - 1)); + return CandidateMask; } + + NextInSequenceMask = ResourceUnitMask; + CandidateMask = PowerOf2Floor(ReadyMask & NextInSequenceMask); + NextInSequenceMask &= (CandidateMask | (CandidateMask - 1)); return CandidateMask; } @@ -47,14 +54,20 @@ void DefaultResourceStrategy::used(uint64_t Mask) { RemovedFromNextInSequence |= Mask; return; } - skipMask(Mask); + + NextInSequenceMask &= (~Mask); + if (NextInSequenceMask) + return; + + NextInSequenceMask = ResourceUnitMask ^ RemovedFromNextInSequence; + RemovedFromNextInSequence = 0; } ResourceState::ResourceState(const MCProcResourceDesc &Desc, unsigned Index, uint64_t Mask) : ProcResourceDescIndex(Index), ResourceMask(Mask), - BufferSize(Desc.BufferSize) { - if (countPopulation(ResourceMask) > 1) + BufferSize(Desc.BufferSize), IsAGroup(countPopulation(ResourceMask)>1) { + if (IsAGroup) ResourceSizeMask = ResourceMask ^ PowerOf2Floor(ResourceMask); else ResourceSizeMask = (1ULL << Desc.NumUnits) - 1; @@ -78,8 +91,10 @@ ResourceStateEvent ResourceState::isBufferAvailable() const { #ifndef NDEBUG void ResourceState::dump() const { - dbgs() << "MASK: " << ResourceMask << ", SIZE_MASK: " << ResourceSizeMask - << ", RDYMASK: " << ReadyMask << ", BufferSize=" << BufferSize + dbgs() << "MASK=" << format_hex(ResourceMask, 8) + << ", SZMASK=" << format_hex(ResourceSizeMask, 8) + << ", RDYMASK=" << format_hex(ReadyMask, 8) + << ", BufferSize=" << BufferSize << ", AvailableSlots=" << AvailableSlots << ", Reserved=" << Unavailable << '\n'; } @@ -147,8 +162,14 @@ ResourceRef ResourceManager::selectPipe(uint64_t ResourceID) { void ResourceManager::use(const ResourceRef &RR) { // Mark the sub-resource referenced by RR as used. - ResourceState &RS = *Resources[getResourceStateIndex(RR.first)]; + unsigned RSID = getResourceStateIndex(RR.first); + ResourceState &RS = *Resources[RSID]; RS.markSubResourceAsUsed(RR.second); + // Remember to update the resource strategy for non-group resources with + // multiple units. + if (RS.getNumUnits() > 1) + Strategies[RSID]->used(RR.second); + // If there are still available units in RR.first, // then we are done. if (RS.isReady()) diff --git a/tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp b/lib/MCA/HardwareUnits/RetireControlUnit.cpp similarity index 94% rename from tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp rename to lib/MCA/HardwareUnits/RetireControlUnit.cpp index 0456e1d7a5bf9fb8fded20f42323e906ab9d93c6..de9f24552c38be4190bd364fc3f7b17868ae53c8 100644 --- a/tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp +++ b/lib/MCA/HardwareUnits/RetireControlUnit.cpp @@ -12,7 +12,7 @@ /// //===----------------------------------------------------------------------===// -#include "HardwareUnits/RetireControlUnit.h" +#include "llvm/MCA/HardwareUnits/RetireControlUnit.h" #include "llvm/Support/Debug.h" #define DEBUG_TYPE "llvm-mca" @@ -60,9 +60,10 @@ const RetireControlUnit::RUToken &RetireControlUnit::peekCurrentToken() const { } void RetireControlUnit::consumeCurrentToken() { - const RetireControlUnit::RUToken &Current = peekCurrentToken(); + RetireControlUnit::RUToken &Current = Queue[CurrentInstructionSlotIdx]; assert(Current.NumSlots && "Reserved zero slots?"); assert(Current.IR && "Invalid RUToken in the RCU queue."); + Current.IR.getInstruction()->retire(); // Update the slot index to be the next item in the circular queue. CurrentInstructionSlotIdx += Current.NumSlots; diff --git a/tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp b/lib/MCA/HardwareUnits/Scheduler.cpp similarity index 97% rename from tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp rename to lib/MCA/HardwareUnits/Scheduler.cpp index b1ac8d99b865b83b3af292c5382b60a7e96baf8d..3924ac599105aa3e803f7361568613c9c0d3f81a 100644 --- a/tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp +++ b/lib/MCA/HardwareUnits/Scheduler.cpp @@ -11,7 +11,7 @@ // //===----------------------------------------------------------------------===// -#include "HardwareUnits/Scheduler.h" +#include "llvm/MCA/HardwareUnits/Scheduler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -51,7 +51,7 @@ Scheduler::Status Scheduler::isAvailable(const InstRef &IR) const { } // Give lower priority to LSUnit stall events. - switch (LSU->isAvailable(IR)) { + switch (LSU.isAvailable(IR)) { case LSUnit::LSU_LQUEUE_FULL: return Scheduler::SC_LOAD_QUEUE_FULL; case LSUnit::LSU_SQUEUE_FULL: @@ -80,7 +80,7 @@ void Scheduler::issueInstructionImpl( if (IS->isExecuting()) IssuedSet.emplace_back(IR); else if (IS->isExecuted()) - LSU->onInstructionExecuted(IR); + LSU.onInstructionExecuted(IR); } // Release the buffered resources and issue the instruction. @@ -170,7 +170,7 @@ void Scheduler::updateIssuedSet(SmallVectorImpl &Executed) { } // Instruction IR has completed execution. - LSU->onInstructionExecuted(IR); + LSU.onInstructionExecuted(IR); Executed.emplace_back(IR); ++RemovedElements; IR.invalidate(); @@ -213,7 +213,7 @@ void Scheduler::dispatch(const InstRef &IR) { // If necessary, reserve queue entries in the load-store unit (LSU). bool IsMemOp = Desc.MayLoad || Desc.MayStore; if (IsMemOp) - LSU->dispatch(IR); + LSU.dispatch(IR); if (!isReady(IR)) { LLVM_DEBUG(dbgs() << "[SCHEDULER] Adding #" << IR << " to the WaitSet\n"); @@ -238,7 +238,7 @@ void Scheduler::dispatch(const InstRef &IR) { bool Scheduler::isReady(const InstRef &IR) const { const InstrDesc &Desc = IR.getInstruction()->getDesc(); bool IsMemOp = Desc.MayLoad || Desc.MayStore; - return IR.getInstruction()->isReady() && (!IsMemOp || LSU->isReady(IR)); + return IR.getInstruction()->isReady() && (!IsMemOp || LSU.isReady(IR)); } } // namespace mca diff --git a/tools/llvm-mca/lib/InstrBuilder.cpp b/lib/MCA/InstrBuilder.cpp similarity index 65% rename from tools/llvm-mca/lib/InstrBuilder.cpp rename to lib/MCA/InstrBuilder.cpp index 535ad4d57feea8d15f91533fb9e0128c6c78e6fe..8d501dc6b15abb422234e5d4e4f7c718d0c14e7f 100644 --- a/tools/llvm-mca/lib/InstrBuilder.cpp +++ b/lib/MCA/InstrBuilder.cpp @@ -12,7 +12,7 @@ /// //===----------------------------------------------------------------------===// -#include "InstrBuilder.h" +#include "llvm/MCA/InstrBuilder.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" #include "llvm/MC/MCInst.h" @@ -28,8 +28,9 @@ namespace mca { InstrBuilder::InstrBuilder(const llvm::MCSubtargetInfo &sti, const llvm::MCInstrInfo &mcii, const llvm::MCRegisterInfo &mri, - const llvm::MCInstrAnalysis &mcia) - : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia) { + const llvm::MCInstrAnalysis *mcia) + : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia), FirstCallInst(true), + FirstReturnInst(true) { computeProcResourceMasks(STI.getSchedModel(), ProcResourceMasks); } @@ -55,12 +56,15 @@ static void initializeUsedResources(InstrDesc &ID, // part of a "Super" resource. The key value is the "Super" resource mask ID. DenseMap SuperResources; + unsigned NumProcResources = SM.getNumProcResourceKinds(); + APInt Buffers(NumProcResources, 0); + for (unsigned I = 0, E = SCDesc.NumWriteProcResEntries; I < E; ++I) { const MCWriteProcResEntry *PRE = STI.getWriteProcResBegin(&SCDesc) + I; const MCProcResourceDesc &PR = *SM.getProcResource(PRE->ProcResourceIdx); uint64_t Mask = ProcResourceMasks[PRE->ProcResourceIdx]; if (PR.BufferSize != -1) - ID.Buffers.push_back(Mask); + Buffers.setBit(PRE->ProcResourceIdx); CycleSegment RCy(0, PRE->Cycles, false); Worklist.emplace_back(ResourcePlusCycles(Mask, ResourceUsage(RCy))); if (PR.SuperIdx) { @@ -138,6 +142,30 @@ static void initializeUsedResources(InstrDesc &ID, } } + // Identify extra buffers that are consumed through super resources. + for (const std::pair &SR : SuperResources) { + for (unsigned I = 1, E = NumProcResources; I < E; ++I) { + const MCProcResourceDesc &PR = *SM.getProcResource(I); + if (PR.BufferSize == -1) + continue; + + uint64_t Mask = ProcResourceMasks[I]; + if (Mask != SR.first && ((Mask & SR.first) == SR.first)) + Buffers.setBit(I); + } + } + + // Now set the buffers. + if (unsigned NumBuffers = Buffers.countPopulation()) { + ID.Buffers.resize(NumBuffers); + for (unsigned I = 0, E = NumProcResources; I < E && NumBuffers; ++I) { + if (Buffers[I]) { + --NumBuffers; + ID.Buffers[NumBuffers] = ProcResourceMasks[I]; + } + } + } + LLVM_DEBUG({ for (const std::pair &R : ID.Resources) dbgs() << "\t\tMask=" << R.first << ", cy=" << R.second.size() << '\n'; @@ -161,33 +189,92 @@ static void computeMaxLatency(InstrDesc &ID, const MCInstrDesc &MCDesc, ID.MaxLatency = Latency < 0 ? 100U : static_cast(Latency); } -Error InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI, - unsigned SchedClassID) { +static Error verifyOperands(const MCInstrDesc &MCDesc, const MCInst &MCI) { + // Count register definitions, and skip non register operands in the process. + unsigned I, E; + unsigned NumExplicitDefs = MCDesc.getNumDefs(); + for (I = 0, E = MCI.getNumOperands(); NumExplicitDefs && I < E; ++I) { + const MCOperand &Op = MCI.getOperand(I); + if (Op.isReg()) + --NumExplicitDefs; + } + + if (NumExplicitDefs) { + return make_error>( + "Expected more register operand definitions.", MCI); + } + + if (MCDesc.hasOptionalDef()) { + // Always assume that the optional definition is the last operand. + const MCOperand &Op = MCI.getOperand(MCDesc.getNumOperands() - 1); + if (I == MCI.getNumOperands() || !Op.isReg()) { + std::string Message = + "expected a register operand for an optional definition. Instruction " + "has not been correctly analyzed."; + return make_error>(Message, MCI); + } + } + + return ErrorSuccess(); +} + +void InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI, + unsigned SchedClassID) { const MCInstrDesc &MCDesc = MCII.get(MCI.getOpcode()); const MCSchedModel &SM = STI.getSchedModel(); const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID); - // These are for now the (strong) assumptions made by this algorithm: - // * The number of explicit and implicit register definitions in a MCInst - // matches the number of explicit and implicit definitions according to - // the opcode descriptor (MCInstrDesc). - // * Register definitions take precedence over register uses in the operands - // list. - // * If an opcode specifies an optional definition, then the optional - // definition is always the last operand in the sequence, and it can be - // set to zero (i.e. "no register"). + // Assumptions made by this algorithm: + // 1. The number of explicit and implicit register definitions in a MCInst + // matches the number of explicit and implicit definitions according to + // the opcode descriptor (MCInstrDesc). + // 2. Uses start at index #(MCDesc.getNumDefs()). + // 3. There can only be a single optional register definition, an it is + // always the last operand of the sequence (excluding extra operands + // contributed by variadic opcodes). // // These assumptions work quite well for most out-of-order in-tree targets // like x86. This is mainly because the vast majority of instructions is // expanded to MCInst using a straightforward lowering logic that preserves // the ordering of the operands. + // + // About assumption 1. + // The algorithm allows non-register operands between register operand + // definitions. This helps to handle some special ARM instructions with + // implicit operand increment (-mtriple=armv7): + // + // vld1.32 {d18, d19}, [r1]! @ + // @ (!!) + // @ + // @ + // @ + // @ > + // + // MCDesc reports: + // 6 explicit operands. + // 1 optional definition + // 2 explicit definitions (!!) + // + // The presence of an 'Imm' operand between the two register definitions + // breaks the assumption that "register definitions are always at the + // beginning of the operand sequence". + // + // To workaround this issue, this algorithm ignores (i.e. skips) any + // non-register operands between register definitions. The optional + // definition is still at index #(NumOperands-1). + // + // According to assumption 2. register reads start at #(NumExplicitDefs-1). + // That means, register R1 from the example is both read and written. unsigned NumExplicitDefs = MCDesc.getNumDefs(); unsigned NumImplicitDefs = MCDesc.getNumImplicitDefs(); unsigned NumWriteLatencyEntries = SCDesc.NumWriteLatencyEntries; unsigned TotalDefs = NumExplicitDefs + NumImplicitDefs; if (MCDesc.hasOptionalDef()) TotalDefs++; - ID.Writes.resize(TotalDefs); + + unsigned NumVariadicOps = MCI.getNumOperands() - MCDesc.getNumOperands(); + ID.Writes.resize(TotalDefs + NumVariadicOps); // Iterate over the operands list, and skip non-register operands. // The first NumExplictDefs register operands are expected to be register // definitions. @@ -214,19 +301,15 @@ Error InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI, } Write.IsOptionalDef = false; LLVM_DEBUG({ - dbgs() << "\t\t[Def] OpIdx=" << Write.OpIndex + dbgs() << "\t\t[Def] OpIdx=" << Write.OpIndex << ", Latency=" << Write.Latency << ", WriteResourceID=" << Write.SClassOrWriteResourceID << '\n'; }); CurrentDef++; } - if (CurrentDef != NumExplicitDefs) { - return make_error>( - "Expected more register operand definitions.", MCI); - } - - CurrentDef = 0; + assert(CurrentDef == NumExplicitDefs && + "Expected more register operand definitions."); for (CurrentDef = 0; CurrentDef < NumImplicitDefs; ++CurrentDef) { unsigned Index = NumExplicitDefs + CurrentDef; WriteDescriptor &Write = ID.Writes[Index]; @@ -248,7 +331,7 @@ Error InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI, Write.IsOptionalDef = false; assert(Write.RegisterID != 0 && "Expected a valid phys register!"); LLVM_DEBUG({ - dbgs() << "\t\t[Def] OpIdx=" << Write.OpIndex + dbgs() << "\t\t[Def][I] OpIdx=" << ~Write.OpIndex << ", PhysReg=" << MRI.getName(Write.RegisterID) << ", Latency=" << Write.Latency << ", WriteResourceID=" << Write.SClassOrWriteResourceID << '\n'; @@ -256,75 +339,120 @@ Error InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI, } if (MCDesc.hasOptionalDef()) { - // Always assume that the optional definition is the last operand of the - // MCInst sequence. - const MCOperand &Op = MCI.getOperand(MCI.getNumOperands() - 1); - if (i == MCI.getNumOperands() || !Op.isReg()) { - std::string Message = - "expected a register operand for an optional definition. Instruction " - "has not been correctly analyzed."; - return make_error>(Message, MCI); - } - - WriteDescriptor &Write = ID.Writes[TotalDefs - 1]; - Write.OpIndex = MCI.getNumOperands() - 1; + WriteDescriptor &Write = ID.Writes[NumExplicitDefs + NumImplicitDefs]; + Write.OpIndex = MCDesc.getNumOperands() - 1; // Assign a default latency for this write. Write.Latency = ID.MaxLatency; Write.SClassOrWriteResourceID = 0; Write.IsOptionalDef = true; + LLVM_DEBUG({ + dbgs() << "\t\t[Def][O] OpIdx=" << Write.OpIndex + << ", Latency=" << Write.Latency + << ", WriteResourceID=" << Write.SClassOrWriteResourceID << '\n'; + }); } - return ErrorSuccess(); -} + if (!NumVariadicOps) + return; -Error InstrBuilder::populateReads(InstrDesc &ID, const MCInst &MCI, - unsigned SchedClassID) { - const MCInstrDesc &MCDesc = MCII.get(MCI.getOpcode()); - unsigned NumExplicitDefs = MCDesc.getNumDefs(); + // FIXME: if an instruction opcode is flagged 'mayStore', and it has no + // "unmodeledSideEffects', then this logic optimistically assumes that any + // extra register operands in the variadic sequence is not a register + // definition. + // + // Otherwise, we conservatively assume that any register operand from the + // variadic sequence is both a register read and a register write. + bool AssumeUsesOnly = MCDesc.mayStore() && !MCDesc.mayLoad() && + !MCDesc.hasUnmodeledSideEffects(); + CurrentDef = NumExplicitDefs + NumImplicitDefs + MCDesc.hasOptionalDef(); + for (unsigned I = 0, OpIndex = MCDesc.getNumOperands(); + I < NumVariadicOps && !AssumeUsesOnly; ++I, ++OpIndex) { + const MCOperand &Op = MCI.getOperand(OpIndex); + if (!Op.isReg()) + continue; - // Skip explicit definitions. - unsigned i = 0; - for (; i < MCI.getNumOperands() && NumExplicitDefs; ++i) { - const MCOperand &Op = MCI.getOperand(i); - if (Op.isReg()) - NumExplicitDefs--; + WriteDescriptor &Write = ID.Writes[CurrentDef]; + Write.OpIndex = OpIndex; + // Assign a default latency for this write. + Write.Latency = ID.MaxLatency; + Write.SClassOrWriteResourceID = 0; + Write.IsOptionalDef = false; + ++CurrentDef; + LLVM_DEBUG({ + dbgs() << "\t\t[Def][V] OpIdx=" << Write.OpIndex + << ", Latency=" << Write.Latency + << ", WriteResourceID=" << Write.SClassOrWriteResourceID << '\n'; + }); } - if (NumExplicitDefs) { - return make_error>( - "Expected more register operand definitions.", MCI); - } + ID.Writes.resize(CurrentDef); +} - unsigned NumExplicitUses = MCI.getNumOperands() - i; +void InstrBuilder::populateReads(InstrDesc &ID, const MCInst &MCI, + unsigned SchedClassID) { + const MCInstrDesc &MCDesc = MCII.get(MCI.getOpcode()); + unsigned NumExplicitUses = MCDesc.getNumOperands() - MCDesc.getNumDefs(); unsigned NumImplicitUses = MCDesc.getNumImplicitUses(); - if (MCDesc.hasOptionalDef()) { - assert(NumExplicitUses); - NumExplicitUses--; - } - unsigned TotalUses = NumExplicitUses + NumImplicitUses; - if (!TotalUses) - return ErrorSuccess(); - + // Remove the optional definition. + if (MCDesc.hasOptionalDef()) + --NumExplicitUses; + unsigned NumVariadicOps = MCI.getNumOperands() - MCDesc.getNumOperands(); + unsigned TotalUses = NumExplicitUses + NumImplicitUses + NumVariadicOps; ID.Reads.resize(TotalUses); - for (unsigned CurrentUse = 0; CurrentUse < NumExplicitUses; ++CurrentUse) { + unsigned CurrentUse = 0; + for (unsigned I = 0, OpIndex = MCDesc.getNumDefs(); I < NumExplicitUses; + ++I, ++OpIndex) { + const MCOperand &Op = MCI.getOperand(OpIndex); + if (!Op.isReg()) + continue; + ReadDescriptor &Read = ID.Reads[CurrentUse]; - Read.OpIndex = i + CurrentUse; - Read.UseIndex = CurrentUse; + Read.OpIndex = OpIndex; + Read.UseIndex = I; Read.SchedClassID = SchedClassID; - LLVM_DEBUG(dbgs() << "\t\t[Use] OpIdx=" << Read.OpIndex + ++CurrentUse; + LLVM_DEBUG(dbgs() << "\t\t[Use] OpIdx=" << Read.OpIndex << ", UseIndex=" << Read.UseIndex << '\n'); } - for (unsigned CurrentUse = 0; CurrentUse < NumImplicitUses; ++CurrentUse) { - ReadDescriptor &Read = ID.Reads[NumExplicitUses + CurrentUse]; - Read.OpIndex = ~CurrentUse; - Read.UseIndex = NumExplicitUses + CurrentUse; - Read.RegisterID = MCDesc.getImplicitUses()[CurrentUse]; + // For the purpose of ReadAdvance, implicit uses come directly after explicit + // uses. The "UseIndex" must be updated according to that implicit layout. + for (unsigned I = 0; I < NumImplicitUses; ++I) { + ReadDescriptor &Read = ID.Reads[CurrentUse + I]; + Read.OpIndex = ~I; + Read.UseIndex = NumExplicitUses + I; + Read.RegisterID = MCDesc.getImplicitUses()[I]; Read.SchedClassID = SchedClassID; - LLVM_DEBUG(dbgs() << "\t\t[Use] OpIdx=" << Read.OpIndex << ", RegisterID=" + LLVM_DEBUG(dbgs() << "\t\t[Use][I] OpIdx=" << ~Read.OpIndex + << ", UseIndex=" << Read.UseIndex << ", RegisterID=" << MRI.getName(Read.RegisterID) << '\n'); } - return ErrorSuccess(); + + CurrentUse += NumImplicitUses; + + // FIXME: If an instruction opcode is marked as 'mayLoad', and it has no + // "unmodeledSideEffects", then this logic optimistically assumes that any + // extra register operands in the variadic sequence are not register + // definition. + + bool AssumeDefsOnly = !MCDesc.mayStore() && MCDesc.mayLoad() && + !MCDesc.hasUnmodeledSideEffects(); + for (unsigned I = 0, OpIndex = MCDesc.getNumOperands(); + I < NumVariadicOps && !AssumeDefsOnly; ++I, ++OpIndex) { + const MCOperand &Op = MCI.getOperand(OpIndex); + if (!Op.isReg()) + continue; + + ReadDescriptor &Read = ID.Reads[CurrentUse]; + Read.OpIndex = OpIndex; + Read.UseIndex = NumExplicitUses + NumImplicitUses + I; + Read.SchedClassID = SchedClassID; + ++CurrentUse; + LLVM_DEBUG(dbgs() << "\t\t[Use][V] OpIdx=" << Read.OpIndex + << ", UseIndex=" << Read.UseIndex << '\n'); + } + + ID.Reads.resize(CurrentUse); } Error InstrBuilder::verifyInstrDesc(const InstrDesc &ID, @@ -364,10 +492,11 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) { // Then obtain the scheduling class information from the instruction. unsigned SchedClassID = MCDesc.getSchedClass(); - unsigned CPUID = SM.getProcessorID(); + bool IsVariant = SM.getSchedClassDesc(SchedClassID)->isVariant(); // Try to solve variant scheduling classes. - if (SchedClassID) { + if (IsVariant) { + unsigned CPUID = SM.getProcessorID(); while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant()) SchedClassID = STI.resolveVariantSchedClass(SchedClassID, &MCI, CPUID); @@ -389,30 +518,36 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) { std::unique_ptr ID = llvm::make_unique(); ID->NumMicroOps = SCDesc.NumMicroOps; - if (MCDesc.isCall()) { + if (MCDesc.isCall() && FirstCallInst) { // We don't correctly model calls. WithColor::warning() << "found a call in the input assembly sequence.\n"; WithColor::note() << "call instructions are not correctly modeled. " << "Assume a latency of 100cy.\n"; + FirstCallInst = false; } - if (MCDesc.isReturn()) { + if (MCDesc.isReturn() && FirstReturnInst) { WithColor::warning() << "found a return instruction in the input" << " assembly sequence.\n"; WithColor::note() << "program counter updates are ignored.\n"; + FirstReturnInst = false; } ID->MayLoad = MCDesc.mayLoad(); ID->MayStore = MCDesc.mayStore(); ID->HasSideEffects = MCDesc.hasUnmodeledSideEffects(); + ID->BeginGroup = SCDesc.BeginGroup; + ID->EndGroup = SCDesc.EndGroup; initializeUsedResources(*ID, SCDesc, STI, ProcResourceMasks); computeMaxLatency(*ID, MCDesc, SCDesc, STI); - if (auto Err = populateWrites(*ID, MCI, SchedClassID)) - return std::move(Err); - if (auto Err = populateReads(*ID, MCI, SchedClassID)) + + if (Error Err = verifyOperands(MCDesc, MCI)) return std::move(Err); + populateWrites(*ID, MCI, SchedClassID); + populateReads(*ID, MCI, SchedClassID); + LLVM_DEBUG(dbgs() << "\t\tMaxLatency=" << ID->MaxLatency << '\n'); LLVM_DEBUG(dbgs() << "\t\tNumMicroOps=" << ID->NumMicroOps << '\n'); @@ -422,7 +557,8 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) { // Now add the new descriptor. SchedClassID = MCDesc.getSchedClass(); - if (!SM.getSchedClassDesc(SchedClassID)->isVariant()) { + bool IsVariadic = MCDesc.isVariadic(); + if (!IsVariadic && !IsVariant) { Descriptors[MCI.getOpcode()] = std::move(ID); return *Descriptors[MCI.getOpcode()]; } @@ -453,12 +589,16 @@ InstrBuilder::createInstruction(const MCInst &MCI) { // Check if this is a dependency breaking instruction. APInt Mask; - unsigned ProcID = STI.getSchedModel().getProcessorID(); - bool IsZeroIdiom = MCIA.isZeroIdiom(MCI, Mask, ProcID); - bool IsDepBreaking = - IsZeroIdiom || MCIA.isDependencyBreaking(MCI, Mask, ProcID); - if (MCIA.isOptimizableRegisterMove(MCI, ProcID)) - NewIS->setOptimizableMove(); + bool IsZeroIdiom = false; + bool IsDepBreaking = false; + if (MCIA) { + unsigned ProcID = STI.getSchedModel().getProcessorID(); + IsZeroIdiom = MCIA->isZeroIdiom(MCI, Mask, ProcID); + IsDepBreaking = + IsZeroIdiom || MCIA->isDependencyBreaking(MCI, Mask, ProcID); + if (MCIA->isOptimizableRegisterMove(MCI, ProcID)) + NewIS->setOptimizableMove(); + } // Initialize Reads first. for (const ReadDescriptor &RD : D.Reads) { @@ -515,7 +655,8 @@ InstrBuilder::createInstruction(const MCInst &MCI) { // Now query the MCInstrAnalysis object to obtain information about which // register writes implicitly clear the upper portion of a super-register. - MCIA.clearsSuperRegisters(MRI, MCI, WriteMask); + if (MCIA) + MCIA->clearsSuperRegisters(MRI, MCI, WriteMask); // Initialize writes. unsigned WriteIndex = 0; @@ -529,9 +670,9 @@ InstrBuilder::createInstruction(const MCInst &MCI) { } assert(RegID && "Expected a valid register ID!"); - NewIS->getDefs().emplace_back( - WD, RegID, /* ClearsSuperRegs */ WriteMask[WriteIndex], - /* WritesZero */ IsZeroIdiom); + NewIS->getDefs().emplace_back(WD, RegID, + /* ClearsSuperRegs */ WriteMask[WriteIndex], + /* WritesZero */ IsZeroIdiom); ++WriteIndex; } diff --git a/tools/llvm-mca/lib/Instruction.cpp b/lib/MCA/Instruction.cpp similarity index 83% rename from tools/llvm-mca/lib/Instruction.cpp rename to lib/MCA/Instruction.cpp index 832a6199f00ca363891cd3e5c780c2bb921f9b96..057e95ca9990fdf967cda1c741c5e8e2ce453db7 100644 --- a/tools/llvm-mca/lib/Instruction.cpp +++ b/lib/MCA/Instruction.cpp @@ -12,7 +12,7 @@ // //===----------------------------------------------------------------------===// -#include "Instruction.h" +#include "llvm/MCA/Instruction.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -49,6 +49,10 @@ void WriteState::onInstructionIssued() { unsigned ReadCycles = std::max(0, CyclesLeft - User.second); RS->writeStartEvent(ReadCycles); } + + // Notify any writes that are in a false dependency with this write. + if (PartialWrite) + PartialWrite->writeStartEvent(CyclesLeft); } void WriteState::addUser(ReadState *User, int ReadAdvance) { @@ -61,8 +65,22 @@ void WriteState::addUser(ReadState *User, int ReadAdvance) { return; } - std::pair NewPair(User, ReadAdvance); - Users.insert(NewPair); + if (llvm::find_if(Users, [&User](const std::pair &Use) { + return Use.first == User; + }) == Users.end()) { + Users.emplace_back(User, ReadAdvance); + } +} + +void WriteState::addUser(WriteState *User) { + if (CyclesLeft != UNKNOWN_CYCLES) { + User->writeStartEvent(std::max(0, CyclesLeft)); + return; + } + + assert(!PartialWrite && "PartialWrite already set!"); + PartialWrite = User; + User->setDependentWrite(this); } void WriteState::cycleEvent() { @@ -71,6 +89,9 @@ void WriteState::cycleEvent() { // specify a negative ReadAdvance. if (CyclesLeft != UNKNOWN_CYCLES) CyclesLeft--; + + if (DependentWriteCyclesLeft) + DependentWriteCyclesLeft--; } void ReadState::cycleEvent() { @@ -143,13 +164,11 @@ void Instruction::update() { // A partial register write cannot complete before a dependent write. auto IsDefReady = [&](const WriteState &Def) { - if (const WriteState *Write = Def.getDependentWrite()) { - int WriteLatency = Write->getCyclesLeft(); - if (WriteLatency == UNKNOWN_CYCLES) - return false; - return static_cast(WriteLatency) < getLatency(); + if (!Def.getDependentWrite()) { + unsigned CyclesLeft = Def.getDependentWriteCyclesLeft(); + return !CyclesLeft || CyclesLeft < getLatency(); } - return true; + return false; }; if (all_of(getDefs(), IsDefReady)) @@ -164,6 +183,9 @@ void Instruction::cycleEvent() { for (ReadState &Use : getUses()) Use.cycleEvent(); + for (WriteState &Def : getDefs()) + Def.cycleEvent(); + update(); return; } diff --git a/tools/llvm-mca/lib/LLVMBuild.txt b/lib/MCA/LLVMBuild.txt similarity index 100% rename from tools/llvm-mca/lib/LLVMBuild.txt rename to lib/MCA/LLVMBuild.txt diff --git a/tools/llvm-mca/lib/Pipeline.cpp b/lib/MCA/Pipeline.cpp similarity index 94% rename from tools/llvm-mca/lib/Pipeline.cpp rename to lib/MCA/Pipeline.cpp index 309f415913d746b53639293e354266cc503bca9d..fd97ea624b839c4be09a0357710017973b987e84 100644 --- a/tools/llvm-mca/lib/Pipeline.cpp +++ b/lib/MCA/Pipeline.cpp @@ -13,8 +13,8 @@ /// //===----------------------------------------------------------------------===// -#include "Pipeline.h" -#include "HWEventListener.h" +#include "llvm/MCA/Pipeline.h" +#include "llvm/MCA/HWEventListener.h" #include "llvm/Support/Debug.h" namespace llvm { @@ -35,18 +35,18 @@ bool Pipeline::hasWorkToProcess() { }); } -Error Pipeline::run() { +Expected Pipeline::run() { assert(!Stages.empty() && "Unexpected empty pipeline found!"); do { notifyCycleBegin(); if (Error Err = runCycle()) - return Err; + return std::move(Err); notifyCycleEnd(); ++Cycles; } while (hasWorkToProcess()); - return ErrorSuccess(); + return Cycles; } Error Pipeline::runCycle() { diff --git a/tools/llvm-mca/lib/Stages/DispatchStage.cpp b/lib/MCA/Stages/DispatchStage.cpp similarity index 95% rename from tools/llvm-mca/lib/Stages/DispatchStage.cpp rename to lib/MCA/Stages/DispatchStage.cpp index 838dbad22e3ad5c03ffb5d1c8fda39374bb914d0..7fb4eb6a1c0ef85b3b1a6a03157a2df1197f3cfe 100644 --- a/tools/llvm-mca/lib/Stages/DispatchStage.cpp +++ b/lib/MCA/Stages/DispatchStage.cpp @@ -16,9 +16,9 @@ /// //===----------------------------------------------------------------------===// -#include "Stages/DispatchStage.h" -#include "HWEventListener.h" -#include "HardwareUnits/Scheduler.h" +#include "llvm/MCA/Stages/DispatchStage.h" +#include "llvm/MCA/HWEventListener.h" +#include "llvm/MCA/HardwareUnits/Scheduler.h" #include "llvm/Support/Debug.h" #define DEBUG_TYPE "llvm-mca" @@ -99,6 +99,10 @@ Error DispatchStage::dispatch(InstRef IR) { AvailableEntries -= NumMicroOps; } + // Check if this instructions ends the dispatch group. + if (Desc.EndGroup) + AvailableEntries = 0; + // Check if this is an optimizable reg-reg move. bool IsEliminated = false; if (IS.isOptimizableMove()) { @@ -164,6 +168,10 @@ bool DispatchStage::isAvailable(const InstRef &IR) const { unsigned Required = std::min(Desc.NumMicroOps, DispatchWidth); if (Required > AvailableEntries) return false; + + if (Desc.BeginGroup && AvailableEntries != DispatchWidth) + return false; + // The dispatch logic doesn't internally buffer instructions. It only accepts // instructions that can be successfully moved to the next stage during this // same cycle. diff --git a/tools/llvm-mca/lib/Stages/FetchStage.cpp b/lib/MCA/Stages/EntryStage.cpp similarity index 66% rename from tools/llvm-mca/lib/Stages/FetchStage.cpp rename to lib/MCA/Stages/EntryStage.cpp index 6e91dd6121d35fb18f00fdd6583cdeb22029fef9..3325bb36f5afe91e9bcf9f04064f2029fe8975c0 100644 --- a/tools/llvm-mca/lib/Stages/FetchStage.cpp +++ b/lib/MCA/Stages/EntryStage.cpp @@ -1,4 +1,4 @@ -//===---------------------- FetchStage.cpp ----------------------*- C++ -*-===// +//===---------------------- EntryStage.cpp ----------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -13,32 +13,32 @@ /// //===----------------------------------------------------------------------===// -#include "Stages/FetchStage.h" -#include "Instruction.h" +#include "llvm/MCA/Stages/EntryStage.h" +#include "llvm/MCA/Instruction.h" namespace llvm { namespace mca { -bool FetchStage::hasWorkToComplete() const { return CurrentInstruction; } +bool EntryStage::hasWorkToComplete() const { return CurrentInstruction; } -bool FetchStage::isAvailable(const InstRef & /* unused */) const { +bool EntryStage::isAvailable(const InstRef & /* unused */) const { if (CurrentInstruction) return checkNextStage(CurrentInstruction); return false; } -void FetchStage::getNextInstruction() { +void EntryStage::getNextInstruction() { assert(!CurrentInstruction && "There is already an instruction to process!"); if (!SM.hasNext()) return; SourceRef SR = SM.peekNext(); std::unique_ptr Inst = llvm::make_unique(SR.second); CurrentInstruction = InstRef(SR.first, Inst.get()); - Instructions[SR.first] = std::move(Inst); + Instructions.emplace_back(std::move(Inst)); SM.updateNext(); } -llvm::Error FetchStage::execute(InstRef & /*unused */) { +llvm::Error EntryStage::execute(InstRef & /*unused */) { assert(CurrentInstruction && "There is no instruction to process!"); if (llvm::Error Val = moveToTheNextStage(CurrentInstruction)) return Val; @@ -49,22 +49,25 @@ llvm::Error FetchStage::execute(InstRef & /*unused */) { return llvm::ErrorSuccess(); } -llvm::Error FetchStage::cycleStart() { +llvm::Error EntryStage::cycleStart() { if (!CurrentInstruction) getNextInstruction(); return llvm::ErrorSuccess(); } -llvm::Error FetchStage::cycleEnd() { +llvm::Error EntryStage::cycleEnd() { // Find the first instruction which hasn't been retired. - const InstMap::iterator It = - llvm::find_if(Instructions, [](const InstMap::value_type &KeyValuePair) { - return !KeyValuePair.second->isRetired(); - }); + auto Range = make_range(&Instructions[NumRetired], Instructions.end()); + auto It = find_if(Range, [](const std::unique_ptr &I) { + return !I->isRetired(); + }); + NumRetired = std::distance(Instructions.begin(), It); // Erase instructions up to the first that hasn't been retired. - if (It != Instructions.begin()) + if ((NumRetired * 2) >= Instructions.size()) { Instructions.erase(Instructions.begin(), It); + NumRetired = 0; + } return llvm::ErrorSuccess(); } diff --git a/tools/llvm-mca/lib/Stages/ExecuteStage.cpp b/lib/MCA/Stages/ExecuteStage.cpp similarity index 99% rename from tools/llvm-mca/lib/Stages/ExecuteStage.cpp rename to lib/MCA/Stages/ExecuteStage.cpp index 298f08a2887f951117d3a79d629ee705a8e1d66e..17f7ff7259a3e963bc5e003c011f417ab130c4e6 100644 --- a/tools/llvm-mca/lib/Stages/ExecuteStage.cpp +++ b/lib/MCA/Stages/ExecuteStage.cpp @@ -15,7 +15,7 @@ /// //===----------------------------------------------------------------------===// -#include "Stages/ExecuteStage.h" +#include "llvm/MCA/Stages/ExecuteStage.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Debug.h" diff --git a/tools/llvm-mca/lib/Stages/InstructionTables.cpp b/lib/MCA/Stages/InstructionTables.cpp similarity index 98% rename from tools/llvm-mca/lib/Stages/InstructionTables.cpp rename to lib/MCA/Stages/InstructionTables.cpp index 33c30e7f95c0f7142b0732df0a20d858f0550cbf..f918c183aa5a0486452a4cd5dac3f80deb99f1f3 100644 --- a/tools/llvm-mca/lib/Stages/InstructionTables.cpp +++ b/lib/MCA/Stages/InstructionTables.cpp @@ -15,7 +15,7 @@ /// //===----------------------------------------------------------------------===// -#include "Stages/InstructionTables.h" +#include "llvm/MCA/Stages/InstructionTables.h" namespace llvm { namespace mca { diff --git a/tools/llvm-mca/lib/Stages/RetireStage.cpp b/lib/MCA/Stages/RetireStage.cpp similarity index 95% rename from tools/llvm-mca/lib/Stages/RetireStage.cpp rename to lib/MCA/Stages/RetireStage.cpp index 47eed5f2c9c651beefae47f364ee0f9d764dda85..d6bcc518662f5323e87c2a21c93a237e328b667f 100644 --- a/tools/llvm-mca/lib/Stages/RetireStage.cpp +++ b/lib/MCA/Stages/RetireStage.cpp @@ -14,8 +14,8 @@ /// //===----------------------------------------------------------------------===// -#include "Stages/RetireStage.h" -#include "HWEventListener.h" +#include "llvm/MCA/Stages/RetireStage.h" +#include "llvm/MCA/HWEventListener.h" #include "llvm/Support/Debug.h" #define DEBUG_TYPE "llvm-mca" diff --git a/tools/llvm-mca/lib/Stages/Stage.cpp b/lib/MCA/Stages/Stage.cpp similarity index 95% rename from tools/llvm-mca/lib/Stages/Stage.cpp rename to lib/MCA/Stages/Stage.cpp index c3cfe47d24e19feba002cec357a148355685da91..38191645e736dab45a0543e5cf7800c74b9aaaed 100644 --- a/tools/llvm-mca/lib/Stages/Stage.cpp +++ b/lib/MCA/Stages/Stage.cpp @@ -13,7 +13,7 @@ /// //===----------------------------------------------------------------------===// -#include "Stages/Stage.h" +#include "llvm/MCA/Stages/Stage.h" namespace llvm { namespace mca { diff --git a/tools/llvm-mca/lib/Support.cpp b/lib/MCA/Support.cpp similarity index 98% rename from tools/llvm-mca/lib/Support.cpp rename to lib/MCA/Support.cpp index a6ff26dafb5baa49e52d1fd3291c3bd22596c712..3271bc6bf5b4f103fcd046cd5c48547dccc96f5e 100644 --- a/tools/llvm-mca/lib/Support.cpp +++ b/lib/MCA/Support.cpp @@ -13,7 +13,7 @@ /// //===----------------------------------------------------------------------===// -#include "Support.h" +#include "llvm/MCA/Support.h" #include "llvm/MC/MCSchedule.h" namespace llvm { diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp index f09401307b40f6b2f6e1d0981d933c1c0eb27666..e17f0e2783e1e838c728cbb2d33a9283ca7e9c78 100644 --- a/lib/Object/COFFObjectFile.cpp +++ b/lib/Object/COFFObjectFile.cpp @@ -938,6 +938,18 @@ iterator_range COFFObjectFile::base_relocs() const { return make_range(base_reloc_begin(), base_reloc_end()); } +std::error_code +COFFObjectFile::getCOFFHeader(const coff_file_header *&Res) const { + Res = COFFHeader; + return std::error_code(); +} + +std::error_code +COFFObjectFile::getCOFFBigObjHeader(const coff_bigobj_file_header *&Res) const { + Res = COFFBigObjHeader; + return std::error_code(); +} + std::error_code COFFObjectFile::getPE32Header(const pe32_header *&Res) const { Res = PE32Header; return std::error_code(); @@ -1284,6 +1296,12 @@ bool COFFObjectFile::isRelocatableObject() const { return !DataDirectory; } +StringRef COFFObjectFile::mapDebugSectionName(StringRef Name) const { + return StringSwitch(Name) + .Case("eh_fram", "eh_frame") + .Default(Name); +} + bool ImportDirectoryEntryRef:: operator==(const ImportDirectoryEntryRef &Other) const { return ImportTable == Other.ImportTable && Index == Other.Index; diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp index 2edab0b13735eca0432f1d6a6080212286008ff0..cf8313f88f9394ed4a68b74295f9662714ed9da2 100644 --- a/lib/Object/ELF.cpp +++ b/lib/Object/ELF.cpp @@ -154,7 +154,7 @@ StringRef llvm::object::getELFRelocationTypeName(uint32_t Machine, #undef ELF_RELOC -uint32_t llvm::object::getELFRelrRelocationType(uint32_t Machine) { +uint32_t llvm::object::getELFRelativeRelocationType(uint32_t Machine) { switch (Machine) { case ELF::EM_X86_64: return ELF::R_X86_64_RELATIVE; @@ -300,7 +300,7 @@ ELFFile::decode_relrs(Elf_Relr_Range relrs) const { Elf_Rela Rela; Rela.r_info = 0; Rela.r_addend = 0; - Rela.setType(getRelrRelocationType(), false); + Rela.setType(getRelativeRelocationType(), false); std::vector Relocs; // Word type: uint32_t for Elf32, and uint64_t for Elf64. diff --git a/lib/Object/ModuleSymbolTable.cpp b/lib/Object/ModuleSymbolTable.cpp index b353ef3c835bca6396f7b7a165f6f5ed561f421b..33ce7d8109fb8a74083dd6a85205df2603d45346 100644 --- a/lib/Object/ModuleSymbolTable.cpp +++ b/lib/Object/ModuleSymbolTable.cpp @@ -100,6 +100,7 @@ initializeRecordStreamer(const Module &M, MCObjectFileInfo MOFI; MCContext MCCtx(MAI.get(), MRI.get(), &MOFI); MOFI.InitMCObjectFileInfo(TT, /*PIC*/ false, MCCtx); + MOFI.setSDKVersion(M.getSDKVersion()); RecordStreamer Streamer(MCCtx, M); T->createNullTargetStreamer(Streamer); diff --git a/lib/Object/Object.cpp b/lib/Object/Object.cpp index 5fd823e0117ed6b942625dcb3b9eb0c074691a34..f5de2e1d5ce23b8cae7d7ff4d956ad589d61b22f 100644 --- a/lib/Object/Object.cpp +++ b/lib/Object/Object.cpp @@ -105,7 +105,7 @@ void LLVMMoveToContainingSection(LLVMSectionIteratorRef Sect, if (!SecOrErr) { std::string Buf; raw_string_ostream OS(Buf); - logAllUnhandledErrors(SecOrErr.takeError(), OS, ""); + logAllUnhandledErrors(SecOrErr.takeError(), OS); OS.flush(); report_fatal_error(Buf); } @@ -187,7 +187,7 @@ const char *LLVMGetSymbolName(LLVMSymbolIteratorRef SI) { if (!Ret) { std::string Buf; raw_string_ostream OS(Buf); - logAllUnhandledErrors(Ret.takeError(), OS, ""); + logAllUnhandledErrors(Ret.takeError(), OS); OS.flush(); report_fatal_error(Buf); } @@ -199,7 +199,7 @@ uint64_t LLVMGetSymbolAddress(LLVMSymbolIteratorRef SI) { if (!Ret) { std::string Buf; raw_string_ostream OS(Buf); - logAllUnhandledErrors(Ret.takeError(), OS, ""); + logAllUnhandledErrors(Ret.takeError(), OS); OS.flush(); report_fatal_error(Buf); } @@ -229,7 +229,7 @@ const char *LLVMGetRelocationTypeName(LLVMRelocationIteratorRef RI) { SmallVector ret; (*unwrap(RI))->getTypeName(ret); char *str = static_cast(safe_malloc(ret.size())); - std::copy(ret.begin(), ret.end(), str); + llvm::copy(ret, str); return str; } diff --git a/lib/Object/ObjectFile.cpp b/lib/Object/ObjectFile.cpp index db0ff220c4d89947658b8931c7135632ba582393..cf63b89adc122a516d69a8adc37f368b8b027404 100644 --- a/lib/Object/ObjectFile.cpp +++ b/lib/Object/ObjectFile.cpp @@ -77,6 +77,14 @@ bool ObjectFile::isSectionBitcode(DataRefImpl Sec) const { bool ObjectFile::isSectionStripped(DataRefImpl Sec) const { return false; } +bool ObjectFile::isBerkeleyText(DataRefImpl Sec) const { + return isSectionText(Sec); +} + +bool ObjectFile::isBerkeleyData(DataRefImpl Sec) const { + return isSectionData(Sec); +} + section_iterator ObjectFile::getRelocatedSection(DataRefImpl Sec) const { return section_iterator(SectionRef(Sec, this)); } diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp index 3bd66f9375fe670eab34e7b8f41839c454a8505a..34e4d192e590a5f4b42287ac7b1ee16b3c6b89f7 100644 --- a/lib/Object/WasmObjectFile.cpp +++ b/lib/Object/WasmObjectFile.cpp @@ -24,6 +24,7 @@ #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/LEB128.h" +#include "llvm/Support/ScopedPrinter.h" #include #include #include @@ -207,8 +208,8 @@ static wasm::WasmTable readTable(WasmObjectFile::ReadContext &Ctx) { return Table; } -static Error readSection(WasmSection &Section, - WasmObjectFile::ReadContext &Ctx) { +static Error readSection(WasmSection &Section, WasmObjectFile::ReadContext &Ctx, + WasmSectionOrderChecker &Checker) { Section.Offset = Ctx.Ptr - Ctx.Start; Section.Type = readUint8(Ctx); LLVM_DEBUG(dbgs() << "readSection type=" << Section.Type << "\n"); @@ -231,6 +232,13 @@ static Error readSection(WasmSection &Section, Ctx.Ptr += SectionNameSize; Size -= SectionNameSize; } + + if (!Checker.isValidSectionOrder(Section.Type, Section.Name)) { + return make_error("Out of order section type: " + + llvm::to_string(Section.Type), + object_error::parse_failed); + } + Section.Content = ArrayRef(Ctx.Ptr, Size); Ctx.Ptr += Size; return Error::success(); @@ -265,8 +273,9 @@ WasmObjectFile::WasmObjectFile(MemoryBufferRef Buffer, Error &Err) } WasmSection Sec; + WasmSectionOrderChecker Checker; while (Ctx.Ptr < Ctx.End) { - if ((Err = readSection(Sec, Ctx))) + if ((Err = readSection(Sec, Ctx, Checker))) return; if ((Err = parseSection(Sec))) return; @@ -295,6 +304,8 @@ Error WasmObjectFile::parseSection(WasmSection &Sec) { return parseMemorySection(Ctx); case wasm::WASM_SEC_GLOBAL: return parseGlobalSection(Ctx); + case wasm::WASM_SEC_EVENT: + return parseEventSection(Ctx); case wasm::WASM_SEC_EXPORT: return parseExportSection(Ctx); case wasm::WASM_SEC_START: @@ -311,6 +322,22 @@ Error WasmObjectFile::parseSection(WasmSection &Sec) { } } +Error WasmObjectFile::parseDylinkSection(ReadContext &Ctx) { + // See https://github.com/WebAssembly/tool-conventions/blob/master/DynamicLinking.md + DylinkInfo.MemorySize = readVaruint32(Ctx); + DylinkInfo.MemoryAlignment = readVaruint32(Ctx); + DylinkInfo.TableSize = readVaruint32(Ctx); + DylinkInfo.TableAlignment = readVaruint32(Ctx); + uint32_t Count = readVaruint32(Ctx); + while (Count--) { + DylinkInfo.Needed.push_back(readString(Ctx)); + } + if (Ctx.Ptr != Ctx.End) + return make_error("dylink section ended prematurely", + object_error::parse_failed); + return Error::success(); +} + Error WasmObjectFile::parseNameSection(ReadContext &Ctx) { llvm::DenseSet Seen; if (Functions.size() != FunctionTypes.size()) { @@ -439,19 +466,24 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) { std::vector ImportedGlobals; std::vector ImportedFunctions; + std::vector ImportedEvents; ImportedGlobals.reserve(Imports.size()); ImportedFunctions.reserve(Imports.size()); + ImportedEvents.reserve(Imports.size()); for (auto &I : Imports) { if (I.Kind == wasm::WASM_EXTERNAL_FUNCTION) ImportedFunctions.emplace_back(&I); else if (I.Kind == wasm::WASM_EXTERNAL_GLOBAL) ImportedGlobals.emplace_back(&I); + else if (I.Kind == wasm::WASM_EXTERNAL_EVENT) + ImportedEvents.emplace_back(&I); } while (Count--) { wasm::WasmSymbolInfo Info; - const wasm::WasmSignature *FunctionType = nullptr; + const wasm::WasmSignature *Signature = nullptr; const wasm::WasmGlobalType *GlobalType = nullptr; + const wasm::WasmEventType *EventType = nullptr; Info.Kind = readUint8(Ctx); Info.Flags = readVaruint32(Ctx); @@ -467,13 +499,13 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) { if (IsDefined) { Info.Name = readString(Ctx); unsigned FuncIndex = Info.ElementIndex - NumImportedFunctions; - FunctionType = &Signatures[FunctionTypes[FuncIndex]]; + Signature = &Signatures[FunctionTypes[FuncIndex]]; wasm::WasmFunction &Function = Functions[FuncIndex]; if (Function.SymbolName.empty()) Function.SymbolName = Info.Name; } else { wasm::WasmImport &Import = *ImportedFunctions[Info.ElementIndex]; - FunctionType = &Signatures[Import.SigIndex]; + Signature = &Signatures[Import.SigIndex]; Info.Name = Import.Field; Info.Module = Import.Module; } @@ -532,6 +564,34 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) { break; } + case wasm::WASM_SYMBOL_TYPE_EVENT: { + Info.ElementIndex = readVaruint32(Ctx); + if (!isValidEventIndex(Info.ElementIndex) || + IsDefined != isDefinedEventIndex(Info.ElementIndex)) + return make_error("invalid event symbol index", + object_error::parse_failed); + if (!IsDefined && (Info.Flags & wasm::WASM_SYMBOL_BINDING_MASK) == + wasm::WASM_SYMBOL_BINDING_WEAK) + return make_error("undefined weak global symbol", + object_error::parse_failed); + if (IsDefined) { + Info.Name = readString(Ctx); + unsigned EventIndex = Info.ElementIndex - NumImportedEvents; + wasm::WasmEvent &Event = Events[EventIndex]; + Signature = &Signatures[Event.Type.SigIndex]; + EventType = &Event.Type; + if (Event.SymbolName.empty()) + Event.SymbolName = Info.Name; + + } else { + wasm::WasmImport &Import = *ImportedEvents[Info.ElementIndex]; + EventType = &Import.Event; + Signature = &Signatures[EventType->SigIndex]; + Info.Name = Import.Field; + } + break; + } + default: return make_error("Invalid symbol type", object_error::parse_failed); @@ -544,8 +604,8 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) { Twine(Info.Name), object_error::parse_failed); LinkingData.SymbolTable.emplace_back(Info); - Symbols.emplace_back(LinkingData.SymbolTable.back(), FunctionType, - GlobalType); + Symbols.emplace_back(LinkingData.SymbolTable.back(), GlobalType, EventType, + Signature); LLVM_DEBUG(dbgs() << "Adding symbol: " << Symbols.back() << "\n"); } @@ -635,6 +695,11 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) { return make_error("Bad relocation global index", object_error::parse_failed); break; + case wasm::R_WEBASSEMBLY_EVENT_INDEX_LEB: + if (!isValidEventSymbol(Reloc.Index)) + return make_error("Bad relocation event index", + object_error::parse_failed); + break; case wasm::R_WEBASSEMBLY_MEMORY_ADDR_LEB: case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB: case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32: @@ -683,7 +748,10 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) { } Error WasmObjectFile::parseCustomSection(WasmSection &Sec, ReadContext &Ctx) { - if (Sec.Name == "name") { + if (Sec.Name == "dylink") { + if (Error Err = parseDylinkSection(Ctx)) + return Err; + } else if (Sec.Name == "name") { if (Error Err = parseNameSection(Ctx)) return Err; } else if (Sec.Name == "linking") { @@ -755,6 +823,11 @@ Error WasmObjectFile::parseImportSection(ReadContext &Ctx) { return make_error("Invalid table element type", object_error::parse_failed); break; + case wasm::WASM_EXTERNAL_EVENT: + NumImportedEvents++; + Im.Event.Attribute = readVarint32(Ctx); + Im.Event.SigIndex = readVarint32(Ctx); + break; default: return make_error("Unexpected import kind", object_error::parse_failed); @@ -831,6 +904,24 @@ Error WasmObjectFile::parseGlobalSection(ReadContext &Ctx) { return Error::success(); } +Error WasmObjectFile::parseEventSection(ReadContext &Ctx) { + EventSection = Sections.size(); + uint32_t Count = readVarint32(Ctx); + Events.reserve(Count); + while (Count--) { + wasm::WasmEvent Event; + Event.Index = NumImportedEvents + Events.size(); + Event.Type.Attribute = readVaruint32(Ctx); + Event.Type.SigIndex = readVarint32(Ctx); + Events.push_back(Event); + } + + if (Ctx.Ptr != Ctx.End) + return make_error("Event section ended prematurely", + object_error::parse_failed); + return Error::success(); +} + Error WasmObjectFile::parseExportSection(ReadContext &Ctx) { uint32_t Count = readVaruint32(Ctx); Exports.reserve(Count); @@ -850,6 +941,11 @@ Error WasmObjectFile::parseExportSection(ReadContext &Ctx) { return make_error("Invalid global export", object_error::parse_failed); break; + case wasm::WASM_EXTERNAL_EVENT: + if (!isValidEventIndex(Ex.Index)) + return make_error("Invalid event export", + object_error::parse_failed); + break; case wasm::WASM_EXTERNAL_MEMORY: case wasm::WASM_EXTERNAL_TABLE: break; @@ -881,6 +977,14 @@ bool WasmObjectFile::isDefinedGlobalIndex(uint32_t Index) const { return Index >= NumImportedGlobals && isValidGlobalIndex(Index); } +bool WasmObjectFile::isValidEventIndex(uint32_t Index) const { + return Index < NumImportedEvents + Events.size(); +} + +bool WasmObjectFile::isDefinedEventIndex(uint32_t Index) const { + return Index >= NumImportedEvents && isValidEventIndex(Index); +} + bool WasmObjectFile::isValidFunctionSymbol(uint32_t Index) const { return Index < Symbols.size() && Symbols[Index].isTypeFunction(); } @@ -889,6 +993,10 @@ bool WasmObjectFile::isValidGlobalSymbol(uint32_t Index) const { return Index < Symbols.size() && Symbols[Index].isTypeGlobal(); } +bool WasmObjectFile::isValidEventSymbol(uint32_t Index) const { + return Index < Symbols.size() && Symbols[Index].isTypeEvent(); +} + bool WasmObjectFile::isValidDataSymbol(uint32_t Index) const { return Index < Symbols.size() && Symbols[Index].isTypeData(); } @@ -907,6 +1015,11 @@ wasm::WasmGlobal &WasmObjectFile::getDefinedGlobal(uint32_t Index) { return Globals[Index - NumImportedGlobals]; } +wasm::WasmEvent &WasmObjectFile::getDefinedEvent(uint32_t Index) { + assert(isDefinedEventIndex(Index)); + return Events[Index - NumImportedEvents]; +} + Error WasmObjectFile::parseStartSection(ReadContext &Ctx) { StartFunction = readVaruint32(Ctx); if (!isValidFunctionIndex(StartFunction)) @@ -1070,6 +1183,7 @@ uint64_t WasmObjectFile::getWasmSymbolValue(const WasmSymbol &Sym) const { switch (Sym.Info.Kind) { case wasm::WASM_SYMBOL_TYPE_FUNCTION: case wasm::WASM_SYMBOL_TYPE_GLOBAL: + case wasm::WASM_SYMBOL_TYPE_EVENT: return Sym.Info.ElementIndex; case wasm::WASM_SYMBOL_TYPE_DATA: { // The value of a data symbol is the segment offset, plus the symbol @@ -1112,6 +1226,8 @@ WasmObjectFile::getSymbolType(DataRefImpl Symb) const { return SymbolRef::ST_Data; case wasm::WASM_SYMBOL_TYPE_SECTION: return SymbolRef::ST_Debug; + case wasm::WASM_SYMBOL_TYPE_EVENT: + return SymbolRef::ST_Other; } llvm_unreachable("Unknown WasmSymbol::SymbolType"); @@ -1135,10 +1251,12 @@ WasmObjectFile::getSymbolSection(DataRefImpl Symb) const { case wasm::WASM_SYMBOL_TYPE_DATA: Ref.d.a = DataSection; break; - case wasm::WASM_SYMBOL_TYPE_SECTION: { + case wasm::WASM_SYMBOL_TYPE_SECTION: Ref.d.a = Sym.Info.ElementIndex; break; - } + case wasm::WASM_SYMBOL_TYPE_EVENT: + Ref.d.a = EventSection; + break; default: llvm_unreachable("Unknown WasmSymbol::SymbolType"); } @@ -1161,6 +1279,7 @@ std::error_code WasmObjectFile::getSectionName(DataRefImpl Sec, ECase(TABLE); ECase(MEMORY); ECase(GLOBAL); + ECase(EVENT); ECase(EXPORT); ECase(START); ECase(ELEM); @@ -1299,6 +1418,8 @@ SubtargetFeatures WasmObjectFile::getFeatures() const { bool WasmObjectFile::isRelocatableObject() const { return HasLinkingSection; } +bool WasmObjectFile::isSharedObject() const { return HasDylinkSection; } + const WasmSection &WasmObjectFile::getWasmSection(DataRefImpl Ref) const { assert(Ref.d.a < Sections.size()); return Sections[Ref.d.a]; @@ -1321,3 +1442,58 @@ WasmObjectFile::getWasmRelocation(DataRefImpl Ref) const { assert(Ref.d.b < Sec.Relocations.size()); return Sec.Relocations[Ref.d.b]; } + +int WasmSectionOrderChecker::getSectionOrder(unsigned ID, + StringRef CustomSectionName) { + switch (ID) { + case wasm::WASM_SEC_CUSTOM: + return StringSwitch(CustomSectionName) + .Case("dylink", WASM_SEC_ORDER_DYLINK) + .Case("linking", WASM_SEC_ORDER_LINKING) + .StartsWith("reloc.", WASM_SEC_ORDER_RELOC) + .Case("name", WASM_SEC_ORDER_NAME) + .Case("producers", WASM_SEC_ORDER_PRODUCERS) + .Default(-1); + case wasm::WASM_SEC_TYPE: + return WASM_SEC_ORDER_TYPE; + case wasm::WASM_SEC_IMPORT: + return WASM_SEC_ORDER_IMPORT; + case wasm::WASM_SEC_FUNCTION: + return WASM_SEC_ORDER_FUNCTION; + case wasm::WASM_SEC_TABLE: + return WASM_SEC_ORDER_TABLE; + case wasm::WASM_SEC_MEMORY: + return WASM_SEC_ORDER_MEMORY; + case wasm::WASM_SEC_GLOBAL: + return WASM_SEC_ORDER_GLOBAL; + case wasm::WASM_SEC_EXPORT: + return WASM_SEC_ORDER_EXPORT; + case wasm::WASM_SEC_START: + return WASM_SEC_ORDER_START; + case wasm::WASM_SEC_ELEM: + return WASM_SEC_ORDER_ELEM; + case wasm::WASM_SEC_CODE: + return WASM_SEC_ORDER_CODE; + case wasm::WASM_SEC_DATA: + return WASM_SEC_ORDER_DATA; + case wasm::WASM_SEC_DATACOUNT: + return WASM_SEC_ORDER_DATACOUNT; + case wasm::WASM_SEC_EVENT: + return WASM_SEC_ORDER_EVENT; + default: + llvm_unreachable("invalid section"); + } +} + +bool WasmSectionOrderChecker::isValidSectionOrder(unsigned ID, + StringRef CustomSectionName) { + int Order = getSectionOrder(ID, CustomSectionName); + if (Order == -1) // Skip unknown sections + return true; + // There can be multiple "reloc." sections. Otherwise there shouldn't be any + // duplicate section orders. + bool IsValid = (LastOrder == Order && Order == WASM_SEC_ORDER_RELOC) || + LastOrder < Order; + LastOrder = Order; + return IsValid; +} diff --git a/lib/Object/WindowsResource.cpp b/lib/Object/WindowsResource.cpp index 1b7282f13db0a7cdc66310b5510ea5ff0a19b3be..65413dd8bea178170721258532099723448cb4ab 100644 --- a/lib/Object/WindowsResource.cpp +++ b/lib/Object/WindowsResource.cpp @@ -259,7 +259,7 @@ WindowsResourceParser::TreeNode::addChild(ArrayRef NameRef, std::vector EndianCorrectedName; if (sys::IsBigEndianHost) { EndianCorrectedName.resize(NameRef.size() + 1); - std::copy(NameRef.begin(), NameRef.end(), EndianCorrectedName.begin() + 1); + llvm::copy(NameRef, EndianCorrectedName.begin() + 1); EndianCorrectedName[0] = UNI_UTF16_BYTE_ORDER_MARK_SWAPPED; CorrectedName = makeArrayRef(EndianCorrectedName); } else @@ -501,8 +501,7 @@ void WindowsResourceCOFFWriter::writeFirstSection() { void WindowsResourceCOFFWriter::writeSecondSection() { // Now write the .rsrc$02 section. for (auto const &RawDataEntry : Data) { - std::copy(RawDataEntry.begin(), RawDataEntry.end(), - BufferStart + CurrentOffset); + llvm::copy(RawDataEntry, BufferStart + CurrentOffset); CurrentOffset += alignTo(RawDataEntry.size(), sizeof(uint64_t)); } @@ -672,7 +671,7 @@ void WindowsResourceCOFFWriter::writeDirectoryStringTable() { support::endian::write16le(BufferStart + CurrentOffset, Length); CurrentOffset += sizeof(uint16_t); auto *Start = reinterpret_cast(BufferStart + CurrentOffset); - std::copy(String.begin(), String.end(), Start); + llvm::copy(String, Start); CurrentOffset += Length * sizeof(UTF16); TotalStringTableSize += Length * sizeof(UTF16) + sizeof(uint16_t); } diff --git a/lib/ObjectYAML/ELFYAML.cpp b/lib/ObjectYAML/ELFYAML.cpp index 189d71782bd85d7f840a760f6c689d288b5f9975..b9f52432fc802b3accf8ed864c6dd17908e9e31b 100644 --- a/lib/ObjectYAML/ELFYAML.cpp +++ b/lib/ObjectYAML/ELFYAML.cpp @@ -337,10 +337,18 @@ void ScalarBitSetTraits::bitset(IO &IO, BCase(EF_HEXAGON_MACH_V3); BCase(EF_HEXAGON_MACH_V4); BCase(EF_HEXAGON_MACH_V5); + BCase(EF_HEXAGON_MACH_V55); + BCase(EF_HEXAGON_MACH_V60); + BCase(EF_HEXAGON_MACH_V62); + BCase(EF_HEXAGON_MACH_V65); BCase(EF_HEXAGON_ISA_V2); BCase(EF_HEXAGON_ISA_V3); BCase(EF_HEXAGON_ISA_V4); BCase(EF_HEXAGON_ISA_V5); + BCase(EF_HEXAGON_ISA_V55); + BCase(EF_HEXAGON_ISA_V60); + BCase(EF_HEXAGON_ISA_V62); + BCase(EF_HEXAGON_ISA_V65); break; case ELF::EM_AVR: BCase(EF_AVR_ARCH_AVR1); diff --git a/lib/ObjectYAML/WasmYAML.cpp b/lib/ObjectYAML/WasmYAML.cpp index dba950af5892941ef6e79cd3bbb7fa542f040ebd..b97803340c0d9e4cf07fbb353f7629304db1ad13 100644 --- a/lib/ObjectYAML/WasmYAML.cpp +++ b/lib/ObjectYAML/WasmYAML.cpp @@ -48,6 +48,16 @@ static void commonSectionMapping(IO &IO, WasmYAML::Section &Section) { IO.mapOptional("Relocations", Section.Relocations); } +static void sectionMapping(IO &IO, WasmYAML::DylinkSection &Section) { + commonSectionMapping(IO, Section); + IO.mapRequired("Name", Section.Name); + IO.mapRequired("MemorySize", Section.MemorySize); + IO.mapRequired("MemoryAlignment", Section.MemoryAlignment); + IO.mapRequired("TableSize", Section.TableSize); + IO.mapRequired("TableAlignment", Section.TableAlignment); + IO.mapRequired("Needed", Section.Needed); +} + static void sectionMapping(IO &IO, WasmYAML::NameSection &Section) { commonSectionMapping(IO, Section); IO.mapRequired("Name", Section.Name); @@ -100,6 +110,11 @@ static void sectionMapping(IO &IO, WasmYAML::GlobalSection &Section) { IO.mapOptional("Globals", Section.Globals); } +static void sectionMapping(IO &IO, WasmYAML::EventSection &Section) { + commonSectionMapping(IO, Section); + IO.mapOptional("Events", Section.Events); +} + static void sectionMapping(IO &IO, WasmYAML::ExportSection &Section) { commonSectionMapping(IO, Section); IO.mapOptional("Exports", Section.Exports); @@ -142,7 +157,11 @@ void MappingTraits>::mapping( } else { IO.mapRequired("Name", SectionName); } - if (SectionName == "linking") { + if (SectionName == "dylink") { + if (!IO.outputting()) + Section.reset(new WasmYAML::DylinkSection()); + sectionMapping(IO, *cast(Section.get())); + } else if (SectionName == "linking") { if (!IO.outputting()) Section.reset(new WasmYAML::LinkingSection()); sectionMapping(IO, *cast(Section.get())); @@ -187,6 +206,11 @@ void MappingTraits>::mapping( Section.reset(new WasmYAML::GlobalSection()); sectionMapping(IO, *cast(Section.get())); break; + case wasm::WASM_SEC_EVENT: + if (!IO.outputting()) + Section.reset(new WasmYAML::EventSection()); + sectionMapping(IO, *cast(Section.get())); + break; case wasm::WASM_SEC_EXPORT: if (!IO.outputting()) Section.reset(new WasmYAML::ExportSection()); @@ -227,6 +251,7 @@ void ScalarEnumerationTraits::enumeration( ECase(TABLE); ECase(MEMORY); ECase(GLOBAL); + ECase(EVENT); ECase(EXPORT); ECase(START); ECase(ELEM); @@ -307,6 +332,9 @@ void MappingTraits::mapping(IO &IO, } else if (Import.Kind == wasm::WASM_EXTERNAL_GLOBAL) { IO.mapRequired("GlobalType", Import.GlobalImport.Type); IO.mapRequired("GlobalMutable", Import.GlobalImport.Mutable); + } else if (Import.Kind == wasm::WASM_EXTERNAL_EVENT) { + IO.mapRequired("EventAttribute", Import.EventImport.Attribute); + IO.mapRequired("EventSigIndex", Import.EventImport.SigIndex); } else if (Import.Kind == wasm::WASM_EXTERNAL_TABLE) { IO.mapRequired("Table", Import.TableImport); } else if (Import.Kind == wasm::WASM_EXTERNAL_MEMORY) { @@ -399,6 +427,8 @@ void MappingTraits::mapping(IO &IO, IO.mapRequired("Function", Info.ElementIndex); } else if (Info.Kind == wasm::WASM_SYMBOL_TYPE_GLOBAL) { IO.mapRequired("Global", Info.ElementIndex); + } else if (Info.Kind == wasm::WASM_SYMBOL_TYPE_EVENT) { + IO.mapRequired("Event", Info.ElementIndex); } else if (Info.Kind == wasm::WASM_SYMBOL_TYPE_DATA) { if ((Info.Flags & wasm::WASM_SYMBOL_UNDEFINED) == 0) { IO.mapRequired("Segment", Info.DataRef.Segment); @@ -412,6 +442,12 @@ void MappingTraits::mapping(IO &IO, } } +void MappingTraits::mapping(IO &IO, WasmYAML::Event &Event) { + IO.mapRequired("Index", Event.Index); + IO.mapRequired("Attribute", Event.Attribute); + IO.mapRequired("SigIndex", Event.SigIndex); +} + void ScalarBitSetTraits::bitset( IO &IO, WasmYAML::LimitFlags &Value) { #define BCase(X) IO.bitSetCase(Value, #X, wasm::WASM_LIMITS_FLAG_##X) @@ -443,6 +479,7 @@ void ScalarEnumerationTraits::enumeration( ECase(DATA); ECase(GLOBAL); ECase(SECTION); + ECase(EVENT); #undef ECase } @@ -467,6 +504,7 @@ void ScalarEnumerationTraits::enumeration( ECase(TABLE); ECase(MEMORY); ECase(GLOBAL); + ECase(EVENT); #undef ECase } diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp index 0c6dfff06f1887b1306ffaab762fd2a2fff1ed63..4da8f549d9bf9c5d6c7fa8fbaee4606255e6bcd2 100644 --- a/lib/Passes/PassBuilder.cpp +++ b/lib/Passes/PassBuilder.cpp @@ -48,6 +48,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScopedNoAliasAA.h" +#include "llvm/Analysis/StackSafetyAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/TypeBasedAliasAnalysis.h" @@ -62,7 +63,6 @@ #include "llvm/Support/Regex.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h" -#include "llvm/Transforms/Instrumentation/CGProfile.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/IPO/ArgumentPromotion.h" #include "llvm/Transforms/IPO/CalledValuePropagation.h" @@ -89,6 +89,7 @@ #include "llvm/Transforms/IPO/WholeProgramDevirt.h" #include "llvm/Transforms/InstCombine/InstCombine.h" #include "llvm/Transforms/Instrumentation/BoundsChecking.h" +#include "llvm/Transforms/Instrumentation/CGProfile.h" #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h" #include "llvm/Transforms/Instrumentation/GCOVProfiler.h" #include "llvm/Transforms/Instrumentation/InstrProfiling.h" @@ -130,6 +131,7 @@ #include "llvm/Transforms/Scalar/LowerAtomic.h" #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" #include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h" +#include "llvm/Transforms/Scalar/MakeGuardsExplicit.h" #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h" #include "llvm/Transforms/Scalar/NaryReassociate.h" @@ -139,12 +141,14 @@ #include "llvm/Transforms/Scalar/RewriteStatepointsForGC.h" #include "llvm/Transforms/Scalar/SCCP.h" #include "llvm/Transforms/Scalar/SROA.h" +#include "llvm/Transforms/Scalar/Scalarizer.h" #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" #include "llvm/Transforms/Scalar/SimplifyCFG.h" #include "llvm/Transforms/Scalar/Sink.h" #include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h" #include "llvm/Transforms/Scalar/SpeculativeExecution.h" #include "llvm/Transforms/Scalar/TailRecursionElimination.h" +#include "llvm/Transforms/Scalar/WarnMissedTransforms.h" #include "llvm/Transforms/Utils/AddDiscriminators.h" #include "llvm/Transforms/Utils/BreakCriticalEdges.h" #include "llvm/Transforms/Utils/EntryExitInstrumenter.h" @@ -155,6 +159,7 @@ #include "llvm/Transforms/Utils/Mem2Reg.h" #include "llvm/Transforms/Utils/NameAnonGlobals.h" #include "llvm/Transforms/Utils/SymbolRewriter.h" +#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/SLPVectorizer.h" @@ -831,6 +836,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, createFunctionToLoopPassAdaptor(LoopUnrollAndJamPass(Level))); } OptimizePM.addPass(LoopUnrollPass(LoopUnrollOptions(Level))); + OptimizePM.addPass(WarnMissedTransformationsPass()); OptimizePM.addPass(InstCombinePass()); OptimizePM.addPass(RequireAnalysisPass()); OptimizePM.addPass(createFunctionToLoopPassAdaptor(LICMPass(), DebugLogging)); @@ -862,6 +868,9 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, // inserting redudnancies into the progrem. This even includes SimplifyCFG. OptimizePM.addPass(SpeculateAroundPHIsPass()); + for (auto &C : OptimizerLastEPCallbacks) + C(OptimizePM, Level); + // Add the core optimizing pipeline. MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM))); @@ -1001,6 +1010,13 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging, assert(Level != O0 && "Must request optimizations for the default pipeline!"); ModulePassManager MPM(DebugLogging); + if (PGOOpt && !PGOOpt->SampleProfileFile.empty()) { + // Load sample profile before running the LTO optimization pipeline. + MPM.addPass(SampleProfileLoaderPass(PGOOpt->SampleProfileFile, + PGOOpt->ProfileRemappingFile, + false /* ThinLTOPhase::PreLink */)); + } + // Remove unused virtual tables to improve the quality of code generated by // whole-program devirtualization and bitset lowering. MPM.addPass(GlobalDCEPass()); diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def index 99df2ad2719cf78505d0a0ec76ed6ceea7e6b944..97f0d577b30d2ba45e4733ed94cb6e2ac889cbec 100644 --- a/lib/Passes/PassRegistry.def +++ b/lib/Passes/PassRegistry.def @@ -24,6 +24,7 @@ MODULE_ANALYSIS("lcg", LazyCallGraphAnalysis()) MODULE_ANALYSIS("module-summary", ModuleSummaryIndexAnalysis()) MODULE_ANALYSIS("no-op-module", NoOpModuleAnalysis()) MODULE_ANALYSIS("profile-summary", ProfileSummaryAnalysis()) +MODULE_ANALYSIS("stack-safety", StackSafetyGlobalAnalysis()) MODULE_ANALYSIS("targetlibinfo", TargetLibraryAnalysis()) MODULE_ANALYSIS("verify", VerifierAnalysis()) MODULE_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC)) @@ -71,6 +72,7 @@ MODULE_PASS("print-callgraph", CallGraphPrinterPass(dbgs())) MODULE_PASS("print", PrintModulePass(dbgs())) MODULE_PASS("print-lcg", LazyCallGraphPrinterPass(dbgs())) MODULE_PASS("print-lcg-dot", LazyCallGraphDOTPrinterPass(dbgs())) +MODULE_PASS("print-stack-safety", StackSafetyGlobalPrinterPass(dbgs())) MODULE_PASS("rewrite-statepoints-for-gc", RewriteStatepointsForGC()) MODULE_PASS("rewrite-symbols", RewriteSymbolPass()) MODULE_PASS("rpo-functionattrs", ReversePostOrderFunctionAttrsPass()) @@ -120,6 +122,7 @@ FUNCTION_ANALYSIS("regions", RegionInfoAnalysis()) FUNCTION_ANALYSIS("no-op-function", NoOpFunctionAnalysis()) FUNCTION_ANALYSIS("opt-remark-emit", OptimizationRemarkEmitterAnalysis()) FUNCTION_ANALYSIS("scalar-evolution", ScalarEvolutionAnalysis()) +FUNCTION_ANALYSIS("stack-safety-local", StackSafetyAnalysis()) FUNCTION_ANALYSIS("targetlibinfo", TargetLibraryAnalysis()) FUNCTION_ANALYSIS("targetir", TM ? TM->getTargetIRAnalysis() : TargetIRAnalysis()) @@ -162,6 +165,7 @@ FUNCTION_PASS("dot-cfg-only", CFGOnlyPrinterPass()) FUNCTION_PASS("early-cse", EarlyCSEPass(/*UseMemorySSA=*/false)) FUNCTION_PASS("early-cse-memssa", EarlyCSEPass(/*UseMemorySSA=*/true)) FUNCTION_PASS("ee-instrument", EntryExitInstrumenterPass(/*PostInlining=*/false)) +FUNCTION_PASS("make-guards-explicit", MakeGuardsExplicitPass()) FUNCTION_PASS("post-inline-ee-instrument", EntryExitInstrumenterPass(/*PostInlining=*/true)) FUNCTION_PASS("gvn-hoist", GVNHoistPass()) FUNCTION_PASS("instcombine", InstCombinePass()) @@ -175,6 +179,7 @@ FUNCTION_PASS("lower-expect", LowerExpectIntrinsicPass()) FUNCTION_PASS("lower-guard-intrinsic", LowerGuardIntrinsicPass()) FUNCTION_PASS("guard-widening", GuardWideningPass()) FUNCTION_PASS("gvn", GVN()) +FUNCTION_PASS("load-store-vectorizer", LoadStoreVectorizerPass()) FUNCTION_PASS("loop-simplify", LoopSimplifyPass()) FUNCTION_PASS("loop-sink", LoopSinkPass()) FUNCTION_PASS("lowerinvoke", LowerInvokePass()) @@ -204,7 +209,9 @@ FUNCTION_PASS("print", MemorySSAPrinterPass(dbgs())) FUNCTION_PASS("print", PhiValuesPrinterPass(dbgs())) FUNCTION_PASS("print", RegionInfoPrinterPass(dbgs())) FUNCTION_PASS("print", ScalarEvolutionPrinterPass(dbgs())) +FUNCTION_PASS("print", StackSafetyPrinterPass(dbgs())) FUNCTION_PASS("reassociate", ReassociatePass()) +FUNCTION_PASS("scalarizer", ScalarizerPass()) FUNCTION_PASS("sccp", SCCPPass()) FUNCTION_PASS("simplify-cfg", SimplifyCFGPass()) FUNCTION_PASS("sink", SinkingPass()) @@ -223,6 +230,7 @@ FUNCTION_PASS("verify", MemorySSAVerifierPass()) FUNCTION_PASS("verify", RegionInfoVerifierPass()) FUNCTION_PASS("view-cfg", CFGViewerPass()) FUNCTION_PASS("view-cfg-only", CFGOnlyViewerPass()) +FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass()) #undef FUNCTION_PASS #ifndef LOOP_ANALYSIS diff --git a/lib/Passes/StandardInstrumentations.cpp b/lib/Passes/StandardInstrumentations.cpp index 48d36e5a01e58ad9f9335dca6be6fb3bbe5875c8..765ffe626620108a47adcc1ab77788d66b6876a1 100644 --- a/lib/Passes/StandardInstrumentations.cpp +++ b/lib/Passes/StandardInstrumentations.cpp @@ -53,7 +53,6 @@ void unwrapAndPrint(StringRef Banner, Any IR) { Extra = formatv(" (function: {0})\n", F->getName()); } else if (any_isa(IR)) { const LazyCallGraph::SCC *C = any_cast(IR); - assert(C); if (!llvm::forcePrintModuleIR()) { Extra = formatv(" (scc: {0})\n", C->getName()); bool BannerPrinted = false; diff --git a/lib/Support/AArch64TargetParser.cpp b/lib/Support/AArch64TargetParser.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e897137df680fcfc99f344223b3ecf31c152790f --- /dev/null +++ b/lib/Support/AArch64TargetParser.cpp @@ -0,0 +1,206 @@ +//===-- AArch64TargetParser - Parser for AArch64 features -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a target parser to recognise AArch64 hardware features +// such as FPU/CPU/ARCH and extension names. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/AArch64TargetParser.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSwitch.h" +#include + +using namespace llvm; + +static unsigned checkArchVersion(llvm::StringRef Arch) { + if (Arch.size() >= 2 && Arch[0] == 'v' && std::isdigit(Arch[1])) + return (Arch[1] - 48); + return 0; +} + +unsigned AArch64::getDefaultFPU(StringRef CPU, AArch64::ArchKind AK) { + if (CPU == "generic") + return AArch64ARCHNames[static_cast(AK)].DefaultFPU; + + return StringSwitch(CPU) +#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \ + .Case(NAME, ARM::DEFAULT_FPU) +#include "../../include/llvm/Support/AArch64TargetParser.def" + .Default(ARM::FK_INVALID); +} + +unsigned AArch64::getDefaultExtensions(StringRef CPU, AArch64::ArchKind AK) { + if (CPU == "generic") + return AArch64ARCHNames[static_cast(AK)].ArchBaseExtensions; + + return StringSwitch(CPU) +#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \ + .Case(NAME, AArch64ARCHNames[static_cast(ArchKind::ID)] \ + .ArchBaseExtensions | \ + DEFAULT_EXT) +#include "../../include/llvm/Support/AArch64TargetParser.def" + .Default(AArch64::AEK_INVALID); +} + +AArch64::ArchKind AArch64::getCPUArchKind(StringRef CPU) { + if (CPU == "generic") + return ArchKind::ARMV8A; + + return StringSwitch(CPU) +#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \ + .Case(NAME, ArchKind::ID) +#include "../../include/llvm/Support/AArch64TargetParser.def" + .Default(ArchKind::INVALID); +} + +bool AArch64::getExtensionFeatures(unsigned Extensions, + std::vector &Features) { + if (Extensions == AArch64::AEK_INVALID) + return false; + + if (Extensions & AEK_FP) + Features.push_back("+fp-armv8"); + if (Extensions & AEK_SIMD) + Features.push_back("+neon"); + if (Extensions & AEK_CRC) + Features.push_back("+crc"); + if (Extensions & AEK_CRYPTO) + Features.push_back("+crypto"); + if (Extensions & AEK_DOTPROD) + Features.push_back("+dotprod"); + if (Extensions & AEK_FP16FML) + Features.push_back("+fp16fml"); + if (Extensions & AEK_FP16) + Features.push_back("+fullfp16"); + if (Extensions & AEK_PROFILE) + Features.push_back("+spe"); + if (Extensions & AEK_RAS) + Features.push_back("+ras"); + if (Extensions & AEK_LSE) + Features.push_back("+lse"); + if (Extensions & AEK_RDM) + Features.push_back("+rdm"); + if (Extensions & AEK_SVE) + Features.push_back("+sve"); + if (Extensions & AEK_RCPC) + Features.push_back("+rcpc"); + + return true; +} + +bool AArch64::getArchFeatures(AArch64::ArchKind AK, + std::vector &Features) { + if (AK == ArchKind::ARMV8_1A) + Features.push_back("+v8.1a"); + if (AK == ArchKind::ARMV8_2A) + Features.push_back("+v8.2a"); + if (AK == ArchKind::ARMV8_3A) + Features.push_back("+v8.3a"); + if (AK == ArchKind::ARMV8_4A) + Features.push_back("+v8.4a"); + if (AK == ArchKind::ARMV8_5A) + Features.push_back("+v8.5a"); + + return AK != ArchKind::INVALID; +} + +StringRef AArch64::getArchName(AArch64::ArchKind AK) { + return AArch64ARCHNames[static_cast(AK)].getName(); +} + +StringRef AArch64::getCPUAttr(AArch64::ArchKind AK) { + return AArch64ARCHNames[static_cast(AK)].getCPUAttr(); +} + +StringRef AArch64::getSubArch(AArch64::ArchKind AK) { + return AArch64ARCHNames[static_cast(AK)].getSubArch(); +} + +unsigned AArch64::getArchAttr(AArch64::ArchKind AK) { + return AArch64ARCHNames[static_cast(AK)].ArchAttr; +} + +StringRef AArch64::getArchExtName(unsigned ArchExtKind) { + for (const auto &AE : AArch64ARCHExtNames) + if (ArchExtKind == AE.ID) + return AE.getName(); + return StringRef(); +} + +StringRef AArch64::getArchExtFeature(StringRef ArchExt) { + if (ArchExt.startswith("no")) { + StringRef ArchExtBase(ArchExt.substr(2)); + for (const auto &AE : AArch64ARCHExtNames) { + if (AE.NegFeature && ArchExtBase == AE.getName()) + return StringRef(AE.NegFeature); + } + } + + for (const auto &AE : AArch64ARCHExtNames) + if (AE.Feature && ArchExt == AE.getName()) + return StringRef(AE.Feature); + return StringRef(); +} + +StringRef AArch64::getDefaultCPU(StringRef Arch) { + ArchKind AK = parseArch(Arch); + if (AK == ArchKind::INVALID) + return StringRef(); + + // Look for multiple AKs to find the default for pair AK+Name. + for (const auto &CPU : AArch64CPUNames) + if (CPU.ArchID == AK && CPU.Default) + return CPU.getName(); + + // If we can't find a default then target the architecture instead + return "generic"; +} + +void AArch64::fillValidCPUArchList(SmallVectorImpl &Values) { + for (const auto &Arch : AArch64CPUNames) { + if (Arch.ArchID != ArchKind::INVALID) + Values.push_back(Arch.getName()); + } +} + +bool AArch64::isX18ReservedByDefault(const Triple &TT) { + return TT.isAndroid() || TT.isOSDarwin() || TT.isOSFuchsia() || + TT.isOSWindows(); +} + +// Allows partial match, ex. "v8a" matches "armv8a". +AArch64::ArchKind AArch64::parseArch(StringRef Arch) { + Arch = ARM::getCanonicalArchName(Arch); + if (checkArchVersion(Arch) < 8) + return ArchKind::INVALID; + + StringRef Syn = ARM::getArchSynonym(Arch); + for (const auto A : AArch64ARCHNames) { + if (A.getName().endswith(Syn)) + return A.ID; + } + return ArchKind::INVALID; +} + +AArch64::ArchExtKind AArch64::parseArchExt(StringRef ArchExt) { + for (const auto A : AArch64ARCHExtNames) { + if (ArchExt == A.getName()) + return static_cast(A.ID); + } + return AArch64::AEK_INVALID; +} + +AArch64::ArchKind AArch64::parseCPUArch(StringRef CPU) { + for (const auto C : AArch64CPUNames) { + if (CPU == C.getName()) + return C.ArchID; + } + return ArchKind::INVALID; +} diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp index f2f5cca946f3706314a97f8c005aa26c90ce5b44..a5f4f98c489ae4f44822a99d17ea1a682c18f351 100644 --- a/lib/Support/APInt.cpp +++ b/lib/Support/APInt.cpp @@ -1947,7 +1947,43 @@ APInt APInt::ushl_ov(const APInt &ShAmt, bool &Overflow) const { return *this << ShAmt; } +APInt APInt::sadd_sat(const APInt &RHS) const { + bool Overflow; + APInt Res = sadd_ov(RHS, Overflow); + if (!Overflow) + return Res; + return isNegative() ? APInt::getSignedMinValue(BitWidth) + : APInt::getSignedMaxValue(BitWidth); +} + +APInt APInt::uadd_sat(const APInt &RHS) const { + bool Overflow; + APInt Res = uadd_ov(RHS, Overflow); + if (!Overflow) + return Res; + + return APInt::getMaxValue(BitWidth); +} + +APInt APInt::ssub_sat(const APInt &RHS) const { + bool Overflow; + APInt Res = ssub_ov(RHS, Overflow); + if (!Overflow) + return Res; + + return isNegative() ? APInt::getSignedMinValue(BitWidth) + : APInt::getSignedMaxValue(BitWidth); +} + +APInt APInt::usub_sat(const APInt &RHS) const { + bool Overflow; + APInt Res = usub_ov(RHS, Overflow); + if (!Overflow) + return Res; + + return APInt(BitWidth, 0); +} void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) { diff --git a/lib/Support/ARMTargetParser.cpp b/lib/Support/ARMTargetParser.cpp new file mode 100644 index 0000000000000000000000000000000000000000..07294b0c09a3031bc138072eb56d46c122994c66 --- /dev/null +++ b/lib/Support/ARMTargetParser.cpp @@ -0,0 +1,577 @@ +//===-- ARMTargetParser - Parser for ARM target features --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a target parser to recognise ARM hardware features +// such as FPU/CPU/ARCH/extensions and specific support such as HWDIV. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/ARMTargetParser.h" +#include "llvm/ADT/StringSwitch.h" +#include + +using namespace llvm; + +static StringRef getHWDivSynonym(StringRef HWDiv) { + return StringSwitch(HWDiv) + .Case("thumb,arm", "arm,thumb") + .Default(HWDiv); +} + +// Allows partial match, ex. "v7a" matches "armv7a". +ARM::ArchKind ARM::parseArch(StringRef Arch) { + Arch = getCanonicalArchName(Arch); + StringRef Syn = getArchSynonym(Arch); + for (const auto A : ARCHNames) { + if (A.getName().endswith(Syn)) + return A.ID; + } + return ArchKind::INVALID; +} + +// Version number (ex. v7 = 7). +unsigned ARM::parseArchVersion(StringRef Arch) { + Arch = getCanonicalArchName(Arch); + switch (parseArch(Arch)) { + case ArchKind::ARMV2: + case ArchKind::ARMV2A: + return 2; + case ArchKind::ARMV3: + case ArchKind::ARMV3M: + return 3; + case ArchKind::ARMV4: + case ArchKind::ARMV4T: + return 4; + case ArchKind::ARMV5T: + case ArchKind::ARMV5TE: + case ArchKind::IWMMXT: + case ArchKind::IWMMXT2: + case ArchKind::XSCALE: + case ArchKind::ARMV5TEJ: + return 5; + case ArchKind::ARMV6: + case ArchKind::ARMV6K: + case ArchKind::ARMV6T2: + case ArchKind::ARMV6KZ: + case ArchKind::ARMV6M: + return 6; + case ArchKind::ARMV7A: + case ArchKind::ARMV7VE: + case ArchKind::ARMV7R: + case ArchKind::ARMV7M: + case ArchKind::ARMV7S: + case ArchKind::ARMV7EM: + case ArchKind::ARMV7K: + return 7; + case ArchKind::ARMV8A: + case ArchKind::ARMV8_1A: + case ArchKind::ARMV8_2A: + case ArchKind::ARMV8_3A: + case ArchKind::ARMV8_4A: + case ArchKind::ARMV8_5A: + case ArchKind::ARMV8R: + case ArchKind::ARMV8MBaseline: + case ArchKind::ARMV8MMainline: + return 8; + case ArchKind::INVALID: + return 0; + } + llvm_unreachable("Unhandled architecture"); +} + +// Profile A/R/M +ARM::ProfileKind ARM::parseArchProfile(StringRef Arch) { + Arch = getCanonicalArchName(Arch); + switch (parseArch(Arch)) { + case ArchKind::ARMV6M: + case ArchKind::ARMV7M: + case ArchKind::ARMV7EM: + case ArchKind::ARMV8MMainline: + case ArchKind::ARMV8MBaseline: + return ProfileKind::M; + case ArchKind::ARMV7R: + case ArchKind::ARMV8R: + return ProfileKind::R; + case ArchKind::ARMV7A: + case ArchKind::ARMV7VE: + case ArchKind::ARMV7K: + case ArchKind::ARMV8A: + case ArchKind::ARMV8_1A: + case ArchKind::ARMV8_2A: + case ArchKind::ARMV8_3A: + case ArchKind::ARMV8_4A: + case ArchKind::ARMV8_5A: + return ProfileKind::A; + case ArchKind::ARMV2: + case ArchKind::ARMV2A: + case ArchKind::ARMV3: + case ArchKind::ARMV3M: + case ArchKind::ARMV4: + case ArchKind::ARMV4T: + case ArchKind::ARMV5T: + case ArchKind::ARMV5TE: + case ArchKind::ARMV5TEJ: + case ArchKind::ARMV6: + case ArchKind::ARMV6K: + case ArchKind::ARMV6T2: + case ArchKind::ARMV6KZ: + case ArchKind::ARMV7S: + case ArchKind::IWMMXT: + case ArchKind::IWMMXT2: + case ArchKind::XSCALE: + case ArchKind::INVALID: + return ProfileKind::INVALID; + } + llvm_unreachable("Unhandled architecture"); +} + +StringRef ARM::getArchSynonym(StringRef Arch) { + return StringSwitch(Arch) + .Case("v5", "v5t") + .Case("v5e", "v5te") + .Case("v6j", "v6") + .Case("v6hl", "v6k") + .Cases("v6m", "v6sm", "v6s-m", "v6-m") + .Cases("v6z", "v6zk", "v6kz") + .Cases("v7", "v7a", "v7hl", "v7l", "v7-a") + .Case("v7r", "v7-r") + .Case("v7m", "v7-m") + .Case("v7em", "v7e-m") + .Cases("v8", "v8a", "v8l", "aarch64", "arm64", "v8-a") + .Case("v8.1a", "v8.1-a") + .Case("v8.2a", "v8.2-a") + .Case("v8.3a", "v8.3-a") + .Case("v8.4a", "v8.4-a") + .Case("v8.5a", "v8.5-a") + .Case("v8r", "v8-r") + .Case("v8m.base", "v8-m.base") + .Case("v8m.main", "v8-m.main") + .Default(Arch); +} + +bool ARM::getFPUFeatures(unsigned FPUKind, std::vector &Features) { + + if (FPUKind >= FK_LAST || FPUKind == FK_INVALID) + return false; + + // fp-only-sp and d16 subtarget features are independent of each other, so we + // must enable/disable both. + switch (FPUNames[FPUKind].Restriction) { + case FPURestriction::SP_D16: + Features.push_back("+fp-only-sp"); + Features.push_back("+d16"); + break; + case FPURestriction::D16: + Features.push_back("-fp-only-sp"); + Features.push_back("+d16"); + break; + case FPURestriction::None: + Features.push_back("-fp-only-sp"); + Features.push_back("-d16"); + break; + } + + // FPU version subtarget features are inclusive of lower-numbered ones, so + // enable the one corresponding to this version and disable all that are + // higher. We also have to make sure to disable fp16 when vfp4 is disabled, + // as +vfp4 implies +fp16 but -vfp4 does not imply -fp16. + switch (FPUNames[FPUKind].FPUVer) { + case FPUVersion::VFPV5: + Features.push_back("+fp-armv8"); + break; + case FPUVersion::VFPV4: + Features.push_back("+vfp4"); + Features.push_back("-fp-armv8"); + break; + case FPUVersion::VFPV3_FP16: + Features.push_back("+vfp3"); + Features.push_back("+fp16"); + Features.push_back("-vfp4"); + Features.push_back("-fp-armv8"); + break; + case FPUVersion::VFPV3: + Features.push_back("+vfp3"); + Features.push_back("-fp16"); + Features.push_back("-vfp4"); + Features.push_back("-fp-armv8"); + break; + case FPUVersion::VFPV2: + Features.push_back("+vfp2"); + Features.push_back("-vfp3"); + Features.push_back("-fp16"); + Features.push_back("-vfp4"); + Features.push_back("-fp-armv8"); + break; + case FPUVersion::NONE: + Features.push_back("-vfp2"); + Features.push_back("-vfp3"); + Features.push_back("-fp16"); + Features.push_back("-vfp4"); + Features.push_back("-fp-armv8"); + break; + } + + // crypto includes neon, so we handle this similarly to FPU version. + switch (FPUNames[FPUKind].NeonSupport) { + case NeonSupportLevel::Crypto: + Features.push_back("+neon"); + Features.push_back("+crypto"); + break; + case NeonSupportLevel::Neon: + Features.push_back("+neon"); + Features.push_back("-crypto"); + break; + case NeonSupportLevel::None: + Features.push_back("-neon"); + Features.push_back("-crypto"); + break; + } + + return true; +} + +// Little/Big endian +ARM::EndianKind ARM::parseArchEndian(StringRef Arch) { + if (Arch.startswith("armeb") || Arch.startswith("thumbeb") || + Arch.startswith("aarch64_be")) + return EndianKind::BIG; + + if (Arch.startswith("arm") || Arch.startswith("thumb")) { + if (Arch.endswith("eb")) + return EndianKind::BIG; + else + return EndianKind::LITTLE; + } + + if (Arch.startswith("aarch64")) + return EndianKind::LITTLE; + + return EndianKind::INVALID; +} + +// ARM, Thumb, AArch64 +ARM::ISAKind ARM::parseArchISA(StringRef Arch) { + return StringSwitch(Arch) + .StartsWith("aarch64", ISAKind::AARCH64) + .StartsWith("arm64", ISAKind::AARCH64) + .StartsWith("thumb", ISAKind::THUMB) + .StartsWith("arm", ISAKind::ARM) + .Default(ISAKind::INVALID); +} + +unsigned ARM::parseFPU(StringRef FPU) { + StringRef Syn = getFPUSynonym(FPU); + for (const auto F : FPUNames) { + if (Syn == F.getName()) + return F.ID; + } + return FK_INVALID; +} + +ARM::NeonSupportLevel ARM::getFPUNeonSupportLevel(unsigned FPUKind) { + if (FPUKind >= FK_LAST) + return NeonSupportLevel::None; + return FPUNames[FPUKind].NeonSupport; +} + +// MArch is expected to be of the form (arm|thumb)?(eb)?(v.+)?(eb)?, but +// (iwmmxt|xscale)(eb)? is also permitted. If the former, return +// "v.+", if the latter, return unmodified string, minus 'eb'. +// If invalid, return empty string. +StringRef ARM::getCanonicalArchName(StringRef Arch) { + size_t offset = StringRef::npos; + StringRef A = Arch; + StringRef Error = ""; + + // Begins with "arm" / "thumb", move past it. + if (A.startswith("arm64")) + offset = 5; + else if (A.startswith("arm")) + offset = 3; + else if (A.startswith("thumb")) + offset = 5; + else if (A.startswith("aarch64")) { + offset = 7; + // AArch64 uses "_be", not "eb" suffix. + if (A.find("eb") != StringRef::npos) + return Error; + if (A.substr(offset, 3) == "_be") + offset += 3; + } + + // Ex. "armebv7", move past the "eb". + if (offset != StringRef::npos && A.substr(offset, 2) == "eb") + offset += 2; + // Or, if it ends with eb ("armv7eb"), chop it off. + else if (A.endswith("eb")) + A = A.substr(0, A.size() - 2); + // Trim the head + if (offset != StringRef::npos) + A = A.substr(offset); + + // Empty string means offset reached the end, which means it's valid. + if (A.empty()) + return Arch; + + // Only match non-marketing names + if (offset != StringRef::npos) { + // Must start with 'vN'. + if (A.size() >= 2 && (A[0] != 'v' || !std::isdigit(A[1]))) + return Error; + // Can't have an extra 'eb'. + if (A.find("eb") != StringRef::npos) + return Error; + } + + // Arch will either be a 'v' name (v7a) or a marketing name (xscale). + return A; +} + +StringRef ARM::getFPUSynonym(StringRef FPU) { + return StringSwitch(FPU) + .Cases("fpa", "fpe2", "fpe3", "maverick", "invalid") // Unsupported + .Case("vfp2", "vfpv2") + .Case("vfp3", "vfpv3") + .Case("vfp4", "vfpv4") + .Case("vfp3-d16", "vfpv3-d16") + .Case("vfp4-d16", "vfpv4-d16") + .Cases("fp4-sp-d16", "vfpv4-sp-d16", "fpv4-sp-d16") + .Cases("fp4-dp-d16", "fpv4-dp-d16", "vfpv4-d16") + .Case("fp5-sp-d16", "fpv5-sp-d16") + .Cases("fp5-dp-d16", "fpv5-dp-d16", "fpv5-d16") + // FIXME: Clang uses it, but it's bogus, since neon defaults to vfpv3. + .Case("neon-vfpv3", "neon") + .Default(FPU); +} + +StringRef ARM::getFPUName(unsigned FPUKind) { + if (FPUKind >= FK_LAST) + return StringRef(); + return FPUNames[FPUKind].getName(); +} + +ARM::FPUVersion ARM::getFPUVersion(unsigned FPUKind) { + if (FPUKind >= FK_LAST) + return FPUVersion::NONE; + return FPUNames[FPUKind].FPUVer; +} + +ARM::FPURestriction ARM::getFPURestriction(unsigned FPUKind) { + if (FPUKind >= FK_LAST) + return FPURestriction::None; + return FPUNames[FPUKind].Restriction; +} + +unsigned ARM::getDefaultFPU(StringRef CPU, ARM::ArchKind AK) { + if (CPU == "generic") + return ARM::ARCHNames[static_cast(AK)].DefaultFPU; + + return StringSwitch(CPU) +#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \ + .Case(NAME, DEFAULT_FPU) +#include "llvm/Support/ARMTargetParser.def" + .Default(ARM::FK_INVALID); +} + +unsigned ARM::getDefaultExtensions(StringRef CPU, ARM::ArchKind AK) { + if (CPU == "generic") + return ARM::ARCHNames[static_cast(AK)].ArchBaseExtensions; + + return StringSwitch(CPU) +#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \ + .Case(NAME, \ + ARCHNames[static_cast(ArchKind::ID)].ArchBaseExtensions | \ + DEFAULT_EXT) +#include "llvm/Support/ARMTargetParser.def" + .Default(ARM::AEK_INVALID); +} + +bool ARM::getHWDivFeatures(unsigned HWDivKind, + std::vector &Features) { + + if (HWDivKind == AEK_INVALID) + return false; + + if (HWDivKind & AEK_HWDIVARM) + Features.push_back("+hwdiv-arm"); + else + Features.push_back("-hwdiv-arm"); + + if (HWDivKind & AEK_HWDIVTHUMB) + Features.push_back("+hwdiv"); + else + Features.push_back("-hwdiv"); + + return true; +} + +bool ARM::getExtensionFeatures(unsigned Extensions, + std::vector &Features) { + + if (Extensions == AEK_INVALID) + return false; + + if (Extensions & AEK_CRC) + Features.push_back("+crc"); + else + Features.push_back("-crc"); + + if (Extensions & AEK_DSP) + Features.push_back("+dsp"); + else + Features.push_back("-dsp"); + + if (Extensions & AEK_FP16FML) + Features.push_back("+fp16fml"); + else + Features.push_back("-fp16fml"); + + if (Extensions & AEK_RAS) + Features.push_back("+ras"); + else + Features.push_back("-ras"); + + if (Extensions & AEK_DOTPROD) + Features.push_back("+dotprod"); + else + Features.push_back("-dotprod"); + + return getHWDivFeatures(Extensions, Features); +} + +StringRef ARM::getArchName(ARM::ArchKind AK) { + return ARCHNames[static_cast(AK)].getName(); +} + +StringRef ARM::getCPUAttr(ARM::ArchKind AK) { + return ARCHNames[static_cast(AK)].getCPUAttr(); +} + +StringRef ARM::getSubArch(ARM::ArchKind AK) { + return ARCHNames[static_cast(AK)].getSubArch(); +} + +unsigned ARM::getArchAttr(ARM::ArchKind AK) { + return ARCHNames[static_cast(AK)].ArchAttr; +} + +StringRef ARM::getArchExtName(unsigned ArchExtKind) { + for (const auto AE : ARCHExtNames) { + if (ArchExtKind == AE.ID) + return AE.getName(); + } + return StringRef(); +} + +StringRef ARM::getArchExtFeature(StringRef ArchExt) { + if (ArchExt.startswith("no")) { + StringRef ArchExtBase(ArchExt.substr(2)); + for (const auto AE : ARCHExtNames) { + if (AE.NegFeature && ArchExtBase == AE.getName()) + return StringRef(AE.NegFeature); + } + } + for (const auto AE : ARCHExtNames) { + if (AE.Feature && ArchExt == AE.getName()) + return StringRef(AE.Feature); + } + + return StringRef(); +} + +StringRef ARM::getHWDivName(unsigned HWDivKind) { + for (const auto D : HWDivNames) { + if (HWDivKind == D.ID) + return D.getName(); + } + return StringRef(); +} + +StringRef ARM::getDefaultCPU(StringRef Arch) { + ArchKind AK = parseArch(Arch); + if (AK == ArchKind::INVALID) + return StringRef(); + + // Look for multiple AKs to find the default for pair AK+Name. + for (const auto CPU : CPUNames) { + if (CPU.ArchID == AK && CPU.Default) + return CPU.getName(); + } + + // If we can't find a default then target the architecture instead + return "generic"; +} + +unsigned ARM::parseHWDiv(StringRef HWDiv) { + StringRef Syn = getHWDivSynonym(HWDiv); + for (const auto D : HWDivNames) { + if (Syn == D.getName()) + return D.ID; + } + return AEK_INVALID; +} + +unsigned ARM::parseArchExt(StringRef ArchExt) { + for (const auto A : ARCHExtNames) { + if (ArchExt == A.getName()) + return A.ID; + } + return AEK_INVALID; +} + +ARM::ArchKind ARM::parseCPUArch(StringRef CPU) { + for (const auto C : CPUNames) { + if (CPU == C.getName()) + return C.ArchID; + } + return ArchKind::INVALID; +} + +void ARM::fillValidCPUArchList(SmallVectorImpl &Values) { + for (const CpuNames &Arch : CPUNames) { + if (Arch.ArchID != ArchKind::INVALID) + Values.push_back(Arch.getName()); + } +} + +StringRef ARM::computeDefaultTargetABI(const Triple &TT, StringRef CPU) { + StringRef ArchName = + CPU.empty() ? TT.getArchName() : getArchName(parseCPUArch(CPU)); + + if (TT.isOSBinFormatMachO()) { + if (TT.getEnvironment() == Triple::EABI || + TT.getOS() == Triple::UnknownOS || + parseArchProfile(ArchName) == ProfileKind::M) + return "aapcs"; + if (TT.isWatchABI()) + return "aapcs16"; + return "apcs-gnu"; + } else if (TT.isOSWindows()) + // FIXME: this is invalid for WindowsCE. + return "aapcs"; + + // Select the default based on the platform. + switch (TT.getEnvironment()) { + case Triple::Android: + case Triple::GNUEABI: + case Triple::GNUEABIHF: + case Triple::MuslEABI: + case Triple::MuslEABIHF: + return "aapcs-linux"; + case Triple::EABIHF: + case Triple::EABI: + return "aapcs"; + default: + if (TT.isOSNetBSD()) + return "apcs-gnu"; + if (TT.isOSOpenBSD()) + return "aapcs-linux"; + return "aapcs"; + } +} diff --git a/lib/Support/BuryPointer.cpp b/lib/Support/BuryPointer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6c988b4a0ab263d79e76e5ba397647859cfe7e1c --- /dev/null +++ b/lib/Support/BuryPointer.cpp @@ -0,0 +1,31 @@ +//===- BuryPointer.cpp - Memory Manipulation/Leak ---------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/BuryPointer.h" +#include "llvm/Support/Compiler.h" +#include + +namespace llvm { + +void BuryPointer(const void *Ptr) { + // This function may be called only a small fixed amount of times per each + // invocation, otherwise we do actually have a leak which we want to report. + // If this function is called more than kGraveYardMaxSize times, the pointers + // will not be properly buried and a leak detector will report a leak, which + // is what we want in such case. + static const size_t kGraveYardMaxSize = 16; + LLVM_ATTRIBUTE_UNUSED static const void *GraveYard[kGraveYardMaxSize]; + static std::atomic GraveYardSize; + unsigned Idx = GraveYardSize++; + if (Idx >= kGraveYardMaxSize) + return; + GraveYard[Idx] = Ptr; +} + +} diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt index 1446359f0cfef57e235fd765d57845f128d09939..2a6810672b1da92a5146df21f5b761197ba1f29f 100644 --- a/lib/Support/CMakeLists.txt +++ b/lib/Support/CMakeLists.txt @@ -45,6 +45,8 @@ if (MSVC) endif() add_llvm_library(LLVMSupport + AArch64TargetParser.cpp + ARMTargetParser.cpp AMDGPUMetadata.cpp APFloat.cpp APInt.cpp @@ -59,6 +61,7 @@ add_llvm_library(LLVMSupport BinaryStreamWriter.cpp BlockFrequency.cpp BranchProbability.cpp + BuryPointer.cpp CachePruning.cpp circular_raw_ostream.cpp Chrono.cpp diff --git a/lib/Support/Error.cpp b/lib/Support/Error.cpp index ad2443db9af596d304976f38f39356745388e781..0b0be8d48fa7a72b9088293b08b598ef6de4c929 100644 --- a/lib/Support/Error.cpp +++ b/lib/Support/Error.cpp @@ -141,7 +141,7 @@ void report_fatal_error(Error Err, bool GenCrashDiag) { std::string ErrMsg; { raw_string_ostream ErrStream(ErrMsg); - logAllUnhandledErrors(std::move(Err), ErrStream, ""); + logAllUnhandledErrors(std::move(Err), ErrStream); } report_fatal_error(ErrMsg); } diff --git a/lib/Support/FileCheck.cpp b/lib/Support/FileCheck.cpp index 386fa80b8b9d3adc5aaf6707d584e48916f3fa04..37986c96c0817f600ef34b16aecd5b93643f28ea 100644 --- a/lib/Support/FileCheck.cpp +++ b/lib/Support/FileCheck.cpp @@ -16,8 +16,12 @@ #include "llvm/Support/FileCheck.h" #include "llvm/ADT/StringSet.h" +#include "llvm/Support/FormatVariadic.h" +#include #include #include +#include +#include using namespace llvm; @@ -408,9 +412,28 @@ void FileCheckPattern::PrintVariableUses(const SourceMgr &SM, StringRef Buffer, } } +static SMRange ProcessMatchResult(FileCheckDiag::MatchType MatchTy, + const SourceMgr &SM, SMLoc Loc, + Check::FileCheckType CheckTy, + StringRef Buffer, size_t Pos, size_t Len, + std::vector *Diags, + bool AdjustPrevDiag = false) { + SMLoc Start = SMLoc::getFromPointer(Buffer.data() + Pos); + SMLoc End = SMLoc::getFromPointer(Buffer.data() + Pos + Len); + SMRange Range(Start, End); + if (Diags) { + if (AdjustPrevDiag) + Diags->rbegin()->MatchTy = MatchTy; + else + Diags->emplace_back(SM, CheckTy, Loc, MatchTy, Range); + } + return Range; +} + void FileCheckPattern::PrintFuzzyMatch( const SourceMgr &SM, StringRef Buffer, - const StringMap &VariableTable) const { + const StringMap &VariableTable, + std::vector *Diags) const { // Attempt to find the closest/best fuzzy match. Usually an error happens // because some string in the output didn't exactly match. In these cases, we // would like to show the user a best guess at what "should have" matched, to @@ -444,8 +467,11 @@ void FileCheckPattern::PrintFuzzyMatch( // reasonable and not equal to what we showed in the "scanning from here" // line. if (Best && Best != StringRef::npos && BestQuality < 50) { - SM.PrintMessage(SMLoc::getFromPointer(Buffer.data() + Best), - SourceMgr::DK_Note, "possible intended match here"); + SMRange MatchRange = + ProcessMatchResult(FileCheckDiag::MatchFuzzy, SM, getLoc(), + getCheckTy(), Buffer, Best, 0, Diags); + SM.PrintMessage(MatchRange.Start, SourceMgr::DK_Note, + "possible intended match here"); // FIXME: If we wanted to be really friendly we would show why the match // failed, as it can be hard to spot simple one character differences. @@ -527,51 +553,42 @@ llvm::FileCheck::CanonicalizeFile(MemoryBuffer &MB, return StringRef(OutputBuffer.data(), OutputBuffer.size() - 1); } +FileCheckDiag::FileCheckDiag(const SourceMgr &SM, + const Check::FileCheckType &CheckTy, + SMLoc CheckLoc, MatchType MatchTy, + SMRange InputRange) + : CheckTy(CheckTy), MatchTy(MatchTy) { + auto Start = SM.getLineAndColumn(InputRange.Start); + auto End = SM.getLineAndColumn(InputRange.End); + InputStartLine = Start.first; + InputStartCol = Start.second; + InputEndLine = End.first; + InputEndCol = End.second; + Start = SM.getLineAndColumn(CheckLoc); + CheckLine = Start.first; + CheckCol = Start.second; +} + static bool IsPartOfWord(char c) { return (isalnum(c) || c == '-' || c == '_'); } -// Get the size of the prefix extension. -static size_t CheckTypeSize(Check::FileCheckType Ty) { - switch (Ty) { - case Check::CheckNone: - case Check::CheckBadNot: - return 0; - - case Check::CheckPlain: - return sizeof(":") - 1; - - case Check::CheckNext: - return sizeof("-NEXT:") - 1; - - case Check::CheckSame: - return sizeof("-SAME:") - 1; - - case Check::CheckNot: - return sizeof("-NOT:") - 1; - - case Check::CheckDAG: - return sizeof("-DAG:") - 1; - - case Check::CheckLabel: - return sizeof("-LABEL:") - 1; - - case Check::CheckEmpty: - return sizeof("-EMPTY:") - 1; - - case Check::CheckEOF: - llvm_unreachable("Should not be using EOF size"); - } - - llvm_unreachable("Bad check type"); +Check::FileCheckType &Check::FileCheckType::setCount(int C) { + assert(Count > 0 && "zero and negative counts are not supported"); + assert((C == 1 || Kind == CheckPlain) && + "count supported only for plain CHECK directives"); + Count = C; + return *this; } // Get a description of the type. -static std::string CheckTypeName(StringRef Prefix, Check::FileCheckType Ty) { - switch (Ty) { +std::string Check::FileCheckType::getDescription(StringRef Prefix) const { + switch (Kind) { case Check::CheckNone: return "invalid"; case Check::CheckPlain: + if (Count > 1) + return Prefix.str() + "-COUNT"; return Prefix; case Check::CheckNext: return Prefix.str() + "-NEXT"; @@ -589,50 +606,65 @@ static std::string CheckTypeName(StringRef Prefix, Check::FileCheckType Ty) { return "implicit EOF"; case Check::CheckBadNot: return "bad NOT"; + case Check::CheckBadCount: + return "bad COUNT"; } llvm_unreachable("unknown FileCheckType"); } -static Check::FileCheckType FindCheckType(StringRef Buffer, StringRef Prefix) { +static std::pair +FindCheckType(StringRef Buffer, StringRef Prefix) { if (Buffer.size() <= Prefix.size()) - return Check::CheckNone; + return {Check::CheckNone, StringRef()}; char NextChar = Buffer[Prefix.size()]; + StringRef Rest = Buffer.drop_front(Prefix.size() + 1); // Verify that the : is present after the prefix. if (NextChar == ':') - return Check::CheckPlain; + return {Check::CheckPlain, Rest}; if (NextChar != '-') - return Check::CheckNone; + return {Check::CheckNone, StringRef()}; + + if (Rest.consume_front("COUNT-")) { + int64_t Count; + if (Rest.consumeInteger(10, Count)) + // Error happened in parsing integer. + return {Check::CheckBadCount, Rest}; + if (Count <= 0 || Count > INT32_MAX) + return {Check::CheckBadCount, Rest}; + if (!Rest.consume_front(":")) + return {Check::CheckBadCount, Rest}; + return {Check::FileCheckType(Check::CheckPlain).setCount(Count), Rest}; + } - StringRef Rest = Buffer.drop_front(Prefix.size() + 1); - if (Rest.startswith("NEXT:")) - return Check::CheckNext; + if (Rest.consume_front("NEXT:")) + return {Check::CheckNext, Rest}; - if (Rest.startswith("SAME:")) - return Check::CheckSame; + if (Rest.consume_front("SAME:")) + return {Check::CheckSame, Rest}; - if (Rest.startswith("NOT:")) - return Check::CheckNot; + if (Rest.consume_front("NOT:")) + return {Check::CheckNot, Rest}; - if (Rest.startswith("DAG:")) - return Check::CheckDAG; + if (Rest.consume_front("DAG:")) + return {Check::CheckDAG, Rest}; - if (Rest.startswith("LABEL:")) - return Check::CheckLabel; + if (Rest.consume_front("LABEL:")) + return {Check::CheckLabel, Rest}; - if (Rest.startswith("EMPTY:")) - return Check::CheckEmpty; + if (Rest.consume_front("EMPTY:")) + return {Check::CheckEmpty, Rest}; // You can't combine -NOT with another suffix. if (Rest.startswith("DAG-NOT:") || Rest.startswith("NOT-DAG:") || Rest.startswith("NEXT-NOT:") || Rest.startswith("NOT-NEXT:") || Rest.startswith("SAME-NOT:") || Rest.startswith("NOT-SAME:") || Rest.startswith("EMPTY-NOT:") || Rest.startswith("NOT-EMPTY:")) - return Check::CheckBadNot; + return {Check::CheckBadNot, Rest}; - return Check::CheckNone; + return {Check::CheckNone, Rest}; } // From the given position, find the next character after the word. @@ -651,8 +683,12 @@ static size_t SkipWord(StringRef Str, size_t Loc) { /// 2) The found prefix must be followed by a valid check type suffix using \c /// FindCheckType above. /// -/// The first match of the regular expression to satisfy these two is returned, -/// otherwise an empty StringRef is returned to indicate failure. +/// Returns a pair of StringRefs into the Buffer, which combines: +/// - the first match of the regular expression to satisfy these two is +/// returned, +/// otherwise an empty StringRef is returned to indicate failure. +/// - buffer rewound to the location right after parsed suffix, for parsing +/// to continue from /// /// If this routine returns a valid prefix, it will also shrink \p Buffer to /// start at the beginning of the returned prefix, increment \p LineNumber for @@ -661,16 +697,16 @@ static size_t SkipWord(StringRef Str, size_t Loc) { /// /// If no valid prefix is found, the state of Buffer, LineNumber, and CheckTy /// is unspecified. -static StringRef FindFirstMatchingPrefix(Regex &PrefixRE, StringRef &Buffer, - unsigned &LineNumber, - Check::FileCheckType &CheckTy) { +static std::pair +FindFirstMatchingPrefix(Regex &PrefixRE, StringRef &Buffer, + unsigned &LineNumber, Check::FileCheckType &CheckTy) { SmallVector Matches; while (!Buffer.empty()) { // Find the first (longest) match using the RE. if (!PrefixRE.match(Buffer, &Matches)) // No match at all, bail. - return StringRef(); + return {StringRef(), StringRef()}; StringRef Prefix = Matches[0]; Matches.clear(); @@ -690,11 +726,12 @@ static StringRef FindFirstMatchingPrefix(Regex &PrefixRE, StringRef &Buffer, // intentional and unintentional uses of this feature. if (Skipped.empty() || !IsPartOfWord(Skipped.back())) { // Now extract the type. - CheckTy = FindCheckType(Buffer, Prefix); + StringRef AfterSuffix; + std::tie(CheckTy, AfterSuffix) = FindCheckType(Buffer, Prefix); // If we've found a valid check type for this prefix, we're done. if (CheckTy != Check::CheckNone) - return Prefix; + return {Prefix, AfterSuffix}; } // If we didn't successfully find a prefix, we need to skip this invalid @@ -704,7 +741,7 @@ static StringRef FindFirstMatchingPrefix(Regex &PrefixRE, StringRef &Buffer, } // We ran out of buffer while skipping partial matches so give up. - return StringRef(); + return {StringRef(), StringRef()}; } /// Read the check file, which specifies the sequence of expected strings. @@ -742,19 +779,26 @@ bool llvm::FileCheck::ReadCheckFile(SourceMgr &SM, StringRef Buffer, Check::FileCheckType CheckTy; // See if a prefix occurs in the memory buffer. - StringRef UsedPrefix = FindFirstMatchingPrefix(PrefixRE, Buffer, LineNumber, - CheckTy); + StringRef UsedPrefix; + StringRef AfterSuffix; + std::tie(UsedPrefix, AfterSuffix) = + FindFirstMatchingPrefix(PrefixRE, Buffer, LineNumber, CheckTy); if (UsedPrefix.empty()) break; assert(UsedPrefix.data() == Buffer.data() && "Failed to move Buffer's start forward, or pointed prefix outside " "of the buffer!"); + assert(AfterSuffix.data() >= Buffer.data() && + AfterSuffix.data() < Buffer.data() + Buffer.size() && + "Parsing after suffix doesn't start inside of buffer!"); // Location to use for error messages. const char *UsedPrefixStart = UsedPrefix.data(); - // Skip the buffer to the end. - Buffer = Buffer.drop_front(UsedPrefix.size() + CheckTypeSize(CheckTy)); + // Skip the buffer to the end of parsed suffix (or just prefix, if no good + // suffix was processed). + Buffer = AfterSuffix.empty() ? Buffer.drop_front(UsedPrefix.size()) + : AfterSuffix; // Complain about useful-looking but unsupported suffixes. if (CheckTy == Check::CheckBadNot) { @@ -763,6 +807,14 @@ bool llvm::FileCheck::ReadCheckFile(SourceMgr &SM, StringRef Buffer, return true; } + // Complain about invalid count specification. + if (CheckTy == Check::CheckBadCount) { + SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()), SourceMgr::DK_Error, + "invalid count in -COUNT specification on prefix '" + + UsedPrefix + "'"); + return true; + } + // Okay, we found the prefix, yay. Remember the rest of the line, but ignore // leading whitespace. if (!(Req.NoCanonicalizeWhiteSpace && Req.MatchFullLines)) @@ -845,69 +897,87 @@ bool llvm::FileCheck::ReadCheckFile(SourceMgr &SM, StringRef Buffer, static void PrintMatch(bool ExpectedMatch, const SourceMgr &SM, StringRef Prefix, SMLoc Loc, const FileCheckPattern &Pat, - StringRef Buffer, StringMap &VariableTable, - size_t MatchPos, size_t MatchLen, - const FileCheckRequest &Req) { + int MatchedCount, StringRef Buffer, + StringMap &VariableTable, size_t MatchPos, + size_t MatchLen, const FileCheckRequest &Req, + std::vector *Diags) { if (ExpectedMatch) { if (!Req.Verbose) return; if (!Req.VerboseVerbose && Pat.getCheckTy() == Check::CheckEOF) return; } - SMLoc MatchStart = SMLoc::getFromPointer(Buffer.data() + MatchPos); - SMLoc MatchEnd = SMLoc::getFromPointer(Buffer.data() + MatchPos + MatchLen); - SMRange MatchRange(MatchStart, MatchEnd); + SMRange MatchRange = ProcessMatchResult( + ExpectedMatch ? FileCheckDiag::MatchFoundAndExpected + : FileCheckDiag::MatchFoundButExcluded, + SM, Loc, Pat.getCheckTy(), Buffer, MatchPos, MatchLen, Diags); + std::string Message = formatv("{0}: {1} string found in input", + Pat.getCheckTy().getDescription(Prefix), + (ExpectedMatch ? "expected" : "excluded")) + .str(); + if (Pat.getCount() > 1) + Message += formatv(" ({0} out of {1})", MatchedCount, Pat.getCount()).str(); + SM.PrintMessage( - Loc, ExpectedMatch ? SourceMgr::DK_Remark : SourceMgr::DK_Error, - CheckTypeName(Prefix, Pat.getCheckTy()) + ": " + - (ExpectedMatch ? "expected" : "excluded") + - " string found in input"); - SM.PrintMessage(MatchStart, SourceMgr::DK_Note, "found here", {MatchRange}); + Loc, ExpectedMatch ? SourceMgr::DK_Remark : SourceMgr::DK_Error, Message); + SM.PrintMessage(MatchRange.Start, SourceMgr::DK_Note, "found here", + {MatchRange}); Pat.PrintVariableUses(SM, Buffer, VariableTable, MatchRange); } static void PrintMatch(bool ExpectedMatch, const SourceMgr &SM, - const FileCheckString &CheckStr, StringRef Buffer, - StringMap &VariableTable, size_t MatchPos, - size_t MatchLen, FileCheckRequest &Req) { + const FileCheckString &CheckStr, int MatchedCount, + StringRef Buffer, StringMap &VariableTable, + size_t MatchPos, size_t MatchLen, FileCheckRequest &Req, + std::vector *Diags) { PrintMatch(ExpectedMatch, SM, CheckStr.Prefix, CheckStr.Loc, CheckStr.Pat, - Buffer, VariableTable, MatchPos, MatchLen, Req); + MatchedCount, Buffer, VariableTable, MatchPos, MatchLen, Req, + Diags); } static void PrintNoMatch(bool ExpectedMatch, const SourceMgr &SM, - StringRef Prefix, SMLoc Loc, const FileCheckPattern &Pat, - StringRef Buffer, - StringMap &VariableTable, - bool VerboseVerbose) { + StringRef Prefix, SMLoc Loc, + const FileCheckPattern &Pat, int MatchedCount, + StringRef Buffer, StringMap &VariableTable, + bool VerboseVerbose, + std::vector *Diags) { if (!ExpectedMatch && !VerboseVerbose) return; // Otherwise, we have an error, emit an error message. - SM.PrintMessage(Loc, - ExpectedMatch ? SourceMgr::DK_Error : SourceMgr::DK_Remark, - CheckTypeName(Prefix, Pat.getCheckTy()) + ": " + - (ExpectedMatch ? "expected" : "excluded") + - " string not found in input"); + std::string Message = formatv("{0}: {1} string not found in input", + Pat.getCheckTy().getDescription(Prefix), + (ExpectedMatch ? "expected" : "excluded")) + .str(); + if (Pat.getCount() > 1) + Message += formatv(" ({0} out of {1})", MatchedCount, Pat.getCount()).str(); + + SM.PrintMessage( + Loc, ExpectedMatch ? SourceMgr::DK_Error : SourceMgr::DK_Remark, Message); // Print the "scanning from here" line. If the current position is at the // end of a line, advance to the start of the next line. Buffer = Buffer.substr(Buffer.find_first_not_of(" \t\n\r")); - - SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()), SourceMgr::DK_Note, - "scanning from here"); + SMRange SearchRange = ProcessMatchResult( + ExpectedMatch ? FileCheckDiag::MatchNoneButExpected + : FileCheckDiag::MatchNoneAndExcluded, + SM, Loc, Pat.getCheckTy(), Buffer, 0, Buffer.size(), Diags); + SM.PrintMessage(SearchRange.Start, SourceMgr::DK_Note, "scanning from here"); // Allow the pattern to print additional information if desired. Pat.PrintVariableUses(SM, Buffer, VariableTable); + if (ExpectedMatch) - Pat.PrintFuzzyMatch(SM, Buffer, VariableTable); + Pat.PrintFuzzyMatch(SM, Buffer, VariableTable, Diags); } static void PrintNoMatch(bool ExpectedMatch, const SourceMgr &SM, - const FileCheckString &CheckStr, StringRef Buffer, - StringMap &VariableTable, - bool VerboseVerbose) { + const FileCheckString &CheckStr, int MatchedCount, + StringRef Buffer, StringMap &VariableTable, + bool VerboseVerbose, + std::vector *Diags) { PrintNoMatch(ExpectedMatch, SM, CheckStr.Prefix, CheckStr.Loc, CheckStr.Pat, - Buffer, VariableTable, VerboseVerbose); + MatchedCount, Buffer, VariableTable, VerboseVerbose, Diags); } /// Count the number of newlines in the specified range. @@ -935,9 +1005,10 @@ static unsigned CountNumNewlinesBetween(StringRef Range, /// Match check string and its "not strings" and/or "dag strings". size_t FileCheckString::Check(const SourceMgr &SM, StringRef Buffer, - bool IsLabelScanMode, size_t &MatchLen, - StringMap &VariableTable, - FileCheckRequest &Req) const { + bool IsLabelScanMode, size_t &MatchLen, + StringMap &VariableTable, + FileCheckRequest &Req, + std::vector *Diags) const { size_t LastPos = 0; std::vector NotStrings; @@ -947,42 +1018,72 @@ size_t FileCheckString::Check(const SourceMgr &SM, StringRef Buffer, // over the block again (including the last CHECK-LABEL) in normal mode. if (!IsLabelScanMode) { // Match "dag strings" (with mixed "not strings" if any). - LastPos = CheckDag(SM, Buffer, NotStrings, VariableTable, Req); + LastPos = CheckDag(SM, Buffer, NotStrings, VariableTable, Req, Diags); if (LastPos == StringRef::npos) return StringRef::npos; } // Match itself from the last position after matching CHECK-DAG. - StringRef MatchBuffer = Buffer.substr(LastPos); - size_t MatchPos = Pat.Match(MatchBuffer, MatchLen, VariableTable); - if (MatchPos == StringRef::npos) { - PrintNoMatch(true, SM, *this, MatchBuffer, VariableTable, Req.VerboseVerbose); - return StringRef::npos; + size_t LastMatchEnd = LastPos; + size_t FirstMatchPos = 0; + // Go match the pattern Count times. Majority of patterns only match with + // count 1 though. + assert(Pat.getCount() != 0 && "pattern count can not be zero"); + for (int i = 1; i <= Pat.getCount(); i++) { + StringRef MatchBuffer = Buffer.substr(LastMatchEnd); + size_t CurrentMatchLen; + // get a match at current start point + size_t MatchPos = Pat.Match(MatchBuffer, CurrentMatchLen, VariableTable); + if (i == 1) + FirstMatchPos = LastPos + MatchPos; + + // report + if (MatchPos == StringRef::npos) { + PrintNoMatch(true, SM, *this, i, MatchBuffer, VariableTable, + Req.VerboseVerbose, Diags); + return StringRef::npos; + } + PrintMatch(true, SM, *this, i, MatchBuffer, VariableTable, MatchPos, + CurrentMatchLen, Req, Diags); + + // move start point after the match + LastMatchEnd += MatchPos + CurrentMatchLen; } - PrintMatch(true, SM, *this, MatchBuffer, VariableTable, MatchPos, MatchLen, Req); + // Full match len counts from first match pos. + MatchLen = LastMatchEnd - FirstMatchPos; // Similar to the above, in "label-scan mode" we can't yet handle CHECK-NEXT // or CHECK-NOT if (!IsLabelScanMode) { + size_t MatchPos = FirstMatchPos - LastPos; + StringRef MatchBuffer = Buffer.substr(LastPos); StringRef SkippedRegion = Buffer.substr(LastPos, MatchPos); // If this check is a "CHECK-NEXT", verify that the previous match was on // the previous line (i.e. that there is one newline between them). - if (CheckNext(SM, SkippedRegion)) + if (CheckNext(SM, SkippedRegion)) { + ProcessMatchResult(FileCheckDiag::MatchFoundButWrongLine, SM, Loc, + Pat.getCheckTy(), MatchBuffer, MatchPos, MatchLen, + Diags, Req.Verbose); return StringRef::npos; + } // If this check is a "CHECK-SAME", verify that the previous match was on // the same line (i.e. that there is no newline between them). - if (CheckSame(SM, SkippedRegion)) + if (CheckSame(SM, SkippedRegion)) { + ProcessMatchResult(FileCheckDiag::MatchFoundButWrongLine, SM, Loc, + Pat.getCheckTy(), MatchBuffer, MatchPos, MatchLen, + Diags, Req.Verbose); return StringRef::npos; + } // If this match had "not strings", verify that they don't exist in the // skipped region. - if (CheckNot(SM, SkippedRegion, NotStrings, VariableTable, Req)) + if (CheckNot(SM, SkippedRegion, NotStrings, VariableTable, Req, Diags)) return StringRef::npos; } - return LastPos + MatchPos; + return FirstMatchPos; } /// Verify there is a single line in the given buffer. @@ -1061,10 +1162,11 @@ bool FileCheckString::CheckSame(const SourceMgr &SM, StringRef Buffer) const { } /// Verify there's no "not strings" in the given buffer. -bool FileCheckString::CheckNot(const SourceMgr &SM, StringRef Buffer, - const std::vector &NotStrings, - StringMap &VariableTable, - const FileCheckRequest &Req) const { +bool FileCheckString::CheckNot( + const SourceMgr &SM, StringRef Buffer, + const std::vector &NotStrings, + StringMap &VariableTable, const FileCheckRequest &Req, + std::vector *Diags) const { for (const FileCheckPattern *Pat : NotStrings) { assert((Pat->getCheckTy() == Check::CheckNot) && "Expect CHECK-NOT!"); @@ -1072,13 +1174,13 @@ bool FileCheckString::CheckNot(const SourceMgr &SM, StringRef Buffer, size_t Pos = Pat->Match(Buffer, MatchLen, VariableTable); if (Pos == StringRef::npos) { - PrintNoMatch(false, SM, Prefix, Pat->getLoc(), *Pat, Buffer, - VariableTable, Req.VerboseVerbose); + PrintNoMatch(false, SM, Prefix, Pat->getLoc(), *Pat, 1, Buffer, + VariableTable, Req.VerboseVerbose, Diags); continue; } - PrintMatch(false, SM, Prefix, Pat->getLoc(), *Pat, Buffer, VariableTable, - Pos, MatchLen, Req); + PrintMatch(false, SM, Prefix, Pat->getLoc(), *Pat, 1, Buffer, VariableTable, + Pos, MatchLen, Req, Diags); return true; } @@ -1087,10 +1189,12 @@ bool FileCheckString::CheckNot(const SourceMgr &SM, StringRef Buffer, } /// Match "dag strings" and their mixed "not strings". -size_t FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer, - std::vector &NotStrings, - StringMap &VariableTable, - const FileCheckRequest &Req) const { +size_t +FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer, + std::vector &NotStrings, + StringMap &VariableTable, + const FileCheckRequest &Req, + std::vector *Diags) const { if (DagNotStrings.empty()) return 0; @@ -1133,15 +1237,15 @@ size_t FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer, // With a group of CHECK-DAGs, a single mismatching means the match on // that group of CHECK-DAGs fails immediately. if (MatchPosBuf == StringRef::npos) { - PrintNoMatch(true, SM, Prefix, Pat.getLoc(), Pat, MatchBuffer, - VariableTable, Req.VerboseVerbose); + PrintNoMatch(true, SM, Prefix, Pat.getLoc(), Pat, 1, MatchBuffer, + VariableTable, Req.VerboseVerbose, Diags); return StringRef::npos; } // Re-calc it as the offset relative to the start of the original string. MatchPos += MatchPosBuf; if (Req.VerboseVerbose) - PrintMatch(true, SM, Prefix, Pat.getLoc(), Pat, Buffer, VariableTable, - MatchPos, MatchLen, Req); + PrintMatch(true, SM, Prefix, Pat.getLoc(), Pat, 1, Buffer, + VariableTable, MatchPos, MatchLen, Req, Diags); MatchRange M{MatchPos, MatchPos + MatchLen}; if (Req.AllowDeprecatedDagOverlap) { // We don't need to track all matches in this mode, so we just maintain @@ -1178,12 +1282,14 @@ size_t FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer, SM.PrintMessage(OldStart, SourceMgr::DK_Note, "match discarded, overlaps earlier DAG match here", {OldRange}); + if (Diags) + Diags->rbegin()->MatchTy = FileCheckDiag::MatchFoundButDiscarded; } MatchPos = MI->End; } if (!Req.VerboseVerbose) - PrintMatch(true, SM, Prefix, Pat.getLoc(), Pat, Buffer, VariableTable, - MatchPos, MatchLen, Req); + PrintMatch(true, SM, Prefix, Pat.getLoc(), Pat, 1, Buffer, VariableTable, + MatchPos, MatchLen, Req, Diags); // Handle the end of a CHECK-DAG group. if (std::next(PatItr) == PatEnd || @@ -1194,7 +1300,7 @@ size_t FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer, // region. StringRef SkippedRegion = Buffer.slice(StartPos, MatchRanges.begin()->Pos); - if (CheckNot(SM, SkippedRegion, NotStrings, VariableTable, Req)) + if (CheckNot(SM, SkippedRegion, NotStrings, VariableTable, Req, Diags)) return StringRef::npos; // Clear "not strings". NotStrings.clear(); @@ -1275,7 +1381,8 @@ static void ClearLocalVars(StringMap &VariableTable) { /// /// Returns false if the input fails to satisfy the checks. bool llvm::FileCheck::CheckInput(SourceMgr &SM, StringRef Buffer, - ArrayRef CheckStrings) { + ArrayRef CheckStrings, + std::vector *Diags) { bool ChecksFailed = false; /// VariableTable - This holds all the current filecheck variables. @@ -1298,9 +1405,8 @@ bool llvm::FileCheck::CheckInput(SourceMgr &SM, StringRef Buffer, // Scan to next CHECK-LABEL match, ignoring CHECK-NOT and CHECK-DAG size_t MatchLabelLen = 0; - size_t MatchLabelPos = - CheckLabelStr.Check(SM, Buffer, true, MatchLabelLen, VariableTable, - Req); + size_t MatchLabelPos = CheckLabelStr.Check( + SM, Buffer, true, MatchLabelLen, VariableTable, Req, Diags); if (MatchLabelPos == StringRef::npos) // Immediately bail of CHECK-LABEL fails, nothing else we can do. return false; @@ -1319,8 +1425,8 @@ bool llvm::FileCheck::CheckInput(SourceMgr &SM, StringRef Buffer, // Check each string within the scanned region, including a second check // of any final CHECK-LABEL (to verify CHECK-NOT and CHECK-DAG) size_t MatchLen = 0; - size_t MatchPos = - CheckStr.Check(SM, CheckRegion, false, MatchLen, VariableTable, Req); + size_t MatchPos = CheckStr.Check(SM, CheckRegion, false, MatchLen, + VariableTable, Req, Diags); if (MatchPos == StringRef::npos) { ChecksFailed = true; diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp index 91e98a33b371e4db035ba21bf7e5ebe7be83e9aa..d5a688c7fb9b0b1ca5aed00d54122117c564c1af 100644 --- a/lib/Support/Host.cpp +++ b/lib/Support/Host.cpp @@ -211,6 +211,17 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) { } } + if (Implementer == "0x48") // HiSilicon Technologies, Inc. + // Look for the CPU part line. + for (unsigned I = 0, E = Lines.size(); I != E; ++I) + if (Lines[I].startswith("CPU part")) + // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The + // values correspond to the "Part number" in the CP15/c0 register. The + // contents are specified in the various processor manuals. + return StringSwitch(Lines[I].substr(8).ltrim("\t :")) + .Case("0xd01", "tsv110") + .Default("generic"); + if (Implementer == "0x51") // Qualcomm Technologies, Inc. // Look for the CPU part line. for (unsigned I = 0, E = Lines.size(); I != E; ++I) @@ -679,12 +690,24 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model, break; default: // Unknown family 6 CPU, try to guess. + if (Features & (1 << X86::FEATURE_AVX512VBMI2)) { + *Type = X86::INTEL_COREI7; + *Subtype = X86::INTEL_COREI7_ICELAKE_CLIENT; + break; + } + if (Features & (1 << X86::FEATURE_AVX512VBMI)) { *Type = X86::INTEL_COREI7; *Subtype = X86::INTEL_COREI7_CANNONLAKE; break; } + if (Features2 & (1 << (X86::FEATURE_AVX512VNNI - 32))) { + *Type = X86::INTEL_COREI7; + *Subtype = X86::INTEL_COREI7_CASCADELAKE; + break; + } + if (Features & (1 << X86::FEATURE_AVX512VL)) { *Type = X86::INTEL_COREI7; *Subtype = X86::INTEL_COREI7_SKYLAKE_AVX512; @@ -886,11 +909,11 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf, auto setFeature = [&](unsigned F) { if (F < 32) - Features |= 1 << F; + Features |= 1U << (F & 0x1f); else if (F < 64) - Features2 |= 1 << (F - 32); + Features2 |= 1U << ((F - 32) & 0x1f); else if (F < 96) - Features3 |= 1 << (F - 64); + Features3 |= 1U << ((F - 64) & 0x1f); else llvm_unreachable("Unexpected FeatureBit"); }; diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp index 7eba5962774d0e3e01f0f8153c5c9e16b0a90ec4..a3e6941bd62277d69c41ae9807d8ffc2fea3739d 100644 --- a/lib/Support/Path.cpp +++ b/lib/Support/Path.cpp @@ -533,7 +533,7 @@ void replace_path_prefix(SmallVectorImpl &Path, // If prefixes have the same size we can simply copy the new one over. if (OldPrefix.size() == NewPrefix.size()) { - std::copy(NewPrefix.begin(), NewPrefix.end(), Path.begin()); + llvm::copy(NewPrefix, Path.begin()); return; } diff --git a/lib/Support/RandomNumberGenerator.cpp b/lib/Support/RandomNumberGenerator.cpp index f1f22af82a81579e991a81e8f8c09a91c5305695..df0d87fab021b558cf58ac095a0e985bcf096b69 100644 --- a/lib/Support/RandomNumberGenerator.cpp +++ b/lib/Support/RandomNumberGenerator.cpp @@ -49,7 +49,7 @@ RandomNumberGenerator::RandomNumberGenerator(StringRef Salt) { Data[0] = Seed; Data[1] = Seed >> 32; - std::copy(Salt.begin(), Salt.end(), Data.begin() + 2); + llvm::copy(Salt, Data.begin() + 2); std::seed_seq SeedSeq(Data.begin(), Data.end()); Generator.seed(SeedSeq); diff --git a/lib/Support/Signals.cpp b/lib/Support/Signals.cpp index 6534ff69b84cb3b2352091b837d14b7f7ac8fcaf..333f492d4589489840567abc58db7699c18b6994 100644 --- a/lib/Support/Signals.cpp +++ b/lib/Support/Signals.cpp @@ -20,6 +20,8 @@ #include "llvm/Support/FileSystem.h" #include "llvm/Support/FileUtilities.h" #include "llvm/Support/Format.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/FormatAdapters.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Mutex.h" @@ -155,7 +157,7 @@ static bool printSymbolizedStackTrace(StringRef Argv0, void **StackTrace, } Optional Redirects[] = {StringRef(InputFile), - StringRef(OutputFile), llvm::None}; + StringRef(OutputFile), StringRef("")}; StringRef Args[] = {"llvm-symbolizer", "--functions=linkage", "--inlining", #ifdef _WIN32 // Pass --relative-address on Windows so that we don't @@ -180,8 +182,14 @@ static bool printSymbolizedStackTrace(StringRef Argv0, void **StackTrace, auto CurLine = Lines.begin(); int frame_no = 0; for (int i = 0; i < Depth; i++) { + auto PrintLineHeader = [&]() { + OS << right_justify(formatv("#{0}", frame_no++).str(), + std::log10(Depth) + 2) + << ' ' << format_ptr(StackTrace[i]) << ' '; + }; if (!Modules[i]) { - OS << '#' << frame_no++ << ' ' << format_ptr(StackTrace[i]) << '\n'; + PrintLineHeader(); + OS << '\n'; continue; } // Read pairs of lines (function name and file/line info) until we @@ -192,7 +200,7 @@ static bool printSymbolizedStackTrace(StringRef Argv0, void **StackTrace, StringRef FunctionName = *CurLine++; if (FunctionName.empty()) break; - OS << '#' << frame_no++ << ' ' << format_ptr(StackTrace[i]) << ' '; + PrintLineHeader(); if (!FunctionName.startswith("??")) OS << FunctionName << ' '; if (CurLine == Lines.end()) diff --git a/lib/Support/TargetParser.cpp b/lib/Support/TargetParser.cpp index 968b559c08dfa5a268347faa4d6e6f2c1c97987e..bdc0dc52c5e2334b4d4fb6d8e68618eb38457013 100644 --- a/lib/Support/TargetParser.cpp +++ b/lib/Support/TargetParser.cpp @@ -17,944 +17,12 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" -#include using namespace llvm; -using namespace ARM; -using namespace AArch64; using namespace AMDGPU; namespace { -// List of canonical FPU names (use getFPUSynonym) and which architectural -// features they correspond to (use getFPUFeatures). -// FIXME: TableGen this. -// The entries must appear in the order listed in ARM::FPUKind for correct indexing -static const struct { - const char *NameCStr; - size_t NameLength; - ARM::FPUKind ID; - ARM::FPUVersion FPUVersion; - ARM::NeonSupportLevel NeonSupport; - ARM::FPURestriction Restriction; - - StringRef getName() const { return StringRef(NameCStr, NameLength); } -} FPUNames[] = { -#define ARM_FPU(NAME, KIND, VERSION, NEON_SUPPORT, RESTRICTION) \ - { NAME, sizeof(NAME) - 1, KIND, VERSION, NEON_SUPPORT, RESTRICTION }, -#include "llvm/Support/ARMTargetParser.def" -}; - -// List of canonical arch names (use getArchSynonym). -// This table also provides the build attribute fields for CPU arch -// and Arch ID, according to the Addenda to the ARM ABI, chapters -// 2.4 and 2.3.5.2 respectively. -// FIXME: SubArch values were simplified to fit into the expectations -// of the triples and are not conforming with their official names. -// Check to see if the expectation should be changed. -// FIXME: TableGen this. -template struct ArchNames { - const char *NameCStr; - size_t NameLength; - const char *CPUAttrCStr; - size_t CPUAttrLength; - const char *SubArchCStr; - size_t SubArchLength; - unsigned DefaultFPU; - unsigned ArchBaseExtensions; - T ID; - ARMBuildAttrs::CPUArch ArchAttr; // Arch ID in build attributes. - - StringRef getName() const { return StringRef(NameCStr, NameLength); } - - // CPU class in build attributes. - StringRef getCPUAttr() const { return StringRef(CPUAttrCStr, CPUAttrLength); } - - // Sub-Arch name. - StringRef getSubArch() const { return StringRef(SubArchCStr, SubArchLength); } -}; -ArchNames ARCHNames[] = { -#define ARM_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT) \ - {NAME, sizeof(NAME) - 1, CPU_ATTR, sizeof(CPU_ATTR) - 1, SUB_ARCH, \ - sizeof(SUB_ARCH) - 1, ARCH_FPU, ARCH_BASE_EXT, ARM::ArchKind::ID, ARCH_ATTR}, -#include "llvm/Support/ARMTargetParser.def" -}; - -ArchNames AArch64ARCHNames[] = { - #define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT) \ - {NAME, sizeof(NAME) - 1, CPU_ATTR, sizeof(CPU_ATTR) - 1, SUB_ARCH, \ - sizeof(SUB_ARCH) - 1, ARCH_FPU, ARCH_BASE_EXT, AArch64::ArchKind::ID, ARCH_ATTR}, - #include "llvm/Support/AArch64TargetParser.def" - }; - - -// List of Arch Extension names. -// FIXME: TableGen this. -static const struct { - const char *NameCStr; - size_t NameLength; - unsigned ID; - const char *Feature; - const char *NegFeature; - - StringRef getName() const { return StringRef(NameCStr, NameLength); } -} ARCHExtNames[] = { -#define ARM_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE) \ - { NAME, sizeof(NAME) - 1, ID, FEATURE, NEGFEATURE }, -#include "llvm/Support/ARMTargetParser.def" -},AArch64ARCHExtNames[] = { -#define AARCH64_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE) \ - { NAME, sizeof(NAME) - 1, ID, FEATURE, NEGFEATURE }, -#include "llvm/Support/AArch64TargetParser.def" -}; - -// List of HWDiv names (use getHWDivSynonym) and which architectural -// features they correspond to (use getHWDivFeatures). -// FIXME: TableGen this. -static const struct { - const char *NameCStr; - size_t NameLength; - unsigned ID; - - StringRef getName() const { return StringRef(NameCStr, NameLength); } -} HWDivNames[] = { -#define ARM_HW_DIV_NAME(NAME, ID) { NAME, sizeof(NAME) - 1, ID }, -#include "llvm/Support/ARMTargetParser.def" -}; - -// List of CPU names and their arches. -// The same CPU can have multiple arches and can be default on multiple arches. -// When finding the Arch for a CPU, first-found prevails. Sort them accordingly. -// When this becomes table-generated, we'd probably need two tables. -// FIXME: TableGen this. -template struct CpuNames { - const char *NameCStr; - size_t NameLength; - T ArchID; - bool Default; // is $Name the default CPU for $ArchID ? - unsigned DefaultExtensions; - - StringRef getName() const { return StringRef(NameCStr, NameLength); } -}; -CpuNames CPUNames[] = { -#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \ - { NAME, sizeof(NAME) - 1, ARM::ArchKind::ID, IS_DEFAULT, DEFAULT_EXT }, -#include "llvm/Support/ARMTargetParser.def" -}; - -CpuNames AArch64CPUNames[] = { - #define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \ - { NAME, sizeof(NAME) - 1, AArch64::ArchKind::ID, IS_DEFAULT, DEFAULT_EXT }, - #include "llvm/Support/AArch64TargetParser.def" - }; - -} // namespace - -// ======================================================= // -// Information by ID -// ======================================================= // - -StringRef ARM::getFPUName(unsigned FPUKind) { - if (FPUKind >= ARM::FK_LAST) - return StringRef(); - return FPUNames[FPUKind].getName(); -} - -FPUVersion ARM::getFPUVersion(unsigned FPUKind) { - if (FPUKind >= ARM::FK_LAST) - return FPUVersion::NONE; - return FPUNames[FPUKind].FPUVersion; -} - -ARM::NeonSupportLevel ARM::getFPUNeonSupportLevel(unsigned FPUKind) { - if (FPUKind >= ARM::FK_LAST) - return ARM::NeonSupportLevel::None; - return FPUNames[FPUKind].NeonSupport; -} - -ARM::FPURestriction ARM::getFPURestriction(unsigned FPUKind) { - if (FPUKind >= ARM::FK_LAST) - return ARM::FPURestriction::None; - return FPUNames[FPUKind].Restriction; -} - -unsigned llvm::ARM::getDefaultFPU(StringRef CPU, ArchKind AK) { - if (CPU == "generic") - return ARCHNames[static_cast(AK)].DefaultFPU; - - return StringSwitch(CPU) -#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \ - .Case(NAME, DEFAULT_FPU) -#include "llvm/Support/ARMTargetParser.def" - .Default(ARM::FK_INVALID); -} - -unsigned llvm::ARM::getDefaultExtensions(StringRef CPU, ArchKind AK) { - if (CPU == "generic") - return ARCHNames[static_cast(AK)].ArchBaseExtensions; - - return StringSwitch(CPU) -#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \ - .Case(NAME, ARCHNames[static_cast(ARM::ArchKind::ID)]\ - .ArchBaseExtensions | DEFAULT_EXT) -#include "llvm/Support/ARMTargetParser.def" - .Default(ARM::AEK_INVALID); -} - -bool llvm::ARM::getHWDivFeatures(unsigned HWDivKind, - std::vector &Features) { - - if (HWDivKind == ARM::AEK_INVALID) - return false; - - if (HWDivKind & ARM::AEK_HWDIVARM) - Features.push_back("+hwdiv-arm"); - else - Features.push_back("-hwdiv-arm"); - - if (HWDivKind & ARM::AEK_HWDIVTHUMB) - Features.push_back("+hwdiv"); - else - Features.push_back("-hwdiv"); - - return true; -} - -bool llvm::ARM::getExtensionFeatures(unsigned Extensions, - std::vector &Features) { - - if (Extensions == ARM::AEK_INVALID) - return false; - - if (Extensions & ARM::AEK_CRC) - Features.push_back("+crc"); - else - Features.push_back("-crc"); - - if (Extensions & ARM::AEK_DSP) - Features.push_back("+dsp"); - else - Features.push_back("-dsp"); - - if (Extensions & ARM::AEK_FP16FML) - Features.push_back("+fp16fml"); - else - Features.push_back("-fp16fml"); - - if (Extensions & ARM::AEK_RAS) - Features.push_back("+ras"); - else - Features.push_back("-ras"); - - if (Extensions & ARM::AEK_DOTPROD) - Features.push_back("+dotprod"); - else - Features.push_back("-dotprod"); - - return getHWDivFeatures(Extensions, Features); -} - -bool llvm::ARM::getFPUFeatures(unsigned FPUKind, - std::vector &Features) { - - if (FPUKind >= ARM::FK_LAST || FPUKind == ARM::FK_INVALID) - return false; - - // fp-only-sp and d16 subtarget features are independent of each other, so we - // must enable/disable both. - switch (FPUNames[FPUKind].Restriction) { - case ARM::FPURestriction::SP_D16: - Features.push_back("+fp-only-sp"); - Features.push_back("+d16"); - break; - case ARM::FPURestriction::D16: - Features.push_back("-fp-only-sp"); - Features.push_back("+d16"); - break; - case ARM::FPURestriction::None: - Features.push_back("-fp-only-sp"); - Features.push_back("-d16"); - break; - } - - // FPU version subtarget features are inclusive of lower-numbered ones, so - // enable the one corresponding to this version and disable all that are - // higher. We also have to make sure to disable fp16 when vfp4 is disabled, - // as +vfp4 implies +fp16 but -vfp4 does not imply -fp16. - switch (FPUNames[FPUKind].FPUVersion) { - case ARM::FPUVersion::VFPV5: - Features.push_back("+fp-armv8"); - break; - case ARM::FPUVersion::VFPV4: - Features.push_back("+vfp4"); - Features.push_back("-fp-armv8"); - break; - case ARM::FPUVersion::VFPV3_FP16: - Features.push_back("+vfp3"); - Features.push_back("+fp16"); - Features.push_back("-vfp4"); - Features.push_back("-fp-armv8"); - break; - case ARM::FPUVersion::VFPV3: - Features.push_back("+vfp3"); - Features.push_back("-fp16"); - Features.push_back("-vfp4"); - Features.push_back("-fp-armv8"); - break; - case ARM::FPUVersion::VFPV2: - Features.push_back("+vfp2"); - Features.push_back("-vfp3"); - Features.push_back("-fp16"); - Features.push_back("-vfp4"); - Features.push_back("-fp-armv8"); - break; - case ARM::FPUVersion::NONE: - Features.push_back("-vfp2"); - Features.push_back("-vfp3"); - Features.push_back("-fp16"); - Features.push_back("-vfp4"); - Features.push_back("-fp-armv8"); - break; - } - - // crypto includes neon, so we handle this similarly to FPU version. - switch (FPUNames[FPUKind].NeonSupport) { - case ARM::NeonSupportLevel::Crypto: - Features.push_back("+neon"); - Features.push_back("+crypto"); - break; - case ARM::NeonSupportLevel::Neon: - Features.push_back("+neon"); - Features.push_back("-crypto"); - break; - case ARM::NeonSupportLevel::None: - Features.push_back("-neon"); - Features.push_back("-crypto"); - break; - } - - return true; -} - -StringRef llvm::ARM::getArchName(ArchKind AK) { - return ARCHNames[static_cast(AK)].getName(); -} - -StringRef llvm::ARM::getCPUAttr(ArchKind AK) { - return ARCHNames[static_cast(AK)].getCPUAttr(); -} - -StringRef llvm::ARM::getSubArch(ArchKind AK) { - return ARCHNames[static_cast(AK)].getSubArch(); -} - -unsigned llvm::ARM::getArchAttr(ArchKind AK) { - return ARCHNames[static_cast(AK)].ArchAttr; -} - -StringRef llvm::ARM::getArchExtName(unsigned ArchExtKind) { - for (const auto AE : ARCHExtNames) { - if (ArchExtKind == AE.ID) - return AE.getName(); - } - return StringRef(); -} - -StringRef llvm::ARM::getArchExtFeature(StringRef ArchExt) { - if (ArchExt.startswith("no")) { - StringRef ArchExtBase(ArchExt.substr(2)); - for (const auto AE : ARCHExtNames) { - if (AE.NegFeature && ArchExtBase == AE.getName()) - return StringRef(AE.NegFeature); - } - } - for (const auto AE : ARCHExtNames) { - if (AE.Feature && ArchExt == AE.getName()) - return StringRef(AE.Feature); - } - - return StringRef(); -} - -StringRef llvm::ARM::getHWDivName(unsigned HWDivKind) { - for (const auto D : HWDivNames) { - if (HWDivKind == D.ID) - return D.getName(); - } - return StringRef(); -} - -StringRef llvm::ARM::getDefaultCPU(StringRef Arch) { - ArchKind AK = parseArch(Arch); - if (AK == ARM::ArchKind::INVALID) - return StringRef(); - - // Look for multiple AKs to find the default for pair AK+Name. - for (const auto CPU : CPUNames) { - if (CPU.ArchID == AK && CPU.Default) - return CPU.getName(); - } - - // If we can't find a default then target the architecture instead - return "generic"; -} - -StringRef llvm::AArch64::getFPUName(unsigned FPUKind) { - return ARM::getFPUName(FPUKind); -} - -ARM::FPUVersion AArch64::getFPUVersion(unsigned FPUKind) { - return ARM::getFPUVersion(FPUKind); -} - -ARM::NeonSupportLevel AArch64::getFPUNeonSupportLevel(unsigned FPUKind) { - return ARM::getFPUNeonSupportLevel( FPUKind); -} - -ARM::FPURestriction AArch64::getFPURestriction(unsigned FPUKind) { - return ARM::getFPURestriction(FPUKind); -} - -unsigned llvm::AArch64::getDefaultFPU(StringRef CPU, ArchKind AK) { - if (CPU == "generic") - return AArch64ARCHNames[static_cast(AK)].DefaultFPU; - - return StringSwitch(CPU) -#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \ - .Case(NAME, DEFAULT_FPU) -#include "llvm/Support/AArch64TargetParser.def" - .Default(ARM::FK_INVALID); -} - -unsigned llvm::AArch64::getDefaultExtensions(StringRef CPU, ArchKind AK) { - if (CPU == "generic") - return AArch64ARCHNames[static_cast(AK)].ArchBaseExtensions; - - return StringSwitch(CPU) -#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \ - .Case(NAME, \ - AArch64ARCHNames[static_cast(AArch64::ArchKind::ID)] \ - .ArchBaseExtensions | \ - DEFAULT_EXT) -#include "llvm/Support/AArch64TargetParser.def" - .Default(AArch64::AEK_INVALID); -} - -AArch64::ArchKind llvm::AArch64::getCPUArchKind(StringRef CPU) { - if (CPU == "generic") - return AArch64::ArchKind::ARMV8A; - - return StringSwitch(CPU) -#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \ - .Case(NAME, AArch64::ArchKind:: ID) -#include "llvm/Support/AArch64TargetParser.def" - .Default(AArch64::ArchKind::INVALID); -} - -bool llvm::AArch64::getExtensionFeatures(unsigned Extensions, - std::vector &Features) { - - if (Extensions == AArch64::AEK_INVALID) - return false; - - if (Extensions & AArch64::AEK_FP) - Features.push_back("+fp-armv8"); - if (Extensions & AArch64::AEK_SIMD) - Features.push_back("+neon"); - if (Extensions & AArch64::AEK_CRC) - Features.push_back("+crc"); - if (Extensions & AArch64::AEK_CRYPTO) - Features.push_back("+crypto"); - if (Extensions & AArch64::AEK_DOTPROD) - Features.push_back("+dotprod"); - if (Extensions & AArch64::AEK_FP16FML) - Features.push_back("+fp16fml"); - if (Extensions & AArch64::AEK_FP16) - Features.push_back("+fullfp16"); - if (Extensions & AArch64::AEK_PROFILE) - Features.push_back("+spe"); - if (Extensions & AArch64::AEK_RAS) - Features.push_back("+ras"); - if (Extensions & AArch64::AEK_LSE) - Features.push_back("+lse"); - if (Extensions & AArch64::AEK_RDM) - Features.push_back("+rdm"); - if (Extensions & AArch64::AEK_SVE) - Features.push_back("+sve"); - if (Extensions & AArch64::AEK_RCPC) - Features.push_back("+rcpc"); - - return true; -} - -bool llvm::AArch64::getFPUFeatures(unsigned FPUKind, - std::vector &Features) { - return ARM::getFPUFeatures(FPUKind, Features); -} - -bool llvm::AArch64::getArchFeatures(AArch64::ArchKind AK, - std::vector &Features) { - if (AK == AArch64::ArchKind::ARMV8_1A) - Features.push_back("+v8.1a"); - if (AK == AArch64::ArchKind::ARMV8_2A) - Features.push_back("+v8.2a"); - if (AK == AArch64::ArchKind::ARMV8_3A) - Features.push_back("+v8.3a"); - if (AK == AArch64::ArchKind::ARMV8_4A) - Features.push_back("+v8.4a"); - if (AK == AArch64::ArchKind::ARMV8_5A) - Features.push_back("+v8.5a"); - - return AK != AArch64::ArchKind::INVALID; -} - -StringRef llvm::AArch64::getArchName(ArchKind AK) { - return AArch64ARCHNames[static_cast(AK)].getName(); -} - -StringRef llvm::AArch64::getCPUAttr(ArchKind AK) { - return AArch64ARCHNames[static_cast(AK)].getCPUAttr(); -} - -StringRef llvm::AArch64::getSubArch(ArchKind AK) { - return AArch64ARCHNames[static_cast(AK)].getSubArch(); -} - -unsigned llvm::AArch64::getArchAttr(ArchKind AK) { - return AArch64ARCHNames[static_cast(AK)].ArchAttr; -} - -StringRef llvm::AArch64::getArchExtName(unsigned ArchExtKind) { - for (const auto &AE : AArch64ARCHExtNames) - if (ArchExtKind == AE.ID) - return AE.getName(); - return StringRef(); -} - -StringRef llvm::AArch64::getArchExtFeature(StringRef ArchExt) { - if (ArchExt.startswith("no")) { - StringRef ArchExtBase(ArchExt.substr(2)); - for (const auto &AE : AArch64ARCHExtNames) { - if (AE.NegFeature && ArchExtBase == AE.getName()) - return StringRef(AE.NegFeature); - } - } - - for (const auto &AE : AArch64ARCHExtNames) - if (AE.Feature && ArchExt == AE.getName()) - return StringRef(AE.Feature); - return StringRef(); -} - -StringRef llvm::AArch64::getDefaultCPU(StringRef Arch) { - AArch64::ArchKind AK = parseArch(Arch); - if (AK == ArchKind::INVALID) - return StringRef(); - - // Look for multiple AKs to find the default for pair AK+Name. - for (const auto &CPU : AArch64CPUNames) - if (CPU.ArchID == AK && CPU.Default) - return CPU.getName(); - - // If we can't find a default then target the architecture instead - return "generic"; -} - -unsigned llvm::AArch64::checkArchVersion(StringRef Arch) { - if (Arch.size() >= 2 && Arch[0] == 'v' && std::isdigit(Arch[1])) - return (Arch[1] - 48); - return 0; -} - -// ======================================================= // -// Parsers -// ======================================================= // - -static StringRef getHWDivSynonym(StringRef HWDiv) { - return StringSwitch(HWDiv) - .Case("thumb,arm", "arm,thumb") - .Default(HWDiv); -} - -static StringRef getFPUSynonym(StringRef FPU) { - return StringSwitch(FPU) - .Cases("fpa", "fpe2", "fpe3", "maverick", "invalid") // Unsupported - .Case("vfp2", "vfpv2") - .Case("vfp3", "vfpv3") - .Case("vfp4", "vfpv4") - .Case("vfp3-d16", "vfpv3-d16") - .Case("vfp4-d16", "vfpv4-d16") - .Cases("fp4-sp-d16", "vfpv4-sp-d16", "fpv4-sp-d16") - .Cases("fp4-dp-d16", "fpv4-dp-d16", "vfpv4-d16") - .Case("fp5-sp-d16", "fpv5-sp-d16") - .Cases("fp5-dp-d16", "fpv5-dp-d16", "fpv5-d16") - // FIXME: Clang uses it, but it's bogus, since neon defaults to vfpv3. - .Case("neon-vfpv3", "neon") - .Default(FPU); -} - -static StringRef getArchSynonym(StringRef Arch) { - return StringSwitch(Arch) - .Case("v5", "v5t") - .Case("v5e", "v5te") - .Case("v6j", "v6") - .Case("v6hl", "v6k") - .Cases("v6m", "v6sm", "v6s-m", "v6-m") - .Cases("v6z", "v6zk", "v6kz") - .Cases("v7", "v7a", "v7hl", "v7l", "v7-a") - .Case("v7r", "v7-r") - .Case("v7m", "v7-m") - .Case("v7em", "v7e-m") - .Cases("v8", "v8a", "v8l", "aarch64", "arm64", "v8-a") - .Case("v8.1a", "v8.1-a") - .Case("v8.2a", "v8.2-a") - .Case("v8.3a", "v8.3-a") - .Case("v8.4a", "v8.4-a") - .Case("v8.5a", "v8.5-a") - .Case("v8r", "v8-r") - .Case("v8m.base", "v8-m.base") - .Case("v8m.main", "v8-m.main") - .Default(Arch); -} - -// MArch is expected to be of the form (arm|thumb)?(eb)?(v.+)?(eb)?, but -// (iwmmxt|xscale)(eb)? is also permitted. If the former, return -// "v.+", if the latter, return unmodified string, minus 'eb'. -// If invalid, return empty string. -StringRef llvm::ARM::getCanonicalArchName(StringRef Arch) { - size_t offset = StringRef::npos; - StringRef A = Arch; - StringRef Error = ""; - - // Begins with "arm" / "thumb", move past it. - if (A.startswith("arm64")) - offset = 5; - else if (A.startswith("arm")) - offset = 3; - else if (A.startswith("thumb")) - offset = 5; - else if (A.startswith("aarch64")) { - offset = 7; - // AArch64 uses "_be", not "eb" suffix. - if (A.find("eb") != StringRef::npos) - return Error; - if (A.substr(offset, 3) == "_be") - offset += 3; - } - - // Ex. "armebv7", move past the "eb". - if (offset != StringRef::npos && A.substr(offset, 2) == "eb") - offset += 2; - // Or, if it ends with eb ("armv7eb"), chop it off. - else if (A.endswith("eb")) - A = A.substr(0, A.size() - 2); - // Trim the head - if (offset != StringRef::npos) - A = A.substr(offset); - - // Empty string means offset reached the end, which means it's valid. - if (A.empty()) - return Arch; - - // Only match non-marketing names - if (offset != StringRef::npos) { - // Must start with 'vN'. - if (A.size() >= 2 && (A[0] != 'v' || !std::isdigit(A[1]))) - return Error; - // Can't have an extra 'eb'. - if (A.find("eb") != StringRef::npos) - return Error; - } - - // Arch will either be a 'v' name (v7a) or a marketing name (xscale). - return A; -} - -unsigned llvm::ARM::parseHWDiv(StringRef HWDiv) { - StringRef Syn = getHWDivSynonym(HWDiv); - for (const auto D : HWDivNames) { - if (Syn == D.getName()) - return D.ID; - } - return ARM::AEK_INVALID; -} - -unsigned llvm::ARM::parseFPU(StringRef FPU) { - StringRef Syn = getFPUSynonym(FPU); - for (const auto F : FPUNames) { - if (Syn == F.getName()) - return F.ID; - } - return ARM::FK_INVALID; -} - -// Allows partial match, ex. "v7a" matches "armv7a". -ARM::ArchKind ARM::parseArch(StringRef Arch) { - Arch = getCanonicalArchName(Arch); - StringRef Syn = getArchSynonym(Arch); - for (const auto A : ARCHNames) { - if (A.getName().endswith(Syn)) - return A.ID; - } - return ARM::ArchKind::INVALID; -} - -unsigned llvm::ARM::parseArchExt(StringRef ArchExt) { - for (const auto A : ARCHExtNames) { - if (ArchExt == A.getName()) - return A.ID; - } - return ARM::AEK_INVALID; -} - -ARM::ArchKind llvm::ARM::parseCPUArch(StringRef CPU) { - for (const auto C : CPUNames) { - if (CPU == C.getName()) - return C.ArchID; - } - return ARM::ArchKind::INVALID; -} - -void llvm::ARM::fillValidCPUArchList(SmallVectorImpl &Values) { - for (const CpuNames &Arch : CPUNames) { - if (Arch.ArchID != ARM::ArchKind::INVALID) - Values.push_back(Arch.getName()); - } -} - -void llvm::AArch64::fillValidCPUArchList(SmallVectorImpl &Values) { - for (const CpuNames &Arch : AArch64CPUNames) { - if (Arch.ArchID != AArch64::ArchKind::INVALID) - Values.push_back(Arch.getName()); - } -} - -// ARM, Thumb, AArch64 -ARM::ISAKind ARM::parseArchISA(StringRef Arch) { - return StringSwitch(Arch) - .StartsWith("aarch64", ARM::ISAKind::AARCH64) - .StartsWith("arm64", ARM::ISAKind::AARCH64) - .StartsWith("thumb", ARM::ISAKind::THUMB) - .StartsWith("arm", ARM::ISAKind::ARM) - .Default(ARM::ISAKind::INVALID); -} - -// Little/Big endian -ARM::EndianKind ARM::parseArchEndian(StringRef Arch) { - if (Arch.startswith("armeb") || Arch.startswith("thumbeb") || - Arch.startswith("aarch64_be")) - return ARM::EndianKind::BIG; - - if (Arch.startswith("arm") || Arch.startswith("thumb")) { - if (Arch.endswith("eb")) - return ARM::EndianKind::BIG; - else - return ARM::EndianKind::LITTLE; - } - - if (Arch.startswith("aarch64")) - return ARM::EndianKind::LITTLE; - - return ARM::EndianKind::INVALID; -} - -// Profile A/R/M -ARM::ProfileKind ARM::parseArchProfile(StringRef Arch) { - Arch = getCanonicalArchName(Arch); - switch (parseArch(Arch)) { - case ARM::ArchKind::ARMV6M: - case ARM::ArchKind::ARMV7M: - case ARM::ArchKind::ARMV7EM: - case ARM::ArchKind::ARMV8MMainline: - case ARM::ArchKind::ARMV8MBaseline: - return ARM::ProfileKind::M; - case ARM::ArchKind::ARMV7R: - case ARM::ArchKind::ARMV8R: - return ARM::ProfileKind::R; - case ARM::ArchKind::ARMV7A: - case ARM::ArchKind::ARMV7VE: - case ARM::ArchKind::ARMV7K: - case ARM::ArchKind::ARMV8A: - case ARM::ArchKind::ARMV8_1A: - case ARM::ArchKind::ARMV8_2A: - case ARM::ArchKind::ARMV8_3A: - case ARM::ArchKind::ARMV8_4A: - case ARM::ArchKind::ARMV8_5A: - return ARM::ProfileKind::A; - case ARM::ArchKind::ARMV2: - case ARM::ArchKind::ARMV2A: - case ARM::ArchKind::ARMV3: - case ARM::ArchKind::ARMV3M: - case ARM::ArchKind::ARMV4: - case ARM::ArchKind::ARMV4T: - case ARM::ArchKind::ARMV5T: - case ARM::ArchKind::ARMV5TE: - case ARM::ArchKind::ARMV5TEJ: - case ARM::ArchKind::ARMV6: - case ARM::ArchKind::ARMV6K: - case ARM::ArchKind::ARMV6T2: - case ARM::ArchKind::ARMV6KZ: - case ARM::ArchKind::ARMV7S: - case ARM::ArchKind::IWMMXT: - case ARM::ArchKind::IWMMXT2: - case ARM::ArchKind::XSCALE: - case ARM::ArchKind::INVALID: - return ARM::ProfileKind::INVALID; - } - llvm_unreachable("Unhandled architecture"); -} - -// Version number (ex. v7 = 7). -unsigned llvm::ARM::parseArchVersion(StringRef Arch) { - Arch = getCanonicalArchName(Arch); - switch (parseArch(Arch)) { - case ARM::ArchKind::ARMV2: - case ARM::ArchKind::ARMV2A: - return 2; - case ARM::ArchKind::ARMV3: - case ARM::ArchKind::ARMV3M: - return 3; - case ARM::ArchKind::ARMV4: - case ARM::ArchKind::ARMV4T: - return 4; - case ARM::ArchKind::ARMV5T: - case ARM::ArchKind::ARMV5TE: - case ARM::ArchKind::IWMMXT: - case ARM::ArchKind::IWMMXT2: - case ARM::ArchKind::XSCALE: - case ARM::ArchKind::ARMV5TEJ: - return 5; - case ARM::ArchKind::ARMV6: - case ARM::ArchKind::ARMV6K: - case ARM::ArchKind::ARMV6T2: - case ARM::ArchKind::ARMV6KZ: - case ARM::ArchKind::ARMV6M: - return 6; - case ARM::ArchKind::ARMV7A: - case ARM::ArchKind::ARMV7VE: - case ARM::ArchKind::ARMV7R: - case ARM::ArchKind::ARMV7M: - case ARM::ArchKind::ARMV7S: - case ARM::ArchKind::ARMV7EM: - case ARM::ArchKind::ARMV7K: - return 7; - case ARM::ArchKind::ARMV8A: - case ARM::ArchKind::ARMV8_1A: - case ARM::ArchKind::ARMV8_2A: - case ARM::ArchKind::ARMV8_3A: - case ARM::ArchKind::ARMV8_4A: - case ARM::ArchKind::ARMV8_5A: - case ARM::ArchKind::ARMV8R: - case ARM::ArchKind::ARMV8MBaseline: - case ARM::ArchKind::ARMV8MMainline: - return 8; - case ARM::ArchKind::INVALID: - return 0; - } - llvm_unreachable("Unhandled architecture"); -} - -StringRef llvm::ARM::computeDefaultTargetABI(const Triple &TT, StringRef CPU) { - StringRef ArchName = - CPU.empty() ? TT.getArchName() : ARM::getArchName(ARM::parseCPUArch(CPU)); - - if (TT.isOSBinFormatMachO()) { - if (TT.getEnvironment() == Triple::EABI || - TT.getOS() == Triple::UnknownOS || - llvm::ARM::parseArchProfile(ArchName) == ARM::ProfileKind::M) - return "aapcs"; - if (TT.isWatchABI()) - return "aapcs16"; - return "apcs-gnu"; - } else if (TT.isOSWindows()) - // FIXME: this is invalid for WindowsCE. - return "aapcs"; - - // Select the default based on the platform. - switch (TT.getEnvironment()) { - case Triple::Android: - case Triple::GNUEABI: - case Triple::GNUEABIHF: - case Triple::MuslEABI: - case Triple::MuslEABIHF: - return "aapcs-linux"; - case Triple::EABIHF: - case Triple::EABI: - return "aapcs"; - default: - if (TT.isOSNetBSD()) - return "apcs-gnu"; - if (TT.isOSOpenBSD()) - return "aapcs-linux"; - return "aapcs"; - } -} - -StringRef llvm::AArch64::getCanonicalArchName(StringRef Arch) { - return ARM::getCanonicalArchName(Arch); -} - -unsigned llvm::AArch64::parseFPU(StringRef FPU) { - return ARM::parseFPU(FPU); -} - -// Allows partial match, ex. "v8a" matches "armv8a". -AArch64::ArchKind AArch64::parseArch(StringRef Arch) { - Arch = getCanonicalArchName(Arch); - if (checkArchVersion(Arch) < 8) - return ArchKind::INVALID; - - StringRef Syn = getArchSynonym(Arch); - for (const auto A : AArch64ARCHNames) { - if (A.getName().endswith(Syn)) - return A.ID; - } - return ArchKind::INVALID; -} - -AArch64::ArchExtKind llvm::AArch64::parseArchExt(StringRef ArchExt) { - for (const auto A : AArch64ARCHExtNames) { - if (ArchExt == A.getName()) - return static_cast(A.ID); - } - return AArch64::AEK_INVALID; -} - -AArch64::ArchKind llvm::AArch64::parseCPUArch(StringRef CPU) { - for (const auto C : AArch64CPUNames) { - if (CPU == C.getName()) - return C.ArchID; - } - return ArchKind::INVALID; -} - -// ARM, Thumb, AArch64 -ARM::ISAKind AArch64::parseArchISA(StringRef Arch) { - return ARM::parseArchISA(Arch); -} - -// Little/Big endian -ARM::EndianKind AArch64::parseArchEndian(StringRef Arch) { - return ARM::parseArchEndian(Arch); -} - -// Profile A/R/M -ARM::ProfileKind AArch64::parseArchProfile(StringRef Arch) { - return ARM::parseArchProfile(Arch); -} - -// Version number (ex. v8 = 8). -unsigned llvm::AArch64::parseArchVersion(StringRef Arch) { - return ARM::parseArchVersion(Arch); -} - -bool llvm::AArch64::isX18ReservedByDefault(const Triple &TT) { - return TT.isAndroid() || TT.isOSDarwin() || TT.isOSFuchsia() || - TT.isOSWindows(); -} - -namespace { - struct GPUInfo { StringLiteral Name; StringLiteral CanonicalName; diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp index c53c878e9df1217b6758ff323facc3fc56eba816..4471fd05181eaf58bd4390d5c110169cd68acada 100644 --- a/lib/Support/Triple.cpp +++ b/lib/Support/Triple.cpp @@ -210,6 +210,7 @@ StringRef Triple::getOSTypeName(OSType Kind) { case Contiki: return "contiki"; case AMDPAL: return "amdpal"; case HermitCore: return "hermit"; + case Hurd: return "hurd"; } llvm_unreachable("Invalid OSType"); @@ -508,6 +509,7 @@ static Triple::OSType parseOS(StringRef OSName) { .StartsWith("contiki", Triple::Contiki) .StartsWith("amdpal", Triple::AMDPAL) .StartsWith("hermit", Triple::HermitCore) + .StartsWith("hurd", Triple::Hurd) .Default(Triple::UnknownOS); } diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc index 02b7c2579c9c5bf13fd55ac2933ca7f75471b09b..d7cc0d627d09e894f6fb380bf18946576c526bd9 100644 --- a/lib/Support/Unix/Path.inc +++ b/lib/Support/Unix/Path.inc @@ -38,6 +38,8 @@ #ifdef __APPLE__ #include #include +#elif defined(__DragonFly__) +#include #endif // Both stdio.h and cstdio are included via different paths and @@ -54,7 +56,7 @@ #include #if !defined(__APPLE__) && !defined(__OpenBSD__) && !defined(__FreeBSD__) && \ - !defined(__linux__) + !defined(__linux__) && !defined(__FreeBSD_kernel__) #include #define STATVFS statvfs #define FSTATVFS fstatvfs @@ -83,7 +85,7 @@ #define STATVFS_F_FRSIZE(vfs) static_cast(vfs.f_bsize) #endif -#if defined(__NetBSD__) || defined(__GNU__) +#if defined(__NetBSD__) || defined(__DragonFly__) || defined(__GNU__) #define STATVFS_F_FLAG(vfs) (vfs).f_flag #else #define STATVFS_F_FLAG(vfs) (vfs).f_flags @@ -227,11 +229,11 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) { } TimePoint<> basic_file_status::getLastAccessedTime() const { - return toTimePoint(fs_st_atime); + return toTimePoint(fs_st_atime, fs_st_atime_nsec); } TimePoint<> basic_file_status::getLastModificationTime() const { - return toTimePoint(fs_st_mtime); + return toTimePoint(fs_st_mtime, fs_st_mtime_nsec); } UniqueID file_status::getUniqueID() const { @@ -548,6 +550,18 @@ static void expandTildeExpr(SmallVectorImpl &Path) { llvm::sys::path::append(Path, Storage); } + +void expand_tilde(const Twine &path, SmallVectorImpl &dest) { + dest.clear(); + if (path.isTriviallyEmpty()) + return; + + path.toVector(dest); + expandTildeExpr(dest); + + return; +} + static file_type typeForMode(mode_t Mode) { if (S_ISDIR(Mode)) return file_type::directory_file; @@ -577,11 +591,22 @@ static std::error_code fillStatus(int StatRet, const struct stat &Status, return EC; } + uint32_t atime_nsec, mtime_nsec; +#if defined(HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC) + atime_nsec = Status.st_atimespec.tv_nsec; + mtime_nsec = Status.st_mtimespec.tv_nsec; +#elif defined(HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC) + atime_nsec = Status.st_atim.tv_nsec; + mtime_nsec = Status.st_mtim.tv_nsec; +#else + atime_nsec = mtime_nsec = 0; +#endif + perms Perms = static_cast(Status.st_mode) & all_perms; Result = file_status(typeForMode(Status.st_mode), Perms, Status.st_dev, - Status.st_nlink, Status.st_ino, Status.st_atime, - Status.st_mtime, Status.st_uid, Status.st_gid, - Status.st_size); + Status.st_nlink, Status.st_ino, + Status.st_atime, atime_nsec, Status.st_mtime, mtime_nsec, + Status.st_uid, Status.st_gid, Status.st_size); return std::error_code(); } diff --git a/lib/Support/VirtualFileSystem.cpp b/lib/Support/VirtualFileSystem.cpp index e8b0435b9cdf12fe5d43603ecea5adf23b8b3a24..24581944fa81ef82a22f25b14f42658fc8150b44 100644 --- a/lib/Support/VirtualFileSystem.cpp +++ b/lib/Support/VirtualFileSystem.cpp @@ -1164,14 +1164,14 @@ private: /// Looks up the path [Start, End) in \p From, possibly /// recursing into the contents of \p From if it is a directory. ErrorOr lookupPath(sys::path::const_iterator Start, - sys::path::const_iterator End, Entry *From); + sys::path::const_iterator End, Entry *From) const; /// Get the status of a given an \c Entry. ErrorOr status(const Twine &Path, Entry *E); public: /// Looks up \p Path in \c Roots. - ErrorOr lookupPath(const Twine &Path); + ErrorOr lookupPath(const Twine &Path) const; /// Parses \p Buffer, which is expected to be in YAML format and /// returns a virtual file system representing its contents. @@ -1183,6 +1183,9 @@ public: ErrorOr status(const Twine &Path) override; ErrorOr> openFileForRead(const Twine &Path) override; + std::error_code getRealPath(const Twine &Path, + SmallVectorImpl &Output) const override; + llvm::ErrorOr getCurrentWorkingDirectory() const override { return ExternalFS->getCurrentWorkingDirectory(); } @@ -1726,7 +1729,7 @@ RedirectingFileSystem::create(std::unique_ptr Buffer, return FS.release(); } -ErrorOr RedirectingFileSystem::lookupPath(const Twine &Path_) { +ErrorOr RedirectingFileSystem::lookupPath(const Twine &Path_) const { SmallString<256> Path; Path_.toVector(Path); @@ -1757,7 +1760,8 @@ ErrorOr RedirectingFileSystem::lookupPath(const Twine &Path_) { ErrorOr RedirectingFileSystem::lookupPath(sys::path::const_iterator Start, - sys::path::const_iterator End, Entry *From) { + sys::path::const_iterator End, + Entry *From) const { #ifndef _WIN32 assert(!isTraversalComponent(*Start) && !isTraversalComponent(From->getName()) && @@ -1890,6 +1894,27 @@ RedirectingFileSystem::openFileForRead(const Twine &Path) { llvm::make_unique(std::move(*Result), S)); } +std::error_code +RedirectingFileSystem::getRealPath(const Twine &Path, + SmallVectorImpl &Output) const { + ErrorOr Result = lookupPath(Path); + if (!Result) { + if (IsFallthrough && + Result.getError() == llvm::errc::no_such_file_or_directory) { + return ExternalFS->getRealPath(Path, Output); + } + return Result.getError(); + } + + if (auto *F = dyn_cast(*Result)) { + return ExternalFS->getRealPath(F->getExternalContentsPath(), Output); + } + // Even if there is a directory entry, fall back to ExternalFS if allowed, + // because directories don't have a single external contents path. + return IsFallthrough ? ExternalFS->getRealPath(Path, Output) + : llvm::errc::invalid_argument; +} + IntrusiveRefCntPtr vfs::getVFSFromYAML(std::unique_ptr Buffer, SourceMgr::DiagHandlerTy DiagHandler, diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc index 45d73ae3dfed29a98ec00444c05492e1354a5eef..d34aa763124c9aaced0f36459df1564979b44ba7 100644 --- a/lib/Support/Windows/Path.inc +++ b/lib/Support/Windows/Path.inc @@ -416,7 +416,7 @@ static std::error_code rename_internal(HANDLE FromHandle, const Twine &To, *reinterpret_cast(RenameInfoBuf.data()); RenameInfo.ReplaceIfExists = ReplaceIfExists; RenameInfo.RootDirectory = 0; - RenameInfo.FileNameLength = ToWide.size(); + RenameInfo.FileNameLength = ToWide.size() * sizeof(wchar_t); std::copy(ToWide.begin(), ToWide.end(), &RenameInfo.FileName[0]); SetLastError(ERROR_SUCCESS); @@ -1253,6 +1253,17 @@ static void expandTildeExpr(SmallVectorImpl &Path) { Path.insert(Path.begin() + 1, HomeDir.begin() + 1, HomeDir.end()); } +void expand_tilde(const Twine &path, SmallVectorImpl &dest) { + dest.clear(); + if (path.isTriviallyEmpty()) + return; + + path.toVector(dest); + expandTildeExpr(dest); + + return; +} + std::error_code real_path(const Twine &path, SmallVectorImpl &dest, bool expand_tilde) { dest.clear(); diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp index f8492c96bab69e681d4fb163c60dd07de09bda5d..b9bbee7883c61b5c9f29c72f0e55aefd6e1c71f6 100644 --- a/lib/Support/YAMLTraits.cpp +++ b/lib/Support/YAMLTraits.cpp @@ -341,11 +341,25 @@ void Input::scalarString(StringRef &S, QuotingType) { void Input::blockScalarString(StringRef &S) { scalarString(S, QuotingType::None); } +void Input::scalarTag(std::string &Tag) { + Tag = CurrentNode->_node->getVerbatimTag(); +} + void Input::setError(HNode *hnode, const Twine &message) { assert(hnode && "HNode must not be NULL"); setError(hnode->_node, message); } +NodeKind Input::getNodeKind() { + if (isa(CurrentNode)) + return NodeKind::Scalar; + else if (isa(CurrentNode)) + return NodeKind::Map; + else if (isa(CurrentNode)) + return NodeKind::Sequence; + llvm_unreachable("Unsupported node kind"); +} + void Input::setError(Node *node, const Twine &message) { Strm->printError(node, message); EC = make_error_code(errc::invalid_argument); @@ -436,9 +450,11 @@ bool Output::mapTag(StringRef Tag, bool Use) { // If this tag is being written inside a sequence we should write the start // of the sequence before writing the tag, otherwise the tag won't be // attached to the element in the sequence, but rather the sequence itself. - bool SequenceElement = - StateStack.size() > 1 && (StateStack[StateStack.size() - 2] == inSeq || - StateStack[StateStack.size() - 2] == inFlowSeq); + bool SequenceElement = false; + if (StateStack.size() > 1) { + auto &E = StateStack[StateStack.size() - 2]; + SequenceElement = inSeqAnyElement(E) || inFlowSeqAnyElement(E); + } if (SequenceElement && StateStack.back() == inMapFirstKey) { newLineCheck(); } else { @@ -461,6 +477,9 @@ bool Output::mapTag(StringRef Tag, bool Use) { } void Output::endMapping() { + // If we did not map anything, we should explicitly emit an empty map + if (StateStack.back() == inMapFirstKey) + output("{}"); StateStack.pop_back(); } @@ -524,12 +543,15 @@ void Output::endDocuments() { } unsigned Output::beginSequence() { - StateStack.push_back(inSeq); + StateStack.push_back(inSeqFirstElement); NeedsNewLine = true; return 0; } void Output::endSequence() { + // If we did not emit anything, we should explicitly emit an empty sequence + if (StateStack.back() == inSeqFirstElement) + output("[]"); StateStack.pop_back(); } @@ -538,10 +560,17 @@ bool Output::preflightElement(unsigned, void *&) { } void Output::postflightElement(void *) { + if (StateStack.back() == inSeqFirstElement) { + StateStack.pop_back(); + StateStack.push_back(inSeqOtherElement); + } else if (StateStack.back() == inFlowSeqFirstElement) { + StateStack.pop_back(); + StateStack.push_back(inFlowSeqOtherElement); + } } unsigned Output::beginFlowSequence() { - StateStack.push_back(inFlowSeq); + StateStack.push_back(inFlowSeqFirstElement); newLineCheck(); ColumnAtFlowStart = Column; output("[ "); @@ -680,6 +709,14 @@ void Output::blockScalarString(StringRef &S) { } } +void Output::scalarTag(std::string &Tag) { + if (Tag.empty()) + return; + newLineCheck(); + output(Tag); + output(" "); +} + void Output::setError(const Twine &message) { } @@ -693,7 +730,7 @@ bool Output::canElideEmptySequence() { return true; if (StateStack.back() != inMapFirstKey) return true; - return (StateStack[StateStack.size()-2] != inSeq); + return !inSeqAnyElement(StateStack[StateStack.size() - 2]); } void Output::output(StringRef s) { @@ -703,9 +740,8 @@ void Output::output(StringRef s) { void Output::outputUpToEndOfLine(StringRef s) { output(s); - if (StateStack.empty() || (StateStack.back() != inFlowSeq && - StateStack.back() != inFlowMapFirstKey && - StateStack.back() != inFlowMapOtherKey)) + if (StateStack.empty() || (!inFlowSeqAnyElement(StateStack.back()) && + !inFlowMapAnyKey(StateStack.back()))) NeedsNewLine = true; } @@ -725,16 +761,20 @@ void Output::newLineCheck() { outputNewLine(); - assert(StateStack.size() > 0); + if (StateStack.size() == 0) + return; + unsigned Indent = StateStack.size() - 1; bool OutputDash = false; - if (StateStack.back() == inSeq) { + if (StateStack.back() == inSeqFirstElement || + StateStack.back() == inSeqOtherElement) { OutputDash = true; - } else if ((StateStack.size() > 1) && ((StateStack.back() == inMapFirstKey) || - (StateStack.back() == inFlowSeq) || - (StateStack.back() == inFlowMapFirstKey)) && - (StateStack[StateStack.size() - 2] == inSeq)) { + } else if ((StateStack.size() > 1) && + ((StateStack.back() == inMapFirstKey) || + inFlowSeqAnyElement(StateStack.back()) || + (StateStack.back() == inFlowMapFirstKey)) && + inSeqAnyElement(StateStack[StateStack.size() - 2])) { --Indent; OutputDash = true; } @@ -772,6 +812,24 @@ void Output::flowKey(StringRef Key) { output(": "); } +NodeKind Output::getNodeKind() { report_fatal_error("invalid call"); } + +bool Output::inSeqAnyElement(InState State) { + return State == inSeqFirstElement || State == inSeqOtherElement; +} + +bool Output::inFlowSeqAnyElement(InState State) { + return State == inFlowSeqFirstElement || State == inFlowSeqOtherElement; +} + +bool Output::inMapAnyKey(InState State) { + return State == inMapFirstKey || State == inMapOtherKey; +} + +bool Output::inFlowMapAnyKey(InState State) { + return State == inFlowMapFirstKey || State == inFlowMapOtherKey; +} + //===----------------------------------------------------------------------===// // traits for built-in types //===----------------------------------------------------------------------===// diff --git a/lib/TableGen/Main.cpp b/lib/TableGen/Main.cpp index 3a070162608971d2521790833386e2be5e4328d1..df4088ad2f867ec9787a9a20a39e17812a2f4464 100644 --- a/lib/TableGen/Main.cpp +++ b/lib/TableGen/Main.cpp @@ -46,6 +46,10 @@ static cl::list IncludeDirs("I", cl::desc("Directory of include files"), cl::value_desc("directory"), cl::Prefix); +static cl::list +MacroNames("D", cl::desc("Name of the macro to be defined"), + cl::value_desc("macro name"), cl::Prefix); + static int reportError(const char *ProgName, Twine Msg) { errs() << ProgName << ": " << Msg; errs().flush(); @@ -91,7 +95,7 @@ int llvm::TableGenMain(char *argv0, TableGenMainFn *MainFn) { // it later. SrcMgr.setIncludeDirs(IncludeDirs); - TGParser Parser(SrcMgr, Records); + TGParser Parser(SrcMgr, MacroNames, Records); if (Parser.ParseFile()) return 1; diff --git a/lib/TableGen/TGLexer.cpp b/lib/TableGen/TGLexer.cpp index 652be6e8dbbf3bb392e0b4c8ca5ceb4b3c7cf4bf..16aeee561075c54a2f103d8fc185305bdc1a55e6 100644 --- a/lib/TableGen/TGLexer.cpp +++ b/lib/TableGen/TGLexer.cpp @@ -19,6 +19,7 @@ #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/SourceMgr.h" #include "llvm/TableGen/Error.h" +#include #include #include #include @@ -28,11 +29,35 @@ using namespace llvm; -TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) { +namespace { +// A list of supported preprocessing directives with their +// internal token kinds and names. +struct { + tgtok::TokKind Kind; + const char *Word; +} PreprocessorDirs[] = { + { tgtok::Ifdef, "ifdef" }, + { tgtok::Else, "else" }, + { tgtok::Endif, "endif" }, + { tgtok::Define, "define" } +}; +} // end anonymous namespace + +TGLexer::TGLexer(SourceMgr &SM, ArrayRef Macros) : SrcMgr(SM) { CurBuffer = SrcMgr.getMainFileID(); CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); CurPtr = CurBuf.begin(); TokStart = nullptr; + + // Pretend that we enter the "top-level" include file. + PrepIncludeStack.push_back( + make_unique>()); + + // Put all macros defined in the command line into the DefinedMacros set. + std::for_each(Macros.begin(), Macros.end(), + [this](const std::string &MacroName) { + DefinedMacros.insert(MacroName); + }); } SMLoc TGLexer::getLoc() const { @@ -41,11 +66,42 @@ SMLoc TGLexer::getLoc() const { /// ReturnError - Set the error to the specified string at the specified /// location. This is defined to always return tgtok::Error. -tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { +tgtok::TokKind TGLexer::ReturnError(SMLoc Loc, const Twine &Msg) { PrintError(Loc, Msg); return tgtok::Error; } +tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { + return ReturnError(SMLoc::getFromPointer(Loc), Msg); +} + +bool TGLexer::processEOF() { + SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); + if (ParentIncludeLoc != SMLoc()) { + // If prepExitInclude() detects a problem with the preprocessing + // control stack, it will return false. Pretend that we reached + // the final EOF and stop lexing more tokens by returning false + // to LexToken(). + if (!prepExitInclude(false)) + return false; + + CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); + CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); + CurPtr = ParentIncludeLoc.getPointer(); + // Make sure TokStart points into the parent file's buffer. + // LexToken() assigns to it before calling getNextChar(), + // so it is pointing into the included file now. + TokStart = CurPtr; + return true; + } + + // Pretend that we exit the "top-level" include file. + // Note that in case of an error (e.g. control stack imbalance) + // the routine will issue a fatal error. + prepExitInclude(true); + return false; +} + int TGLexer::getNextChar() { char CurChar = *CurPtr++; switch (CurChar) { @@ -57,16 +113,6 @@ int TGLexer::getNextChar() { if (CurPtr-1 != CurBuf.end()) return 0; // Just whitespace. - // If this is the end of an included file, pop the parent file off the - // include stack. - SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); - if (ParentIncludeLoc != SMLoc()) { - CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); - CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); - CurPtr = ParentIncludeLoc.getPointer(); - return getNextChar(); - } - // Otherwise, return end of file. --CurPtr; // Another call to lex will return EOF again. return EOF; @@ -83,11 +129,11 @@ int TGLexer::getNextChar() { } } -int TGLexer::peekNextChar(int Index) { +int TGLexer::peekNextChar(int Index) const { return *(CurPtr + Index); } -tgtok::TokKind TGLexer::LexToken() { +tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) { TokStart = CurPtr; // This always consumes at least one character. int CurChar = getNextChar(); @@ -100,7 +146,18 @@ tgtok::TokKind TGLexer::LexToken() { // Unknown character, emit an error. return ReturnError(TokStart, "Unexpected character"); - case EOF: return tgtok::Eof; + case EOF: + // Lex next token, if we just left an include file. + // Note that leaving an include file means that the next + // symbol is located at the end of 'include "..."' + // construct, so LexToken() is called with default + // false parameter. + if (processEOF()) + return LexToken(); + + // Return EOF denoting the end of lexing. + return tgtok::Eof; + case ':': return tgtok::colon; case ';': return tgtok::semi; case '.': return tgtok::period; @@ -114,15 +171,27 @@ tgtok::TokKind TGLexer::LexToken() { case ')': return tgtok::r_paren; case '=': return tgtok::equal; case '?': return tgtok::question; - case '#': return tgtok::paste; + case '#': + if (FileOrLineStart) { + tgtok::TokKind Kind = prepIsDirective(); + if (Kind != tgtok::Error) + return lexPreprocessor(Kind); + } + + return tgtok::paste; + + case '\r': + PrintFatalError("getNextChar() must never return '\r'"); + return tgtok::Error; case 0: case ' ': case '\t': - case '\n': - case '\r': // Ignore whitespace. - return LexToken(); + return LexToken(FileOrLineStart); + case '\n': + // Ignore whitespace, and identify the new line. + return LexToken(true); case '/': // If this is the start of a // comment, skip until the end of the line or // the end of the buffer. @@ -133,7 +202,7 @@ tgtok::TokKind TGLexer::LexToken() { return tgtok::Error; } else // Otherwise, this is an error. return ReturnError(TokStart, "Unexpected character"); - return LexToken(); + return LexToken(FileOrLineStart); case '-': case '+': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { @@ -249,10 +318,10 @@ tgtok::TokKind TGLexer::LexVarName() { } tgtok::TokKind TGLexer::LexIdentifier() { - // The first letter is [a-zA-Z_#]. + // The first letter is [a-zA-Z_]. const char *IdentStart = TokStart; - // Match the rest of the identifier regex: [0-9a-zA-Z_#]* + // Match the rest of the identifier regex: [0-9a-zA-Z_]* while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') ++CurPtr; @@ -322,6 +391,9 @@ bool TGLexer::LexInclude() { // Save the line number and lex buffer of the includer. CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); CurPtr = CurBuf.begin(); + + PrepIncludeStack.push_back( + make_unique>()); return false; } @@ -496,3 +568,444 @@ tgtok::TokKind TGLexer::LexExclaim() { return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator"); } + +bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) { + // Report an error, if preprocessor control stack for the current + // file is not empty. + if (!PrepIncludeStack.back()->empty()) { + prepReportPreprocessorStackError(); + + return false; + } + + // Pop the preprocessing controls from the include stack. + if (PrepIncludeStack.empty()) { + PrintFatalError("Preprocessor include stack is empty"); + } + + PrepIncludeStack.pop_back(); + + if (IncludeStackMustBeEmpty) { + if (!PrepIncludeStack.empty()) + PrintFatalError("Preprocessor include stack is not empty"); + } else { + if (PrepIncludeStack.empty()) + PrintFatalError("Preprocessor include stack is empty"); + } + + return true; +} + +tgtok::TokKind TGLexer::prepIsDirective() const { + for (unsigned ID = 0; ID < llvm::array_lengthof(PreprocessorDirs); ++ID) { + int NextChar = *CurPtr; + bool Match = true; + unsigned I = 0; + for (; I < strlen(PreprocessorDirs[ID].Word); ++I) { + if (NextChar != PreprocessorDirs[ID].Word[I]) { + Match = false; + break; + } + + NextChar = peekNextChar(I + 1); + } + + // Check for whitespace after the directive. If there is no whitespace, + // then we do not recognize it as a preprocessing directive. + if (Match) { + tgtok::TokKind Kind = PreprocessorDirs[ID].Kind; + + // New line and EOF may follow only #else/#endif. It will be reported + // as an error for #ifdef/#define after the call to prepLexMacroName(). + if (NextChar == ' ' || NextChar == '\t' || NextChar == EOF || + NextChar == '\n' || + // It looks like TableGen does not support '\r' as the actual + // carriage return, e.g. getNextChar() treats a single '\r' + // as '\n'. So we do the same here. + NextChar == '\r') + return Kind; + + // Allow comments after some directives, e.g.: + // #else// OR #else/**/ + // #endif// OR #endif/**/ + // + // Note that we do allow comments after #ifdef/#define here, e.g. + // #ifdef/**/ AND #ifdef// + // #define/**/ AND #define// + // + // These cases will be reported as incorrect after calling + // prepLexMacroName(). We could have supported C-style comments + // after #ifdef/#define, but this would complicate the code + // for little benefit. + if (NextChar == '/') { + NextChar = peekNextChar(I + 1); + + if (NextChar == '*' || NextChar == '/') + return Kind; + + // Pretend that we do not recognize the directive. + } + } + } + + return tgtok::Error; +} + +bool TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) { + TokStart = CurPtr; + + for (unsigned ID = 0; ID < llvm::array_lengthof(PreprocessorDirs); ++ID) + if (PreprocessorDirs[ID].Kind == Kind) { + // Advance CurPtr to the end of the preprocessing word. + CurPtr += strlen(PreprocessorDirs[ID].Word); + return true; + } + + PrintFatalError("Unsupported preprocessing token in " + "prepEatPreprocessorDirective()"); + return false; +} + +tgtok::TokKind TGLexer::lexPreprocessor( + tgtok::TokKind Kind, bool ReturnNextLiveToken) { + + // We must be looking at a preprocessing directive. Eat it! + if (!prepEatPreprocessorDirective(Kind)) + PrintFatalError("lexPreprocessor() called for unknown " + "preprocessor directive"); + + if (Kind == tgtok::Ifdef) { + StringRef MacroName = prepLexMacroName(); + if (MacroName.empty()) + return ReturnError(TokStart, "Expected macro name after #ifdef"); + + bool MacroIsDefined = DefinedMacros.count(MacroName) != 0; + + // Regardless of whether we are processing tokens or not, + // we put the #ifdef control on stack. + PrepIncludeStack.back()->push_back( + {Kind, MacroIsDefined, SMLoc::getFromPointer(TokStart)}); + + if (!prepSkipDirectiveEnd()) + return ReturnError(CurPtr, + "Only comments are supported after #ifdef NAME"); + + // If we were not processing tokens before this #ifdef, + // then just return back to the lines skipping code. + if (!ReturnNextLiveToken) + return Kind; + + // If we were processing tokens before this #ifdef, + // and the macro is defined, then just return the next token. + if (MacroIsDefined) + return LexToken(); + + // We were processing tokens before this #ifdef, and the macro + // is not defined, so we have to start skipping the lines. + // If the skipping is successful, it will return the token following + // either #else or #endif corresponding to this #ifdef. + if (prepSkipRegion(ReturnNextLiveToken)) + return LexToken(); + + return tgtok::Error; + } else if (Kind == tgtok::Else) { + // Check if this #else is correct before calling prepSkipDirectiveEnd(), + // which will move CurPtr away from the beginning of #else. + if (PrepIncludeStack.back()->empty()) + return ReturnError(TokStart, "#else without #ifdef"); + + PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back()->back(); + + if (IfdefEntry.Kind != tgtok::Ifdef) { + PrintError(TokStart, "double #else"); + return ReturnError(IfdefEntry.SrcPos, "Previous #else is here"); + } + + // Replace the corresponding #ifdef's control with its negation + // on the control stack. + PrepIncludeStack.back()->pop_back(); + PrepIncludeStack.back()->push_back( + {Kind, !IfdefEntry.IsDefined, SMLoc::getFromPointer(TokStart)}); + + if (!prepSkipDirectiveEnd()) + return ReturnError(CurPtr, "Only comments are supported after #else"); + + // If we were processing tokens before this #else, + // we have to start skipping lines until the matching #endif. + if (ReturnNextLiveToken) { + if (prepSkipRegion(ReturnNextLiveToken)) + return LexToken(); + + return tgtok::Error; + } + + // Return to the lines skipping code. + return Kind; + } else if (Kind == tgtok::Endif) { + // Check if this #endif is correct before calling prepSkipDirectiveEnd(), + // which will move CurPtr away from the beginning of #endif. + if (PrepIncludeStack.back()->empty()) + return ReturnError(TokStart, "#endif without #ifdef"); + + auto &IfdefOrElseEntry = PrepIncludeStack.back()->back(); + + if (IfdefOrElseEntry.Kind != tgtok::Ifdef && + IfdefOrElseEntry.Kind != tgtok::Else) { + PrintFatalError("Invalid preprocessor control on the stack"); + return tgtok::Error; + } + + if (!prepSkipDirectiveEnd()) + return ReturnError(CurPtr, "Only comments are supported after #endif"); + + PrepIncludeStack.back()->pop_back(); + + // If we were processing tokens before this #endif, then + // we should continue it. + if (ReturnNextLiveToken) { + return LexToken(); + } + + // Return to the lines skipping code. + return Kind; + } else if (Kind == tgtok::Define) { + StringRef MacroName = prepLexMacroName(); + if (MacroName.empty()) + return ReturnError(TokStart, "Expected macro name after #define"); + + if (!DefinedMacros.insert(MacroName).second) + PrintWarning(getLoc(), + "Duplicate definition of macro: " + Twine(MacroName)); + + if (!prepSkipDirectiveEnd()) + return ReturnError(CurPtr, + "Only comments are supported after #define NAME"); + + if (!ReturnNextLiveToken) { + PrintFatalError("#define must be ignored during the lines skipping"); + return tgtok::Error; + } + + return LexToken(); + } + + PrintFatalError("Preprocessing directive is not supported"); + return tgtok::Error; +} + +bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) { + if (!MustNeverBeFalse) + PrintFatalError("Invalid recursion."); + + do { + // Skip all symbols to the line end. + prepSkipToLineEnd(); + + // Find the first non-whitespace symbol in the next line(s). + if (!prepSkipLineBegin()) + return false; + + // If the first non-blank/comment symbol on the line is '#', + // it may be a start of preprocessing directive. + // + // If it is not '#' just go to the next line. + if (*CurPtr == '#') + ++CurPtr; + else + continue; + + tgtok::TokKind Kind = prepIsDirective(); + + // If we did not find a preprocessing directive or it is #define, + // then just skip to the next line. We do not have to do anything + // for #define in the line-skipping mode. + if (Kind == tgtok::Error || Kind == tgtok::Define) + continue; + + tgtok::TokKind ProcessedKind = lexPreprocessor(Kind, false); + + // If lexPreprocessor() encountered an error during lexing this + // preprocessor idiom, then return false to the calling lexPreprocessor(). + // This will force tgtok::Error to be returned to the tokens processing. + if (ProcessedKind == tgtok::Error) + return false; + + if (Kind != ProcessedKind) + PrintFatalError("prepIsDirective() and lexPreprocessor() " + "returned different token kinds"); + + // If this preprocessing directive enables tokens processing, + // then return to the lexPreprocessor() and get to the next token. + // We can move from line-skipping mode to processing tokens only + // due to #else or #endif. + if (prepIsProcessingEnabled()) { + if (Kind != tgtok::Else && Kind != tgtok::Endif) { + PrintFatalError("Tokens processing was enabled by an unexpected " + "preprocessing directive"); + return false; + } + + return true; + } + } while (CurPtr != CurBuf.end()); + + // We have reached the end of the file, but never left the lines-skipping + // mode. This means there is no matching #endif. + prepReportPreprocessorStackError(); + return false; +} + +StringRef TGLexer::prepLexMacroName() { + // Skip whitespaces between the preprocessing directive and the macro name. + while (*CurPtr == ' ' || *CurPtr == '\t') + ++CurPtr; + + TokStart = CurPtr; + // Macro names start with [a-zA-Z_]. + if (*CurPtr != '_' && !isalpha(*CurPtr)) + return ""; + + // Match the rest of the identifier regex: [0-9a-zA-Z_]* + while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') + ++CurPtr; + + return StringRef(TokStart, CurPtr - TokStart); +} + +bool TGLexer::prepSkipLineBegin() { + while (CurPtr != CurBuf.end()) { + switch (*CurPtr) { + case ' ': + case '\t': + case '\n': + case '\r': + break; + + case '/': { + int NextChar = peekNextChar(1); + if (NextChar == '*') { + // Skip C-style comment. + // Note that we do not care about skipping the C++-style comments. + // If the line contains "//", it may not contain any processable + // preprocessing directive. Just return CurPtr pointing to + // the first '/' in this case. We also do not care about + // incorrect symbols after the first '/' - we are in lines-skipping + // mode, so incorrect code is allowed to some extent. + + // Set TokStart to the beginning of the comment to enable proper + // diagnostic printing in case of error in SkipCComment(). + TokStart = CurPtr; + + // CurPtr must point to '*' before call to SkipCComment(). + ++CurPtr; + if (SkipCComment()) + return false; + } else { + // CurPtr points to the non-whitespace '/'. + return true; + } + + // We must not increment CurPtr after the comment was lexed. + continue; + } + + default: + return true; + } + + ++CurPtr; + } + + // We have reached the end of the file. Return to the lines skipping + // code, and allow it to handle the EOF as needed. + return true; +} + +bool TGLexer::prepSkipDirectiveEnd() { + while (CurPtr != CurBuf.end()) { + switch (*CurPtr) { + case ' ': + case '\t': + break; + + case '\n': + case '\r': + return true; + + case '/': { + int NextChar = peekNextChar(1); + if (NextChar == '/') { + // Skip C++-style comment. + // We may just return true now, but let's skip to the line/buffer end + // to simplify the method specification. + ++CurPtr; + SkipBCPLComment(); + } else if (NextChar == '*') { + // When we are skipping C-style comment at the end of a preprocessing + // directive, we can skip several lines. If any meaningful TD token + // follows the end of the C-style comment on the same line, it will + // be considered as an invalid usage of TD token. + // For example, we want to forbid usages like this one: + // #define MACRO class Class {} + // But with C-style comments we also disallow the following: + // #define MACRO /* This macro is used + // to ... */ class Class {} + // One can argue that this should be allowed, but it does not seem + // to be worth of the complication. Moreover, this matches + // the C preprocessor behavior. + + // Set TokStart to the beginning of the comment to enable proper + // diagnostic printer in case of error in SkipCComment(). + TokStart = CurPtr; + ++CurPtr; + if (SkipCComment()) + return false; + } else { + TokStart = CurPtr; + PrintError(CurPtr, "Unexpected character"); + return false; + } + + // We must not increment CurPtr after the comment was lexed. + continue; + } + + default: + // Do not allow any non-whitespaces after the directive. + TokStart = CurPtr; + return false; + } + + ++CurPtr; + } + + return true; +} + +void TGLexer::prepSkipToLineEnd() { + while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) + ++CurPtr; +} + +bool TGLexer::prepIsProcessingEnabled() { + for (auto I = PrepIncludeStack.back()->rbegin(), + E = PrepIncludeStack.back()->rend(); + I != E; ++I) { + if (!I->IsDefined) + return false; + } + + return true; +} + +void TGLexer::prepReportPreprocessorStackError() { + if (PrepIncludeStack.back()->empty()) + PrintFatalError("prepReportPreprocessorStackError() called with " + "empty control stack"); + + auto &PrepControl = PrepIncludeStack.back()->back(); + PrintError(CurBuf.end(), "Reached EOF without matching #endif"); + PrintError(PrepControl.SrcPos, "The latest preprocessor control is here"); + + TokStart = CurPtr; +} diff --git a/lib/TableGen/TGLexer.h b/lib/TableGen/TGLexer.h index 2c80743e3a6840068a6636ecb6e9f09ea520ef44..e9980b36b97bb0bfd29a643a6b9289bcbb4aff1f 100644 --- a/lib/TableGen/TGLexer.h +++ b/lib/TableGen/TGLexer.h @@ -14,11 +14,14 @@ #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H #define LLVM_LIB_TABLEGEN_TGLEXER_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSet.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/SMLoc.h" #include #include +#include #include namespace llvm { @@ -59,7 +62,11 @@ namespace tgtok { BinaryIntVal, // String valued tokens. - Id, StrVal, VarName, CodeFragment + Id, StrVal, VarName, CodeFragment, + + // Preprocessing tokens for internal usage by the lexer. + // They are never returned as a result of Lex(). + Ifdef, Else, Endif, Define }; } @@ -87,10 +94,10 @@ private: DependenciesMapTy Dependencies; public: - TGLexer(SourceMgr &SrcMgr); + TGLexer(SourceMgr &SrcMgr, ArrayRef Macros); tgtok::TokKind Lex() { - return CurCode = LexToken(); + return CurCode = LexToken(CurPtr == CurBuf.begin()); } const DependenciesMapTy &getDependencies() const { @@ -119,12 +126,13 @@ public: private: /// LexToken - Read the next token and return its code. - tgtok::TokKind LexToken(); + tgtok::TokKind LexToken(bool FileOrLineStart = false); + tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg); tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg); int getNextChar(); - int peekNextChar(int Index); + int peekNextChar(int Index) const; void SkipBCPLComment(); bool SkipCComment(); tgtok::TokKind LexIdentifier(); @@ -134,6 +142,231 @@ private: tgtok::TokKind LexNumber(); tgtok::TokKind LexBracket(); tgtok::TokKind LexExclaim(); + + // Process EOF encountered in LexToken(). + // If EOF is met in an include file, then the method will update + // CurPtr, CurBuf and preprocessing include stack, and return true. + // If EOF is met in the top-level file, then the method will + // update and check the preprocessing include stack, and return false. + bool processEOF(); + + // *** Structures and methods for preprocessing support *** + + // A set of macro names that are defined either via command line or + // by using: + // #define NAME + StringSet<> DefinedMacros; + + // Each of #ifdef and #else directives has a descriptor associated + // with it. + // + // An ordered list of preprocessing controls defined by #ifdef/#else + // directives that are in effect currently is called preprocessing + // control stack. It is represented as a vector of PreprocessorControlDesc's. + // + // The control stack is updated according to the following rules: + // + // For each #ifdef we add an element to the control stack. + // For each #else we replace the top element with a descriptor + // with an inverted IsDefined value. + // For each #endif we pop the top element from the control stack. + // + // When CurPtr reaches the current buffer's end, the control stack + // must be empty, i.e. #ifdef and the corresponding #endif + // must be located in the same file. + struct PreprocessorControlDesc { + // Either tgtok::Ifdef or tgtok::Else. + tgtok::TokKind Kind; + + // True, if the condition for this directive is true, false - otherwise. + // Examples: + // #ifdef NAME : true, if NAME is defined, false - otherwise. + // ... + // #else : false, if NAME is defined, true - otherwise. + bool IsDefined; + + // Pointer into CurBuf to the beginning of the preprocessing directive + // word, e.g.: + // #ifdef NAME + // ^ - SrcPos + SMLoc SrcPos; + }; + + // We want to disallow code like this: + // file1.td: + // #define NAME + // #ifdef NAME + // include "file2.td" + // EOF + // file2.td: + // #endif + // EOF + // + // To do this, we clear the preprocessing control stack on entry + // to each of the included file. PrepIncludeStack is used to store + // preprocessing control stacks for the current file and all its + // parent files. The back() element is the preprocessing control + // stack for the current file. + std::vector>> + PrepIncludeStack; + + // Validate that the current preprocessing control stack is empty, + // since we are about to exit a file, and pop the include stack. + // + // If IncludeStackMustBeEmpty is true, the include stack must be empty + // after the popping, otherwise, the include stack must not be empty + // after the popping. Basically, the include stack must be empty + // only if we exit the "top-level" file (i.e. finish lexing). + // + // The method returns false, if the current preprocessing control stack + // is not empty (e.g. there is an unterminated #ifdef/#else), + // true - otherwise. + bool prepExitInclude(bool IncludeStackMustBeEmpty); + + // Look ahead for a preprocessing directive starting from CurPtr. The caller + // must only call this method, if *(CurPtr - 1) is '#'. If the method matches + // a preprocessing directive word followed by a whitespace, then it returns + // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define. + // + // CurPtr is not adjusted by this method. + tgtok::TokKind prepIsDirective() const; + + // Given a preprocessing token kind, adjusts CurPtr to the end + // of the preprocessing directive word. Returns true, unless + // an unsupported token kind is passed in. + // + // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective() + // to avoid adjusting CurPtr before we are sure that '#' is followed + // by a preprocessing directive. If it is not, then we fall back to + // tgtok::paste interpretation of '#'. + bool prepEatPreprocessorDirective(tgtok::TokKind Kind); + + // The main "exit" point from the token parsing to preprocessor. + // + // The method is called for CurPtr, when prepIsDirective() returns + // true. The first parameter matches the result of prepIsDirective(), + // denoting the actual preprocessor directive to be processed. + // + // If the preprocessing directive disables the tokens processing, e.g.: + // #ifdef NAME // NAME is undefined + // then lexPreprocessor() enters the lines-skipping mode. + // In this mode, it does not parse any tokens, because the code under + // the #ifdef may not even be a correct tablegen code. The preprocessor + // looks for lines containing other preprocessing directives, which + // may be prepended with whitespaces and C-style comments. If the line + // does not contain a preprocessing directive, it is skipped completely. + // Otherwise, the preprocessing directive is processed by recursively + // calling lexPreprocessor(). The processing of the encountered + // preprocessing directives includes updating preprocessing control stack + // and adding new macros into DefinedMacros set. + // + // The second parameter controls whether lexPreprocessor() is called from + // LexToken() (true) or recursively from lexPreprocessor() (false). + // + // If ReturnNextLiveToken is true, the method returns the next + // LEX token following the current directive or following the end + // of the disabled preprocessing region corresponding to this directive. + // If ReturnNextLiveToken is false, the method returns the first parameter, + // unless there were errors encountered in the disabled preprocessing + // region - in this case, it returns tgtok::Error. + tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind, + bool ReturnNextLiveToken = true); + + // Worker method for lexPreprocessor() to skip lines after some + // preprocessing directive up to the buffer end or to the directive + // that re-enables token processing. The method returns true + // upon processing the next directive that re-enables tokens + // processing. False is returned if an error was encountered. + // + // Note that prepSkipRegion() calls lexPreprocessor() to process + // encountered preprocessing directives. In this case, the second + // parameter to lexPreprocessor() is set to false. Being passed + // false ReturnNextLiveToken, lexPreprocessor() must never call + // prepSkipRegion(). We assert this by passing ReturnNextLiveToken + // to prepSkipRegion() and checking that it is never set to false. + bool prepSkipRegion(bool MustNeverBeFalse); + + // Lex name of the macro after either #ifdef or #define. We could have used + // LexIdentifier(), but it has special handling of "include" word, which + // could result in awkward diagnostic errors. Consider: + // ---- + // #ifdef include + // class ... + // ---- + // LexIdentifier() will engage LexInclude(), which will complain about + // missing file with name "class". Instead, prepLexMacroName() will treat + // "include" as a normal macro name. + // + // On entry, CurPtr points to the end of a preprocessing directive word. + // The method allows for whitespaces between the preprocessing directive + // and the macro name. The allowed whitespaces are ' ' and '\t'. + // + // If the first non-whitespace symbol after the preprocessing directive + // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then + // the method updates TokStart to the position of the first non-whitespace + // symbol, sets CurPtr to the position of the macro name's last symbol, + // and returns a string reference to the macro name. Otherwise, + // TokStart is set to the first non-whitespace symbol after the preprocessing + // directive, and the method returns an empty string reference. + // + // In all cases, TokStart may be used to point to the word following + // the preprocessing directive. + StringRef prepLexMacroName(); + + // Skip any whitespaces starting from CurPtr. The method is used + // only in the lines-skipping mode to find the first non-whitespace + // symbol after or at CurPtr. Allowed whitespaces are ' ', '\t', '\n' + // and '\r'. The method skips C-style comments as well, because + // it is used to find the beginning of the preprocessing directive. + // If we do not handle C-style comments the following code would + // result in incorrect detection of a preprocessing directive: + // /* + // #ifdef NAME + // */ + // As long as we skip C-style comments, the following code is correctly + // recognized as a preprocessing directive: + // /* first line comment + // second line comment */ #ifdef NAME + // + // The method returns true upon reaching the first non-whitespace symbol + // or EOF, CurPtr is set to point to this symbol. The method returns false, + // if an error occured during skipping of a C-style comment. + bool prepSkipLineBegin(); + + // Skip any whitespaces or comments after a preprocessing directive. + // The method returns true upon reaching either end of the line + // or end of the file. If there is a multiline C-style comment + // after the preprocessing directive, the method skips + // the comment, so the final CurPtr may point to one of the next lines. + // The method returns false, if an error occured during skipping + // C- or C++-style comment, or a non-whitespace symbol appears + // after the preprocessing directive. + // + // The method maybe called both during lines-skipping and tokens + // processing. It actually verifies that only whitespaces or/and + // comments follow a preprocessing directive. + // + // After the execution of this mehod, CurPtr points either to new line + // symbol, buffer end or non-whitespace symbol following the preprocesing + // directive. + bool prepSkipDirectiveEnd(); + + // Skip all symbols to the end of the line/file. + // The method adjusts CurPtr, so that it points to either new line + // symbol in the current line or the buffer end. + void prepSkipToLineEnd(); + + // Return true, if the current preprocessor control stack is such that + // we should allow lexer to process the next token, false - otherwise. + // + // In particular, the method returns true, if all the #ifdef/#else + // controls on the stack have their IsDefined member set to true. + bool prepIsProcessingEnabled(); + + // Report an error, if we reach EOF with non-empty preprocessing control + // stack. This means there is no matching #endif for the previous + // #ifdef/#else. + void prepReportPreprocessorStackError(); }; } // end namespace llvm diff --git a/lib/TableGen/TGParser.h b/lib/TableGen/TGParser.h index 0a28b3a03aa19cc90b5e4d82a3750eecbc59d468..e3849043513b58106ecb71d8d4768d1121a2e20d 100644 --- a/lib/TableGen/TGParser.h +++ b/lib/TableGen/TGParser.h @@ -115,8 +115,9 @@ class TGParser { }; public: - TGParser(SourceMgr &SrcMgr, RecordKeeper &records) - : Lex(SrcMgr), CurMultiClass(nullptr), Records(records) {} + TGParser(SourceMgr &SrcMgr, ArrayRef Macros, + RecordKeeper &records) + : Lex(SrcMgr, Macros), CurMultiClass(nullptr), Records(records) {} /// ParseFile - Main entrypoint for parsing a tblgen file. These parser /// routines return true on error, or false on success. diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h index 2f0d0bf346d6db04127fc2857bf25c2c7d3f72a3..c36d9354f3ba6f234fab670da883f96ec374a37d 100644 --- a/lib/Target/AArch64/AArch64.h +++ b/lib/Target/AArch64/AArch64.h @@ -39,6 +39,7 @@ FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM, CodeGenOpt::Level OptLevel); FunctionPass *createAArch64StorePairSuppressPass(); FunctionPass *createAArch64ExpandPseudoPass(); +FunctionPass *createAArch64SpeculationHardeningPass(); FunctionPass *createAArch64LoadStoreOptimizationPass(); FunctionPass *createAArch64SIMDInstrOptPass(); ModulePass *createAArch64PromoteConstantPass(); @@ -68,6 +69,7 @@ void initializeAArch64ConditionalComparesPass(PassRegistry&); void initializeAArch64ConditionOptimizerPass(PassRegistry&); void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&); void initializeAArch64ExpandPseudoPass(PassRegistry&); +void initializeAArch64SpeculationHardeningPass(PassRegistry&); void initializeAArch64LoadStoreOptPass(PassRegistry&); void initializeAArch64SIMDInstrOptPass(PassRegistry&); void initializeAArch64PreLegalizerCombinerPass(PassRegistry&); diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index 9d596a1821c8de72fb6b0c54ff67c8c74a3aaed7..a13f6c848816f79c29ac3b5301d29cd57b464dda 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -65,6 +65,18 @@ def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true", def FeatureRDM : SubtargetFeature<"rdm", "HasRDM", "true", "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions">; +def FeaturePAN : SubtargetFeature< + "pan", "HasPAN", "true", + "Enables ARM v8.1 Privileged Access-Never extension">; + +def FeatureLOR : SubtargetFeature< + "lor", "HasLOR", "true", + "Enables ARM v8.1 Limited Ordering Regions extension">; + +def FeatureVH : SubtargetFeature< + "vh", "HasVH", "true", + "Enables ARM v8.1 Virtual Host extension">; + def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", "Enable ARMv8 PMUv3 Performance Monitors extension">; @@ -77,6 +89,18 @@ def FeatureFP16FML : SubtargetFeature<"fp16fml", "HasFP16FML", "true", def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true", "Enable Statistical Profiling extension">; +def FeaturePAN_RWV : SubtargetFeature< + "pan-rwv", "HasPAN_RWV", "true", + "Enable v8.2 PAN s1e1R and s1e1W Variants", + [FeaturePAN]>; + +// UAO PState +def FeaturePsUAO : SubtargetFeature< "uaops", "HasPsUAO", "true", + "Enable v8.2 UAO PState">; + +def FeatureCCPP : SubtargetFeature<"ccpp", "HasCCPP", + "true", "Enable v8.2 data Cache Clean to Point of Persistence" >; + def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true", "Enable Scalable Vector Extension (SVE) instructions">; @@ -125,11 +149,11 @@ def FeaturePredictableSelectIsExpensive : SubtargetFeature< def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move", "CustomAsCheapAsMove", "true", - "Use custom code for TargetInstrInfo::isAsCheapAsAMove()">; + "Use custom handling of cheap instructions">; def FeatureExynosCheapAsMoveHandling : SubtargetFeature<"exynos-cheap-as-move", "ExynosAsCheapAsMove", "true", - "Use Exynos specific code in TargetInstrInfo::isAsCheapAsAMove()", + "Use Exynos specific handling of cheap instructions", [FeatureCustomCheapAsMoveHandling]>; def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler", @@ -195,6 +219,66 @@ def FeatureDotProd : SubtargetFeature< "dotprod", "HasDotProd", "true", "Enable dot product support">; +def FeaturePA : SubtargetFeature< + "pa", "HasPA", "true", + "Enable v8.3-A Pointer Authentication enchancement">; + +def FeatureJS : SubtargetFeature< + "jsconv", "HasJS", "true", + "Enable v8.3-A JavaScript FP conversion enchancement", + [FeatureFPARMv8]>; + +def FeatureCCIDX : SubtargetFeature< + "ccidx", "HasCCIDX", "true", + "Enable v8.3-A Extend of the CCSIDR number of sets">; + +def FeatureComplxNum : SubtargetFeature< + "complxnum", "HasComplxNum", "true", + "Enable v8.3-A Floating-point complex number support", + [FeatureNEON]>; + +def FeatureNV : SubtargetFeature< + "nv", "HasNV", "true", + "Enable v8.4-A Nested Virtualization Enchancement">; + +def FeatureRASv8_4 : SubtargetFeature< + "rasv8_4", "HasRASv8_4", "true", + "Enable v8.4-A Reliability, Availability and Serviceability extension", + [FeatureRAS]>; + +def FeatureMPAM : SubtargetFeature< + "mpam", "HasMPAM", "true", + "Enable v8.4-A Memory system Partitioning and Monitoring extension">; + +def FeatureDIT : SubtargetFeature< + "dit", "HasDIT", "true", + "Enable v8.4-A Data Independent Timing instructions">; + +def FeatureTRACEV8_4 : SubtargetFeature< + "tracev8.4", "HasTRACEV8_4", "true", + "Enable v8.4-A Trace extension">; + +def FeatureAM : SubtargetFeature< + "am", "HasAM", "true", + "Enable v8.4-A Activity Monitors extension">; + +def FeatureSEL2 : SubtargetFeature< + "sel2", "HasSEL2", "true", + "Enable v8.4-A Secure Exception Level 2 extension">; + +def FeatureTLB_RMI : SubtargetFeature< + "tlb-rmi", "HasTLB_RMI", "true", + "Enable v8.4-A TLB Range and Maintenance Instructions">; + +def FeatureFMI : SubtargetFeature< + "fmi", "HasFMI", "true", + "Enable v8.4-A Flag Manipulation Instructions">; + +// 8.4 RCPC enchancements: LDAPR & STLR instructions with Immediate Offset +def FeatureRCPC_IMMO : SubtargetFeature<"rcpc-immo", "HasRCPC_IMMO", "true", + "Enable v8.4-A RCPC instructions with Immediate Offsets", + [FeatureRCPC]>; + def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates", "NegativeImmediates", "false", "Convert immediates and instructions " @@ -222,6 +306,9 @@ def FeatureFRInt3264 : SubtargetFeature<"fptoint", "HasFRInt3264", "true", def FeatureSpecRestrict : SubtargetFeature<"specrestrict", "HasSpecRestrict", "true", "Enable architectural speculation restriction" >; +def FeatureSSBS : SubtargetFeature<"ssbs", "HasSSBS", + "true", "Enable Speculative Store Bypass Safe bit" >; + def FeatureSpecCtrl : SubtargetFeature<"specctrl", "HasSpecCtrl", "true", "Enable speculation control barrier" >; @@ -229,7 +316,7 @@ def FeaturePredCtrl : SubtargetFeature<"predctrl", "HasPredCtrl", "true", "Enable execution and data prediction invalidation instructions" >; def FeatureCacheDeepPersist : SubtargetFeature<"ccdp", "HasCCDP", - "true", "Enable Cache Clean to Point of Deep Persistence" >; + "true", "Enable v8.5 Cache Clean to Point of Deep Persistence" >; def FeatureBranchTargetId : SubtargetFeature<"bti", "HasBTI", "true", "Enable Branch Target Identification" >; @@ -245,21 +332,27 @@ def FeatureMTE : SubtargetFeature<"mte", "HasMTE", // def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", - "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE, FeatureRDM]>; + "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE, FeatureRDM, + FeaturePAN, FeatureLOR, FeatureVH]>; def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", - "Support ARM v8.2a instructions", [HasV8_1aOps, FeatureRAS]>; + "Support ARM v8.2a instructions", [HasV8_1aOps, FeaturePsUAO, + FeaturePAN_RWV, FeatureRAS, FeatureCCPP]>; def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true", - "Support ARM v8.3a instructions", [HasV8_2aOps, FeatureRCPC]>; + "Support ARM v8.3a instructions", [HasV8_2aOps, FeatureRCPC, FeaturePA, + FeatureJS, FeatureCCIDX, FeatureComplxNum]>; def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true", - "Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd]>; + "Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd, + FeatureNV, FeatureRASv8_4, FeatureMPAM, FeatureDIT, + FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeatureTLB_RMI, + FeatureFMI, FeatureRCPC_IMMO]>; def HasV8_5aOps : SubtargetFeature< "v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions", [HasV8_4aOps, FeatureAltFPCmp, FeatureFRInt3264, FeatureSpecRestrict, - FeatureSpecCtrl, FeaturePredCtrl, FeatureCacheDeepPersist, + FeatureSSBS, FeatureSpecCtrl, FeaturePredCtrl, FeatureCacheDeepPersist, FeatureBranchTargetId] >; @@ -277,6 +370,8 @@ include "AArch64CallingConvention.td" include "AArch64Schedule.td" include "AArch64InstrInfo.td" +include "AArch64SchedPredicates.td" +include "AArch64SchedPredExynos.td" def AArch64InstrInfo : InstrInfo; @@ -546,6 +641,21 @@ def ProcThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily", FeaturePredictableSelectIsExpensive, FeatureNEON]>; +def ProcTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110", + "HiSilicon TS-V110 processors", [ + HasV8_2aOps, + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureFuseAES, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureSPE, + FeatureFullFP16, + FeatureFP16FML, + FeatureDotProd]>; + def : ProcessorModel<"generic", NoSchedModel, [ FeatureFPARMv8, FeatureFuseAES, @@ -578,6 +688,8 @@ def : ProcessorModel<"thunderxt81", ThunderXT8XModel, [ProcThunderXT81]>; def : ProcessorModel<"thunderxt83", ThunderXT8XModel, [ProcThunderXT83]>; // Cavium ThunderX2T9X Processors. Formerly Broadcom Vulcan. def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>; +// FIXME: HiSilicon TSV110 is currently modeled as a Cortex-A57. +def : ProcessorModel<"tsv110", CortexA57Model, [ProcTSV110]>; //===----------------------------------------------------------------------===// // Assembly parser @@ -626,3 +738,9 @@ def AArch64 : Target { let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter]; let AllowRegisterRenaming = 1; } + +//===----------------------------------------------------------------------===// +// Pfm Counters +//===----------------------------------------------------------------------===// + +include "AArch64PfmCounters.td" diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp index 1ff0392c0f2692c599dcb7aee580228c95907ff8..828fbb2de7ee7600e9349bfbacd7fe8a6207ac84 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -28,6 +28,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" +#include "llvm/BinaryFormat/COFF.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" @@ -109,12 +110,33 @@ public: AU.setPreservesAll(); } - bool runOnMachineFunction(MachineFunction &F) override { - AArch64FI = F.getInfo(); - STI = static_cast(&F.getSubtarget()); - bool Result = AsmPrinter::runOnMachineFunction(F); + bool runOnMachineFunction(MachineFunction &MF) override { + AArch64FI = MF.getInfo(); + STI = static_cast(&MF.getSubtarget()); + + SetupMachineFunction(MF); + + if (STI->isTargetCOFF()) { + bool Internal = MF.getFunction().hasInternalLinkage(); + COFF::SymbolStorageClass Scl = Internal ? COFF::IMAGE_SYM_CLASS_STATIC + : COFF::IMAGE_SYM_CLASS_EXTERNAL; + int Type = + COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT; + + OutStreamer->BeginCOFFSymbolDef(CurrentFnSym); + OutStreamer->EmitCOFFSymbolStorageClass(Scl); + OutStreamer->EmitCOFFSymbolType(Type); + OutStreamer->EndCOFFSymbolDef(); + } + + // Emit the rest of the function body. + EmitFunctionBody(); + + // Emit the XRay table for this function. emitXRayTable(); - return Result; + + // We didn't modify anything. + return false; } private: @@ -217,7 +239,7 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { // linker can safely perform dead code stripping. Since LLVM never // generates code that does this, it is always safe to set. OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); - SM.serializeToStackMapSection(); + emitStackMaps(SM); } } diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td index 2f6cb4c8670a2883673a1ada454613af69348127..5db941e9dac72b651ded275d04ec4b91bf31b01b 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.td +++ b/lib/Target/AArch64/AArch64CallingConvention.td @@ -123,7 +123,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[ // Vararg functions on windows pass floats in integer registers def CC_AArch64_Win64_VarArg : CallingConv<[ - CCIfType<[f16, f32], CCPromoteToType>, + CCIfType<[f16, f32], CCPromoteToType>, CCIfType<[f64], CCBitConvertToType>, CCDelegateTo ]>; diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index dfc08a12f51343b4524f6978fa111362d7050015..47550cabb9f0dac8a899c02d2af7f42715e0a2b8 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -2016,8 +2016,9 @@ bool AArch64FastISel::selectLoad(const Instruction *I) { if (RetVT == MVT::i64 && VT <= MVT::i32) { if (WantZExt) { // Delete the last emitted instruction from emitLoad (SUBREG_TO_REG). - std::prev(FuncInfo.InsertPt)->eraseFromParent(); - ResultReg = std::prev(FuncInfo.InsertPt)->getOperand(0).getReg(); + MachineBasicBlock::iterator I(std::prev(FuncInfo.InsertPt)); + ResultReg = std::prev(I)->getOperand(0).getReg(); + removeDeadCode(I, std::next(I)); } else ResultReg = fastEmitInst_extractsubreg(MVT::i32, ResultReg, /*IsKill=*/true, @@ -2038,7 +2039,8 @@ bool AArch64FastISel::selectLoad(const Instruction *I) { break; } } - MI->eraseFromParent(); + MachineBasicBlock::iterator I(MI); + removeDeadCode(I, std::next(I)); MI = nullptr; if (Reg) MI = MRI.getUniqueVRegDef(Reg); @@ -2256,6 +2258,13 @@ static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) { /// Try to emit a combined compare-and-branch instruction. bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) { + // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions + // will not be produced, as they are conditional branch instructions that do + // not set flags. + if (FuncInfo.MF->getFunction().hasFnAttribute( + Attribute::SpeculativeLoadHardening)) + return false; + assert(isa(BI->getCondition()) && "Expected cmp instruction"); const CmpInst *CI = cast(BI->getCondition()); CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); @@ -4508,7 +4517,8 @@ bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT, MI->getOperand(1).getSubReg() == AArch64::sub_32) && "Expected copy instruction"); Reg = MI->getOperand(1).getReg(); - MI->eraseFromParent(); + MachineBasicBlock::iterator I(MI); + removeDeadCode(I, std::next(I)); } updateValueMap(I, Reg); return true; diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 9c85001481d18598b86c60e28d5a883b38d871e4..800dd449b49f2d5fb84077e6c93549eb19290584 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -204,6 +204,11 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); + // Win64 EH requires a frame pointer if funclets are present, as the locals + // are accessed off the frame pointer in both the parent function and the + // funclets. + if (MF.hasEHFunclets()) + return true; // Retain behavior of always omitting the FP for leaf functions when possible. if (MFI.hasCalls() && MF.getTarget().Options.DisableFramePointerElim(MF)) return true; @@ -585,10 +590,12 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc, bool NeedsWinCFI, bool InProlog = true) { // Ignore instructions that do not operate on SP, i.e. shadow call stack - // instructions. + // instructions and associated CFI instruction. while (MBBI->getOpcode() == AArch64::STRXpost || - MBBI->getOpcode() == AArch64::LDRXpre) { - assert(MBBI->getOperand(0).getReg() != AArch64::SP); + MBBI->getOpcode() == AArch64::LDRXpre || + MBBI->getOpcode() == AArch64::CFI_INSTRUCTION) { + if (MBBI->getOpcode() != AArch64::CFI_INSTRUCTION) + assert(MBBI->getOperand(0).getReg() != AArch64::SP); ++MBBI; } unsigned NewOpc; @@ -685,9 +692,11 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI, unsigned Opc = MI.getOpcode(); // Ignore instructions that do not operate on SP, i.e. shadow call stack - // instructions. - if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre) { - assert(MI.getOperand(0).getReg() != AArch64::SP); + // instructions and associated CFI instruction. + if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre || + Opc == AArch64::CFI_INSTRUCTION) { + if (Opc != AArch64::CFI_INSTRUCTION) + assert(MI.getOperand(0).getReg() != AArch64::SP); return; } @@ -774,6 +783,12 @@ static bool ShouldSignWithAKey(MachineFunction &MF) { return Key.equals_lower("a_key"); } +static bool needsWinCFI(const MachineFunction &MF) { + const Function &F = MF.getFunction(); + return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() && + F.needsUnwindTableEntry(); +} + void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -787,9 +802,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, bool needsFrameMoves = (MMI.hasDebugInfo() || F.needsUnwindTableEntry()) && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool HasFP = hasFP(MF); - bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() && - F.needsUnwindTableEntry(); + bool NeedsWinCFI = needsWinCFI(MF); MF.setHasWinCFI(NeedsWinCFI); + bool IsFunclet = MBB.isEHFuncletEntry(); + // At this point, we're going to decide whether or not the function uses a // redzone. In most cases, the function doesn't have a redzone so let's // assume that's false and set it to true in the case that there's a redzone. @@ -804,6 +820,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MBB, MBBI, DL, TII->get(ShouldSignWithAKey(MF) ? AArch64::PACIASP : AArch64::PACIBSP)) .setMIFlag(MachineInstr::FrameSetup); + + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); } // All calls are tail calls in GHC calling conv, and functions have no @@ -811,7 +833,14 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; - int NumBytes = (int)MFI.getStackSize(); + // getStackSize() includes all the locals in its size calculation. We don't + // include these locals when computing the stack size of a funclet, as they + // are allocated in the parent's stack frame and accessed via the frame + // pointer from the funclet. We only save the callee saved registers in the + // funclet, which are really the callee saved registers of the parent + // function, including the funclet. + int NumBytes = IsFunclet ? (int)getWinEHFuncletFrameSize(MF) + : (int)MFI.getStackSize(); if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) { assert(!HasFP && "unexpected function without stack frame but with FP"); // All of the stack allocation is for locals. @@ -847,7 +876,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); - unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; + // Var args are accounted for in the containing function, so don't + // include them for funclets. + unsigned FixedObject = (IsWin64 && !IsFunclet) ? + alignTo(AFI->getVarArgsGPRSize(), 16) : 0; auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; // All of the remaining stack allocations are for locals. @@ -875,6 +907,16 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, ++MBBI; } + // The code below is not applicable to funclets. We have emitted all the SEH + // opcodes that we needed to emit. The FP and BP belong to the containing + // function. + if (IsFunclet) { + if (NeedsWinCFI) + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd)) + .setMIFlag(MachineInstr::FrameSetup); + return; + } + if (HasFP) { // Only set up FP if we actually need to. Frame pointer is fp = // sp - fixedobject - 16. @@ -1165,6 +1207,16 @@ static void InsertReturnAddressAuth(MachineFunction &MF, } } +static bool isFuncletReturnInstr(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return false; + case AArch64::CATCHRET: + case AArch64::CLEANUPRET: + return true; + } +} + void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); @@ -1173,8 +1225,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL; bool IsTailCallReturn = false; - bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() && - MF.getFunction().needsUnwindTableEntry(); + bool NeedsWinCFI = needsWinCFI(MF); + bool IsFunclet = false; if (MBB.end() != MBBI) { DL = MBBI->getDebugLoc(); @@ -1182,9 +1234,11 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi || RetOpcode == AArch64::TCRETURNri || RetOpcode == AArch64::TCRETURNriBTI; + IsFunclet = isFuncletReturnInstr(*MBBI); } - int NumBytes = MFI.getStackSize(); + int NumBytes = IsFunclet ? (int)getWinEHFuncletFrameSize(MF) + : MFI.getStackSize(); AArch64FunctionInfo *AFI = MF.getInfo(); // All calls are tail calls in GHC calling conv, and functions have no @@ -1241,10 +1295,19 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); - unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; + // Var args are accounted for in the containing function, so don't + // include them for funclets. + unsigned FixedObject = + (IsWin64 && !IsFunclet) ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; uint64_t AfterCSRPopSize = ArgumentPopSize; auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; + // We cannot rely on the local stack size set in emitPrologue if the function + // has funclets, as funclets have different local stack size requirements, and + // the current value set in emitPrologue may be that of the containing + // function. + if (MF.hasEHFunclets()) + AFI->setLocalStackSize(NumBytes - PrologueSaveSize); bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); // Assume we can't combine the last pop with the sp restore. @@ -1340,7 +1403,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // FIXME: Rather than doing the math here, we should instead just use // non-post-indexed loads for the restores if we aren't actually going to // be able to save any instructions. - if (MFI.hasVarSizedObjects() || AFI->isStackRealigned()) + if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP, -AFI->getCalleeSavedStackSize() + 16, TII, MachineInstr::FrameDestroy, false, NeedsWinCFI); @@ -1445,6 +1508,14 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, // being in range for direct access. If the FPOffset is positive, // that'll always be best, as the SP will be even further away. UseFP = true; + } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) { + // Funclets access the locals contained in the parent's stack frame + // via the frame pointer, so we have to use the FP in the parent + // function. + assert( + Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) && + "Funclets should only be present on Win64"); + UseFP = true; } else { // We have the choice between FP and (SP or BP). if (FPOffsetFits && PreferFP) // If FP is the best fit, use it. @@ -1538,8 +1609,7 @@ static void computeCalleeSaveRegisterPairs( if (CSI.empty()) return; - bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() && - MF.getFunction().needsUnwindTableEntry(); + bool NeedsWinCFI = needsWinCFI(MF); AArch64FunctionInfo *AFI = MF.getInfo(); MachineFrameInfo &MFI = MF.getFrameInfo(); CallingConv::ID CC = MF.getFunction().getCallingConv(); @@ -1652,8 +1722,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() && - MF.getFunction().needsUnwindTableEntry(); + bool NeedsWinCFI = needsWinCFI(MF); DebugLoc DL; SmallVector RegPairs; @@ -1675,6 +1744,23 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop)) .setMIFlag(MachineInstr::FrameSetup); + if (!MF.getFunction().hasFnAttribute(Attribute::NoUnwind)) { + // Emit a CFI instruction that causes 8 to be subtracted from the value of + // x18 when unwinding past this frame. + static const char CFIInst[] = { + dwarf::DW_CFA_val_expression, + 18, // register + 2, // length + static_cast(unsigned(dwarf::DW_OP_breg18)), + static_cast(-8) & 0x7f, // addend (sleb128) + }; + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createEscape(nullptr, CFIInst)); + BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlag(MachineInstr::FrameSetup); + } + // This instruction also makes x18 live-in to the entry block. MBB.addLiveIn(AArch64::X18); } @@ -1765,8 +1851,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); DebugLoc DL; SmallVector RegPairs; - bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() && - MF.getFunction().needsUnwindTableEntry(); + bool NeedsWinCFI = needsWinCFI(MF); if (MI != MBB.end()) DL = MI->getDebugLoc(); @@ -1998,3 +2083,69 @@ bool AArch64FrameLowering::enableStackSlotScavenging( const AArch64FunctionInfo *AFI = MF.getInfo(); return AFI->hasCalleeSaveStackFreeSpace(); } + +void AArch64FrameLowering::processFunctionBeforeFrameFinalized( + MachineFunction &MF, RegScavenger *RS) const { + // If this function isn't doing Win64-style C++ EH, we don't need to do + // anything. + if (!MF.hasEHFunclets()) + return; + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo(); + + MachineBasicBlock &MBB = MF.front(); + auto MBBI = MBB.begin(); + while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) + ++MBBI; + + if (MBBI->isTerminator()) + return; + + // Create an UnwindHelp object. + int UnwindHelpFI = + MFI.CreateStackObject(/*size*/8, /*alignment*/16, false); + EHInfo.UnwindHelpFrameIdx = UnwindHelpFI; + // We need to store -2 into the UnwindHelp object at the start of the + // function. + DebugLoc DL; + RS->enterBasicBlock(MBB); + unsigned DstReg = RS->scavengeRegister(&AArch64::GPR64RegClass, MBBI, 0); + BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2); + BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi)) + .addReg(DstReg, getKillRegState(true)) + .addFrameIndex(UnwindHelpFI) + .addImm(0); +} + +/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP before +/// the update. This is easily retrieved as it is exactly the offset that is set +/// in processFunctionBeforeFrameFinalized. +int AArch64FrameLowering::getFrameIndexReferencePreferSP( + const MachineFunction &MF, int FI, unsigned &FrameReg, + bool IgnoreSPUpdates) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is " + << MFI.getObjectOffset(FI) << "\n"); + FrameReg = AArch64::SP; + return MFI.getObjectOffset(FI); +} + +/// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve +/// the parent's frame pointer +unsigned AArch64FrameLowering::getWinEHParentFrameOffset( + const MachineFunction &MF) const { + return 0; +} + +/// Funclets only need to account for space for the callee saved registers, +/// as the locals are accounted for in the parent's stack frame. +unsigned AArch64FrameLowering::getWinEHFuncletFrameSize( + const MachineFunction &MF) const { + // This is the size of the pushed CSRs. + unsigned CSSize = + MF.getInfo()->getCalleeSavedStackSize(); + // This is the amount of stack a funclet needs to allocate. + return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(), + getStackAlignment()); +} diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h index 104e52b5f1f367ce8cda652aa81ef6f4f1349340..0d0385acf46e6155e27acb7793a007bfdc46994f 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.h +++ b/lib/Target/AArch64/AArch64FrameLowering.h @@ -69,6 +69,17 @@ public: bool enableStackSlotScavenging(const MachineFunction &MF) const override; + void processFunctionBeforeFrameFinalized(MachineFunction &MF, + RegScavenger *RS) const override; + + unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override; + + unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const; + + int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, + unsigned &FrameReg, + bool IgnoreSPUpdates) const override; + private: bool shouldCombineCSRLocalStackBump(MachineFunction &MF, unsigned StackBumpBytes) const; diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index d5d6d5ca23e1de93a117e76ecc773272983b2f80..cc10c9688e14f38ee3d99790334c245b2be4850a 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -714,8 +714,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) { - setOperationAction(ISD::MULHS, VT, Custom); - setOperationAction(ISD::MULHU, VT, Custom); + setOperationAction(ISD::MULHS, VT, Legal); + setOperationAction(ISD::MULHU, VT, Legal); } else { setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); @@ -1273,6 +1273,20 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI, return EndBB; } +MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet( + MachineInstr &MI, MachineBasicBlock *BB) const { + assert(!isAsynchronousEHPersonality(classifyEHPersonality( + BB->getParent()->getFunction().getPersonalityFn())) && + "SEH does not use catchret!"); + return BB; +} + +MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchPad( + MachineInstr &MI, MachineBasicBlock *BB) const { + MI.eraseFromParent(); + return BB; +} + MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { switch (MI.getOpcode()) { @@ -1288,6 +1302,11 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); + + case AArch64::CATCHRET: + return EmitLoweredCatchRet(MI, BB); + case AArch64::CATCHPAD: + return EmitLoweredCatchPad(MI, BB); } } @@ -1501,6 +1520,11 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ? Opcode = AArch64ISD::ADDS; RHS = RHS.getOperand(1); + } else if (isCMN(LHS, CC)) { + // As we are looking for EQ/NE compares, the operands can be commuted ; can + // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ? + Opcode = AArch64ISD::ADDS; + LHS = LHS.getOperand(1); } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) { // Similarly, (CMP (and X, Y), 0) can be implemented with a TST @@ -1527,33 +1551,44 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, /// ccmp B, inv(CB), CA /// check for CB flags /// -/// In general we can create code for arbitrary "... (and (and A B) C)" -/// sequences. We can also implement some "or" expressions, because "(or A B)" -/// is equivalent to "not (and (not A) (not B))" and we can implement some -/// negation operations: -/// We can negate the results of a single comparison by inverting the flags -/// used when the predicate fails and inverting the flags tested in the next -/// instruction; We can also negate the results of the whole previous -/// conditional compare sequence by inverting the flags tested in the next -/// instruction. However there is no way to negate the result of a partial -/// sequence. +/// This naturally lets us implement chains of AND operations with SETCC +/// operands. And we can even implement some other situations by transforming +/// them: +/// - We can implement (NEG SETCC) i.e. negating a single comparison by +/// negating the flags used in a CCMP/FCCMP operations. +/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations +/// by negating the flags we test for afterwards. i.e. +/// NEG (CMP CCMP CCCMP ...) can be implemented. +/// - Note that we can only ever negate all previously processed results. +/// What we can not implement by flipping the flags to test is a negation +/// of two sub-trees (because the negation affects all sub-trees emitted so +/// far, so the 2nd sub-tree we emit would also affect the first). +/// With those tools we can implement some OR operations: +/// - (OR (SETCC A) (SETCC B)) can be implemented via: +/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B))) +/// - After transforming OR to NEG/AND combinations we may be able to use NEG +/// elimination rules from earlier to implement the whole thing as a +/// CCMP/FCCMP chain. /// -/// Therefore on encountering an "or" expression we can negate the subtree on -/// one side and have to be able to push the negate to the leafs of the subtree -/// on the other side (see also the comments in code). As complete example: -/// "or (or (setCA (cmp A)) (setCB (cmp B))) -/// (and (setCC (cmp C)) (setCD (cmp D)))" -/// is transformed to -/// "not (and (not (and (setCC (cmp C)) (setCC (cmp D)))) -/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" -/// and implemented as: +/// As complete example: +/// or (or (setCA (cmp A)) (setCB (cmp B))) +/// (and (setCC (cmp C)) (setCD (cmp D)))" +/// can be reassociated to: +/// or (and (setCC (cmp C)) setCD (cmp D)) +// (or (setCA (cmp A)) (setCB (cmp B))) +/// can be transformed to: +/// not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) +/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" +/// which can be implemented as: /// cmp C /// ccmp D, inv(CD), CC /// ccmp A, CA, inv(CD) /// ccmp B, CB, inv(CA) /// check for CB flags -/// A counterexample is "or (and A B) (and C D)" which cannot be implemented -/// by conditional compare sequences. +/// +/// A counterexample is "or (and A B) (and C D)" which translates to +/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we +/// can only implement 1 of the inner (not) operations, but not both! /// @{ /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. @@ -1593,9 +1628,20 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be /// expressed as a conjunction. See \ref AArch64CCMP. -/// \param CanNegate Set to true if we can also emit the negation of the -/// tree as a conjunction. +/// \param CanNegate Set to true if we can negate the whole sub-tree just by +/// changing the conditions on the SETCC tests. +/// (this means we can call emitConjunctionRec() with +/// Negate==true on this sub-tree) +/// \param MustBeFirst Set to true if this subtree needs to be negated and we +/// cannot do the negation naturally. We are required to +/// emit the subtree first in this case. +/// \param WillNegate Is true if are called when the result of this +/// subexpression must be negated. This happens when the +/// outer expression is an OR. We can use this fact to know +/// that we have a double negation (or (or ...) ...) that +/// can be implemented for free. static bool canEmitConjunction(const SDValue Val, bool &CanNegate, + bool &MustBeFirst, bool WillNegate, unsigned Depth = 0) { if (!Val.hasOneUse()) return false; @@ -1604,42 +1650,44 @@ static bool canEmitConjunction(const SDValue Val, bool &CanNegate, if (Val->getOperand(0).getValueType() == MVT::f128) return false; CanNegate = true; + MustBeFirst = false; return true; } // Protect against exponential runtime and stack overflow. if (Depth > 6) return false; if (Opcode == ISD::AND || Opcode == ISD::OR) { + bool IsOR = Opcode == ISD::OR; SDValue O0 = Val->getOperand(0); SDValue O1 = Val->getOperand(1); bool CanNegateL; - if (!canEmitConjunction(O0, CanNegateL, Depth+1)) + bool MustBeFirstL; + if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1)) return false; bool CanNegateR; - if (!canEmitConjunction(O1, CanNegateR, Depth+1)) + bool MustBeFirstR; + if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1)) return false; - if (Opcode == ISD::OR) { - // For an OR expression we need to be able to negate at least one side or - // we cannot do the transformation at all. + if (MustBeFirstL && MustBeFirstR) + return false; + + if (IsOR) { + // For an OR expression we need to be able to naturally negate at least + // one side or we cannot do the transformation at all. if (!CanNegateL && !CanNegateR) return false; - // However if we can negate x and y, then we can change - // (not (or x y)) - // into - // (and (not x) (not y)) - // to eliminate the outer negation. - CanNegate = CanNegateL && CanNegateR; + // If we the result of the OR will be negated and we can naturally negate + // the leafs, then this sub-tree as a whole negates naturally. + CanNegate = WillNegate && CanNegateL && CanNegateR; + // If we cannot naturally negate the whole sub-tree, then this must be + // emitted first. + MustBeFirst = !CanNegate; } else { - // If the operands are OR expressions then we finally need to negate their - // outputs, we can only do that for the operand with emitted last by - // negating OutCC, not for both operands. - bool NeedsNegOutL = O0->getOpcode() == ISD::OR; - bool NeedsNegOutR = O1->getOpcode() == ISD::OR; - if (NeedsNegOutL && NeedsNegOutR) - return false; - // We cannot negate an AND operation. + assert(Opcode == ISD::AND && "Must be OR or AND"); + // We cannot naturally negate an AND operation. CanNegate = false; + MustBeFirst = MustBeFirstL || MustBeFirstR; } return true; } @@ -1652,10 +1700,8 @@ static bool canEmitConjunction(const SDValue Val, bool &CanNegate, /// and conditional compare operations. @returns an NZCV flags producing node /// and sets @p OutCC to the flags that should be tested or returns SDValue() if /// transformation was not possible. -/// On recursive invocations @p PushNegate may be set to true to have negation -/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate -/// for the comparisons in the current subtree; @p Depth limits the search -/// depth to avoid stack overflow. +/// \p Negate is true if we want this sub-tree being negated just by changing +/// SETCC conditions. static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate) { @@ -1697,61 +1743,69 @@ static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL, DAG); } - assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) && - "Valid conjunction/disjunction tree"); - - // Check if both sides can be transformed. - SDValue LHS = Val->getOperand(0); - SDValue RHS = Val->getOperand(1); + assert(Val->hasOneUse() && "Valid conjunction/disjunction tree"); - // In case of an OR we need to negate our operands and the result. - // (A v B) <=> not(not(A) ^ not(B)) - bool NegateOpsAndResult = Opcode == ISD::OR; - // We can negate the results of all previous operations by inverting the - // predicate flags giving us a free negation for one side. The other side - // must be negatable by itself. - if (NegateOpsAndResult) { - // See which side we can negate. - bool CanNegateL; - bool isValidL = canEmitConjunction(LHS, CanNegateL); - assert(isValidL && "Valid conjunction/disjunction tree"); - (void)isValidL; + bool IsOR = Opcode == ISD::OR; -#ifndef NDEBUG - bool CanNegateR; - bool isValidR = canEmitConjunction(RHS, CanNegateR); - assert(isValidR && "Valid conjunction/disjunction tree"); - assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree"); -#endif + SDValue LHS = Val->getOperand(0); + bool CanNegateL; + bool MustBeFirstL; + bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR); + assert(ValidL && "Valid conjunction/disjunction tree"); + (void)ValidL; - // Order the side which we cannot negate to RHS so we can emit it first. - if (!CanNegateL) + SDValue RHS = Val->getOperand(1); + bool CanNegateR; + bool MustBeFirstR; + bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR); + assert(ValidR && "Valid conjunction/disjunction tree"); + (void)ValidR; + + // Swap sub-tree that must come first to the right side. + if (MustBeFirstL) { + assert(!MustBeFirstR && "Valid conjunction/disjunction tree"); + std::swap(LHS, RHS); + std::swap(CanNegateL, CanNegateR); + std::swap(MustBeFirstL, MustBeFirstR); + } + + bool NegateR; + bool NegateAfterR; + bool NegateL; + bool NegateAfterAll; + if (Opcode == ISD::OR) { + // Swap the sub-tree that we can negate naturally to the left. + if (!CanNegateL) { + assert(CanNegateR && "at least one side must be negatable"); + assert(!MustBeFirstR && "invalid conjunction/disjunction tree"); + assert(!Negate); std::swap(LHS, RHS); + NegateR = false; + NegateAfterR = true; + } else { + // Negate the left sub-tree if possible, otherwise negate the result. + NegateR = CanNegateR; + NegateAfterR = !CanNegateR; + } + NegateL = true; + NegateAfterAll = !Negate; } else { - bool NeedsNegOutL = LHS->getOpcode() == ISD::OR; - assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) && - "Valid conjunction/disjunction tree"); - // Order the side where we need to negate the output flags to RHS so it - // gets emitted first. - if (NeedsNegOutL) - std::swap(LHS, RHS); + assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree"); + assert(!Negate && "Valid conjunction/disjunction tree"); + + NegateL = false; + NegateR = false; + NegateAfterR = false; + NegateAfterAll = false; } - // Emit RHS. If we want to negate the tree we only need to push a negate - // through if we are already in a PushNegate case, otherwise we can negate - // the "flags to test" afterwards. + // Emit sub-trees. AArch64CC::CondCode RHSCC; - SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, Negate, - CCOp, Predicate); - if (NegateOpsAndResult && !Negate) + SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate); + if (NegateAfterR) RHSCC = AArch64CC::getInvertedCondCode(RHSCC); - // Emit LHS. We may need to negate it. - SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, - NegateOpsAndResult, CmpR, - RHSCC); - // If we transformed an OR to and AND then we have to negate the result - // (or absorb the Negate parameter). - if (NegateOpsAndResult && !Negate) + SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC); + if (NegateAfterAll) OutCC = AArch64CC::getInvertedCondCode(OutCC); return CmpL; } @@ -1763,7 +1817,8 @@ static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC) { bool DummyCanNegate; - if (!canEmitConjunction(Val, DummyCanNegate)) + bool DummyMustBeFirst; + if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false)) return SDValue(); return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL); @@ -2651,66 +2706,6 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); } -// Lower vector multiply high (ISD::MULHS and ISD::MULHU). -static SDValue LowerMULH(SDValue Op, SelectionDAG &DAG) { - // Multiplications are only custom-lowered for 128-bit vectors so that - // {S,U}MULL{2} can be detected. Otherwise v2i64 multiplications are not - // legal. - EVT VT = Op.getValueType(); - assert(VT.is128BitVector() && VT.isInteger() && - "unexpected type for custom-lowering ISD::MULH{U,S}"); - - SDValue V0 = Op.getOperand(0); - SDValue V1 = Op.getOperand(1); - - SDLoc DL(Op); - - EVT ExtractVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); - - // We turn (V0 mulhs/mulhu V1) to: - // - // (uzp2 (smull (extract_subvector (ExtractVT V128:V0, (i64 0)), - // (extract_subvector (ExtractVT V128:V1, (i64 0))))), - // (smull (extract_subvector (ExtractVT V128:V0, (i64 VMull2Idx)), - // (extract_subvector (ExtractVT V128:V2, (i64 VMull2Idx)))))) - // - // Where ExtractVT is a subvector with half number of elements, and - // VMullIdx2 is the index of the middle element (the high part). - // - // The vector hight part extract and multiply will be matched against - // {S,U}MULL{v16i8_v8i16,v8i16_v4i32,v4i32_v2i64} which in turn will - // issue a {s}mull2 instruction. - // - // This basically multiply the lower subvector with '{s,u}mull', the high - // subvector with '{s,u}mull2', and shuffle both results high part in - // resulting vector. - unsigned Mull2VectorIdx = VT.getVectorNumElements () / 2; - SDValue VMullIdx = DAG.getConstant(0, DL, MVT::i64); - SDValue VMull2Idx = DAG.getConstant(Mull2VectorIdx, DL, MVT::i64); - - SDValue VMullV0 = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMullIdx); - SDValue VMullV1 = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMullIdx); - - SDValue VMull2V0 = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMull2Idx); - SDValue VMull2V1 = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMull2Idx); - - unsigned MullOpc = Op.getOpcode() == ISD::MULHS ? AArch64ISD::SMULL - : AArch64ISD::UMULL; - - EVT MullVT = ExtractVT.widenIntegerVectorElementType(*DAG.getContext()); - SDValue Mull = DAG.getNode(MullOpc, DL, MullVT, VMullV0, VMullV1); - SDValue Mull2 = DAG.getNode(MullOpc, DL, MullVT, VMull2V0, VMull2V1); - - Mull = DAG.getNode(ISD::BITCAST, DL, VT, Mull); - Mull2 = DAG.getNode(ISD::BITCAST, DL, VT, Mull2); - - return DAG.getNode(AArch64ISD::UZP2, DL, VT, Mull, Mull2); -} - SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); @@ -2913,9 +2908,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerFLT_ROUNDS_(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); - case ISD::MULHS: - case ISD::MULHU: - return LowerMULH(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::STORE: @@ -4351,6 +4343,13 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Dest = Op.getOperand(4); SDLoc dl(Op); + MachineFunction &MF = DAG.getMachineFunction(); + // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions + // will not be produced, as they are conditional branch instructions that do + // not set flags. + bool ProduceNonFlagSettingCondBr = + !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); + // Handle f128 first, since lowering it will result in comparing the return // value of a libcall against zero, which is just what the rest of LowerBR_CC // is expecting to deal with. @@ -4393,7 +4392,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // If the RHS of the comparison is zero, we can potentially fold this // to a specialized branch. const ConstantSDNode *RHSC = dyn_cast(RHS); - if (RHSC && RHSC->getZExtValue() == 0) { + if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) { if (CC == ISD::SETEQ) { // See if we can use a TBZ to fold in an AND as well. // TBZ has a smaller branch displacement than CBZ. If the offset is @@ -4436,7 +4435,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { } } if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && - LHS.getOpcode() != ISD::AND) { + LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) { // Don't combine AND since emitComparison converts the AND to an ANDS // (a.k.a. TST) and the test in the test bit and branch instruction // becomes redundant. This would also increase register pressure. @@ -7241,7 +7240,10 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, return DAG.getUNDEF(VT); } - if (isOnlyLowElement) { + // Convert BUILD_VECTOR where all elements but the lowest are undef into + // SCALAR_TO_VECTOR, except for when we have a single-element constant vector + // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR. + if (isOnlyLowElement && !(NumElts == 1 && isa(Value))) { LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 " "SCALAR_TO_VECTOR node\n"); return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); @@ -7823,7 +7825,7 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); if (ShouldInvert) - return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); + Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); return Cmp; } @@ -8084,6 +8086,10 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { + // TODO: This may be worth removing. Check regression tests for diffs. + if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT)) + return false; + // If we're reducing the load width in order to avoid having to use an extra // instruction to do extension then it's probably a good idea. if (ExtTy != ISD::NON_EXTLOAD) @@ -10808,6 +10814,13 @@ SDValue performCONDCombine(SDNode *N, static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { + MachineFunction &MF = DAG.getMachineFunction(); + // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions + // will not be produced, as they are conditional branch instructions that do + // not set flags. + if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening)) + return SDValue(); + if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3)) N = NV.getNode(); SDValue Chain = N->getOperand(0); @@ -11668,6 +11681,39 @@ Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { return TargetLowering::getIRStackGuard(IRB); } +void AArch64TargetLowering::insertSSPDeclarations(Module &M) const { + // MSVC CRT provides functionalities for stack protection. + if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) { + // MSVC CRT has a global variable holding security cookie. + M.getOrInsertGlobal("__security_cookie", + Type::getInt8PtrTy(M.getContext())); + + // MSVC CRT has a function to validate security cookie. + auto *SecurityCheckCookie = cast( + M.getOrInsertFunction("__security_check_cookie", + Type::getVoidTy(M.getContext()), + Type::getInt8PtrTy(M.getContext()))); + SecurityCheckCookie->setCallingConv(CallingConv::Win64); + SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg); + return; + } + TargetLowering::insertSSPDeclarations(M); +} + +Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const { + // MSVC CRT has a global variable holding security cookie. + if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) + return M.getGlobalVariable("__security_cookie"); + return TargetLowering::getSDagStackGuard(M); +} + +Value *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const { + // MSVC CRT has a function to validate security cookie. + if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) + return M.getFunction("__security_check_cookie"); + return TargetLowering::getSSPStackGuardCheck(M); +} + Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { // Android provides a fixed TLS slot for the SafeStack pointer. See the // definition of TLS_SLOT_SAFESTACK in @@ -11772,3 +11818,8 @@ void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const { MF.getFrameInfo().computeMaxCallFrameSize(MF); TargetLoweringBase::finalizeLowering(MF); } + +// Unlike X86, we let frame lowering assign offsets to all catch objects. +bool AArch64TargetLowering::needsFixedCatchObjects() const { + return false; +} diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index 7ee3b82a4ac12a7c3f8133122917028b844672b5..ffc4cc3ef53474064faf487c305a37f442005574 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -302,6 +302,12 @@ public: MachineBasicBlock *EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, + MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; @@ -401,6 +407,10 @@ public: /// returns the address of that location. Otherwise, returns nullptr. Value *getIRStackGuard(IRBuilder<> &IRB) const override; + void insertSSPDeclarations(Module &M) const override; + Value *getSDagStackGuard(const Module &M) const override; + Value *getSSPStackGuardCheck(const Module &M) const override; + /// If the target has a standard location for the unsafe stack pointer, /// returns the address of that location. Otherwise, returns nullptr. Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override; @@ -517,6 +527,8 @@ public: bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override; + /// Used for exception handling on Win64. + bool needsFixedCatchObjects() const override; private: /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index ab90ea3f74ad63032a3fe95ed3f7597241c04b2b..9061ed4f9f54a6f275685ba66656cdd6072fe629 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -9989,9 +9989,10 @@ class BaseSIMDThreeSameVectorComplex size, bits<3> opcode, let Inst{4-0} = Rd; } +//8.3 CompNum - Floating-point complex number support multiclass SIMDThreeSameVectorComplexHSD opcode, Operand rottype, string asm, SDPatternOperator OpNode>{ - let Predicates = [HasV8_3a, HasNEON, HasFullFP16] in { + let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in { def v4f16 : BaseSIMDThreeSameVectorComplex<0, U, 0b01, opcode, V64, rottype, asm, ".4h", [(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd), @@ -10007,7 +10008,7 @@ multiclass SIMDThreeSameVectorComplexHSD opcode, Operand rottype, (rottype i32:$rot)))]>; } - let Predicates = [HasV8_3a, HasNEON] in { + let Predicates = [HasComplxNum, HasNEON] in { def v2f32 : BaseSIMDThreeSameVectorComplex<0, U, 0b10, opcode, V64, rottype, asm, ".2s", [(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd), @@ -10063,7 +10064,7 @@ class BaseSIMDThreeSameVectorTiedComplex size, multiclass SIMDThreeSameVectorTiedComplexHSD opcode, Operand rottype, string asm, SDPatternOperator OpNode> { - let Predicates = [HasV8_3a, HasNEON, HasFullFP16] in { + let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in { def v4f16 : BaseSIMDThreeSameVectorTiedComplex<0, U, 0b01, opcode, V64, rottype, asm, ".4h", [(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd), @@ -10079,7 +10080,7 @@ multiclass SIMDThreeSameVectorTiedComplexHSD opcode, (rottype i32:$rot)))]>; } - let Predicates = [HasV8_3a, HasNEON] in { + let Predicates = [HasComplxNum, HasNEON] in { def v2f32 : BaseSIMDThreeSameVectorTiedComplex<0, U, 0b10, opcode, V64, rottype, asm, ".2s", [(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd), @@ -10145,7 +10146,7 @@ class BaseSIMDIndexedTiedComplex size, // classes. multiclass SIMDIndexedTiedComplexHSD { - let Predicates = [HasV8_3a,HasNEON,HasFullFP16] in { + let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in { def v4f16_indexed : BaseSIMDIndexedTiedComplex<0, 1, 0, 0b01, opc1, opc2, V64, V64, V128, VectorIndexD, rottype, asm, ".4h", ".4h", ".4h", ".h", []> { @@ -10161,9 +10162,9 @@ multiclass SIMDIndexedTiedComplexHSD { @@ -10171,7 +10172,7 @@ multiclass SIMDIndexedTiedComplexHSD cl::desc("Restrict range of Bcc instructions (DEBUG)")); AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) - : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP), + : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, + AArch64::CATCHRET), RI(STI.getTargetTriple()), Subtarget(STI) {} /// GetInstSize - Return the number of bytes of code the specified @@ -704,10 +705,10 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { // Secondly, check cases specific to sub-targets. if (Subtarget.hasExynosCheapAsMoveHandling()) { - if (isExynosResetFast(MI) || isExynosShiftExtFast(MI)) + if (isExynosCheapAsMove(MI)) return true; - else - return MI.isAsCheapAsAMove(); + + return MI.isAsCheapAsAMove(); } // Finally, check generic cases. @@ -758,211 +759,6 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { llvm_unreachable("Unknown opcode to check as cheap as a move!"); } -bool AArch64InstrInfo::isExynosResetFast(const MachineInstr &MI) { - unsigned Reg, Imm, Shift; - - switch (MI.getOpcode()) { - default: - return false; - - // MOV Rd, SP - case AArch64::ADDWri: - case AArch64::ADDXri: - if (!MI.getOperand(1).isReg() || !MI.getOperand(2).isImm()) - return false; - - Reg = MI.getOperand(1).getReg(); - Imm = MI.getOperand(2).getImm(); - return ((Reg == AArch64::WSP || Reg == AArch64::SP) && Imm == 0); - - // Literal - case AArch64::ADR: - case AArch64::ADRP: - return true; - - // MOVI Vd, #0 - case AArch64::MOVID: - case AArch64::MOVIv8b_ns: - case AArch64::MOVIv2d_ns: - case AArch64::MOVIv16b_ns: - Imm = MI.getOperand(1).getImm(); - return (Imm == 0); - - // MOVI Vd, #0 - case AArch64::MOVIv2i32: - case AArch64::MOVIv4i16: - case AArch64::MOVIv4i32: - case AArch64::MOVIv8i16: - Imm = MI.getOperand(1).getImm(); - Shift = MI.getOperand(2).getImm(); - return (Imm == 0 && Shift == 0); - - // MOV Rd, Imm - case AArch64::MOVNWi: - case AArch64::MOVNXi: - - // MOV Rd, Imm - case AArch64::MOVZWi: - case AArch64::MOVZXi: - return true; - - // MOV Rd, Imm - case AArch64::ORRWri: - case AArch64::ORRXri: - if (!MI.getOperand(1).isReg()) - return false; - - Reg = MI.getOperand(1).getReg(); - Imm = MI.getOperand(2).getImm(); - return ((Reg == AArch64::WZR || Reg == AArch64::XZR) && Imm == 0); - - // MOV Rd, Rm - case AArch64::ORRWrs: - case AArch64::ORRXrs: - if (!MI.getOperand(1).isReg()) - return false; - - Reg = MI.getOperand(1).getReg(); - Imm = MI.getOperand(3).getImm(); - Shift = AArch64_AM::getShiftValue(Imm); - return ((Reg == AArch64::WZR || Reg == AArch64::XZR) && Shift == 0); - } -} - -bool AArch64InstrInfo::isExynosLdStExtFast(const MachineInstr &MI) { - unsigned Imm; - AArch64_AM::ShiftExtendType Ext; - - switch (MI.getOpcode()) { - default: - return false; - - // WriteLD - case AArch64::PRFMroW: - case AArch64::PRFMroX: - - // WriteLDIdx - case AArch64::LDRBBroW: - case AArch64::LDRBBroX: - case AArch64::LDRHHroW: - case AArch64::LDRHHroX: - case AArch64::LDRSBWroW: - case AArch64::LDRSBWroX: - case AArch64::LDRSBXroW: - case AArch64::LDRSBXroX: - case AArch64::LDRSHWroW: - case AArch64::LDRSHWroX: - case AArch64::LDRSHXroW: - case AArch64::LDRSHXroX: - case AArch64::LDRSWroW: - case AArch64::LDRSWroX: - case AArch64::LDRWroW: - case AArch64::LDRWroX: - case AArch64::LDRXroW: - case AArch64::LDRXroX: - - case AArch64::LDRBroW: - case AArch64::LDRBroX: - case AArch64::LDRDroW: - case AArch64::LDRDroX: - case AArch64::LDRHroW: - case AArch64::LDRHroX: - case AArch64::LDRSroW: - case AArch64::LDRSroX: - - // WriteSTIdx - case AArch64::STRBBroW: - case AArch64::STRBBroX: - case AArch64::STRHHroW: - case AArch64::STRHHroX: - case AArch64::STRWroW: - case AArch64::STRWroX: - case AArch64::STRXroW: - case AArch64::STRXroX: - - case AArch64::STRBroW: - case AArch64::STRBroX: - case AArch64::STRDroW: - case AArch64::STRDroX: - case AArch64::STRHroW: - case AArch64::STRHroX: - case AArch64::STRSroW: - case AArch64::STRSroX: - Imm = MI.getOperand(3).getImm(); - Ext = AArch64_AM::getMemExtendType(Imm); - return (Ext == AArch64_AM::SXTX || Ext == AArch64_AM::UXTX); - } -} - -bool AArch64InstrInfo::isExynosShiftExtFast(const MachineInstr &MI) { - unsigned Imm, Shift; - AArch64_AM::ShiftExtendType Ext; - - switch (MI.getOpcode()) { - default: - return false; - - // WriteI - case AArch64::ADDSWri: - case AArch64::ADDSXri: - case AArch64::ADDWri: - case AArch64::ADDXri: - case AArch64::SUBSWri: - case AArch64::SUBSXri: - case AArch64::SUBWri: - case AArch64::SUBXri: - return true; - - // WriteISReg - case AArch64::ADDSWrs: - case AArch64::ADDSXrs: - case AArch64::ADDWrs: - case AArch64::ADDXrs: - case AArch64::ANDSWrs: - case AArch64::ANDSXrs: - case AArch64::ANDWrs: - case AArch64::ANDXrs: - case AArch64::BICSWrs: - case AArch64::BICSXrs: - case AArch64::BICWrs: - case AArch64::BICXrs: - case AArch64::EONWrs: - case AArch64::EONXrs: - case AArch64::EORWrs: - case AArch64::EORXrs: - case AArch64::ORNWrs: - case AArch64::ORNXrs: - case AArch64::ORRWrs: - case AArch64::ORRXrs: - case AArch64::SUBSWrs: - case AArch64::SUBSXrs: - case AArch64::SUBWrs: - case AArch64::SUBXrs: - Imm = MI.getOperand(3).getImm(); - Shift = AArch64_AM::getShiftValue(Imm); - Ext = AArch64_AM::getShiftType(Imm); - return (Shift == 0 || (Shift <= 3 && Ext == AArch64_AM::LSL)); - - // WriteIEReg - case AArch64::ADDSWrx: - case AArch64::ADDSXrx: - case AArch64::ADDSXrx64: - case AArch64::ADDWrx: - case AArch64::ADDXrx: - case AArch64::ADDXrx64: - case AArch64::SUBSWrx: - case AArch64::SUBSXrx: - case AArch64::SUBSXrx64: - case AArch64::SUBWrx: - case AArch64::SUBXrx: - case AArch64::SUBXrx64: - Imm = MI.getOperand(3).getImm(); - Shift = AArch64_AM::getArithShiftValue(Imm); - Ext = AArch64_AM::getArithExtendType(Imm); - return (Shift == 0 || (Shift <= 3 && Ext == AArch64_AM::UXTX)); - } -} - bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) { switch (MI.getOpcode()) { default: @@ -1134,7 +930,7 @@ bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const { const TargetRegisterInfo *TRI = &getRegisterInfo(); - unsigned BaseRegA = 0, BaseRegB = 0; + MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; int64_t OffsetA = 0, OffsetB = 0; unsigned WidthA = 0, WidthB = 0; @@ -1145,14 +941,14 @@ bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) return false; - // Retrieve the base register, offset from the base register and width. Width + // Retrieve the base, offset from the base and width. Width // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If - // base registers are identical, and the offset of a lower memory access + + // base are identical, and the offset of a lower memory access + // the width doesn't overlap the offset of a higher memory access, // then the memory accesses are different. - if (getMemOpBaseRegImmOfsWidth(MIa, BaseRegA, OffsetA, WidthA, TRI) && - getMemOpBaseRegImmOfsWidth(MIb, BaseRegB, OffsetB, WidthB, TRI)) { - if (BaseRegA == BaseRegB) { + if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) && + getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) { + if (BaseOpA->isIdenticalTo(*BaseOpB)) { int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; @@ -1168,6 +964,13 @@ bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI, const MachineFunction &MF) const { if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) return true; + switch (MI.getOpcode()) { + case AArch64::DSB: + case AArch64::ISB: + // DSB and ISB also are scheduling barriers. + return true; + default:; + } return isSEHInstruction(MI); } @@ -1657,11 +1460,36 @@ bool AArch64InstrInfo::substituteCmpToZero( } bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { - if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD) + if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && + MI.getOpcode() != AArch64::CATCHRET) return false; MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); + + if (MI.getOpcode() == AArch64::CATCHRET) { + // Skip to the first instruction before the epilog. + const TargetInstrInfo *TII = + MBB.getParent()->getSubtarget().getInstrInfo(); + MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); + auto MBBI = MachineBasicBlock::iterator(MI); + MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI); + while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) && + FirstEpilogSEH != MBB.begin()) + FirstEpilogSEH = std::prev(FirstEpilogSEH); + if (FirstEpilogSEH != MBB.begin()) + FirstEpilogSEH = std::next(FirstEpilogSEH); + BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP)) + .addReg(AArch64::X0, RegState::Define) + .addMBB(TargetMBB); + BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri)) + .addReg(AArch64::X0, RegState::Define) + .addReg(AArch64::X0) + .addMBB(TargetMBB) + .addImm(0); + return true; + } + unsigned Reg = MI.getOperand(0).getReg(); const GlobalValue *GV = cast((*MI.memoperands_begin())->getValue()); @@ -1714,71 +1542,6 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } -/// Return true if this is this instruction has a non-zero immediate -bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: - break; - case AArch64::ADDSWrs: - case AArch64::ADDSXrs: - case AArch64::ADDWrs: - case AArch64::ADDXrs: - case AArch64::ANDSWrs: - case AArch64::ANDSXrs: - case AArch64::ANDWrs: - case AArch64::ANDXrs: - case AArch64::BICSWrs: - case AArch64::BICSXrs: - case AArch64::BICWrs: - case AArch64::BICXrs: - case AArch64::EONWrs: - case AArch64::EONXrs: - case AArch64::EORWrs: - case AArch64::EORXrs: - case AArch64::ORNWrs: - case AArch64::ORNXrs: - case AArch64::ORRWrs: - case AArch64::ORRXrs: - case AArch64::SUBSWrs: - case AArch64::SUBSXrs: - case AArch64::SUBWrs: - case AArch64::SUBXrs: - if (MI.getOperand(3).isImm()) { - unsigned val = MI.getOperand(3).getImm(); - return (val != 0); - } - break; - } - return false; -} - -/// Return true if this is this instruction has a non-zero immediate -bool AArch64InstrInfo::hasExtendedReg(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: - break; - case AArch64::ADDSWrx: - case AArch64::ADDSXrx: - case AArch64::ADDSXrx64: - case AArch64::ADDWrx: - case AArch64::ADDXrx: - case AArch64::ADDXrx64: - case AArch64::SUBSWrx: - case AArch64::SUBSXrx: - case AArch64::SUBSXrx64: - case AArch64::SUBWrx: - case AArch64::SUBXrx: - case AArch64::SUBXrx64: - if (MI.getOperand(3).isImm()) { - unsigned val = MI.getOperand(3).getImm(); - return (val != 0); - } - break; - } - - return false; -} - // Return true if this instruction simply sets its single destination register // to zero. This is equivalent to a register rename of the zero-register. bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) { @@ -1901,67 +1664,6 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, return 0; } -/// Return true if this is load/store scales or extends its register offset. -/// This refers to scaling a dynamic index as opposed to scaled immediates. -/// MI should be a memory op that allows scaled addressing. -bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: - break; - case AArch64::LDRBBroW: - case AArch64::LDRBroW: - case AArch64::LDRDroW: - case AArch64::LDRHHroW: - case AArch64::LDRHroW: - case AArch64::LDRQroW: - case AArch64::LDRSBWroW: - case AArch64::LDRSBXroW: - case AArch64::LDRSHWroW: - case AArch64::LDRSHXroW: - case AArch64::LDRSWroW: - case AArch64::LDRSroW: - case AArch64::LDRWroW: - case AArch64::LDRXroW: - case AArch64::STRBBroW: - case AArch64::STRBroW: - case AArch64::STRDroW: - case AArch64::STRHHroW: - case AArch64::STRHroW: - case AArch64::STRQroW: - case AArch64::STRSroW: - case AArch64::STRWroW: - case AArch64::STRXroW: - case AArch64::LDRBBroX: - case AArch64::LDRBroX: - case AArch64::LDRDroX: - case AArch64::LDRHHroX: - case AArch64::LDRHroX: - case AArch64::LDRQroX: - case AArch64::LDRSBWroX: - case AArch64::LDRSBXroX: - case AArch64::LDRSHWroX: - case AArch64::LDRSHXroX: - case AArch64::LDRSWroX: - case AArch64::LDRSroX: - case AArch64::LDRWroX: - case AArch64::LDRXroX: - case AArch64::STRBBroX: - case AArch64::STRBroX: - case AArch64::STRDroX: - case AArch64::STRHHroX: - case AArch64::STRHroX: - case AArch64::STRQroX: - case AArch64::STRSroX: - case AArch64::STRWroX: - case AArch64::STRXroX: - - unsigned Val = MI.getOperand(3).getImm(); - AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getMemExtendType(Val); - return (ExtType != AArch64_AM::UXTX) || AArch64_AM::getMemDoShift(Val); - } - return false; -} - /// Check all MachineMemOperands for a hint to suppress pairing. bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) { return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { @@ -2135,17 +1837,21 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const { if (MI.hasOrderedMemoryRef()) return false; - // Make sure this is a reg+imm (as opposed to an address reloc). - assert(MI.getOperand(1).isReg() && "Expected a reg operand."); + // Make sure this is a reg/fi+imm (as opposed to an address reloc). + assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) && + "Expected a reg or frame index operand."); if (!MI.getOperand(2).isImm()) return false; // Can't merge/pair if the instruction modifies the base register. // e.g., ldr x0, [x0] - unsigned BaseReg = MI.getOperand(1).getReg(); - const TargetRegisterInfo *TRI = &getRegisterInfo(); - if (MI.modifiesRegister(BaseReg, TRI)) - return false; + // This case will never occur with an FI base. + if (MI.getOperand(1).isReg()) { + unsigned BaseReg = MI.getOperand(1).getReg(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + if (MI.modifiesRegister(BaseReg, TRI)) + return false; + } // Check if this load/store has a hint to avoid pair formation. // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. @@ -2168,25 +1874,28 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const { return true; } -bool AArch64InstrInfo::getMemOpBaseRegImmOfs( - MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, - const TargetRegisterInfo *TRI) const { +bool AArch64InstrInfo::getMemOperandWithOffset(MachineInstr &LdSt, + MachineOperand *&BaseOp, + int64_t &Offset, + const TargetRegisterInfo *TRI) const { unsigned Width; - return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI); + return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI); } -bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( - MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, unsigned &Width, - const TargetRegisterInfo *TRI) const { +bool AArch64InstrInfo::getMemOperandWithOffsetWidth( + MachineInstr &LdSt, MachineOperand *&BaseOp, int64_t &Offset, + unsigned &Width, const TargetRegisterInfo *TRI) const { assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); // Handle only loads/stores with base register followed by immediate offset. if (LdSt.getNumExplicitOperands() == 3) { // Non-paired instruction (e.g., ldr x1, [x0, #8]). - if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isImm()) + if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) || + !LdSt.getOperand(2).isImm()) return false; } else if (LdSt.getNumExplicitOperands() == 4) { // Paired instruction (e.g., ldp x1, x2, [x0, #8]). - if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isReg() || + if (!LdSt.getOperand(1).isReg() || + (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) || !LdSt.getOperand(3).isImm()) return false; } else @@ -2205,13 +1914,18 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( // multiplied by the scaling factor. Unscaled instructions have scaling factor // set to 1. if (LdSt.getNumExplicitOperands() == 3) { - BaseReg = LdSt.getOperand(1).getReg(); + BaseOp = &LdSt.getOperand(1); Offset = LdSt.getOperand(2).getImm() * Scale; } else { assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); - BaseReg = LdSt.getOperand(2).getReg(); + BaseOp = &LdSt.getOperand(2); Offset = LdSt.getOperand(3).getImm() * Scale; } + + assert((BaseOp->isReg() || BaseOp->isFI()) && + "getMemOperandWithOffset only supports base " + "operands of type register or frame index."); + return true; } @@ -2366,31 +2080,33 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, return true; } -// Scale the unscaled offsets. Returns false if the unscaled offset can't be -// scaled. -static bool scaleOffset(unsigned Opc, int64_t &Offset) { - unsigned OffsetStride = 1; +static unsigned getOffsetStride(unsigned Opc) { switch (Opc) { default: - return false; + return 0; case AArch64::LDURQi: case AArch64::STURQi: - OffsetStride = 16; - break; + return 16; case AArch64::LDURXi: case AArch64::LDURDi: case AArch64::STURXi: case AArch64::STURDi: - OffsetStride = 8; - break; + return 8; case AArch64::LDURWi: case AArch64::LDURSi: case AArch64::LDURSWi: case AArch64::STURWi: case AArch64::STURSi: - OffsetStride = 4; - break; + return 4; } +} + +// Scale the unscaled offsets. Returns false if the unscaled offset can't be +// scaled. +static bool scaleOffset(unsigned Opc, int64_t &Offset) { + unsigned OffsetStride = getOffsetStride(Opc); + if (OffsetStride == 0) + return false; // If the byte-offset isn't a multiple of the stride, we can't scale this // offset. if (Offset % OffsetStride != 0) @@ -2402,6 +2118,19 @@ static bool scaleOffset(unsigned Opc, int64_t &Offset) { return true; } +// Unscale the scaled offsets. Returns false if the scaled offset can't be +// unscaled. +static bool unscaleOffset(unsigned Opc, int64_t &Offset) { + unsigned OffsetStride = getOffsetStride(Opc); + if (OffsetStride == 0) + return false; + + // Convert the "element" offset used by scaled pair load/store instructions + // into the byte-offset used by unscaled. + Offset *= OffsetStride; + return true; +} + static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { if (FirstOpc == SecondOpc) return true; @@ -2420,15 +2149,46 @@ static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { return false; } +static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, + int64_t Offset1, unsigned Opcode1, int FI2, + int64_t Offset2, unsigned Opcode2) { + // Accesses through fixed stack object frame indices may access a different + // fixed stack slot. Check that the object offsets + offsets match. + if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) { + int64_t ObjectOffset1 = MFI.getObjectOffset(FI1); + int64_t ObjectOffset2 = MFI.getObjectOffset(FI2); + assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered."); + // Get the byte-offset from the object offset. + if (!unscaleOffset(Opcode1, Offset1) || !unscaleOffset(Opcode2, Offset2)) + return false; + ObjectOffset1 += Offset1; + ObjectOffset2 += Offset2; + // Get the "element" index in the object. + if (!scaleOffset(Opcode1, ObjectOffset1) || + !scaleOffset(Opcode2, ObjectOffset2)) + return false; + return ObjectOffset1 + 1 == ObjectOffset2; + } + + return FI1 == FI2; +} + /// Detect opportunities for ldp/stp formation. /// -/// Only called for LdSt for which getMemOpBaseRegImmOfs returns true. -bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, - unsigned BaseReg1, - MachineInstr &SecondLdSt, - unsigned BaseReg2, +/// Only called for LdSt for which getMemOperandWithOffset returns true. +bool AArch64InstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1, + MachineOperand &BaseOp2, unsigned NumLoads) const { - if (BaseReg1 != BaseReg2) + MachineInstr &FirstLdSt = *BaseOp1.getParent(); + MachineInstr &SecondLdSt = *BaseOp2.getParent(); + if (BaseOp1.getType() != BaseOp2.getType()) + return false; + + assert((BaseOp1.isReg() || BaseOp1.isFI()) && + "Only base registers and frame indices are supported."); + + // Check for both base regs and base FI. + if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) return false; // Only cluster up to a single pair. @@ -2464,7 +2224,20 @@ bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, return false; // The caller should already have ordered First/SecondLdSt by offset. - assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); + // Note: except for non-equal frame index bases + if (BaseOp1.isFI()) { + assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 >= Offset2) && + "Caller should have ordered offsets."); + + const MachineFrameInfo &MFI = + FirstLdSt.getParent()->getParent()->getFrameInfo(); + return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc, + BaseOp2.getIndex(), Offset2, SecondOpc); + } + + assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && + "Caller should have ordered offsets."); + return Offset1 + 1 == Offset2; } @@ -5075,11 +4848,13 @@ enum MachineOutlinerClass { enum MachineOutlinerMBBFlags { LRUnavailableSomewhere = 0x2, - HasCalls = 0x4 + HasCalls = 0x4, + UnsafeRegsDead = 0x8 }; unsigned AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const { + assert(C.LRUWasSet && "LRU wasn't set?"); MachineFunction *MF = C.getMF(); const AArch64RegisterInfo *ARI = static_cast( MF->getSubtarget().getRegisterInfo()); @@ -5102,17 +4877,22 @@ AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const { outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( std::vector &RepeatedSequenceLocs) const { - unsigned SequenceSize = std::accumulate( - RepeatedSequenceLocs[0].front(), - std::next(RepeatedSequenceLocs[0].back()), - 0, [this](unsigned Sum, const MachineInstr &MI) { - return Sum + getInstSizeInBytes(MI); - }); - - // Compute liveness information for each candidate. + outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; + unsigned SequenceSize = + std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0, + [this](unsigned Sum, const MachineInstr &MI) { + return Sum + getInstSizeInBytes(MI); + }); + + // Properties about candidate MBBs that hold for all of them. + unsigned FlagsSetInAll = 0xF; + + // Compute liveness information for each candidate, and set FlagsSetInAll. const TargetRegisterInfo &TRI = getRegisterInfo(); std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), - [&TRI](outliner::Candidate &C) { C.initLRU(TRI); }); + [&FlagsSetInAll](outliner::Candidate &C) { + FlagsSetInAll &= C.Flags; + }); // According to the AArch64 Procedure Call Standard, the following are // undefined on entry/exit from a function call: @@ -5125,23 +4905,31 @@ AArch64InstrInfo::getOutliningCandidateInfo( // of these registers is live into/across it. Thus, we need to delete // those // candidates. - auto CantGuaranteeValueAcrossCall = [](outliner::Candidate &C) { + auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) { + // If the unsafe registers in this block are all dead, then we don't need + // to compute liveness here. + if (C.Flags & UnsafeRegsDead) + return false; + C.initLRU(TRI); LiveRegUnits LRU = C.LRU; return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) || !LRU.available(AArch64::NZCV)); }; - // Erase every candidate that violates the restrictions above. (It could be - // true that we have viable candidates, so it's not worth bailing out in - // the case that, say, 1 out of 20 candidates violate the restructions.) - RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(), - RepeatedSequenceLocs.end(), - CantGuaranteeValueAcrossCall), - RepeatedSequenceLocs.end()); + // Are there any candidates where those registers are live? + if (!(FlagsSetInAll & UnsafeRegsDead)) { + // Erase every candidate that violates the restrictions above. (It could be + // true that we have viable candidates, so it's not worth bailing out in + // the case that, say, 1 out of 20 candidates violate the restructions.) + RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(), + RepeatedSequenceLocs.end(), + CantGuaranteeValueAcrossCall), + RepeatedSequenceLocs.end()); - // If the sequence is empty, we're done. - if (RepeatedSequenceLocs.empty()) - return outliner::OutlinedFunction(); + // If the sequence doesn't have enough candidates left, then we're done. + if (RepeatedSequenceLocs.size() < 2) + return outliner::OutlinedFunction(); + } // At this point, we have only "safe" candidates to outline. Figure out // frame + call instruction information. @@ -5162,6 +4950,60 @@ AArch64InstrInfo::getOutliningCandidateInfo( return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement"); }); + // Returns true if an instructions is safe to fix up, false otherwise. + auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) { + if (MI.isCall()) + return true; + + if (!MI.modifiesRegister(AArch64::SP, &TRI) && + !MI.readsRegister(AArch64::SP, &TRI)) + return true; + + // Any modification of SP will break our code to save/restore LR. + // FIXME: We could handle some instructions which add a constant + // offset to SP, with a bit more work. + if (MI.modifiesRegister(AArch64::SP, &TRI)) + return false; + + // At this point, we have a stack instruction that we might need to + // fix up. We'll handle it if it's a load or store. + if (MI.mayLoadOrStore()) { + MachineOperand *Base; // Filled with the base operand of MI. + int64_t Offset; // Filled with the offset of MI. + + // Does it allow us to offset the base operand and is the base the + // register SP? + if (!getMemOperandWithOffset(MI, Base, Offset, &TRI) || !Base->isReg() || + Base->getReg() != AArch64::SP) + return false; + + // Find the minimum/maximum offset for this instruction and check + // if fixing it up would be in range. + int64_t MinOffset, + MaxOffset; // Unscaled offsets for the instruction. + unsigned Scale; // The scale to multiply the offsets by. + unsigned DummyWidth; + getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); + + Offset += 16; // Update the offset to what it would be if we outlined. + if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale) + return false; + + // It's in range, so we can outline it. + return true; + } + + // FIXME: Add handling for instructions like "add x0, sp, #8". + + // We can't fix it up, so don't outline it. + return false; + }; + + // True if it's possible to fix up each stack instruction in this sequence. + // Important for frames/call variants that modify the stack. + bool AllStackInstrsSafe = std::all_of( + FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup); + // If the last instruction in any candidate is a terminator, then we should // tail call all of the candidates. if (RepeatedSequenceLocs[0].back()->isTerminator()) { @@ -5178,57 +5020,94 @@ AArch64InstrInfo::getOutliningCandidateInfo( SetCandidateCallInfo(MachineOutlinerThunk, 4); } - // Make sure that LR isn't live on entry to this candidate. The only - // instructions that use LR that could possibly appear in a repeated sequence - // are calls. Therefore, we only have to check and see if LR is dead on entry - // to (or exit from) some candidate. - else if (std::all_of(RepeatedSequenceLocs.begin(), - RepeatedSequenceLocs.end(), - [](outliner::Candidate &C) { - return C.LRU.available(AArch64::LR); - })) { - FrameID = MachineOutlinerNoLRSave; - NumBytesToCreateFrame = 4; - SetCandidateCallInfo(MachineOutlinerNoLRSave, 4); - } - - // LR is live, so we need to save it. Decide whether it should be saved to - // the stack, or if it can be saved to a register. else { - if (all_of(RepeatedSequenceLocs, [this](outliner::Candidate &C) { - return findRegisterToSaveLRTo(C); - })) { - // Every candidate has an available callee-saved register for the save. - // We can save LR to a register. - FrameID = MachineOutlinerRegSave; - NumBytesToCreateFrame = 4; - SetCandidateCallInfo(MachineOutlinerRegSave, 12); + // We need to decide how to emit calls + frames. We can always emit the same + // frame if we don't need to save to the stack. If we have to save to the + // stack, then we need a different frame. + unsigned NumBytesNoStackCalls = 0; + std::vector CandidatesWithoutStackFixups; + + for (outliner::Candidate &C : RepeatedSequenceLocs) { + C.initLRU(TRI); + + // Is LR available? If so, we don't need a save. + if (C.LRU.available(AArch64::LR)) { + NumBytesNoStackCalls += 4; + C.setCallInfo(MachineOutlinerNoLRSave, 4); + CandidatesWithoutStackFixups.push_back(C); + } + + // Is an unused register available? If so, we won't modify the stack, so + // we can outline with the same frame type as those that don't save LR. + else if (findRegisterToSaveLRTo(C)) { + NumBytesNoStackCalls += 12; + C.setCallInfo(MachineOutlinerRegSave, 12); + CandidatesWithoutStackFixups.push_back(C); + } + + // Is SP used in the sequence at all? If not, we don't have to modify + // the stack, so we are guaranteed to get the same frame. + else if (C.UsedInSequence.available(AArch64::SP)) { + NumBytesNoStackCalls += 12; + C.setCallInfo(MachineOutlinerDefault, 12); + CandidatesWithoutStackFixups.push_back(C); + } + + // If we outline this, we need to modify the stack. Pretend we don't + // outline this by saving all of its bytes. + else { + NumBytesNoStackCalls += SequenceSize; + } } - else { - // At least one candidate does not have an available callee-saved - // register. We must save LR to the stack. - FrameID = MachineOutlinerDefault; - NumBytesToCreateFrame = 4; + // If there are no places where we have to save LR, then note that we + // don't have to update the stack. Otherwise, give every candidate the + // default call type, as long as it's safe to do so. + if (!AllStackInstrsSafe || + NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) { + RepeatedSequenceLocs = CandidatesWithoutStackFixups; + FrameID = MachineOutlinerNoLRSave; + } else { SetCandidateCallInfo(MachineOutlinerDefault, 12); } + + // If we dropped all of the candidates, bail out here. + if (RepeatedSequenceLocs.size() < 2) { + RepeatedSequenceLocs.clear(); + return outliner::OutlinedFunction(); + } } - // Check if the range contains a call. These require a save + restore of the - // link register. - if (std::any_of(RepeatedSequenceLocs[0].front(), - RepeatedSequenceLocs[0].back(), - [](const MachineInstr &MI) { return MI.isCall(); })) - NumBytesToCreateFrame += 8; // Save + restore the link register. + // Does every candidate's MBB contain a call? If so, then we might have a call + // in the range. + if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { + // Check if the range contains a call. These require a save + restore of the + // link register. + bool ModStackToSaveLR = false; + if (std::any_of(FirstCand.front(), FirstCand.back(), + [](const MachineInstr &MI) { return MI.isCall(); })) + ModStackToSaveLR = true; + + // Handle the last instruction separately. If this is a tail call, then the + // last instruction is a call. We don't want to save + restore in this case. + // However, it could be possible that the last instruction is a call without + // it being valid to tail call this sequence. We should consider this as + // well. + else if (FrameID != MachineOutlinerThunk && + FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall()) + ModStackToSaveLR = true; + + if (ModStackToSaveLR) { + // We can't fix up the stack. Bail out. + if (!AllStackInstrsSafe) { + RepeatedSequenceLocs.clear(); + return outliner::OutlinedFunction(); + } - // Handle the last instruction separately. If this is a tail call, then the - // last instruction is a call. We don't want to save + restore in this case. - // However, it could be possible that the last instruction is a call without - // it being valid to tail call this sequence. We should consider this as well. - else if (FrameID != MachineOutlinerThunk && - FrameID != MachineOutlinerTailCall && - RepeatedSequenceLocs[0].back()->isCall()) - NumBytesToCreateFrame += 8; + // Save + restore LR. + NumBytesToCreateFrame += 8; + } + } return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID); @@ -5260,29 +5139,70 @@ bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( return true; } -unsigned -AArch64InstrInfo::getMachineOutlinerMBBFlags(MachineBasicBlock &MBB) const { - unsigned Flags = 0x0; - // Check if there's a call inside this MachineBasicBlock. If there is, then - // set a flag. - if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); })) - Flags |= MachineOutlinerMBBFlags::HasCalls; - +bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, + unsigned &Flags) const { // Check if LR is available through all of the MBB. If it's not, then set // a flag. assert(MBB.getParent()->getRegInfo().tracksLiveness() && "Suitable Machine Function for outlining must track liveness"); LiveRegUnits LRU(getRegisterInfo()); - LRU.addLiveOuts(MBB); - std::for_each(MBB.rbegin(), - MBB.rend(), + std::for_each(MBB.rbegin(), MBB.rend(), [&LRU](MachineInstr &MI) { LRU.accumulate(MI); }); - if (!LRU.available(AArch64::LR)) - Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; + // Check if each of the unsafe registers are available... + bool W16AvailableInBlock = LRU.available(AArch64::W16); + bool W17AvailableInBlock = LRU.available(AArch64::W17); + bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV); + + // If all of these are dead (and not live out), we know we don't have to check + // them later. + if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock) + Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead; + + // Now, add the live outs to the set. + LRU.addLiveOuts(MBB); + + // If any of these registers is available in the MBB, but also a live out of + // the block, then we know outlining is unsafe. + if (W16AvailableInBlock && !LRU.available(AArch64::W16)) + return false; + if (W17AvailableInBlock && !LRU.available(AArch64::W17)) + return false; + if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV)) + return false; + + // Check if there's a call inside this MachineBasicBlock. If there is, then + // set a flag. + if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); })) + Flags |= MachineOutlinerMBBFlags::HasCalls; + + MachineFunction *MF = MBB.getParent(); + + // In the event that we outline, we may have to save LR. If there is an + // available register in the MBB, then we'll always save LR there. Check if + // this is true. + bool CanSaveLR = false; + const AArch64RegisterInfo *ARI = static_cast( + MF->getSubtarget().getRegisterInfo()); + + // Check if there is an available register across the sequence that we can + // use. + for (unsigned Reg : AArch64::GPR64RegClass) { + if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR && + Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) { + CanSaveLR = true; + break; + } + } - return Flags; + // Check if we have a register we can save LR to, and if LR was used + // somewhere. If both of those things are true, then we need to evaluate the + // safety of outlining stack instructions later. + if (!CanSaveLR && !LRU.available(AArch64::LR)) + Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; + + return true; } outliner::InstrType @@ -5405,108 +5325,19 @@ AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, MI.modifiesRegister(AArch64::W30, &getRegisterInfo())) return outliner::InstrType::Illegal; - // Does this use the stack? - if (MI.modifiesRegister(AArch64::SP, &RI) || - MI.readsRegister(AArch64::SP, &RI)) { - // True if there is no chance that any outlined candidate from this range - // could require stack fixups. That is, both - // * LR is available in the range (No save/restore around call) - // * The range doesn't include calls (No save/restore in outlined frame) - // are true. - // FIXME: This is very restrictive; the flags check the whole block, - // not just the bit we will try to outline. - bool MightNeedStackFixUp = - (Flags & (MachineOutlinerMBBFlags::LRUnavailableSomewhere | - MachineOutlinerMBBFlags::HasCalls)); - - // If this instruction is in a range where it *never* needs to be fixed - // up, then we can *always* outline it. This is true even if it's not - // possible to fix that instruction up. - // - // Why? Consider two equivalent instructions I1, I2 where both I1 and I2 - // use SP. Suppose that I1 sits within a range that definitely doesn't - // need stack fixups, while I2 sits in a range that does. - // - // First, I1 can be outlined as long as we *never* fix up the stack in - // any sequence containing it. I1 is already a safe instruction in the - // original program, so as long as we don't modify it we're good to go. - // So this leaves us with showing that outlining I2 won't break our - // program. - // - // Suppose I1 and I2 belong to equivalent candidate sequences. When we - // look at I2, we need to see if it can be fixed up. Suppose I2, (and - // thus I1) cannot be fixed up. Then I2 will be assigned an unique - // integer label; thus, I2 cannot belong to any candidate sequence (a - // contradiction). Suppose I2 can be fixed up. Then I1 can be fixed up - // as well, so we're good. Thus, I1 is always safe to outline. - // - // This gives us two things: first off, it buys us some more instructions - // for our search space by deeming stack instructions illegal only when - // they can't be fixed up AND we might have to fix them up. Second off, - // This allows us to catch tricky instructions like, say, - // %xi = ADDXri %sp, n, 0. We can't safely outline these since they might - // be paired with later SUBXris, which might *not* end up being outlined. - // If we mess with the stack to save something, then an ADDXri messes with - // it *after*, then we aren't going to restore the right something from - // the stack if we don't outline the corresponding SUBXri first. ADDXris and - // SUBXris are extremely common in prologue/epilogue code, so supporting - // them in the outliner can be a pretty big win! - if (!MightNeedStackFixUp) - return outliner::InstrType::Legal; - - // Any modification of SP will break our code to save/restore LR. - // FIXME: We could handle some instructions which add a constant offset to - // SP, with a bit more work. - if (MI.modifiesRegister(AArch64::SP, &RI)) - return outliner::InstrType::Illegal; - - // At this point, we have a stack instruction that we might need to fix - // up. We'll handle it if it's a load or store. - if (MI.mayLoadOrStore()) { - unsigned Base; // Filled with the base regiser of MI. - int64_t Offset; // Filled with the offset of MI. - unsigned DummyWidth; - - // Does it allow us to offset the base register and is the base SP? - if (!getMemOpBaseRegImmOfsWidth(MI, Base, Offset, DummyWidth, &RI) || - Base != AArch64::SP) - return outliner::InstrType::Illegal; - - // Find the minimum/maximum offset for this instruction and check if - // fixing it up would be in range. - int64_t MinOffset, MaxOffset; // Unscaled offsets for the instruction. - unsigned Scale; // The scale to multiply the offsets by. - getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); - - // TODO: We should really test what happens if an instruction overflows. - // This is tricky to test with IR tests, but when the outliner is moved - // to a MIR test, it really ought to be checked. - Offset += 16; // Update the offset to what it would be if we outlined. - if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale) - return outliner::InstrType::Illegal; - - // It's in range, so we can outline it. - return outliner::InstrType::Legal; - } - - // FIXME: Add handling for instructions like "add x0, sp, #8". - - // We can't fix it up, so don't outline it. - return outliner::InstrType::Illegal; - } - return outliner::InstrType::Legal; } void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { for (MachineInstr &MI : MBB) { - unsigned Base, Width; + MachineOperand *Base; + unsigned Width; int64_t Offset; // Is this a load or store with an immediate offset with SP as the base? if (!MI.mayLoadOrStore() || - !getMemOpBaseRegImmOfsWidth(MI, Base, Offset, Width, &RI) || - Base != AArch64::SP) + !getMemOperandWithOffsetWidth(MI, Base, Offset, Width, &RI) || + (Base->isReg() && Base->getReg() != AArch64::SP)) continue; // It is, so we have to fix it up. @@ -5699,3 +5530,6 @@ bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault( MachineFunction &MF) const { return MF.getFunction().optForMinSize(); } + +#define GET_INSTRINFO_HELPERS +#include "AArch64GenInstrInfo.inc" diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index 43011dd4c3e57de43d692ee6fa164bd34ff337ea..9954669d567508d6ce83cb9d17d00dee077c1a91 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -62,14 +62,6 @@ public: unsigned isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override; - /// Returns true if there is a shiftable register and that the shift value - /// is non-zero. - static bool hasShiftedReg(const MachineInstr &MI); - - /// Returns true if there is an extendable register and that the extending - /// value is non-zero. - static bool hasExtendedReg(const MachineInstr &MI); - /// Does this instruction set its full destination register to zero? static bool isGPRZero(const MachineInstr &MI); @@ -79,11 +71,6 @@ public: /// Does this instruction rename an FPR without modifying bits? static bool isFPRCopy(const MachineInstr &MI); - /// Return true if this is load/store scales or extends its register offset. - /// This refers to scaling a dynamic index as opposed to scaled immediates. - /// MI should be a memory op that allows scaled addressing. - static bool isScaledAddr(const MachineInstr &MI); - /// Return true if pairing the given load or store is hinted to be /// unprofitable. static bool isLdStPairSuppressed(const MachineInstr &MI); @@ -110,13 +97,13 @@ public: /// Hint that pairing the given load or store is unprofitable. static void suppressLdStPair(MachineInstr &MI); - bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, - int64_t &Offset, - const TargetRegisterInfo *TRI) const override; + bool getMemOperandWithOffset(MachineInstr &MI, MachineOperand *&BaseOp, + int64_t &Offset, + const TargetRegisterInfo *TRI) const override; - bool getMemOpBaseRegImmOfsWidth(MachineInstr &LdSt, unsigned &BaseReg, - int64_t &Offset, unsigned &Width, - const TargetRegisterInfo *TRI) const; + bool getMemOperandWithOffsetWidth(MachineInstr &MI, MachineOperand *&BaseOp, + int64_t &Offset, unsigned &Width, + const TargetRegisterInfo *TRI) const; /// Return the immediate offset of the base register in a load/store \p LdSt. MachineOperand &getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const; @@ -128,8 +115,7 @@ public: bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width, int64_t &MinOffset, int64_t &MaxOffset) const; - bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1, - MachineInstr &SecondLdSt, unsigned BaseReg2, + bool shouldClusterMemOps(MachineOperand &BaseOp1, MachineOperand &BaseOp2, unsigned NumLoads) const override; void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, @@ -246,7 +232,8 @@ public: std::vector &RepeatedSequenceLocs) const override; outliner::InstrType getOutliningType(MachineBasicBlock::iterator &MIT, unsigned Flags) const override; - unsigned getMachineOutlinerMBBFlags(MachineBasicBlock &MBB) const override; + bool isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, + unsigned &Flags) const override; void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF, const outliner::OutlinedFunction &OF) const override; MachineBasicBlock::iterator @@ -254,15 +241,6 @@ public: MachineBasicBlock::iterator &It, MachineFunction &MF, const outliner::Candidate &C) const override; bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override; - /// Returns true if the instruction sets a constant value that can be - /// executed more efficiently. - static bool isExynosResetFast(const MachineInstr &MI); - /// Returns true if the load or store has an extension that can be executed - /// more efficiently. - static bool isExynosLdStExtFast(const MachineInstr &MI); - /// Returns true if the instruction has a constant shift left or extension - /// that can be executed more efficiently. - static bool isExynosShiftExtFast(const MachineInstr &MI); /// Returns true if the instruction has a shift by immediate that can be /// executed in one cycle less. static bool isFalkorShiftExtFast(const MachineInstr &MI); @@ -270,6 +248,9 @@ public: /// on Windows. static bool isSEHInstruction(const MachineInstr &MI); +#define GET_INSTRINFO_HELPER_DECLS +#include "AArch64GenInstrInfo.inc" + private: /// Sets the offsets on outlined instructions in \p MBB which use SP /// so that they will be valid post-outlining. diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index 37d3967df44efcaa6affde19fe754d684f307775..ee43b55aed93508ae319f977e53c1195b8f96303 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -24,6 +24,54 @@ def HasV8_4a : Predicate<"Subtarget->hasV8_4aOps()">, AssemblerPredicate<"HasV8_4aOps", "armv8.4a">; def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">, AssemblerPredicate<"HasV8_5aOps", "armv8.5a">; +def HasVH : Predicate<"Subtarget->hasVH()">, + AssemblerPredicate<"FeatureVH", "vh">; + +def HasLOR : Predicate<"Subtarget->hasLOR()">, + AssemblerPredicate<"FeatureLOR", "lor">; + +def HasPA : Predicate<"Subtarget->hasPA()">, + AssemblerPredicate<"FeaturePA", "pa">; + +def HasJS : Predicate<"Subtarget->hasJS()">, + AssemblerPredicate<"FeatureJS", "jsconv">; + +def HasCCIDX : Predicate<"Subtarget->hasCCIDX()">, + AssemblerPredicate<"FeatureCCIDX", "ccidx">; + +def HasComplxNum : Predicate<"Subtarget->hasComplxNum()">, + AssemblerPredicate<"FeatureComplxNum", "complxnum">; + +def HasNV : Predicate<"Subtarget->hasNV()">, + AssemblerPredicate<"FeatureNV", "nv">; + +def HasRASv8_4 : Predicate<"Subtarget->hasRASv8_4()">, + AssemblerPredicate<"FeatureRASv8_4", "rasv8_4">; + +def HasMPAM : Predicate<"Subtarget->hasMPAM()">, + AssemblerPredicate<"FeatureMPAM", "mpam">; + +def HasDIT : Predicate<"Subtarget->hasDIT()">, + AssemblerPredicate<"FeatureDIT", "dit">; + +def HasTRACEV8_4 : Predicate<"Subtarget->hasTRACEV8_4()">, + AssemblerPredicate<"FeatureTRACEV8_4", "tracev8.4">; + +def HasAM : Predicate<"Subtarget->hasAM()">, + AssemblerPredicate<"FeatureAM", "am">; + +def HasSEL2 : Predicate<"Subtarget->hasSEL2()">, + AssemblerPredicate<"FeatureSEL2", "sel2">; + +def HasTLB_RMI : Predicate<"Subtarget->hasTLB_RMI()">, + AssemblerPredicate<"FeatureTLB_RMI", "tlb-rmi">; + +def HasFMI : Predicate<"Subtarget->hasFMI()">, + AssemblerPredicate<"FeatureFMI", "fmi">; + +def HasRCPC_IMMO : Predicate<"Subtarget->hasRCPCImm()">, + AssemblerPredicate<"FeatureRCPC_IMMO", "rcpc-immo">; + def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">; def HasNEON : Predicate<"Subtarget->hasNEON()">, @@ -510,7 +558,7 @@ def ISB : CRmSystemI { let CRm = 0b0010; let Inst{12} = 0; - let Predicates = [HasV8_4a]; + let Predicates = [HasTRACEV8_4]; } } @@ -602,7 +650,7 @@ let Uses = [LR], Defs = [LR], CRm = 0b0000 in { } // These pointer authentication isntructions require armv8.3a -let Predicates = [HasV8_3a] in { +let Predicates = [HasPA] in { multiclass SignAuth prefix, bits<3> prefix_z, string asm> { def IA : SignAuthOneData; def IB : SignAuthOneData; @@ -642,17 +690,17 @@ let Predicates = [HasV8_3a] in { defm LDRAA : AuthLoad<0, "ldraa", simm10Scaled>; defm LDRAB : AuthLoad<1, "ldrab", simm10Scaled>; - // v8.3a floating point conversion for javascript - let Predicates = [HasV8_3a, HasFPARMv8] in - def FJCVTZS : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32, - "fjcvtzs", []> { - let Inst{31} = 0; - } +} -} // HasV8_3a +// v8.3a floating point conversion for javascript +let Predicates = [HasJS, HasFPARMv8] in +def FJCVTZS : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32, + "fjcvtzs", []> { + let Inst{31} = 0; +} // HasJS, HasFPARMv8 // v8.4 Flag manipulation instructions -let Predicates = [HasV8_4a] in { +let Predicates = [HasFMI] in { def CFINV : SimpleSystemI<0, (ins), "cfinv", "">, Sched<[WriteSys]> { let Inst{20-5} = 0b0000001000000000; } @@ -660,7 +708,7 @@ def SETF8 : BaseFlagManipulation<0, 0, (ins GPR32:$Rn), "setf8", "{\t$Rn}">; def SETF16 : BaseFlagManipulation<0, 1, (ins GPR32:$Rn), "setf16", "{\t$Rn}">; def RMIF : FlagRotate<(ins GPR64:$Rn, uimm6:$imm, imm0_15:$mask), "rmif", "{\t$Rn, $imm, $mask}">; -} // HasV8_4a +} // HasFMI // v8.5 flag manipulation instructions let Predicates = [HasAltNZCV], Uses = [NZCV], Defs = [NZCV] in { @@ -2629,8 +2677,9 @@ defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32z, "sturb", [(truncstorei8 GPR32z:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>; -// Armv8.4 LDAPR & STLR with Immediate Offset instruction -let Predicates = [HasV8_4a] in { +// Armv8.4 Weaker Release Consistency enhancements +// LDAPR & STLR with Immediate Offset instructions +let Predicates = [HasRCPC_IMMO] in { defm STLURB : BaseStoreUnscaleV84<"stlurb", 0b00, 0b00, GPR32>; defm STLURH : BaseStoreUnscaleV84<"stlurh", 0b01, 0b00, GPR32>; defm STLURW : BaseStoreUnscaleV84<"stlur", 0b10, 0b00, GPR32>; @@ -2915,7 +2964,7 @@ def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">; def STXPW : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">; def STXPX : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">; -let Predicates = [HasV8_1a] in { +let Predicates = [HasLOR] in { // v8.1a "Limited Order Region" extension load-acquire instructions def LDLARW : LoadAcquire <0b10, 1, 1, 0, 0, GPR32, "ldlar">; def LDLARX : LoadAcquire <0b11, 1, 1, 0, 0, GPR64, "ldlar">; @@ -3175,6 +3224,20 @@ let isPseudo = 1 in { def SEH_EpilogEnd : Pseudo<(outs), (ins), []>, Sched<[]>; } +// Pseudo instructions for Windows EH +//===----------------------------------------------------------------------===// +let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, + isCodeGenOnly = 1, isReturn = 1, isEHScopeReturn = 1, isPseudo = 1 in { + def CLEANUPRET : Pseudo<(outs), (ins), [(cleanupret)]>, Sched<[]>; + let usesCustomInserter = 1 in + def CATCHRET : Pseudo<(outs), (ins am_brcond:$dst, am_brcond:$src), [(catchret bb:$dst, bb:$src)]>, + Sched<[]>; +} + +let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1, + usesCustomInserter = 1 in +def CATCHPAD : Pseudo<(outs), (ins), [(catchpad)]>, Sched<[]>; + //===----------------------------------------------------------------------===// // Floating point immediate move. //===----------------------------------------------------------------------===// @@ -4094,25 +4157,6 @@ defm : Neon_mul_widen_patterns; -// Patterns for smull2/umull2. -multiclass Neon_mul_high_patterns { - def : Pat<(v8i16 (opnode (extract_high_v16i8 V128:$Rn), - (extract_high_v16i8 V128:$Rm))), - (INST8B V128:$Rn, V128:$Rm)>; - def : Pat<(v4i32 (opnode (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 V128:$Rm))), - (INST4H V128:$Rn, V128:$Rm)>; - def : Pat<(v2i64 (opnode (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 V128:$Rm))), - (INST2S V128:$Rn, V128:$Rm)>; -} - -defm : Neon_mul_high_patterns; -defm : Neon_mul_high_patterns; - // Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL multiclass Neon_mulacc_widen_patterns { @@ -4335,6 +4379,12 @@ def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>; def DUPv8i8lane : SIMDDup8FromElement <0, ".8b", v8i8, V64>; def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>; +// DUP from a 64-bit register to a 64-bit register is just a copy +def : Pat<(v1i64 (AArch64dup (i64 GPR64:$Rn))), + (COPY_TO_REGCLASS GPR64:$Rn, FPR64)>; +def : Pat<(v1f64 (AArch64dup (f64 FPR64:$Rn))), + (COPY_TO_REGCLASS FPR64:$Rn, FPR64)>; + def : Pat<(v2f32 (AArch64dup (f32 FPR32:$Rn))), (v2f32 (DUPv2i32lane (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub), @@ -5970,6 +6020,41 @@ def : Pat<(i32 (trunc GPR64sp:$src)), // __builtin_trap() uses the BRK instruction on AArch64. def : Pat<(trap), (BRK 1)>; +// Multiply high patterns which multiply the lower subvector using smull/umull +// and the upper subvector with smull2/umull2. Then shuffle the high the high +// part of both results together. +def : Pat<(v16i8 (mulhs V128:$Rn, V128:$Rm)), + (UZP2v16i8 + (SMULLv8i8_v8i16 (EXTRACT_SUBREG V128:$Rn, dsub), + (EXTRACT_SUBREG V128:$Rm, dsub)), + (SMULLv16i8_v8i16 V128:$Rn, V128:$Rm))>; +def : Pat<(v8i16 (mulhs V128:$Rn, V128:$Rm)), + (UZP2v8i16 + (SMULLv4i16_v4i32 (EXTRACT_SUBREG V128:$Rn, dsub), + (EXTRACT_SUBREG V128:$Rm, dsub)), + (SMULLv8i16_v4i32 V128:$Rn, V128:$Rm))>; +def : Pat<(v4i32 (mulhs V128:$Rn, V128:$Rm)), + (UZP2v4i32 + (SMULLv2i32_v2i64 (EXTRACT_SUBREG V128:$Rn, dsub), + (EXTRACT_SUBREG V128:$Rm, dsub)), + (SMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>; + +def : Pat<(v16i8 (mulhu V128:$Rn, V128:$Rm)), + (UZP2v16i8 + (UMULLv8i8_v8i16 (EXTRACT_SUBREG V128:$Rn, dsub), + (EXTRACT_SUBREG V128:$Rm, dsub)), + (UMULLv16i8_v8i16 V128:$Rn, V128:$Rm))>; +def : Pat<(v8i16 (mulhu V128:$Rn, V128:$Rm)), + (UZP2v8i16 + (UMULLv4i16_v4i32 (EXTRACT_SUBREG V128:$Rn, dsub), + (EXTRACT_SUBREG V128:$Rm, dsub)), + (UMULLv8i16_v4i32 V128:$Rn, V128:$Rm))>; +def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)), + (UZP2v4i32 + (UMULLv2i32_v2i64 (EXTRACT_SUBREG V128:$Rn, dsub), + (EXTRACT_SUBREG V128:$Rm, dsub)), + (UMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>; + // Conversions within AdvSIMD types in the same register size are free. // But because we need a consistent lane ordering, in big endian many // conversions require one or more REV instructions. diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp index 46b8b4dc6486eb4ae89b6ba8b9ba15657658ae30..6cbfb6ab161b25e4ce8526fd7007fcc5e6eb78ed 100644 --- a/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -65,6 +65,15 @@ private: bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const; + // Helper to generate an equivalent of scalar_to_vector into a new register, + // returned via 'Dst'. + bool emitScalarToVector(unsigned &Dst, const LLT DstTy, + const TargetRegisterClass *DstRC, unsigned Scalar, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineRegisterInfo &MRI) const; + bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI) const; + ComplexRendererFns selectArithImmed(MachineOperand &Root) const; ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root, @@ -667,7 +676,7 @@ void AArch64InstructionSelector::materializeLargeCMVal( MachineRegisterInfo &MRI = MF.getRegInfo(); MachineIRBuilder MIB(I); - auto MovZ = MIB.buildInstr(AArch64::MOVZXi, &AArch64::GPR64RegClass); + auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {}); MovZ->addOperand(MF, I.getOperand(1)); MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 | AArch64II::MO_NC); @@ -779,16 +788,36 @@ bool AArch64InstructionSelector::select(MachineInstr &I, const unsigned CondReg = I.getOperand(0).getReg(); MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); - if (selectCompareBranch(I, MF, MRI)) + // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z + // instructions will not be produced, as they are conditional branch + // instructions that do not set flags. + bool ProduceNonFlagSettingCondBr = + !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); + if (ProduceNonFlagSettingCondBr && selectCompareBranch(I, MF, MRI)) return true; - auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW)) - .addUse(CondReg) - .addImm(/*bit offset=*/0) - .addMBB(DestMBB); + if (ProduceNonFlagSettingCondBr) { + auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW)) + .addUse(CondReg) + .addImm(/*bit offset=*/0) + .addMBB(DestMBB); - I.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI); + } else { + auto CMP = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri)) + .addDef(AArch64::WZR) + .addUse(CondReg) + .addImm(1); + constrainSelectedInstRegOperands(*CMP.getInstr(), TII, TRI, RBI); + auto Bcc = + BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::Bcc)) + .addImm(AArch64CC::EQ) + .addMBB(DestMBB); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*Bcc.getInstr(), TII, TRI, RBI); + } } case TargetOpcode::G_BRINDIRECT: { @@ -1013,12 +1042,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I, } unsigned MemSizeInBits = MemOp.getSize() * 8; - // FIXME: PR36018: Volatile loads in some cases are incorrectly selected by - // folding with an extend. Until we have a G_SEXTLOAD solution bail out if - // we hit one. - if (Opcode == TargetOpcode::G_LOAD && MemOp.isVolatile()) - return false; - const unsigned PtrReg = I.getOperand(1).getReg(); #ifndef NDEBUG const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI); @@ -1528,11 +1551,130 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); } } + case TargetOpcode::G_BUILD_VECTOR: + return selectBuildVector(I, MRI); } return false; } +bool AArch64InstructionSelector::emitScalarToVector( + unsigned &Dst, const LLT DstTy, const TargetRegisterClass *DstRC, + unsigned Scalar, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, MachineRegisterInfo &MRI) const { + Dst = MRI.createVirtualRegister(DstRC); + + unsigned UndefVec = MRI.createVirtualRegister(DstRC); + MachineInstr &UndefMI = *BuildMI(MBB, MBBI, MBBI->getDebugLoc(), + TII.get(TargetOpcode::IMPLICIT_DEF)) + .addDef(UndefVec); + + auto BuildFn = [&](unsigned SubregIndex) { + MachineInstr &InsMI = *BuildMI(MBB, MBBI, MBBI->getDebugLoc(), + TII.get(TargetOpcode::INSERT_SUBREG)) + .addDef(Dst) + .addUse(UndefVec) + .addUse(Scalar) + .addImm(SubregIndex); + constrainSelectedInstRegOperands(UndefMI, TII, TRI, RBI); + return constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI); + }; + + switch (DstTy.getElementType().getSizeInBits()) { + case 32: + return BuildFn(AArch64::ssub); + case 64: + return BuildFn(AArch64::dsub); + default: + return false; + } +} + +bool AArch64InstructionSelector::selectBuildVector( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); + // Until we port more of the optimized selections, for now just use a vector + // insert sequence. + const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + const LLT EltTy = MRI.getType(I.getOperand(1).getReg()); + unsigned EltSize = EltTy.getSizeInBits(); + if (EltSize < 32 || EltSize > 64) + return false; // Don't support all element types yet. + const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); + unsigned Opc; + unsigned SubregIdx; + if (RB.getID() == AArch64::GPRRegBankID) { + if (EltSize == 32) { + Opc = AArch64::INSvi32gpr; + SubregIdx = AArch64::ssub; + } else { + Opc = AArch64::INSvi64gpr; + SubregIdx = AArch64::dsub; + } + } else { + if (EltSize == 32) { + Opc = AArch64::INSvi32lane; + SubregIdx = AArch64::ssub; + } else { + Opc = AArch64::INSvi64lane; + SubregIdx = AArch64::dsub; + } + } + + if (EltSize * DstTy.getNumElements() != 128) + return false; // Don't handle unpacked vectors yet. + + unsigned DstVec = 0; + const TargetRegisterClass *DstRC = getRegClassForTypeOnBank( + DstTy, RBI.getRegBank(AArch64::FPRRegBankID), RBI); + emitScalarToVector(DstVec, DstTy, DstRC, I.getOperand(1).getReg(), + *I.getParent(), I.getIterator(), MRI); + for (unsigned i = 2, e = DstTy.getSizeInBits() / EltSize + 1; i < e; ++i) { + unsigned InsDef; + // For the last insert re-use the dst reg of the G_BUILD_VECTOR. + if (i + 1 < e) + InsDef = MRI.createVirtualRegister(DstRC); + else + InsDef = I.getOperand(0).getReg(); + unsigned LaneIdx = i - 1; + if (RB.getID() == AArch64::FPRRegBankID) { + unsigned ImpDef = MRI.createVirtualRegister(DstRC); + MachineInstr &ImpDefMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(TargetOpcode::IMPLICIT_DEF)) + .addDef(ImpDef); + unsigned InsSubDef = MRI.createVirtualRegister(DstRC); + MachineInstr &InsSubMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(TargetOpcode::INSERT_SUBREG)) + .addDef(InsSubDef) + .addUse(ImpDef) + .addUse(I.getOperand(i).getReg()) + .addImm(SubregIdx); + MachineInstr &InsEltMI = + *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opc)) + .addDef(InsDef) + .addUse(DstVec) + .addImm(LaneIdx) + .addUse(InsSubDef) + .addImm(0); + constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(InsSubMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(InsEltMI, TII, TRI, RBI); + DstVec = InsDef; + } else { + MachineInstr &InsMI = + *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opc)) + .addDef(InsDef) + .addUse(DstVec) + .addImm(LaneIdx) + .addUse(I.getOperand(i).getReg()); + constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI); + DstVec = InsDef; + } + } + I.eraseFromParent(); + return true; +} + /// SelectArithImmed - Select an immediate value that can be represented as /// a 12-bit value shifted left by either 0 or 12. If so, return true with /// Val set to the 12-bit value and Shift set to the shifter operand. diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp index 4b5e10ac4ec503d025f2f58e58c882c2c1581ecc..5d63f0c244ea32f2f57b5adabc4f0353b3f39258 100644 --- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -48,7 +48,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { const LLT v2s64 = LLT::vector(2, 64); getActionDefinitionsBuilder(G_IMPLICIT_DEF) - .legalFor({p0, s1, s8, s16, s32, s64}) + .legalFor({p0, s1, s8, s16, s32, s64, v2s64}) .clampScalar(0, s1, s64) .widenScalarToNextPow2(0, 8); @@ -398,13 +398,26 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { return VecTy == v4s32 || VecTy == v2s64; }); + getActionDefinitionsBuilder(G_BUILD_VECTOR) + .legalFor({{v4s32, s32}, {v2s64, s64}}) + .clampNumElements(0, v4s32, v4s32) + .clampNumElements(0, v2s64, v2s64) + + // Deal with larger scalar types, which will be implicitly truncated. + .legalIf([=](const LegalityQuery &Query) { + return Query.Types[0].getScalarSizeInBits() < + Query.Types[1].getSizeInBits(); + }) + .minScalarSameAs(1, 0); + computeTables(); verify(*ST.getInstrInfo()); } bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const { + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const { switch (MI.getOpcode()) { default: // No idea what to do. diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.h b/lib/Target/AArch64/AArch64LegalizerInfo.h index a745b0edbc6d943e0af989863005bc11d20bcec6..77e8bdc7623c9ba2a1617631cfa9eeb9eca60d9e 100644 --- a/lib/Target/AArch64/AArch64LegalizerInfo.h +++ b/lib/Target/AArch64/AArch64LegalizerInfo.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINELEGALIZER_H #define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINELEGALIZER_H +#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" namespace llvm { @@ -28,7 +29,8 @@ public: AArch64LegalizerInfo(const AArch64Subtarget &ST); bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const override; + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const override; private: bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI, diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp index 6e1824f632eea335ff1cc7c05fd2cb9fc45f1489..07295989ec8e29d888aa3f902cee13fa665fc02b 100644 --- a/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -269,4 +269,17 @@ void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { if (lowerOperand(MO, MCOp)) OutMI.addOperand(MCOp); } + + switch (OutMI.getOpcode()) { + case AArch64::CATCHRET: + OutMI = MCInst(); + OutMI.setOpcode(AArch64::RET); + OutMI.addOperand(MCOperand::createReg(AArch64::LR)); + break; + case AArch64::CLEANUPRET: + OutMI = MCInst(); + OutMI.setOpcode(AArch64::RET); + OutMI.addOperand(MCOperand::createReg(AArch64::LR)); + break; + } } diff --git a/lib/Target/AArch64/AArch64PfmCounters.td b/lib/Target/AArch64/AArch64PfmCounters.td new file mode 100644 index 0000000000000000000000000000000000000000..16ba3e4282a0d0fd9adb232a634b95b274b77da6 --- /dev/null +++ b/lib/Target/AArch64/AArch64PfmCounters.td @@ -0,0 +1,19 @@ +//===-- AArch64PfmCounters.td - AArch64 Hardware Counters --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This describes the available hardware counters for AArch64. +// +//===----------------------------------------------------------------------===// + +def CpuCyclesPfmCounter : PfmCounter<"CPU_CYCLES">; + +def DefaultPfmCounters : ProcPfmCounters { + let CycleCounter = CpuCyclesPfmCounter; +} +def : PfmCountersDefaultBinding; diff --git a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp index 32487b9ccc330998972efcf99e6f99e6c795eec1..5022b6221ab99029caa1e54089da7928480ce2ff 100644 --- a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp +++ b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp @@ -32,11 +32,11 @@ public: AArch64PreLegalizerCombinerInfo() : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, /*LegalizerInfo*/ nullptr) {} - virtual bool combine(CombinerChangeObserver &Observer, MachineInstr &MI, + virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const override; }; -bool AArch64PreLegalizerCombinerInfo::combine(CombinerChangeObserver &Observer, +bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const { CombinerHelper Helper(Observer, B); diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index 0bab5c05ba6696dcb50e74a226fd257d7221cfa0..96ae45ae3d0dcd7d074fc4294ee823c3ed8a7d21 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -161,6 +161,10 @@ void AArch64RegisterInfo::UpdateCustomCallPreservedMask(MachineFunction &MF, *Mask = UpdatedMask; } +const uint32_t *AArch64RegisterInfo::getNoPreservedMask() const { + return CSR_AArch64_NoRegs_RegMask; +} + const uint32_t * AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { @@ -199,6 +203,10 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { if (hasBasePointer(MF)) markSuperRegs(Reserved, AArch64::W19); + // SLH uses register W16/X16 as the taint register. + if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening)) + markSuperRegs(Reserved, AArch64::W16); + assert(checkAllSuperRegsMarked(Reserved)); return Reserved; } @@ -251,14 +259,15 @@ unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; } bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); - // In the presence of variable sized objects, if the fixed stack size is - // large enough that referencing from the FP won't result in things being - // in range relatively often, we can use a base pointer to allow access + // In the presence of variable sized objects or funclets, if the fixed stack + // size is large enough that referencing from the FP won't result in things + // being in range relatively often, we can use a base pointer to allow access // from the other direction like the SP normally works. + // // Furthermore, if both variable sized objects are present, and the // stack needs to be dynamically re-aligned, the base pointer is the only // reliable way to reference the locals. - if (MFI.hasVarSizedObjects()) { + if (MFI.hasVarSizedObjects() || MF.hasEHFunclets()) { if (needsStackRealignment(MF)) return true; // Conservatively estimate whether the negative offset from the frame diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h index 4653c7af59d0ec1b7c1f5c8b3ed4dbe09da34014..c4153228a7c0467fd3ede037bfd0208c1543d0f3 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/lib/Target/AArch64/AArch64RegisterInfo.h @@ -65,6 +65,9 @@ public: // normal calls, so they need a different mask to represent this. const uint32_t *getTLSCallPreservedMask() const; + // Funclets on ARM64 Windows don't preserve any registers. + const uint32_t *getNoPreservedMask() const override; + /// getThisReturnPreservedMask - Returns a call preserved mask specific to the /// case that 'returned' is on an i64 first argument if the calling convention /// is one that can (partially) model this attribute with a preserved mask diff --git a/lib/Target/AArch64/AArch64SchedExynosM1.td b/lib/Target/AArch64/AArch64SchedExynosM1.td index d566a13dc67dc1193889bac1179c2284f7c85106..7f1c2d4c7648281cc6bfb308608c68ce15476540 100644 --- a/lib/Target/AArch64/AArch64SchedExynosM1.td +++ b/lib/Target/AArch64/AArch64SchedExynosM1.td @@ -61,14 +61,6 @@ def M1UnitALU : ProcResGroup<[M1UnitA, def M1UnitNALU : ProcResGroup<[M1UnitNAL0, M1UnitNAL1]>; // All simple vector -//===----------------------------------------------------------------------===// -// Predicates. - -def M1BranchLinkPred : SchedPredicate<[{MI->getOpcode() == AArch64::BLR && - MI->getOperand(0).getReg() != AArch64::LR}]>; -def M1LdStExtPred : SchedPredicate<[{TII->isExynosLdStExtFast(*MI)}]>; -def M1ShiftExtPred : SchedPredicate<[{TII->isExynosShiftExtFast(*MI)}]>; - //===----------------------------------------------------------------------===// // Coarse scheduling model. @@ -86,14 +78,16 @@ def M1WriteAC : SchedWriteRes<[M1UnitALU, def M1WriteAD : SchedWriteRes<[M1UnitALU, M1UnitC]> { let Latency = 2; let NumMicroOps = 2; } -def M1WriteAX : SchedWriteVariant<[SchedVar, - SchedVar]>; +def M1WriteAX : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M1WriteAY : SchedWriteVariant<[SchedVar, + SchedVar]>; def M1WriteC1 : SchedWriteRes<[M1UnitC]> { let Latency = 1; } def M1WriteC2 : SchedWriteRes<[M1UnitC]> { let Latency = 2; } def M1WriteB1 : SchedWriteRes<[M1UnitB]> { let Latency = 1; } -def M1WriteBX : SchedWriteVariant<[SchedVar, - SchedVar]>; +def M1WriteBX : SchedWriteVariant<[SchedVar, + SchedVar]>; def M1WriteL5 : SchedWriteRes<[M1UnitL]> { let Latency = 5; } def M1WriteL6 : SchedWriteRes<[M1UnitL]> { let Latency = 6; } @@ -111,40 +105,27 @@ def M1WriteLD : SchedWriteRes<[M1UnitL, let ResourceCycles = [2, 1]; } def M1WriteLH : SchedWriteRes<[]> { let Latency = 5; let NumMicroOps = 0; } -def M1WriteLX : SchedWriteVariant<[SchedVar, - SchedVar]>; -def M1WriteLY : SchedWriteVariant<[SchedVar, - SchedVar]>; +def M1WriteLX : SchedWriteVariant<[SchedVar, + SchedVar]>; def M1WriteS1 : SchedWriteRes<[M1UnitS]> { let Latency = 1; } def M1WriteS3 : SchedWriteRes<[M1UnitS]> { let Latency = 3; } def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; } def M1WriteSA : SchedWriteRes<[M1UnitS, - M1UnitFST, - M1UnitS, - M1UnitFST]> { let Latency = 1; - let NumMicroOps = 2; } -def M1WriteSB : SchedWriteRes<[M1UnitS, M1UnitFST, M1UnitA]> { let Latency = 3; let NumMicroOps = 2; } -def M1WriteSC : SchedWriteRes<[M1UnitS, +def M1WriteSB : SchedWriteRes<[M1UnitS, M1UnitFST, M1UnitS, M1UnitFST, M1UnitA]> { let Latency = 3; let NumMicroOps = 3; } -def M1WriteSD : SchedWriteRes<[M1UnitS, - M1UnitFST, - M1UnitA]> { let Latency = 1; - let NumMicroOps = 2; } -def M1WriteSE : SchedWriteRes<[M1UnitS, +def M1WriteSC : SchedWriteRes<[M1UnitS, M1UnitA]> { let Latency = 2; let NumMicroOps = 2; } -def M1WriteSX : SchedWriteVariant<[SchedVar, - SchedVar]>; -def M1WriteSY : SchedWriteVariant<[SchedVar, - SchedVar]>; +def M1WriteSX : SchedWriteVariant<[SchedVar, + SchedVar]>; def M1ReadAdrBase : SchedReadVariant<[SchedVar, SchedVar]>; @@ -415,9 +396,9 @@ def M1WriteVSTH : SchedWriteRes<[M1UnitNALU, M1UnitS, M1UnitFST, M1UnitFST, - M1UnitFST]> { let Latency = 14; - let NumMicroOps = 4; - let ResourceCycles = [1, 7, 1, 7, 1]; } + M1UnitFST]> { let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [1, 7, 1, 7, 1]; } def M1WriteVSTI : SchedWriteRes<[M1UnitNALU, M1UnitS, M1UnitFST, @@ -428,9 +409,17 @@ def M1WriteVSTI : SchedWriteRes<[M1UnitNALU, M1UnitS, M1UnitFST, M1UnitFST, - M1UnitFST]> { let Latency = 17; - let NumMicroOps = 7; - let ResourceCycles = [1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 1]; } + M1UnitFST]> { let Latency = 17; + let NumMicroOps = 7; + let ResourceCycles = [1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 1]; } + +// Special cases. +def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; } +def M1WriteCOPY : SchedWriteVariant<[SchedVar, + SchedVar]>; + +// Fast forwarding. +def M1ReadAES : SchedReadAdvance<1, [M1WriteAES]>; // Branch instructions def : InstRW<[M1WriteB1], (instrs Bcc)>; @@ -440,8 +429,11 @@ def : InstRW<[M1WriteC1], (instregex "^CBN?Z[WX]")>; def : InstRW<[M1WriteAD], (instregex "^TBN?Z[WX]")>; // Arithmetic and logical integer instructions. -def : InstRW<[M1WriteA1], (instrs COPY)>; -def : InstRW<[M1WriteAX], (instregex ".+r[sx](64)?$")>; +def : InstRW<[M1WriteAX], (instregex ".+rx(64)?$")>; +def : InstRW<[M1WriteAY], (instregex ".+rs$")>; + +// Move instructions. +def : InstRW<[M1WriteCOPY], (instrs COPY)>; // Divide and multiply instructions. @@ -451,10 +443,20 @@ def : InstRW<[M1WriteAX], (instregex ".+r[sx](64)?$")>; def : InstRW<[M1WriteLB, WriteLDHi, WriteAdr], (instregex "^LDP(SW|W|X)(post|pre)")>; -def : InstRW<[M1WriteLX, - ReadAdrBase], (instregex "^PRFMro[WX]")>; +def : InstRW<[M1WriteLC, + ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roW")>; +def : InstRW<[M1WriteL5, + ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roX")>; +def : InstRW<[M1WriteLC, + ReadAdrBase], (instrs PRFMroW)>; +def : InstRW<[M1WriteL5, + ReadAdrBase], (instrs PRFMroX)>; // Store instructions. +def : InstRW<[M1WriteSC, + ReadAdrBase], (instregex "^STR(BB|HH|W|X)roW")>; +def : InstRW<[WriteST, + ReadAdrBase], (instregex "^STR(BB|HH|W|X)roX")>; // FP data instructions. def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)[DS]r")>; @@ -488,8 +490,10 @@ def : InstRW<[WriteVLD], (instregex "^LDUR[BDHSQ]i")>; def : InstRW<[WriteVLD, WriteAdr], (instregex "^LDR[BDHSQ](post|pre)")>; def : InstRW<[WriteVLD], (instregex "^LDR[BDHSQ]ui")>; -def : InstRW<[M1WriteLY, - ReadAdrBase], (instregex "^LDR[BDHS]ro[WX]")>; +def : InstRW<[M1WriteLD, + ReadAdrBase], (instregex "^LDR[BDHS]roW")>; +def : InstRW<[WriteVLD, + ReadAdrBase], (instregex "^LDR[BDHS]roX")>; def : InstRW<[M1WriteLD, ReadAdrBase], (instregex "^LDRQro[WX]")>; def : InstRW<[WriteVLD, @@ -508,14 +512,16 @@ def : InstRW<[WriteVST], (instregex "^STUR[BDHSQ]i")>; def : InstRW<[WriteVST, WriteAdr], (instregex "^STR[BDHSQ](post|pre)")>; def : InstRW<[WriteVST], (instregex "^STR[BDHSQ]ui")>; -def : InstRW<[M1WriteSY, - ReadAdrBase], (instregex "^STR[BDHS]ro[WX]")>; -def : InstRW<[M1WriteSB, +def : InstRW<[M1WriteSA, + ReadAdrBase], (instregex "^STR[BDHS]roW")>; +def : InstRW<[WriteVST, + ReadAdrBase], (instregex "^STR[BDHS]roX")>; +def : InstRW<[M1WriteSA, ReadAdrBase], (instregex "^STRQro[WX]")>; def : InstRW<[WriteVST], (instregex "^STN?P[DSQ]i")>; def : InstRW<[WriteVST, WriteAdr], (instregex "^STP[DS](post|pre)")>; -def : InstRW<[M1WriteSC, +def : InstRW<[M1WriteSB, WriteAdr], (instregex "^STPQ(post|pre)")>; // ASIMD instructions. @@ -609,21 +615,21 @@ def : InstRW<[M1WriteVLDE], (instregex "LD1i(64)$")>; def : InstRW<[M1WriteVLDE, WriteAdr], (instregex "LD1i(64)_POST$")>; -def : InstRW<[M1WriteL5], (instregex "LD1Rv(8b|4h|2s)$")>; -def : InstRW<[M1WriteL5, +def : InstRW<[WriteVLD], (instregex "LD1Rv(8b|4h|2s)$")>; +def : InstRW<[WriteVLD, WriteAdr], (instregex "LD1Rv(8b|4h|2s)_POST$")>; -def : InstRW<[M1WriteL5], (instregex "LD1Rv(1d)$")>; -def : InstRW<[M1WriteL5, +def : InstRW<[WriteVLD], (instregex "LD1Rv(1d)$")>; +def : InstRW<[WriteVLD, WriteAdr], (instregex "LD1Rv(1d)_POST$")>; -def : InstRW<[M1WriteL5], (instregex "LD1Rv(16b|8h|4s|2d)$")>; -def : InstRW<[M1WriteL5, +def : InstRW<[WriteVLD], (instregex "LD1Rv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLD, WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>; -def : InstRW<[M1WriteL5], (instregex "LD1Onev(8b|4h|2s|1d)$")>; -def : InstRW<[M1WriteL5, +def : InstRW<[WriteVLD], (instregex "LD1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteVLD, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>; -def : InstRW<[M1WriteL5], (instregex "LD1Onev(16b|8h|4s|2d)$")>; -def : InstRW<[M1WriteL5, +def : InstRW<[WriteVLD], (instregex "LD1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLD, WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>; def : InstRW<[M1WriteVLDA], (instregex "LD1Twov(8b|4h|2s|1d)$")>; def : InstRW<[M1WriteVLDA, @@ -831,8 +837,6 @@ def : InstRW<[M1WriteVSTI, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>; // Cryptography instructions. -def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; } -def M1ReadAES : SchedReadAdvance<1, [M1WriteAES]>; def : InstRW<[M1WriteAES], (instregex "^AES[DE]")>; def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AESI?MC")>; diff --git a/lib/Target/AArch64/AArch64SchedExynosM3.td b/lib/Target/AArch64/AArch64SchedExynosM3.td index e61fb611ab245bc2a2b5b52d5f9969f24dce20c9..fd19ff84f1e010d40b16fdc6917d003bc3fb985b 100644 --- a/lib/Target/AArch64/AArch64SchedExynosM3.td +++ b/lib/Target/AArch64/AArch64SchedExynosM3.td @@ -103,20 +103,6 @@ def M3UnitNSHF : ProcResGroup<[M3UnitNSHF0, M3UnitNSHF1, M3UnitNSHF2]>; -//===----------------------------------------------------------------------===// -// Predicates. - -def M3BranchLinkPred : SchedPredicate<[{MI->getOpcode() == AArch64::BLR && - MI->getOperand(0).isReg() && - MI->getOperand(0).getReg() != AArch64::LR}]>; -def M3ResetPred : SchedPredicate<[{TII->isExynosResetFast(*MI)}]>; -def M3RotatePred : SchedPredicate<[{(MI->getOpcode() == AArch64::EXTRWrri || - MI->getOpcode() == AArch64::EXTRXrri) && - MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && - MI->getOperand(1).getReg() == MI->getOperand(2).getReg()}]>; -def M3LdStExtPred : SchedPredicate<[{TII->isExynosLdStExtFast(*MI)}]>; -def M3ShiftExtPred : SchedPredicate<[{TII->isExynosShiftExtFast(*MI)}]>; - //===----------------------------------------------------------------------===// // Coarse scheduling model. @@ -138,15 +124,23 @@ def M3WriteAD : SchedWriteRes<[M3UnitALU, let NumMicroOps = 2; } def M3WriteC1 : SchedWriteRes<[M3UnitC]> { let Latency = 1; } def M3WriteC2 : SchedWriteRes<[M3UnitC]> { let Latency = 2; } -def M3WriteAX : SchedWriteVariant<[SchedVar, - SchedVar, - SchedVar]>; -def M3WriteAY : SchedWriteVariant<[SchedVar, - SchedVar]>; +def M3WriteAU : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar]>; +def M3WriteAV : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M3WriteAW : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M3WriteAX : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M3WriteAY : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M3WriteAZ : SchedWriteVariant<[SchedVar, + SchedVar]>; def M3WriteB1 : SchedWriteRes<[M3UnitB]> { let Latency = 1; } -def M3WriteBX : SchedWriteVariant<[SchedVar, - SchedVar]>; +def M3WriteBX : SchedWriteVariant<[SchedVar, + SchedVar]>; def M3WriteL4 : SchedWriteRes<[M3UnitL]> { let Latency = 4; } def M3WriteL5 : SchedWriteRes<[M3UnitL]> { let Latency = 5; } @@ -163,29 +157,23 @@ def M3WriteLC : SchedWriteRes<[M3UnitA, def M3WriteLD : SchedWriteRes<[M3UnitA, M3UnitL]> { let Latency = 4; let NumMicroOps = 2; } +def M3WriteLE : SchedWriteRes<[M3UnitA, + M3UnitL]> { let Latency = 6; + let NumMicroOps = 2; } def M3WriteLH : SchedWriteRes<[]> { let Latency = 5; let NumMicroOps = 0; } - -def M3WriteLX : SchedWriteVariant<[SchedVar, - SchedVar]>; +def M3WriteLX : SchedWriteVariant<[SchedVar, + SchedVar]>; def M3WriteS1 : SchedWriteRes<[M3UnitS]> { let Latency = 1; } def M3WriteSA : SchedWriteRes<[M3UnitA, M3UnitS, - M3UnitFST]> { let Latency = 2; + M3UnitFST]> { let Latency = 3; let NumMicroOps = 2; } def M3WriteSB : SchedWriteRes<[M3UnitA, - M3UnitS]> { let Latency = 1; - let NumMicroOps = 2; } -def M3WriteSC : SchedWriteRes<[M3UnitA, M3UnitS]> { let Latency = 2; let NumMicroOps = 2; } -def M3WriteSX : SchedWriteVariant<[SchedVar, - SchedVar]>; -def M3WriteSY : SchedWriteVariant<[SchedVar, - SchedVar]>; - def M3ReadAdrBase : SchedReadVariant<[SchedVar, SchedVar]>; @@ -214,9 +202,7 @@ def : WriteRes { let Latency = 4; let ResourceCycles = [2]; } // Miscellaneous instructions. -def : WriteRes { let Latency = 1; - let NumMicroOps = 2; } +def : SchedAlias; // Addressing modes. def : WriteRes { let Latency = 1; @@ -227,13 +213,13 @@ def : SchedAlias; def : SchedAlias; def : WriteRes { let Latency = 4; let NumMicroOps = 0; } -def : SchedAlias; +def : SchedAlias; // Store instructions. def : SchedAlias; def : SchedAlias; def : SchedAlias; -def : SchedAlias; +def : SchedAlias; // FP data instructions. def : WriteRes { let Latency = 2; } @@ -243,7 +229,6 @@ def : WriteRes { let Latency = 12; def : WriteRes { let Latency = 4; } // FP miscellaneous instructions. -// TODO: Conversion between register files is much different. def : WriteRes { let Latency = 3; } def : WriteRes { let Latency = 1; } def : WriteRes { let Latency = 1; } @@ -479,11 +464,15 @@ def M3WriteVSTI : SchedWriteRes<[M3UnitNALU, // Special cases. def M3WriteAES : SchedWriteRes<[M3UnitNCRY]> { let Latency = 1; } +def M3WriteCOPY : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M3WriteMOVI : SchedWriteVariant<[SchedVar, + SchedVar]>; + +// Fast forwarding. def M3ReadAES : SchedReadAdvance<1, [M3WriteAES]>; def M3ReadFMAC : SchedReadAdvance<1, [M3WriteFMAC4, M3WriteFMAC5]>; -def M3WriteMOVI : SchedWriteVariant<[SchedVar, - SchedVar]>; def M3ReadNMUL : SchedReadAdvance<1, [M3WriteNMUL3]>; // Branch instructions @@ -494,29 +483,40 @@ def : InstRW<[M3WriteC1], (instregex "^CBN?Z[WX]")>; def : InstRW<[M3WriteAD], (instregex "^TBN?Z[WX]")>; // Arithmetic and logical integer instructions. -def : InstRW<[M3WriteA1], (instrs COPY)>; -def : InstRW<[M3WriteAX], (instregex "^(ADD|SUB)S?Xrx64")>; -def : InstRW<[M3WriteAX], (instregex "^(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[sx]$")>; -def : InstRW<[M3WriteAX], (instregex "^(ADD|BIC|SUB)S[WX]r[sx]$")>; -def : InstRW<[M3WriteAX], (instregex "^(ADD|AND|EOR|ORR|SUB)[WX]ri")>; +def : InstRW<[M3WriteAZ], (instregex "^(ADD|AND|BIC|EON|EOR|ORN|SUB)[WX]rs$")>; +def : InstRW<[M3WriteAU], (instrs ORRWrs, ORRXrs)>; +def : InstRW<[M3WriteAX], (instregex "^(ADD|SUB)S?[WX]rx(64)?$")>; +def : InstRW<[M3WriteAZ], (instregex "^(ADD|AND|BIC|SUB)S[WX]rs$")>; +def : InstRW<[M3WriteAV], (instrs ADDWri, ADDXri)>; +def : InstRW<[M3WriteAW], (instrs ORRWri, ORRXri)>; // Move instructions. -def : InstRW<[M3WriteZ0], (instrs ADR, ADRP)>; -def : InstRW<[M3WriteZ0], (instregex "^MOV[NZ][WX]i")>; +def : InstRW<[M3WriteCOPY], (instrs COPY)>; +def : InstRW<[M3WriteZ0], (instrs ADR, ADRP)>; +def : InstRW<[M3WriteZ0], (instregex "^MOV[NZ][WX]i")>; // Divide and multiply instructions. // Miscellaneous instructions. -def : InstRW<[M3WriteAY], (instrs EXTRWrri, EXTRXrri)>; // Load instructions. def : InstRW<[M3WriteLD, WriteLDHi, WriteAdr], (instregex "^LDP(SW|W|X)(post|pre)")>; +def : InstRW<[M3WriteLB, + ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roW")>; def : InstRW<[M3WriteLX, - ReadAdrBase], (instregex "^PRFMro[WX]")>; + ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roX")>; +def : InstRW<[M3WriteLB, + ReadAdrBase], (instrs PRFMroW)>; +def : InstRW<[M3WriteLX, + ReadAdrBase], (instrs PRFMroX)>; // Store instructions. +def : InstRW<[M3WriteSB, + ReadAdrBase], (instregex "^STR(BB|HH|W|X)roW")>; +def : InstRW<[WriteST, + ReadAdrBase], (instregex "^STR(BB|HH|W|X)roX")>; // FP data instructions. def : InstRW<[M3WriteNSHF1], (instregex "^FABS[DS]r")>; @@ -553,9 +553,11 @@ def : InstRW<[WriteVLD], (instregex "^LDUR[BDHSQ]i")>; def : InstRW<[WriteVLD, WriteAdr], (instregex "^LDR[BDHSQ](post|pre)")>; def : InstRW<[WriteVLD], (instregex "^LDR[BDHSQ]ui")>; -def : InstRW<[M3WriteLX, - ReadAdrBase], (instregex "^LDR[BDHS]ro[WX]")>; -def : InstRW<[M3WriteLB, +def : InstRW<[M3WriteLE, + ReadAdrBase], (instregex "^LDR[BDHS]roW")>; +def : InstRW<[WriteVLD, + ReadAdrBase], (instregex "^LDR[BDHS]roX")>; +def : InstRW<[M3WriteLE, ReadAdrBase], (instregex "^LDRQro[WX]")>; def : InstRW<[WriteVLD, M3WriteLH], (instregex "^LDN?P[DS]i")>; @@ -573,8 +575,10 @@ def : InstRW<[WriteVST], (instregex "^STUR[BDHSQ]i")>; def : InstRW<[WriteVST, WriteAdr], (instregex "^STR[BDHSQ](post|pre)")>; def : InstRW<[WriteVST], (instregex "^STR[BDHSQ]ui")>; -def : InstRW<[M3WriteSY, - ReadAdrBase], (instregex "^STR[BDHS]ro[WX]")>; +def : InstRW<[M3WriteSA, + ReadAdrBase], (instregex "^STR[BDHS]roW")>; +def : InstRW<[WriteVST, + ReadAdrBase], (instregex "^STR[BDHS]roX")>; def : InstRW<[M3WriteSA, ReadAdrBase], (instregex "^STRQro[WX]")>; def : InstRW<[WriteVST], (instregex "^STN?P[DSQ]i")>; diff --git a/lib/Target/AArch64/AArch64SchedPredExynos.td b/lib/Target/AArch64/AArch64SchedPredExynos.td new file mode 100644 index 0000000000000000000000000000000000000000..f8533d18022af3108248f201451a0301b66f34b0 --- /dev/null +++ b/lib/Target/AArch64/AArch64SchedPredExynos.td @@ -0,0 +1,139 @@ +//===- AArch64SchedPredExynos.td - AArch64 Sched Preds -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines scheduling predicate definitions that are used by the +// AArch64 Exynos processors. +// +//===----------------------------------------------------------------------===// + +// Auxiliary predicates. + +// Check the shift in arithmetic and logic instructions. +def ExynosCheckShift : CheckAny<[CheckShiftBy0, + CheckAll< + [CheckShiftLSL, + CheckAny< + [CheckShiftBy1, + CheckShiftBy2, + CheckShiftBy3]>]>]>; + +// Exynos predicates. + +// Identify BLR specifying the LR register as the indirect target register. +def ExynosBranchLinkLRPred : MCSchedPredicate< + CheckAll<[CheckOpcode<[BLR]>, + CheckRegOperand<0, LR>]>>; + +// Identify arithmetic and logic instructions without or with limited extension. +def ExynosExtFn : TIIPredicate< + "isExynosExtFast", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsArithExtOp.ValidOpcodes, + MCReturnStatement< + CheckAny<[CheckExtBy0, + CheckAll< + [CheckAny< + [CheckExtUXTW, + CheckExtUXTX]>, + CheckAny< + [CheckExtBy1, + CheckExtBy2, + CheckExtBy3]>]>]>>>], + MCReturnStatement>>; +def ExynosExtPred : MCSchedPredicate; + +// Identify a load or store using the register offset addressing mode +// with a scaled non-extended register. +def ExynosScaledIdxFn : TIIPredicate<"isExynosScaledAddr", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsLoadStoreRegOffsetOp.ValidOpcodes, + MCReturnStatement< + CheckAny< + [CheckMemExtSXTW, + CheckMemExtUXTW, + CheckMemScaled]>>>], + MCReturnStatement>>; +def ExynosScaledIdxPred : MCSchedPredicate; + +// Identify FP instructions. +def ExynosFPPred : MCSchedPredicate>; + +// Identify whether an instruction whose result is a long vector +// operates on the upper half of the input registers. +def ExynosLongVectorUpperFn : TIIPredicate< + "isExynosLongVectorUpper", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsLongVectorUpperOp.ValidOpcodes, + MCReturnStatement>], + MCReturnStatement>>; +def ExynosLongVectorUpperPred : MCSchedPredicate; + +// Identify 128-bit NEON instructions. +def ExynosQFormPred : MCSchedPredicate; + +// Identify instructions that reset a register efficiently. +def ExynosResetFn : TIIPredicate< + "isExynosResetFast", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + [ADR, ADRP, + MOVNWi, MOVNXi, + MOVZWi, MOVZXi], + MCReturnStatement>], + MCReturnStatement< + CheckAny< + [IsCopyIdiomFn, + IsZeroFPIdiomFn, + IsZeroIdiomFn]>>>>; +def ExynosResetPred : MCSchedPredicate; + +// Identify EXTR as the alias for ROR (immediate). +def ExynosRotateRightImmPred : MCSchedPredicate< + CheckAll<[CheckOpcode<[EXTRWrri, EXTRXrri]>, + CheckSameRegOperand<1, 2>]>>; + +// Identify arithmetic and logic instructions with limited shift. +def ExynosShiftFn : TIIPredicate< + "isExynosShiftFast", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsArithLogicShiftOp.ValidOpcodes, + MCReturnStatement>], + MCReturnStatement>>; +def ExynosShiftPred : MCSchedPredicate; + +// Identify more arithmetic and logic instructions with limited shift. +def ExynosShiftExFn : TIIPredicate< + "isExynosShiftExFast", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsArithLogicShiftOp.ValidOpcodes, + MCReturnStatement< + CheckAny< + [CheckAll< + [CheckShiftLSL, + CheckShiftBy8]>, + ExynosCheckShift]>>>], + MCReturnStatement>>; +def ExynosShiftExPred : MCSchedPredicate; + + +// Identify arithmetic and logic immediate instructions. +def ExynosCheapFn : TIIPredicate< + "isExynosCheapAsMove", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsArithLogicImmOp.ValidOpcodes, + MCReturnStatement>], + MCReturnStatement< + CheckAny< + [ExynosExtFn, ExynosResetFn, ExynosShiftFn]>>>>; diff --git a/lib/Target/AArch64/AArch64SchedPredicates.td b/lib/Target/AArch64/AArch64SchedPredicates.td new file mode 100644 index 0000000000000000000000000000000000000000..a48f1dcffaffadd70758855a6735d01ee177af32 --- /dev/null +++ b/lib/Target/AArch64/AArch64SchedPredicates.td @@ -0,0 +1,370 @@ +//===- AArch64SchedPredicates.td - AArch64 Sched Preds -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines scheduling predicate definitions that are used by the +// AArch64 subtargets. +// +//===----------------------------------------------------------------------===// + +// Function mappers. + +// Check the extension type in arithmetic instructions. +let FunctionMapper = "AArch64_AM::getArithExtendType" in { + def CheckExtUXTB : CheckImmOperand_s<3, "AArch64_AM::UXTB">; + def CheckExtUXTH : CheckImmOperand_s<3, "AArch64_AM::UXTH">; + def CheckExtUXTW : CheckImmOperand_s<3, "AArch64_AM::UXTW">; + def CheckExtUXTX : CheckImmOperand_s<3, "AArch64_AM::UXTX">; + def CheckExtSXTB : CheckImmOperand_s<3, "AArch64_AM::SXTB">; + def CheckExtSXTH : CheckImmOperand_s<3, "AArch64_AM::SXTH">; + def CheckExtSXTW : CheckImmOperand_s<3, "AArch64_AM::SXTW">; + def CheckExtSXTX : CheckImmOperand_s<3, "AArch64_AM::SXTX">; +} + +// Check for shifting in extended arithmetic instructions. +foreach I = {0-3} in { + let FunctionMapper = "AArch64_AM::getArithShiftValue" in + def CheckExtBy#I : CheckImmOperand<3, I>; +} + +// Check the extension type in the register offset addressing mode. +let FunctionMapper = "AArch64_AM::getMemExtendType" in { + def CheckMemExtUXTW : CheckImmOperand_s<3, "AArch64_AM::UXTW">; + def CheckMemExtLSL : CheckImmOperand_s<3, "AArch64_AM::UXTX">; + def CheckMemExtSXTW : CheckImmOperand_s<3, "AArch64_AM::SXTW">; + def CheckMemExtSXTX : CheckImmOperand_s<3, "AArch64_AM::SXTX">; +} + +// Check for scaling in the register offset addressing mode. +let FunctionMapper = "AArch64_AM::getMemDoShift" in +def CheckMemScaled : CheckImmOperandSimple<3>; + +// Check the shifting type in arithmetic and logic instructions. +let FunctionMapper = "AArch64_AM::getShiftType" in { + def CheckShiftLSL : CheckImmOperand_s<3, "AArch64_AM::LSL">; + def CheckShiftLSR : CheckImmOperand_s<3, "AArch64_AM::LSR">; + def CheckShiftASR : CheckImmOperand_s<3, "AArch64_AM::ASR">; + def CheckShiftROR : CheckImmOperand_s<3, "AArch64_AM::ROR">; + def CheckShiftMSL : CheckImmOperand_s<3, "AArch64_AM::MSL">; +} + +// Check for shifting in arithmetic and logic instructions. +foreach I = {0-3, 8} in { + let FunctionMapper = "AArch64_AM::getShiftValue" in + def CheckShiftBy#I : CheckImmOperand<3, I>; +} + +// Generic predicates. + +// Identify whether an instruction is the 64-bit NEON form based on its result. +def CheckDForm : CheckAll<[CheckIsRegOperand<0>, + CheckAny<[CheckRegOperand<0, D0>, + CheckRegOperand<0, D1>, + CheckRegOperand<0, D2>, + CheckRegOperand<0, D3>, + CheckRegOperand<0, D4>, + CheckRegOperand<0, D5>, + CheckRegOperand<0, D6>, + CheckRegOperand<0, D7>, + CheckRegOperand<0, D8>, + CheckRegOperand<0, D9>, + CheckRegOperand<0, D10>, + CheckRegOperand<0, D11>, + CheckRegOperand<0, D12>, + CheckRegOperand<0, D13>, + CheckRegOperand<0, D14>, + CheckRegOperand<0, D15>, + CheckRegOperand<0, D16>, + CheckRegOperand<0, D17>, + CheckRegOperand<0, D18>, + CheckRegOperand<0, D19>, + CheckRegOperand<0, D20>, + CheckRegOperand<0, D21>, + CheckRegOperand<0, D22>, + CheckRegOperand<0, D23>, + CheckRegOperand<0, D24>, + CheckRegOperand<0, D25>, + CheckRegOperand<0, D26>, + CheckRegOperand<0, D27>, + CheckRegOperand<0, D28>, + CheckRegOperand<0, D29>, + CheckRegOperand<0, D30>, + CheckRegOperand<0, D31>]>]>; + +// Identify whether an instruction is the 128-bit NEON form based on its result. +def CheckQForm : CheckAll<[CheckIsRegOperand<0>, + CheckAny<[CheckRegOperand<0, Q0>, + CheckRegOperand<0, Q1>, + CheckRegOperand<0, Q2>, + CheckRegOperand<0, Q3>, + CheckRegOperand<0, Q4>, + CheckRegOperand<0, Q5>, + CheckRegOperand<0, Q6>, + CheckRegOperand<0, Q7>, + CheckRegOperand<0, Q8>, + CheckRegOperand<0, Q9>, + CheckRegOperand<0, Q10>, + CheckRegOperand<0, Q11>, + CheckRegOperand<0, Q12>, + CheckRegOperand<0, Q13>, + CheckRegOperand<0, Q14>, + CheckRegOperand<0, Q15>, + CheckRegOperand<0, Q16>, + CheckRegOperand<0, Q17>, + CheckRegOperand<0, Q18>, + CheckRegOperand<0, Q19>, + CheckRegOperand<0, Q20>, + CheckRegOperand<0, Q21>, + CheckRegOperand<0, Q22>, + CheckRegOperand<0, Q23>, + CheckRegOperand<0, Q24>, + CheckRegOperand<0, Q25>, + CheckRegOperand<0, Q26>, + CheckRegOperand<0, Q27>, + CheckRegOperand<0, Q28>, + CheckRegOperand<0, Q29>, + CheckRegOperand<0, Q30>, + CheckRegOperand<0, Q31>]>]>; + +// Identify arithmetic instructions with extend. +def IsArithExtOp : CheckOpcode<[ADDWrx, ADDXrx, ADDSWrx, ADDSXrx, + SUBWrx, SUBXrx, SUBSWrx, SUBSXrx, + ADDXrx64, ADDSXrx64, + SUBXrx64, SUBSXrx64]>; + +// Identify arithmetic immediate instructions. +def IsArithImmOp : CheckOpcode<[ADDWri, ADDXri, ADDSWri, ADDSXri, + SUBWri, SUBXri, SUBSWri, SUBSXri]>; + +// Identify arithmetic instructions with shift. +def IsArithShiftOp : CheckOpcode<[ADDWrs, ADDXrs, ADDSWrs, ADDSXrs, + SUBWrs, SUBXrs, SUBSWrs, SUBSXrs]>; + +// Identify arithmetic instructions without shift. +def IsArithUnshiftOp : CheckOpcode<[ADDWrr, ADDXrr, ADDSWrr, ADDSXrr, + SUBWrr, SUBXrr, SUBSWrr, SUBSXrr]>; + +// Identify logic immediate instructions. +def IsLogicImmOp : CheckOpcode<[ANDWri, ANDXri, + EORWri, EORXri, + ORRWri, ORRXri]>; + +// Identify logic instructions with shift. +def IsLogicShiftOp : CheckOpcode<[ANDWrs, ANDXrs, ANDSWrs, ANDSXrs, + BICWrs, BICXrs, BICSWrs, BICSXrs, + EONWrs, EONXrs, + EORWrs, EORXrs, + ORNWrs, ORNXrs, + ORRWrs, ORRXrs]>; + +// Identify logic instructions without shift. +def IsLogicUnshiftOp : CheckOpcode<[ANDWrr, ANDXrr, ANDSWrr, ANDSXrr, + BICWrr, BICXrr, BICSWrr, BICSXrr, + EONWrr, EONXrr, + EORWrr, EORXrr, + ORNWrr, ORNXrr, + ORRWrr, ORRXrr]>; + +// Identify arithmetic and logic immediate instructions. +def IsArithLogicImmOp : CheckOpcode; + +// Identify arithmetic and logic instructions with shift. +def IsArithLogicShiftOp : CheckOpcode; + +// Identify arithmetic and logic instructions without shift. +def IsArithLogicUnshiftOp : CheckOpcode; + +// Identify whether an instruction whose result is a long vector +// operates on the upper half of the input registers. +def IsLongVectorUpperOp : CheckOpcode<[FCVTLv8i16, FCVTLv4i32, + FCVTNv8i16, FCVTNv4i32, + FCVTXNv4f32, + PMULLv16i8, PMULLv2i64, + RADDHNv8i16_v16i8, RADDHNv4i32_v8i16, RADDHNv2i64_v4i32, + RSHRNv16i8_shift, RSHRNv8i16_shift, RSHRNv4i32_shift, + RSUBHNv8i16_v16i8, RSUBHNv4i32_v8i16, RSUBHNv2i64_v4i32, + SABALv16i8_v8i16, SABALv8i16_v4i32, SABALv4i32_v2i64, + SABDLv16i8_v8i16, SABDLv8i16_v4i32, SABDLv4i32_v2i64, + SADDLv16i8_v8i16, SADDLv8i16_v4i32, SADDLv4i32_v2i64, + SADDWv16i8_v8i16, SADDWv8i16_v4i32, SADDWv4i32_v2i64, + SHLLv16i8, SHLLv8i16, SHLLv4i32, + SHRNv16i8_shift, SHRNv8i16_shift, SHRNv4i32_shift, + SMLALv16i8_v8i16, SMLALv8i16_v4i32, SMLALv4i32_v2i64, + SMLALv8i16_indexed, SMLALv4i32_indexed, + SMLSLv16i8_v8i16, SMLSLv8i16_v4i32, SMLSLv4i32_v2i64, + SMLSLv8i16_indexed, SMLSLv4i32_indexed, + SMULLv16i8_v8i16, SMULLv8i16_v4i32, SMULLv4i32_v2i64, + SMULLv8i16_indexed, SMULLv4i32_indexed, + SQDMLALv8i16_v4i32, SQDMLALv4i32_v2i64, + SQDMLALv8i16_indexed, SQDMLALv4i32_indexed, + SQDMLSLv8i16_v4i32, SQDMLSLv4i32_v2i64, + SQDMLSLv8i16_indexed, SQDMLSLv4i32_indexed, + SQDMULLv8i16_v4i32, SQDMULLv4i32_v2i64, + SQDMULLv8i16_indexed, SQDMULLv4i32_indexed, + SQRSHRNv16i8_shift, SQRSHRNv8i16_shift, SQRSHRNv4i32_shift, + SQRSHRUNv16i8_shift, SQRSHRUNv8i16_shift, SQRSHRUNv4i32_shift, + SQSHRNv16i8_shift, SQSHRNv8i16_shift, SQSHRNv4i32_shift, + SQSHRUNv16i8_shift, SQSHRUNv8i16_shift, SQSHRUNv4i32_shift, + SQXTNv16i8, SQXTNv8i16, SQXTNv4i32, + SQXTUNv16i8, SQXTUNv8i16, SQXTUNv4i32, + SSHLLv16i8_shift, SSHLLv8i16_shift, SSHLLv4i32_shift, + SSUBLv16i8_v8i16, SSUBLv8i16_v4i32, SSUBLv4i32_v2i64, + SSUBWv16i8_v8i16, SSUBWv8i16_v4i32, SSUBWv4i32_v2i64, + UABALv16i8_v8i16, UABALv8i16_v4i32, UABALv4i32_v2i64, + UABDLv16i8_v8i16, UABDLv8i16_v4i32, UABDLv4i32_v2i64, + UADDLv16i8_v8i16, UADDLv8i16_v4i32, UADDLv4i32_v2i64, + UADDWv16i8_v8i16, UADDWv8i16_v4i32, UADDWv4i32_v2i64, + UMLALv16i8_v8i16, UMLALv8i16_v4i32, UMLALv4i32_v2i64, + UMLALv8i16_indexed, UMLALv4i32_indexed, + UMLSLv16i8_v8i16, UMLSLv8i16_v4i32, UMLSLv4i32_v2i64, + UMLSLv8i16_indexed, UMLSLv4i32_indexed, + UMULLv16i8_v8i16, UMULLv8i16_v4i32, UMULLv4i32_v2i64, + UMULLv8i16_indexed, UMULLv4i32_indexed, + UQSHRNv16i8_shift, UQSHRNv8i16_shift, UQSHRNv4i32_shift, + UQXTNv16i8, UQXTNv8i16, UQXTNv4i32, + USHLLv16i8_shift, USHLLv8i16_shift, USHLLv4i32_shift, + USUBLv16i8_v8i16, USUBLv8i16_v4i32, USUBLv4i32_v2i64, + USUBWv16i8_v8i16, USUBWv8i16_v4i32, USUBWv4i32_v2i64, + XTNv16i8, XTNv8i16, XTNv4i32]>; + +// Identify whether an instruction is a load +// using the register offset addressing mode. +def IsLoadRegOffsetOp : CheckOpcode<[PRFMroW, PRFMroX, + LDRBBroW, LDRBBroX, + LDRSBWroW, LDRSBWroX, LDRSBXroW, LDRSBXroX, + LDRHHroW, LDRHHroX, + LDRSHWroW, LDRSHWroX, LDRSHXroW, LDRSHXroX, + LDRWroW, LDRWroX, + LDRSWroW, LDRSWroX, + LDRXroW, LDRXroX, + LDRBroW, LDRBroX, + LDRHroW, LDRHroX, + LDRSroW, LDRSroX, + LDRDroW, LDRDroX]>; + +// Identify whether an instruction is a load +// using the register offset addressing mode. +def IsStoreRegOffsetOp : CheckOpcode<[STRBBroW, STRBBroX, + STRHHroW, STRHHroX, + STRWroW, STRWroX, + STRXroW, STRXroX, + STRBroW, STRBroX, + STRHroW, STRHroX, + STRSroW, STRSroX, + STRDroW, STRDroX]>; + +// Identify whether an instruction is a load or +// store using the register offset addressing mode. +def IsLoadStoreRegOffsetOp : CheckOpcode; + +// Target predicates. + +// Identify an instruction that effectively transfers a register to another. +def IsCopyIdiomFn : TIIPredicate<"isCopyIdiom", + MCOpcodeSwitchStatement< + [// MOV {Rd, SP}, {SP, Rn} => + // ADD {Rd, SP}, {SP, Rn}, #0 + MCOpcodeSwitchCase< + [ADDWri, ADDXri], + MCReturnStatement< + CheckAll< + [CheckIsRegOperand<0>, + CheckIsRegOperand<1>, + CheckAny< + [CheckRegOperand<0, WSP>, + CheckRegOperand<0, SP>, + CheckRegOperand<1, WSP>, + CheckRegOperand<1, SP>]>, + CheckZeroOperand<2>]>>>, + // MOV Rd, Rm => + // ORR Rd, ZR, Rm, LSL #0 + MCOpcodeSwitchCase< + [ORRWrs, ORRXrs], + MCReturnStatement< + CheckAll< + [CheckIsRegOperand<1>, + CheckIsRegOperand<2>, + CheckAny< + [CheckRegOperand<1, WZR>, + CheckRegOperand<1, XZR>, + CheckRegOperand<2, WZR>, + CheckRegOperand<2, XZR>]>, + CheckShiftBy0]>>>], + MCReturnStatement>>; +def IsCopyIdiomPred : MCSchedPredicate; + +// Identify arithmetic instructions with an extended register. +def RegExtendedFn : TIIPredicate<"hasExtendedReg", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsArithExtOp.ValidOpcodes, + MCReturnStatement< + CheckNot>>>], + MCReturnStatement>>; +def RegExtendedPred : MCSchedPredicate; + +// Identify arithmetic and logic instructions with a shifted register. +def RegShiftedFn : TIIPredicate<"hasShiftedReg", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsArithLogicShiftOp.ValidOpcodes, + MCReturnStatement< + CheckNot>>>], + MCReturnStatement>>; +def RegShiftedPred : MCSchedPredicate; + +// Identify a load or store using the register offset addressing mode +// with an extended or scaled register. +def ScaledIdxFn : TIIPredicate<"isScaledAddr", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsLoadStoreRegOffsetOp.ValidOpcodes, + MCReturnStatement< + CheckAny<[CheckNot, + CheckMemScaled]>>>], + MCReturnStatement>>; +def ScaledIdxPred : MCSchedPredicate; + +// Identify an instruction that effectively resets a FP register to zero. +def IsZeroFPIdiomFn : TIIPredicate<"isZeroFPIdiom", + MCOpcodeSwitchStatement< + [// MOVI Vd, #0 + MCOpcodeSwitchCase< + [MOVIv8b_ns, MOVIv16b_ns, + MOVID, MOVIv2d_ns], + MCReturnStatement>>, + // MOVI Vd, #0, LSL #0 + MCOpcodeSwitchCase< + [MOVIv4i16, MOVIv8i16, + MOVIv2i32, MOVIv4i32], + MCReturnStatement< + CheckAll< + [CheckZeroOperand<1>, + CheckZeroOperand<2>]>>>], + MCReturnStatement>>; +def IsZeroFPIdiomPred : MCSchedPredicate; + +// Identify an instruction that effectively resets a GP register to zero. +def IsZeroIdiomFn : TIIPredicate<"isZeroIdiom", + MCOpcodeSwitchStatement< + [// ORR Rd, ZR, #0 + MCOpcodeSwitchCase< + [ORRWri, ORRXri], + MCReturnStatement< + CheckAll< + [CheckIsRegOperand<1>, + CheckAny< + [CheckRegOperand<1, WZR>, + CheckRegOperand<1, XZR>]>, + CheckZeroOperand<2>]>>>], + MCReturnStatement>>; +def IsZeroIdiomPred : MCSchedPredicate; diff --git a/lib/Target/AArch64/AArch64Schedule.td b/lib/Target/AArch64/AArch64Schedule.td index ce81f48acf71282a63d6eebf5cbea23b72419b9d..f55ba4d42fce18b95b7149a7e44115ee667cd5ee 100644 --- a/lib/Target/AArch64/AArch64Schedule.td +++ b/lib/Target/AArch64/AArch64Schedule.td @@ -50,17 +50,6 @@ def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled). def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled). def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST. -// Predicate for determining when a shiftable register is shifted. -def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(*MI)}]>; - -// Predicate for determining when a extendedable register is extended. -def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(*MI)}]>; - -// ScaledIdxPred is true if a WriteLDIdx operand will be -// scaled. Subtargets can use this to dynamically select resources and -// latency for WriteLDIdx and ReadAdrBase. -def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(*MI)}]>; - // Serialized two-level address load. // EXAMPLE: LOADGot def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>; diff --git a/lib/Target/AArch64/AArch64SpeculationHardening.cpp b/lib/Target/AArch64/AArch64SpeculationHardening.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1f8ef5ee6eae58a1454e05f5cf2e3e309154fa7b --- /dev/null +++ b/lib/Target/AArch64/AArch64SpeculationHardening.cpp @@ -0,0 +1,368 @@ +//===- AArch64SpeculationHardening.cpp - Harden Against Missspeculation --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass to insert code to mitigate against side channel +// vulnerabilities that may happen under control flow miss-speculation. +// +// The pass implements tracking of control flow miss-speculation into a "taint" +// register. That taint register can then be used to mask off registers with +// sensitive data when executing under miss-speculation, a.k.a. "transient +// execution". +// This pass is aimed at mitigating against SpectreV1-style vulnarabilities. +// +// At the moment, it implements the tracking of miss-speculation of control +// flow into a taint register, but doesn't implement a mechanism yet to then +// use that taint register to mask of vulnerable data in registers (something +// for a follow-on improvement). Possible strategies to mask out vulnerable +// data that can be implemented on top of this are: +// - speculative load hardening to automatically mask of data loaded +// in registers. +// - using intrinsics to mask of data in registers as indicated by the +// programmer (see https://lwn.net/Articles/759423/). +// +// For AArch64, the following implementation choices are made below. +// Some of these are different than the implementation choices made in +// the similar pass implemented in X86SpeculativeLoadHardening.cpp, as +// the instruction set characteristics result in different trade-offs. +// - The speculation hardening is done after register allocation. With a +// relative abundance of registers, one register is reserved (X16) to be +// the taint register. X16 is expected to not clash with other register +// reservation mechanisms with very high probability because: +// . The AArch64 ABI doesn't guarantee X16 to be retained across any call. +// . The only way to request X16 to be used as a programmer is through +// inline assembly. In the rare case a function explicitly demands to +// use X16/W16, this pass falls back to hardening against speculation +// by inserting a DSB SYS/ISB barrier pair which will prevent control +// flow speculation. +// - It is easy to insert mask operations at this late stage as we have +// mask operations available that don't set flags. +// - The taint variable contains all-ones when no miss-speculation is detected, +// and contains all-zeros when miss-speculation is detected. Therefore, when +// masking, an AND instruction (which only changes the register to be masked, +// no other side effects) can easily be inserted anywhere that's needed. +// - The tracking of miss-speculation is done by using a data-flow conditional +// select instruction (CSEL) to evaluate the flags that were also used to +// make conditional branch direction decisions. Speculation of the CSEL +// instruction can be limited with a CSDB instruction - so the combination of +// CSEL + a later CSDB gives the guarantee that the flags as used in the CSEL +// aren't speculated. When conditional branch direction gets miss-speculated, +// the semantics of the inserted CSEL instruction is such that the taint +// register will contain all zero bits. +// One key requirement for this to work is that the conditional branch is +// followed by an execution of the CSEL instruction, where the CSEL +// instruction needs to use the same flags status as the conditional branch. +// This means that the conditional branches must not be implemented as one +// of the AArch64 conditional branches that do not use the flags as input +// (CB(N)Z and TB(N)Z). This is implemented by ensuring in the instruction +// selectors to not produce these instructions when speculation hardening +// is enabled. This pass will assert if it does encounter such an instruction. +// - On function call boundaries, the miss-speculation state is transferred from +// the taint register X16 to be encoded in the SP register as value 0. +// +// Future extensions/improvements could be: +// - Implement this functionality using full speculation barriers, akin to the +// x86-slh-lfence option. This may be more useful for the intrinsics-based +// approach than for the SLH approach to masking. +// Note that this pass already inserts the full speculation barriers if the +// function for some niche reason makes use of X16/W16. +// - no indirect branch misprediction gets protected/instrumented; but this +// could be done for some indirect branches, such as switch jump tables. +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Pass.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Target/TargetMachine.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-speculation-hardening" + +#define AARCH64_SPECULATION_HARDENING_NAME "AArch64 speculation hardening pass" + +namespace { + +class AArch64SpeculationHardening : public MachineFunctionPass { +public: + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + + static char ID; + + AArch64SpeculationHardening() : MachineFunctionPass(ID) { + initializeAArch64SpeculationHardeningPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &Fn) override; + + StringRef getPassName() const override { + return AARCH64_SPECULATION_HARDENING_NAME; + } + +private: + unsigned MisspeculatingTaintReg; + bool UseControlFlowSpeculationBarrier; + + bool functionUsesHardeningRegister(MachineFunction &MF) const; + bool instrumentControlFlow(MachineBasicBlock &MBB); + bool endsWithCondControlFlow(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + AArch64CC::CondCode &CondCode) const; + void insertTrackingCode(MachineBasicBlock &SplitEdgeBB, + AArch64CC::CondCode &CondCode, DebugLoc DL) const; + void insertSPToRegTaintPropagation(MachineBasicBlock *MBB, + MachineBasicBlock::iterator MBBI) const; + void insertRegToSPTaintPropagation(MachineBasicBlock *MBB, + MachineBasicBlock::iterator MBBI, + unsigned TmpReg) const; +}; + +} // end anonymous namespace + +char AArch64SpeculationHardening::ID = 0; + +INITIALIZE_PASS(AArch64SpeculationHardening, "aarch64-speculation-hardening", + AARCH64_SPECULATION_HARDENING_NAME, false, false) + +bool AArch64SpeculationHardening::endsWithCondControlFlow( + MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, + AArch64CC::CondCode &CondCode) const { + SmallVector analyzeBranchCondCode; + if (TII->analyzeBranch(MBB, TBB, FBB, analyzeBranchCondCode, false)) + return false; + + // Ignore if the BB ends in an unconditional branch/fall-through. + if (analyzeBranchCondCode.empty()) + return false; + + // If the BB ends with a single conditional branch, FBB will be set to + // nullptr (see API docs for TII->analyzeBranch). For the rest of the + // analysis we want the FBB block to be set always. + assert(TBB != nullptr); + if (FBB == nullptr) + FBB = MBB.getFallThrough(); + + // If both the true and the false condition jump to the same basic block, + // there isn't need for any protection - whether the branch is speculated + // correctly or not, we end up executing the architecturally correct code. + if (TBB == FBB) + return false; + + assert(MBB.succ_size() == 2); + // translate analyzeBranchCondCode to CondCode. + assert(analyzeBranchCondCode.size() == 1 && "unknown Cond array format"); + CondCode = AArch64CC::CondCode(analyzeBranchCondCode[0].getImm()); + return true; +} + +void AArch64SpeculationHardening::insertTrackingCode( + MachineBasicBlock &SplitEdgeBB, AArch64CC::CondCode &CondCode, + DebugLoc DL) const { + if (UseControlFlowSpeculationBarrier) { + // insert full control flow speculation barrier (DSB SYS + ISB) + BuildMI(SplitEdgeBB, SplitEdgeBB.begin(), DL, TII->get(AArch64::ISB)) + .addImm(0xf); + BuildMI(SplitEdgeBB, SplitEdgeBB.begin(), DL, TII->get(AArch64::DSB)) + .addImm(0xf); + } else { + BuildMI(SplitEdgeBB, SplitEdgeBB.begin(), DL, TII->get(AArch64::CSELXr)) + .addDef(MisspeculatingTaintReg) + .addUse(MisspeculatingTaintReg) + .addUse(AArch64::XZR) + .addImm(CondCode); + SplitEdgeBB.addLiveIn(AArch64::NZCV); + } +} + +bool AArch64SpeculationHardening::instrumentControlFlow( + MachineBasicBlock &MBB) { + LLVM_DEBUG(dbgs() << "Instrument control flow tracking on MBB: " << MBB); + + bool Modified = false; + MachineBasicBlock *TBB = nullptr; + MachineBasicBlock *FBB = nullptr; + AArch64CC::CondCode CondCode; + + if (!endsWithCondControlFlow(MBB, TBB, FBB, CondCode)) { + LLVM_DEBUG(dbgs() << "... doesn't end with CondControlFlow\n"); + } else { + // Now insert: + // "CSEL MisSpeculatingR, MisSpeculatingR, XZR, cond" on the True edge and + // "CSEL MisSpeculatingR, MisSpeculatingR, XZR, Invertcond" on the False + // edge. + AArch64CC::CondCode InvCondCode = AArch64CC::getInvertedCondCode(CondCode); + + MachineBasicBlock *SplitEdgeTBB = MBB.SplitCriticalEdge(TBB, *this); + MachineBasicBlock *SplitEdgeFBB = MBB.SplitCriticalEdge(FBB, *this); + + assert(SplitEdgeTBB != nullptr); + assert(SplitEdgeFBB != nullptr); + + DebugLoc DL; + if (MBB.instr_end() != MBB.instr_begin()) + DL = (--MBB.instr_end())->getDebugLoc(); + + insertTrackingCode(*SplitEdgeTBB, CondCode, DL); + insertTrackingCode(*SplitEdgeFBB, InvCondCode, DL); + + LLVM_DEBUG(dbgs() << "SplitEdgeTBB: " << *SplitEdgeTBB << "\n"); + LLVM_DEBUG(dbgs() << "SplitEdgeFBB: " << *SplitEdgeFBB << "\n"); + Modified = true; + } + + // Perform correct code generation around function calls and before returns. + { + SmallVector ReturnInstructions; + SmallVector CallInstructions; + + for (MachineInstr &MI : MBB) { + if (MI.isReturn()) + ReturnInstructions.push_back(&MI); + else if (MI.isCall()) + CallInstructions.push_back(&MI); + } + + Modified |= + (ReturnInstructions.size() > 0) || (CallInstructions.size() > 0); + + for (MachineInstr *Return : ReturnInstructions) + insertRegToSPTaintPropagation(Return->getParent(), Return, AArch64::X17); + for (MachineInstr *Call : CallInstructions) { + // Just after the call: + MachineBasicBlock::iterator i = Call; + i++; + insertSPToRegTaintPropagation(Call->getParent(), i); + // Just before the call: + insertRegToSPTaintPropagation(Call->getParent(), Call, AArch64::X17); + } + } + + return Modified; +} + +void AArch64SpeculationHardening::insertSPToRegTaintPropagation( + MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI) const { + // If full control flow speculation barriers are used, emit a control flow + // barrier to block potential miss-speculation in flight coming in to this + // function. + if (UseControlFlowSpeculationBarrier) { + // insert full control flow speculation barrier (DSB SYS + ISB) + BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::DSB)).addImm(0xf); + BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ISB)).addImm(0xf); + return; + } + + // CMP SP, #0 === SUBS xzr, SP, #0 + BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::SUBSXri)) + .addDef(AArch64::XZR) + .addUse(AArch64::SP) + .addImm(0) + .addImm(0); // no shift + // CSETM x16, NE === CSINV x16, xzr, xzr, EQ + BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::CSINVXr)) + .addDef(MisspeculatingTaintReg) + .addUse(AArch64::XZR) + .addUse(AArch64::XZR) + .addImm(AArch64CC::EQ); +} + +void AArch64SpeculationHardening::insertRegToSPTaintPropagation( + MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI, + unsigned TmpReg) const { + // If full control flow speculation barriers are used, there will not be + // miss-speculation when returning from this function, and therefore, also + // no need to encode potential miss-speculation into the stack pointer. + if (UseControlFlowSpeculationBarrier) + return; + + // mov Xtmp, SP === ADD Xtmp, SP, #0 + BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ADDXri)) + .addDef(TmpReg) + .addUse(AArch64::SP) + .addImm(0) + .addImm(0); // no shift + // and Xtmp, Xtmp, TaintReg === AND Xtmp, Xtmp, TaintReg, #0 + BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ANDXrs)) + .addDef(TmpReg, RegState::Renamable) + .addUse(TmpReg, RegState::Kill | RegState::Renamable) + .addUse(MisspeculatingTaintReg, RegState::Kill) + .addImm(0); + // mov SP, Xtmp === ADD SP, Xtmp, #0 + BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ADDXri)) + .addDef(AArch64::SP) + .addUse(TmpReg, RegState::Kill) + .addImm(0) + .addImm(0); // no shift +} + +bool AArch64SpeculationHardening::functionUsesHardeningRegister( + MachineFunction &MF) const { + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + // treat function calls specially, as the hardening register does not + // need to remain live across function calls. + if (MI.isCall()) + continue; + if (MI.readsRegister(MisspeculatingTaintReg, TRI) || + MI.modifiesRegister(MisspeculatingTaintReg, TRI)) + return true; + } + } + return false; +} + +bool AArch64SpeculationHardening::runOnMachineFunction(MachineFunction &MF) { + if (!MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening)) + return false; + + MisspeculatingTaintReg = AArch64::X16; + TII = MF.getSubtarget().getInstrInfo(); + TRI = MF.getSubtarget().getRegisterInfo(); + bool Modified = false; + + UseControlFlowSpeculationBarrier = functionUsesHardeningRegister(MF); + + // Instrument control flow speculation tracking, if requested. + LLVM_DEBUG( + dbgs() + << "***** AArch64SpeculationHardening - track control flow *****\n"); + + // 1. Add instrumentation code to function entry and exits. + SmallVector EntryBlocks; + EntryBlocks.push_back(&MF.front()); + for (const LandingPadInfo &LPI : MF.getLandingPads()) + EntryBlocks.push_back(LPI.LandingPadBlock); + for (auto Entry : EntryBlocks) + insertSPToRegTaintPropagation( + Entry, Entry->SkipPHIsLabelsAndDebug(Entry->begin())); + + // 2. Add instrumentation code to every basic block. + for (auto &MBB : MF) + Modified |= instrumentControlFlow(MBB); + + return Modified; +} + +/// \brief Returns an instance of the pseudo instruction expansion pass. +FunctionPass *llvm::createAArch64SpeculationHardeningPass() { + return new AArch64SpeculationHardening(); +} diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp index fc7b5984fe3e867da3138505d961b74525c37dff..d5643d384283340e0674c344a0e7aad498b568c7 100644 --- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp +++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp @@ -148,9 +148,11 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) { for (auto &MI : MBB) { if (!isNarrowFPStore(MI)) continue; - unsigned BaseReg; + MachineOperand *BaseOp; int64_t Offset; - if (TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI)) { + if (TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI) && + BaseOp->isReg()) { + unsigned BaseReg = BaseOp->getReg(); if (PrevBaseReg == BaseReg) { // If this block can take STPs, skip ahead to the next block. if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent())) diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index 49d737bea6a6dc2c544b40106c9c4776a6beb77e..dd30d25b2b505de77c9beb0b727c62f6ed8d01e2 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -14,13 +14,13 @@ #include "AArch64Subtarget.h" #include "AArch64.h" -#include "AArch64InstrInfo.h" -#include "AArch64PBQPRegAlloc.h" -#include "AArch64TargetMachine.h" - #include "AArch64CallLowering.h" +#include "AArch64InstrInfo.h" #include "AArch64LegalizerInfo.h" +#include "AArch64PBQPRegAlloc.h" #include "AArch64RegisterBankInfo.h" +#include "AArch64TargetMachine.h" +#include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/IR/GlobalValue.h" @@ -148,6 +148,11 @@ void AArch64Subtarget::initializeProperties() { // FIXME: remove this to enable 64-bit SLP if performance looks good. MinVectorRegisterBitWidth = 128; break; + case TSV110: + CacheLineSize = 64; + PrefFunctionAlignment = 4; + PrefLoopAlignment = 2; + break; } } diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 8bf7c1654081e10c69f64daced9852d6cea57422..9b27e0a0e8fdf8428ac8783d9bf98f79c8ae1c17 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -56,7 +56,8 @@ public: ThunderX, ThunderXT81, ThunderXT83, - ThunderXT88 + ThunderXT88, + TSV110 }; protected: @@ -82,6 +83,33 @@ protected: bool HasFP16FML = false; bool HasSPE = false; + // ARMv8.1 extensions + bool HasVH = false; + bool HasPAN = false; + bool HasLOR = false; + + // ARMv8.2 extensions + bool HasPsUAO = false; + bool HasPAN_RWV = false; + bool HasCCPP = false; + + // ARMv8.3 extensions + bool HasPA = false; + bool HasJS = false; + bool HasCCIDX = false; + bool HasComplxNum = false; + + // ARMv8.4 extensions + bool HasNV = false; + bool HasRASv8_4 = false; + bool HasMPAM = false; + bool HasDIT = false; + bool HasTRACEV8_4 = false; + bool HasAM = false; + bool HasSEL2 = false; + bool HasTLB_RMI = false; + bool HasFMI = false; + bool HasRCPC_IMMO = false; // ARMv8.4 Crypto extensions bool HasSM4 = true; bool HasSHA3 = true; @@ -99,6 +127,7 @@ protected: bool HasFRInt3264 = false; bool HasSpecRestrict = false; bool HasSpecCtrl = false; + bool HasSSBS = false; bool HasPredCtrl = false; bool HasCCDP = false; bool HasBTI = false; @@ -327,6 +356,7 @@ public: bool hasFRInt3264() const { return HasFRInt3264; } bool hasSpecRestrict() const { return HasSpecRestrict; } bool hasSpecCtrl() const { return HasSpecCtrl; } + bool hasSSBS() const { return HasSSBS; } bool hasPredCtrl() const { return HasPredCtrl; } bool hasCCDP() const { return HasCCDP; } bool hasBTI() const { return HasBTI; } @@ -348,6 +378,30 @@ public: bool useAA() const override { return UseAA; } + bool hasVH() const { return HasVH; } + bool hasPAN() const { return HasPAN; } + bool hasLOR() const { return HasLOR; } + + bool hasPsUAO() const { return HasPsUAO; } + bool hasPAN_RWV() const { return HasPAN_RWV; } + bool hasCCPP() const { return HasCCPP; } + + bool hasPA() const { return HasPA; } + bool hasJS() const { return HasJS; } + bool hasCCIDX() const { return HasCCIDX; } + bool hasComplxNum() const { return HasComplxNum; } + + bool hasNV() const { return HasNV; } + bool hasRASv8_4() const { return HasRASv8_4; } + bool hasMPAM() const { return HasMPAM; } + bool hasDIT() const { return HasDIT; } + bool hasTRACEV8_4() const { return HasTRACEV8_4; } + bool hasAM() const { return HasAM; } + bool hasSEL2() const { return HasSEL2; } + bool hasTLB_RMI() const { return HasTLB_RMI; } + bool hasFMI() const { return HasFMI; } + bool hasRCPC_IMMO() const { return HasRCPC_IMMO; } + bool useSmallAddressing() const { switch (TLInfo.getTargetMachine().getCodeModel()) { case CodeModel::Kernel: @@ -382,6 +436,8 @@ public: bool isCallingConvWin64(CallingConv::ID CC) const { switch (CC) { case CallingConv::C: + case CallingConv::Fast: + case CallingConv::Swift: return isTargetWindows(); case CallingConv::Win64: return true; diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td index 9faa465fb9ba15a23ce3d3564a09a253a97a9beb..60d48e4d99d750e41b2d04efe20459e82ef9c91b 100644 --- a/lib/Target/AArch64/AArch64SystemOperands.td +++ b/lib/Target/AArch64/AArch64SystemOperands.td @@ -14,6 +14,25 @@ include "llvm/TableGen/SearchableTable.td" +//===----------------------------------------------------------------------===// +// Features that, for the compiler, only enable system operands and PStates +//===----------------------------------------------------------------------===// + +def HasCCPP : Predicate<"Subtarget->hasCCPP()">, + AssemblerPredicate<"FeatureCCPP", "ccpp">; + +def HasPAN : Predicate<"Subtarget->hasPAN()">, + AssemblerPredicate<"FeaturePAN", + "ARM v8.1 Privileged Access-Never extension">; + +def HasPsUAO : Predicate<"Subtarget->hasPsUAO()">, + AssemblerPredicate<"FeaturePsUAO", + "ARM v8.2 UAO PState extension (psuao)">; + +def HasPAN_RWV : Predicate<"Subtarget->hasPAN_RWV()">, + AssemblerPredicate<"FeaturePAN_RWV", + "ARM v8.2 PAN AT S1E1R and AT S1E1W Variation">; + //===----------------------------------------------------------------------===// // AT (address translate) instruction options. //===----------------------------------------------------------------------===// @@ -45,7 +64,7 @@ def : AT<"S12E1W", 0b100, 0b0111, 0b1000, 0b101>; def : AT<"S12E0R", 0b100, 0b0111, 0b1000, 0b110>; def : AT<"S12E0W", 0b100, 0b0111, 0b1000, 0b111>; -let Requires = [{ {AArch64::HasV8_2aOps} }] in { +let Requires = [{ {AArch64::FeaturePAN_RWV} }] in { def : AT<"S1E1RP", 0b000, 0b0111, 0b1001, 0b000>; def : AT<"S1E1WP", 0b000, 0b0111, 0b1001, 0b001>; } @@ -102,7 +121,7 @@ def : DC<"CVAU", 0b011, 0b0111, 0b1011, 0b001>; def : DC<"CIVAC", 0b011, 0b0111, 0b1110, 0b001>; def : DC<"CISW", 0b000, 0b0111, 0b1110, 0b010>; -let Requires = [{ {AArch64::HasV8_2aOps} }] in +let Requires = [{ {AArch64::FeatureCCPP} }] in def : DC<"CVAP", 0b011, 0b0111, 0b1100, 0b001>; let Requires = [{ {AArch64::FeatureCacheDeepPersist} }] in @@ -178,7 +197,7 @@ class TSB encoding> : SearchableTable{ bits<4> Encoding; let Encoding = encoding; - code Requires = [{ {AArch64::HasV8_4aOps} }]; + code Requires = [{ {AArch64::FeatureTRACEV8_4} }]; } def : TSB<"csync", 0>; @@ -314,16 +333,17 @@ def : PState<"SPSel", 0b00101>; def : PState<"DAIFSet", 0b11110>; def : PState<"DAIFClr", 0b11111>; // v8.1a "Privileged Access Never" extension-specific PStates -let Requires = [{ {AArch64::HasV8_1aOps} }] in +let Requires = [{ {AArch64::FeaturePAN} }] in def : PState<"PAN", 0b00100>; + // v8.2a "User Access Override" extension-specific PStates -let Requires = [{ {AArch64::HasV8_2aOps} }] in +let Requires = [{ {AArch64::FeaturePsUAO} }] in def : PState<"UAO", 0b00011>; // v8.4a timining insensitivity of data processing instructions -let Requires = [{ {AArch64::HasV8_4aOps} }] in +let Requires = [{ {AArch64::FeatureDIT} }] in def : PState<"DIT", 0b11010>; // v8.5a Spectre Mitigation -let Requires = [{ {AArch64::FeatureSpecRestrict} }] in +let Requires = [{ {AArch64::FeatureSSBS} }] in def : PState<"SSBS", 0b11001>; // v8.5a Memory Tagging Extension let Requires = [{ {AArch64::FeatureMTE} }] in @@ -413,8 +433,9 @@ def : TLBI<"VALE3", 0b110, 0b1000, 0b0111, 0b101>; def : TLBI<"VMALLS12E1", 0b100, 0b1000, 0b0111, 0b110, 0>; def : TLBI<"VAALE1", 0b000, 0b1000, 0b0111, 0b111>; +// Armv8.4-A Translation Lookaside Buffer Instructions (TLBI) +let Requires = [{ {AArch64::FeatureTLB_RMI} }] in { // Armv8.4-A Outer Sharable TLB Maintenance instructions: -let Requires = [{ {AArch64::HasV8_4aOps} }] in { // op1 CRn CRm op2 def : TLBI<"VMALLE1OS", 0b000, 0b1000, 0b0001, 0b000, 0>; def : TLBI<"VAE1OS", 0b000, 0b1000, 0b0001, 0b001>; @@ -465,7 +486,7 @@ def : TLBI<"RVAE3IS", 0b110, 0b1000, 0b0010, 0b001>; def : TLBI<"RVALE3IS", 0b110, 0b1000, 0b0010, 0b101>; def : TLBI<"RVAE3OS", 0b110, 0b1000, 0b0101, 0b001>; def : TLBI<"RVALE3OS", 0b110, 0b1000, 0b0101, 0b101>; -} +} //FeatureTLB_RMI // Armv8.5-A Prediction Restriction by Context instruction options: class PRCTX crm> : SearchableTable { @@ -540,8 +561,10 @@ def : ROSysReg<"PMCEID0_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b110>; def : ROSysReg<"PMCEID1_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b111>; def : ROSysReg<"MIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b000>; def : ROSysReg<"CCSIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b000>; + +//v8.3 CCIDX - extending the CCsIDr number of sets def : ROSysReg<"CCSIDR2_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b010> { - let Requires = [{ {AArch64::HasV8_3aOps} }]; + let Requires = [{ {AArch64::FeatureCCIDX} }]; } def : ROSysReg<"CLIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b001>; def : ROSysReg<"CTR_EL0", 0b11, 0b011, 0b0000, 0b0000, 0b001>; @@ -579,9 +602,7 @@ def : ROSysReg<"ID_AA64ISAR0_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b000>; def : ROSysReg<"ID_AA64ISAR1_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b001>; def : ROSysReg<"ID_AA64MMFR0_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b000>; def : ROSysReg<"ID_AA64MMFR1_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b001>; -def : ROSysReg<"ID_AA64MMFR2_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b010> { - let Requires = [{ {AArch64::HasV8_2aOps} }]; -} +def : ROSysReg<"ID_AA64MMFR2_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b010>; def : ROSysReg<"MVFR0_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b000>; def : ROSysReg<"MVFR1_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b001>; def : ROSysReg<"MVFR2_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b010>; @@ -651,7 +672,7 @@ def : ROSysReg<"ID_AA64ZFR0_EL1", 0b11, 0b000, 0b0000, 0b0100, 0b100>; // v8.1a "Limited Ordering Regions" extension-specific system register // Op0 Op1 CRn CRm Op2 -let Requires = [{ {AArch64::HasV8_1aOps} }] in +let Requires = [{ {AArch64::FeatureLOR} }] in def : ROSysReg<"LORID_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b111>; // v8.2a "RAS extension" registers @@ -1185,21 +1206,21 @@ def : RWSysReg<"ICH_LR14_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b110>; def : RWSysReg<"ICH_LR15_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b111>; // v8.1a "Privileged Access Never" extension-specific system registers -let Requires = [{ {AArch64::HasV8_1aOps} }] in +let Requires = [{ {AArch64::FeaturePAN} }] in def : RWSysReg<"PAN", 0b11, 0b000, 0b0100, 0b0010, 0b011>; // v8.1a "Limited Ordering Regions" extension-specific system registers // Op0 Op1 CRn CRm Op2 -let Requires = [{ {AArch64::HasV8_1aOps} }] in { +let Requires = [{ {AArch64::FeatureLOR} }] in { def : RWSysReg<"LORSA_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b000>; def : RWSysReg<"LOREA_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b001>; def : RWSysReg<"LORN_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b010>; def : RWSysReg<"LORC_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b011>; } -// v8.1a "Virtualization hos extensions" system registers +// v8.1a "Virtualization Host extensions" system registers // Op0 Op1 CRn CRm Op2 -let Requires = [{ {AArch64::HasV8_1aOps} }] in { +let Requires = [{ {AArch64::FeatureVH} }] in { def : RWSysReg<"TTBR1_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b001>; def : RWSysReg<"CONTEXTIDR_EL2", 0b11, 0b100, 0b1101, 0b0000, 0b001>; def : RWSysReg<"CNTHV_TVAL_EL2", 0b11, 0b100, 0b1110, 0b0011, 0b000>; @@ -1230,7 +1251,7 @@ def : RWSysReg<"ELR_EL12", 0b11, 0b101, 0b0100, 0b0000, 0b001>; } // v8.2a registers // Op0 Op1 CRn CRm Op2 -let Requires = [{ {AArch64::HasV8_2aOps} }] in +let Requires = [{ {AArch64::FeaturePsUAO} }] in def : RWSysReg<"UAO", 0b11, 0b000, 0b0100, 0b0010, 0b100>; // v8.2a "Statistical Profiling extension" registers @@ -1267,7 +1288,7 @@ def : RWSysReg<"VSESR_EL2", 0b11, 0b100, 0b0101, 0b0010, 0b011>; // v8.3a "Pointer authentication extension" registers // Op0 Op1 CRn CRm Op2 -let Requires = [{ {AArch64::HasV8_3aOps} }] in { +let Requires = [{ {AArch64::FeaturePA} }] in { def : RWSysReg<"APIAKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b000>; def : RWSysReg<"APIAKeyHi_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b001>; def : RWSysReg<"APIBKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b010>; @@ -1280,8 +1301,8 @@ def : RWSysReg<"APGAKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0011, 0b000>; def : RWSysReg<"APGAKeyHi_EL1", 0b11, 0b000, 0b0010, 0b0011, 0b001>; } -let Requires = [{ {AArch64::HasV8_4aOps} }] in { - +// v8.4 "Secure Exception Level 2 extension" +let Requires = [{ {AArch64::FeatureSEL2} }] in { // v8.4a "Virtualization secure second stage translation" registers // Op0 Op1 CRn CRm Op2 def : RWSysReg<"VSTCR_EL2" , 0b11, 0b100, 0b0010, 0b0110, 0b010>; @@ -1299,18 +1320,22 @@ def : RWSysReg<"CNTHPS_CTL_EL2", 0b11, 0b100, 0b1110, 0b0101, 0b001>; // v8.4a "Virtualization debug state" registers // Op0 Op1 CRn CRm Op2 def : RWSysReg<"SDER32_EL2", 0b11, 0b100, 0b0001, 0b0011, 0b001>; +} // FeatureSEL2 // v8.4a RAS registers -// Op0 Op1 CRn CRm Op2 +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureRASv8_4} }] in { def : RWSysReg<"ERXPFGCTL_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b101>; def : RWSysReg<"ERXPFGCDN_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b110>; def : RWSysReg<"ERXTS_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b111>; def : RWSysReg<"ERXMISC2_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b010>; def : RWSysReg<"ERXMISC3_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b011>; def : ROSysReg<"ERXPFGF_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b100>; +} // FeatureRASv8_4 // v8.4a MPAM registers // Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureMPAM} }] in { def : RWSysReg<"MPAM0_EL1", 0b11, 0b000, 0b1010, 0b0101, 0b001>; def : RWSysReg<"MPAM1_EL1", 0b11, 0b000, 0b1010, 0b0101, 0b000>; def : RWSysReg<"MPAM2_EL2", 0b11, 0b100, 0b1010, 0b0101, 0b000>; @@ -1327,9 +1352,11 @@ def : RWSysReg<"MPAMVPM5_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b101>; def : RWSysReg<"MPAMVPM6_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b110>; def : RWSysReg<"MPAMVPM7_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b111>; def : ROSysReg<"MPAMIDR_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b100>; +} //FeatureMPAM -// v8.4a Activitiy monitor registers +// v8.4a Activitiy Monitor registers // Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureAM} }] in { def : RWSysReg<"AMCR_EL0", 0b11, 0b011, 0b1101, 0b0010, 0b000>; def : ROSysReg<"AMCFGR_EL0", 0b11, 0b011, 0b1101, 0b0010, 0b001>; def : ROSysReg<"AMCGCR_EL0", 0b11, 0b011, 0b1101, 0b0010, 0b010>; @@ -1378,6 +1405,7 @@ def : RWSysReg<"AMEVTYPER112_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b100>; def : RWSysReg<"AMEVTYPER113_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b101>; def : RWSysReg<"AMEVTYPER114_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b110>; def : RWSysReg<"AMEVTYPER115_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b111>; +} //FeatureAM // v8.4a Trace Extension registers // @@ -1386,19 +1414,24 @@ def : RWSysReg<"AMEVTYPER115_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b111>; // but they are already defined above. // // Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureTRACEV8_4} }] in { def : RWSysReg<"TRFCR_EL1", 0b11, 0b000, 0b0001, 0b0010, 0b001>; def : RWSysReg<"TRFCR_EL2", 0b11, 0b100, 0b0001, 0b0010, 0b001>; def : RWSysReg<"TRFCR_EL12", 0b11, 0b101, 0b0001, 0b0010, 0b001>; +} //FeatureTRACEV8_4 // v8.4a Timining insensitivity of data processing instructions +// DIT: Data Independent Timing instructions // Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureDIT} }] in { def : RWSysReg<"DIT", 0b11, 0b011, 0b0100, 0b0010, 0b101>; +} //FeatureDIT // v8.4a Enhanced Support for Nested Virtualization // Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureNV} }] in { def : RWSysReg<"VNCR_EL2", 0b11, 0b100, 0b0010, 0b0010, 0b000>; - -} // HasV8_4aOps +} //FeatureNV // SVE control registers // Op0 Op1 CRn CRm Op2 @@ -1411,7 +1444,7 @@ def : RWSysReg<"ZCR_EL12", 0b11, 0b101, 0b0001, 0b0010, 0b000>; // V8.5a Spectre mitigation SSBS register // Op0 Op1 CRn CRm Op2 -let Requires = [{ {AArch64::FeatureSpecRestrict} }] in +let Requires = [{ {AArch64::FeatureSSBS} }] in def : RWSysReg<"SSBS", 0b11, 0b011, 0b0100, 0b0010, 0b110>; // v8.5a Memory Tagging Extension diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index 2f3f87d02b787206cffe5026a871eedc648abdf7..4e016525f7e4e6bc1fea1dad2b68c2b58904541b 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -177,6 +177,7 @@ extern "C" void LLVMInitializeAArch64Target() { initializeFalkorHWPFFixPass(*PR); initializeFalkorMarkStridedAccessesLegacyPass(*PR); initializeLDTLSCleanupPass(*PR); + initializeAArch64SpeculationHardeningPass(*PR); } //===----------------------------------------------------------------------===// @@ -219,9 +220,9 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT, return *RM; } -static CodeModel::Model getEffectiveCodeModel(const Triple &TT, - Optional CM, - bool JIT) { +static CodeModel::Model +getEffectiveAArch64CodeModel(const Triple &TT, Optional CM, + bool JIT) { if (CM) { if (*CM != CodeModel::Small && *CM != CodeModel::Tiny && *CM != CodeModel::Large) { @@ -255,7 +256,7 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, : LLVMTargetMachine(T, computeDataLayout(TT, Options.MCOptions, LittleEndian), TT, CPU, FS, Options, getEffectiveRelocModel(TT, RM), - getEffectiveCodeModel(TT, CM, JIT), OL), + getEffectiveAArch64CodeModel(TT, CM, JIT), OL), TLOF(createTLOF(getTargetTriple())), isLittle(LittleEndian) { initAsmInfo(); @@ -275,8 +276,10 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, } // Enable GlobalISel at or below EnableGlobalISelAt0. - if (getOptLevel() <= EnableGlobalISelAtO) + if (getOptLevel() <= EnableGlobalISelAtO) { setGlobalISel(true); + setGlobalISelAbort(GlobalISelAbortMode::Disable); + } // AArch64 supports the MachineOutliner. setMachineOutliner(true); @@ -419,8 +422,10 @@ void AArch64PassConfig::addIRPasses() { TargetPassConfig::addIRPasses(); // Match interleaved memory accesses to ldN/stN intrinsics. - if (TM->getOptLevel() != CodeGenOpt::None) + if (TM->getOptLevel() != CodeGenOpt::None) { + addPass(createInterleavedLoadCombinePass()); addPass(createInterleavedAccessPass()); + } if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) { // Call SeparateConstOffsetFromGEP pass to extract constants within indices @@ -546,12 +551,28 @@ void AArch64PassConfig::addPreSched2() { if (TM->getOptLevel() != CodeGenOpt::None) { if (EnableLoadStoreOpt) addPass(createAArch64LoadStoreOptimizationPass()); + } + + // The AArch64SpeculationHardeningPass destroys dominator tree and natural + // loop info, which is needed for the FalkorHWPFFixPass and also later on. + // Therefore, run the AArch64SpeculationHardeningPass before the + // FalkorHWPFFixPass to avoid recomputing dominator tree and natural loop + // info. + addPass(createAArch64SpeculationHardeningPass()); + + if (TM->getOptLevel() != CodeGenOpt::None) { if (EnableFalkorHWPFFix) addPass(createFalkorHWPFFixPass()); } } void AArch64PassConfig::addPreEmitPass() { + // Machine Block Placement might have created new opportunities when run + // at O3, where the Tail Duplication Threshold is set to 4 instructions. + // Run the load/store optimizer once more. + if (TM->getOptLevel() >= CodeGenOpt::Aggressive && EnableLoadStoreOpt) + addPass(createAArch64LoadStoreOptimizationPass()); + if (EnableA53Fix835769) addPass(createAArch64A53Fix835769()); // Relax conditional branch instructions if they're otherwise out of diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 0f291cb83824202b9b36d534ab11f447f8d8659c..edf361bc399cce8321b1b0d985f1f1563c44c082 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -175,6 +175,7 @@ private: bool parseDirectiveReq(StringRef Name, SMLoc L); bool parseDirectiveUnreq(SMLoc L); + bool parseDirectiveCFINegateRAState(); bool validateInstruction(MCInst &Inst, SMLoc &IDLoc, SmallVectorImpl &Loc); @@ -2813,28 +2814,29 @@ static const struct Extension { const char *Name; const FeatureBitset Features; } ExtensionMap[] = { - { "crc", {AArch64::FeatureCRC} }, - { "sm4", {AArch64::FeatureSM4} }, - { "sha3", {AArch64::FeatureSHA3} }, - { "sha2", {AArch64::FeatureSHA2} }, - { "aes", {AArch64::FeatureAES} }, - { "crypto", {AArch64::FeatureCrypto} }, - { "fp", {AArch64::FeatureFPARMv8} }, - { "simd", {AArch64::FeatureNEON} }, - { "ras", {AArch64::FeatureRAS} }, - { "lse", {AArch64::FeatureLSE} }, - { "predctrl", {AArch64::FeaturePredCtrl} }, - { "ccdp", {AArch64::FeatureCacheDeepPersist} }, - { "mte", {AArch64::FeatureMTE} }, - - // FIXME: Unsupported extensions - { "pan", {} }, - { "lor", {} }, - { "rdma", {} }, - { "profile", {} }, + {"crc", {AArch64::FeatureCRC}}, + {"sm4", {AArch64::FeatureSM4}}, + {"sha3", {AArch64::FeatureSHA3}}, + {"sha2", {AArch64::FeatureSHA2}}, + {"aes", {AArch64::FeatureAES}}, + {"crypto", {AArch64::FeatureCrypto}}, + {"fp", {AArch64::FeatureFPARMv8}}, + {"simd", {AArch64::FeatureNEON}}, + {"ras", {AArch64::FeatureRAS}}, + {"lse", {AArch64::FeatureLSE}}, + {"predctrl", {AArch64::FeaturePredCtrl}}, + {"ccdp", {AArch64::FeatureCacheDeepPersist}}, + {"mte", {AArch64::FeatureMTE}}, + {"tlb-rmi", {AArch64::FeatureTLB_RMI}}, + {"pan-rwv", {AArch64::FeaturePAN_RWV}}, + {"ccpp", {AArch64::FeatureCCPP}}, + // FIXME: Unsupported extensions + {"pan", {}}, + {"lor", {}}, + {"rdma", {}}, + {"profile", {}}, }; - static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) { if (FBS[AArch64::HasV8_1aOps]) Str += "ARMv8.1a"; @@ -5026,6 +5028,8 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { parseDirectiveUnreq(Loc); else if (IDVal == ".inst") parseDirectiveInst(Loc); + else if (IDVal == ".cfi_negate_ra_state") + parseDirectiveCFINegateRAState(); else if (IsMachO) { if (IDVal == MCLOHDirectiveName()) parseDirectiveLOH(IDVal, Loc); @@ -5399,6 +5403,13 @@ bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) { return false; } +bool AArch64AsmParser::parseDirectiveCFINegateRAState() { + if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive")) + return true; + getStreamer().EmitCFINegateRAState(); + return false; +} + bool AArch64AsmParser::classifySymbolRef(const MCExpr *Expr, AArch64MCExpr::VariantKind &ELFRefKind, diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt index 58190686c7947c2b60b350c35d202fbb2ce34121..7778882d4915f3b338f6918e3e3ee97c9e8888ce 100644 --- a/lib/Target/AArch64/CMakeLists.txt +++ b/lib/Target/AArch64/CMakeLists.txt @@ -15,6 +15,7 @@ tablegen(LLVM AArch64GenRegisterBank.inc -gen-register-bank) tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info) tablegen(LLVM AArch64GenSubtargetInfo.inc -gen-subtarget) tablegen(LLVM AArch64GenSystemOperands.inc -gen-searchable-tables) +tablegen(LLVM AArch64GenExegesis.inc -gen-exegesis) add_public_tablegen_target(AArch64CommonTableGen) @@ -51,6 +52,7 @@ add_llvm_target(AArch64CodeGen AArch64RegisterBankInfo.cpp AArch64RegisterInfo.cpp AArch64SelectionDAGInfo.cpp + AArch64SpeculationHardening.cpp AArch64StorePairSuppress.cpp AArch64Subtarget.cpp AArch64TargetMachine.cpp diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 0e486b9392316a78688c597bcbed616f34b94d54..58e4a9c9a9e9aa7bdc5a0a6b070ed57750acd63f 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -132,4 +132,7 @@ AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() { CommentString = "//"; ExceptionsType = ExceptionHandling::DwarfCFI; + // The default is dwarf, but WinEH can be enabled optionally, which requires + // WinEHEncodingType to be set. + WinEHEncodingType = WinEH::EncodingType::Itanium; } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index 744bb0805d2515041c3539fb6efb24e8685081ba..0f8198ba4e9b679ae5f1105c193ecf099f902656 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -16,6 +16,7 @@ #include "AArch64MCAsmInfo.h" #include "AArch64WinCOFFStreamer.h" #include "InstPrinter/AArch64InstPrinter.h" +#include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCInstrAnalysis.h" @@ -31,6 +32,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define GET_INSTRINFO_MC_HELPERS #include "AArch64GenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h index 63f50778ccdb675983d5a4021edf8deb0fcaa540..0f22f69bd5b0a9a954183a5cc0dea29a7fa6f9b5 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -84,6 +84,7 @@ void initLLVMToCVRegMapping(MCRegisterInfo *MRI); // Defines symbolic names for the AArch64 instructions. // #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "AArch64GenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 07e5d97dff90b148fdbe23905ae9f190e075dce2..d26397a4271a0709c91925d70c636b289ec9b827 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -37,10 +37,12 @@ FunctionPass *createAMDGPUCFGStructurizerPass(); FunctionPass *createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel); // SI Passes +FunctionPass *createGCNDPPCombinePass(); FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSIPeepholeSDWAPass(); FunctionPass *createSILowerI1CopiesPass(); +FunctionPass *createSIFixupVectorISelPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(); FunctionPass *createSIWholeQuadModePass(); @@ -57,6 +59,7 @@ FunctionPass *createAMDGPUUseNativeCallsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); FunctionPass *createAMDGPURewriteOutArgumentsPass(); +FunctionPass *createSIModeRegisterPass(); void initializeAMDGPUDAGToDAGISelPass(PassRegistry&); @@ -92,6 +95,9 @@ extern char &AMDGPULowerKernelAttributesID; void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &); extern char &AMDGPURewriteOutArgumentsID; +void initializeGCNDPPCombinePass(PassRegistry &); +extern char &GCNDPPCombineID; + void initializeR600ClauseMergePassPass(PassRegistry &); extern char &R600ClauseMergePassID; @@ -122,6 +128,9 @@ extern char &SIFixSGPRCopiesID; void initializeSIFixVGPRCopiesPass(PassRegistry &); extern char &SIFixVGPRCopiesID; +void initializeSIFixupVectorISelPass(PassRegistry &); +extern char &SIFixupVectorISelID; + void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; @@ -187,6 +196,9 @@ extern char &SIMemoryLegalizerID; void initializeSIDebuggerInsertNopsPass(PassRegistry&); extern char &SIDebuggerInsertNopsID; +void initializeSIModeRegisterPass(PassRegistry&); +extern char &SIModeRegisterID; + void initializeSIInsertWaitcntsPass(PassRegistry&); extern char &SIInsertWaitcntsID; diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 96a8029773d0b5ce53900fd4c4625d76de127346..0aacedf24e32b84b07c189882b9abd19e6a7d028 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -11,6 +11,10 @@ include "llvm/TableGen/SearchableTable.td" include "llvm/Target/Target.td" include "AMDGPUFeatures.td" +class BoolToList { + list ret = !if(Value, [1], []); +} + //===------------------------------------------------------------===// // Subtarget Features (device properties) //===------------------------------------------------------------===// @@ -465,34 +469,41 @@ def FeatureISAVersion6_0_0 : SubtargetFeatureISAVersion <6,0,0, [FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops, - FeatureLDSBankCount32]>; + FeatureLDSBankCount32, + FeatureCodeObjectV3]>; def FeatureISAVersion6_0_1 : SubtargetFeatureISAVersion <6,0,1, [FeatureSouthernIslands, - FeatureLDSBankCount32]>; + FeatureLDSBankCount32, + FeatureCodeObjectV3]>; def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0, [FeatureSeaIslands, - FeatureLDSBankCount32]>; + FeatureLDSBankCount32, + FeatureCodeObjectV3]>; def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1, [FeatureSeaIslands, HalfRate64Ops, FeatureLDSBankCount32, - FeatureFastFMAF32]>; + FeatureFastFMAF32, + FeatureCodeObjectV3]>; def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2, [FeatureSeaIslands, FeatureLDSBankCount16, - FeatureFastFMAF32]>; + FeatureFastFMAF32, + FeatureCodeObjectV3]>; def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3, [FeatureSeaIslands, - FeatureLDSBankCount16]>; + FeatureLDSBankCount16, + FeatureCodeObjectV3]>; def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4, [FeatureSeaIslands, - FeatureLDSBankCount32]>; + FeatureLDSBankCount32, + FeatureCodeObjectV3]>; def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1, [FeatureVolcanicIslands, @@ -500,39 +511,46 @@ def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1, HalfRate64Ops, FeatureLDSBankCount32, FeatureXNACK, - FeatureUnpackedD16VMem]>; + FeatureUnpackedD16VMem, + FeatureCodeObjectV3]>; def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2, [FeatureVolcanicIslands, FeatureLDSBankCount32, FeatureSGPRInitBug, - FeatureUnpackedD16VMem]>; + FeatureUnpackedD16VMem, + FeatureCodeObjectV3]>; def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3, [FeatureVolcanicIslands, FeatureLDSBankCount32, - FeatureUnpackedD16VMem]>; + FeatureUnpackedD16VMem, + FeatureCodeObjectV3]>; def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0, [FeatureVolcanicIslands, FeatureLDSBankCount16, - FeatureXNACK]>; + FeatureXNACK, + FeatureCodeObjectV3]>; def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0, [FeatureGFX9, FeatureMadMixInsts, - FeatureLDSBankCount32]>; + FeatureLDSBankCount32, + FeatureCodeObjectV3]>; def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2, [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, - FeatureXNACK]>; + FeatureXNACK, + FeatureCodeObjectV3]>; def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4, [FeatureGFX9, FeatureLDSBankCount32, - FeatureFmaMixInsts]>; + FeatureFmaMixInsts, + FeatureCodeObjectV3]>; def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6, [FeatureGFX9, @@ -540,13 +558,15 @@ def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6, FeatureFmaMixInsts, FeatureLDSBankCount32, FeatureDLInsts, - FeatureSRAMECC]>; + FeatureSRAMECC, + FeatureCodeObjectV3]>; def FeatureISAVersion9_0_9 : SubtargetFeatureISAVersion <9,0,9, [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, - FeatureXNACK]>; + FeatureXNACK, + FeatureCodeObjectV3]>; //===----------------------------------------------------------------------===// // Debugger related subtarget features. @@ -744,7 +764,6 @@ def EnableLateCFGStructurize : Predicate< include "SISchedule.td" include "GCNProcessors.td" include "AMDGPUInstrInfo.td" -include "AMDGPUIntrinsics.td" include "SIIntrinsics.td" include "AMDGPURegisterInfo.td" include "AMDGPURegisterBanks.td" diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index d07c0516c27265e14e6c0dfe84a2ef72f71fff64..2ded7cdb648994026a4990431e5479dc25130b16 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -46,6 +46,7 @@ using namespace llvm; using namespace llvm::AMDGPU; +using namespace llvm::AMDGPU::HSAMD; // TODO: This should get the default rounding mode from the kernel. We just set // the default here, but this could change if the OpenCL rounding mode pragmas @@ -99,6 +100,10 @@ extern "C" void LLVMInitializeAMDGPUAsmPrinter() { AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr Streamer) : AsmPrinter(TM, std::move(Streamer)) { + if (IsaInfo::hasCodeObjectV3(getSTI())) + HSAMetadataStream.reset(new MetadataStreamerV3()); + else + HSAMetadataStream.reset(new MetadataStreamerV2()); } StringRef AMDGPUAsmPrinter::getPassName() const { @@ -122,9 +127,6 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { IsaInfo::streamIsaVersion(getSTI(), ExpectedTargetOS); getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget); - - if (TM.getTargetTriple().getOS() == Triple::AMDHSA) - return; } if (TM.getTargetTriple().getOS() != Triple::AMDHSA && @@ -132,11 +134,14 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { return; if (TM.getTargetTriple().getOS() == Triple::AMDHSA) - HSAMetadataStream.begin(M); + HSAMetadataStream->begin(M); if (TM.getTargetTriple().getOS() == Triple::AMDPAL) readPALMetadata(M); + if (IsaInfo::hasCodeObjectV3(getSTI())) + return; + // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2. if (TM.getTargetTriple().getOS() == Triple::AMDHSA) getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1); @@ -148,37 +153,38 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { } void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { - // TODO: Add metadata to code object v3. - if (IsaInfo::hasCodeObjectV3(getSTI()) && - TM.getTargetTriple().getOS() == Triple::AMDHSA) - return; - // Following code requires TargetStreamer to be present. if (!getTargetStreamer()) return; - // Emit ISA Version (NT_AMD_AMDGPU_ISA). - std::string ISAVersionString; - raw_string_ostream ISAVersionStream(ISAVersionString); - IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream); - getTargetStreamer()->EmitISAVersion(ISAVersionStream.str()); + if (!IsaInfo::hasCodeObjectV3(getSTI())) { + // Emit ISA Version (NT_AMD_AMDGPU_ISA). + std::string ISAVersionString; + raw_string_ostream ISAVersionStream(ISAVersionString); + IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream); + getTargetStreamer()->EmitISAVersion(ISAVersionStream.str()); + } // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA). if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { - HSAMetadataStream.end(); - getTargetStreamer()->EmitHSAMetadata(HSAMetadataStream.getHSAMetadata()); + HSAMetadataStream->end(); + bool Success = HSAMetadataStream->emitTo(*getTargetStreamer()); + (void)Success; + assert(Success && "Malformed HSA Metadata"); } - // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA). - if (TM.getTargetTriple().getOS() == Triple::AMDPAL) { - // Copy the PAL metadata from the map where we collected it into a vector, - // then write it as a .note. - PALMD::Metadata PALMetadataVector; - for (auto i : PALMetadataMap) { - PALMetadataVector.push_back(i.first); - PALMetadataVector.push_back(i.second); + if (!IsaInfo::hasCodeObjectV3(getSTI())) { + // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA). + if (TM.getTargetTriple().getOS() == Triple::AMDPAL) { + // Copy the PAL metadata from the map where we collected it into a vector, + // then write it as a .note. + PALMD::Metadata PALMetadataVector; + for (auto i : PALMetadataMap) { + PALMetadataVector.push_back(i.first); + PALMetadataVector.push_back(i.second); + } + getTargetStreamer()->EmitPALMetadata(PALMetadataVector); } - getTargetStreamer()->EmitPALMetadata(PALMetadataVector); } } @@ -211,11 +217,8 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() { getTargetStreamer()->EmitAMDKernelCodeT(KernelCode); } - if (TM.getTargetTriple().getOS() != Triple::AMDHSA) - return; - - if (!STM.hasCodeObjectV3() && STM.isAmdHsaOS()) - HSAMetadataStream.emitKernel(*MF, CurrentProgramInfo); + if (STM.isAmdHsaOS()) + HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo); } void AMDGPUAsmPrinter::EmitFunctionBodyEnd() { @@ -911,9 +914,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, } ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks( - getSTI(), ProgInfo.NumSGPRsForWavesPerEU); + &STM, ProgInfo.NumSGPRsForWavesPerEU); ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks( - getSTI(), ProgInfo.NumVGPRsForWavesPerEU); + &STM, ProgInfo.NumVGPRsForWavesPerEU); // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue" diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 462b5feca6a3f35c761ca139d788f88e948e35e6..167ac4b21e1e20feb3dbcc39d168e2853d21d545 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -56,7 +56,7 @@ private: SIProgramInfo CurrentProgramInfo; DenseMap CallGraphResourceInfo; - AMDGPU::HSAMD::MetadataStreamer HSAMetadataStream; + std::unique_ptr HSAMetadataStream; std::map PALMetadataMap; uint64_t getFunctionCodeSize(const MachineFunction &MF) const; diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index fadc833e0142287424eb223c7b0043778cd1f0a7..c38b0e61558b3d2a2464c776b6dfb43dcb1f3aee 100644 --- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -16,6 +16,7 @@ #include "AMDGPUHSAMetadataStreamer.h" #include "AMDGPU.h" #include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUTargetStreamer.h" #include "SIMachineFunctionInfo.h" #include "SIProgramInfo.h" #include "Utils/AMDGPUBaseInfo.h" @@ -36,11 +37,14 @@ static cl::opt VerifyHSAMetadata( namespace AMDGPU { namespace HSAMD { -void MetadataStreamer::dump(StringRef HSAMetadataString) const { +//===----------------------------------------------------------------------===// +// HSAMetadataStreamerV2 +//===----------------------------------------------------------------------===// +void MetadataStreamerV2::dump(StringRef HSAMetadataString) const { errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n'; } -void MetadataStreamer::verify(StringRef HSAMetadataString) const { +void MetadataStreamerV2::verify(StringRef HSAMetadataString) const { errs() << "AMDGPU HSA Metadata Parser Test: "; HSAMD::Metadata FromHSAMetadataString; @@ -63,7 +67,8 @@ void MetadataStreamer::verify(StringRef HSAMetadataString) const { } } -AccessQualifier MetadataStreamer::getAccessQualifier(StringRef AccQual) const { +AccessQualifier +MetadataStreamerV2::getAccessQualifier(StringRef AccQual) const { if (AccQual.empty()) return AccessQualifier::Unknown; @@ -74,7 +79,8 @@ AccessQualifier MetadataStreamer::getAccessQualifier(StringRef AccQual) const { .Default(AccessQualifier::Default); } -AddressSpaceQualifier MetadataStreamer::getAddressSpaceQualifer( +AddressSpaceQualifier +MetadataStreamerV2::getAddressSpaceQualifier( unsigned AddressSpace) const { switch (AddressSpace) { case AMDGPUAS::PRIVATE_ADDRESS: @@ -94,8 +100,8 @@ AddressSpaceQualifier MetadataStreamer::getAddressSpaceQualifer( } } -ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual, - StringRef BaseTypeName) const { +ValueKind MetadataStreamerV2::getValueKind(Type *Ty, StringRef TypeQual, + StringRef BaseTypeName) const { if (TypeQual.find("pipe") != StringRef::npos) return ValueKind::Pipe; @@ -122,7 +128,7 @@ ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual, ValueKind::ByValue); } -ValueType MetadataStreamer::getValueType(Type *Ty, StringRef TypeName) const { +ValueType MetadataStreamerV2::getValueType(Type *Ty, StringRef TypeName) const { switch (Ty->getTypeID()) { case Type::IntegerTyID: { auto Signed = !TypeName.startswith("u"); @@ -154,7 +160,7 @@ ValueType MetadataStreamer::getValueType(Type *Ty, StringRef TypeName) const { } } -std::string MetadataStreamer::getTypeName(Type *Ty, bool Signed) const { +std::string MetadataStreamerV2::getTypeName(Type *Ty, bool Signed) const { switch (Ty->getTypeID()) { case Type::IntegerTyID: { if (!Signed) @@ -191,8 +197,8 @@ std::string MetadataStreamer::getTypeName(Type *Ty, bool Signed) const { } } -std::vector MetadataStreamer::getWorkGroupDimensions( - MDNode *Node) const { +std::vector +MetadataStreamerV2::getWorkGroupDimensions(MDNode *Node) const { std::vector Dims; if (Node->getNumOperands() != 3) return Dims; @@ -202,9 +208,9 @@ std::vector MetadataStreamer::getWorkGroupDimensions( return Dims; } -Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps( - const MachineFunction &MF, - const SIProgramInfo &ProgramInfo) const { +Kernel::CodeProps::Metadata +MetadataStreamerV2::getHSACodeProps(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const { const GCNSubtarget &STM = MF.getSubtarget(); const SIMachineFunctionInfo &MFI = *MF.getInfo(); HSAMD::Kernel::CodeProps::Metadata HSACodeProps; @@ -231,9 +237,9 @@ Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps( return HSACodeProps; } -Kernel::DebugProps::Metadata MetadataStreamer::getHSADebugProps( - const MachineFunction &MF, - const SIProgramInfo &ProgramInfo) const { +Kernel::DebugProps::Metadata +MetadataStreamerV2::getHSADebugProps(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const { const GCNSubtarget &STM = MF.getSubtarget(); HSAMD::Kernel::DebugProps::Metadata HSADebugProps; @@ -253,14 +259,14 @@ Kernel::DebugProps::Metadata MetadataStreamer::getHSADebugProps( return HSADebugProps; } -void MetadataStreamer::emitVersion() { +void MetadataStreamerV2::emitVersion() { auto &Version = HSAMetadata.mVersion; Version.push_back(VersionMajor); Version.push_back(VersionMinor); } -void MetadataStreamer::emitPrintf(const Module &Mod) { +void MetadataStreamerV2::emitPrintf(const Module &Mod) { auto &Printf = HSAMetadata.mPrintf; auto Node = Mod.getNamedMetadata("llvm.printf.fmts"); @@ -272,7 +278,7 @@ void MetadataStreamer::emitPrintf(const Module &Mod) { Printf.push_back(cast(Op->getOperand(0))->getString()); } -void MetadataStreamer::emitKernelLanguage(const Function &Func) { +void MetadataStreamerV2::emitKernelLanguage(const Function &Func) { auto &Kernel = HSAMetadata.mKernels.back(); // TODO: What about other languages? @@ -290,7 +296,7 @@ void MetadataStreamer::emitKernelLanguage(const Function &Func) { mdconst::extract(Op0->getOperand(1))->getZExtValue()); } -void MetadataStreamer::emitKernelAttrs(const Function &Func) { +void MetadataStreamerV2::emitKernelAttrs(const Function &Func) { auto &Attrs = HSAMetadata.mKernels.back().mAttrs; if (auto Node = Func.getMetadata("reqd_work_group_size")) @@ -308,14 +314,14 @@ void MetadataStreamer::emitKernelAttrs(const Function &Func) { } } -void MetadataStreamer::emitKernelArgs(const Function &Func) { +void MetadataStreamerV2::emitKernelArgs(const Function &Func) { for (auto &Arg : Func.args()) emitKernelArg(Arg); emitHiddenKernelArgs(Func); } -void MetadataStreamer::emitKernelArg(const Argument &Arg) { +void MetadataStreamerV2::emitKernelArg(const Argument &Arg) { auto Func = Arg.getParent(); auto ArgNo = Arg.getArgNo(); const MDNode *Node; @@ -368,12 +374,12 @@ void MetadataStreamer::emitKernelArg(const Argument &Arg) { PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual); } -void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty, - ValueKind ValueKind, - unsigned PointeeAlign, - StringRef Name, - StringRef TypeName, StringRef BaseTypeName, - StringRef AccQual, StringRef TypeQual) { +void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty, + ValueKind ValueKind, + unsigned PointeeAlign, StringRef Name, + StringRef TypeName, + StringRef BaseTypeName, + StringRef AccQual, StringRef TypeQual) { HSAMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata()); auto &Arg = HSAMetadata.mKernels.back().mArgs.back(); @@ -386,7 +392,7 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty, Arg.mPointeeAlign = PointeeAlign; if (auto PtrTy = dyn_cast(Ty)) - Arg.mAddrSpaceQual = getAddressSpaceQualifer(PtrTy->getAddressSpace()); + Arg.mAddrSpaceQual = getAddressSpaceQualifier(PtrTy->getAddressSpace()); Arg.mAccQual = getAccessQualifier(AccQual); @@ -406,7 +412,7 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty, } } -void MetadataStreamer::emitHiddenKernelArgs(const Function &Func) { +void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) { int HiddenArgNumBytes = getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0); @@ -448,12 +454,16 @@ void MetadataStreamer::emitHiddenKernelArgs(const Function &Func) { } } -void MetadataStreamer::begin(const Module &Mod) { +bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) { + return TargetStreamer.EmitHSAMetadata(getHSAMetadata()); +} + +void MetadataStreamerV2::begin(const Module &Mod) { emitVersion(); emitPrintf(Mod); } -void MetadataStreamer::end() { +void MetadataStreamerV2::end() { std::string HSAMetadataString; if (toString(HSAMetadata, HSAMetadataString)) return; @@ -464,7 +474,8 @@ void MetadataStreamer::end() { verify(HSAMetadataString); } -void MetadataStreamer::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) { +void MetadataStreamerV2::emitKernel(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) { auto &Func = MF.getFunction(); if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL) return; @@ -484,6 +495,505 @@ void MetadataStreamer::emitKernel(const MachineFunction &MF, const SIProgramInfo HSAMetadata.mKernels.back().mDebugProps = DebugProps; } +//===----------------------------------------------------------------------===// +// HSAMetadataStreamerV3 +//===----------------------------------------------------------------------===// + +void MetadataStreamerV3::dump(StringRef HSAMetadataString) const { + errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n'; +} + +void MetadataStreamerV3::verify(StringRef HSAMetadataString) const { + errs() << "AMDGPU HSA Metadata Parser Test: "; + + std::shared_ptr FromHSAMetadataString = + std::make_shared(); + + yaml::Input YIn(HSAMetadataString); + YIn >> FromHSAMetadataString; + if (YIn.error()) { + errs() << "FAIL\n"; + return; + } + + std::string ToHSAMetadataString; + raw_string_ostream StrOS(ToHSAMetadataString); + yaml::Output YOut(StrOS); + YOut << FromHSAMetadataString; + + errs() << (HSAMetadataString == StrOS.str() ? "PASS" : "FAIL") << '\n'; + if (HSAMetadataString != ToHSAMetadataString) { + errs() << "Original input: " << HSAMetadataString << '\n' + << "Produced output: " << StrOS.str() << '\n'; + } +} + +Optional +MetadataStreamerV3::getAccessQualifier(StringRef AccQual) const { + return StringSwitch>(AccQual) + .Case("read_only", StringRef("read_only")) + .Case("write_only", StringRef("write_only")) + .Case("read_write", StringRef("read_write")) + .Default(None); +} + +Optional +MetadataStreamerV3::getAddressSpaceQualifier(unsigned AddressSpace) const { + switch (AddressSpace) { + case AMDGPUAS::PRIVATE_ADDRESS: + return StringRef("private"); + case AMDGPUAS::GLOBAL_ADDRESS: + return StringRef("global"); + case AMDGPUAS::CONSTANT_ADDRESS: + return StringRef("constant"); + case AMDGPUAS::LOCAL_ADDRESS: + return StringRef("local"); + case AMDGPUAS::FLAT_ADDRESS: + return StringRef("generic"); + case AMDGPUAS::REGION_ADDRESS: + return StringRef("region"); + default: + return None; + } +} + +StringRef MetadataStreamerV3::getValueKind(Type *Ty, StringRef TypeQual, + StringRef BaseTypeName) const { + if (TypeQual.find("pipe") != StringRef::npos) + return "pipe"; + + return StringSwitch(BaseTypeName) + .Case("image1d_t", "image") + .Case("image1d_array_t", "image") + .Case("image1d_buffer_t", "image") + .Case("image2d_t", "image") + .Case("image2d_array_t", "image") + .Case("image2d_array_depth_t", "image") + .Case("image2d_array_msaa_t", "image") + .Case("image2d_array_msaa_depth_t", "image") + .Case("image2d_depth_t", "image") + .Case("image2d_msaa_t", "image") + .Case("image2d_msaa_depth_t", "image") + .Case("image3d_t", "image") + .Case("sampler_t", "sampler") + .Case("queue_t", "queue") + .Default(isa(Ty) + ? (Ty->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS + ? "dynamic_shared_pointer" + : "global_buffer") + : "by_value"); +} + +StringRef MetadataStreamerV3::getValueType(Type *Ty, StringRef TypeName) const { + switch (Ty->getTypeID()) { + case Type::IntegerTyID: { + auto Signed = !TypeName.startswith("u"); + switch (Ty->getIntegerBitWidth()) { + case 8: + return Signed ? "i8" : "u8"; + case 16: + return Signed ? "i16" : "u16"; + case 32: + return Signed ? "i32" : "u32"; + case 64: + return Signed ? "i64" : "u64"; + default: + return "struct"; + } + } + case Type::HalfTyID: + return "f16"; + case Type::FloatTyID: + return "f32"; + case Type::DoubleTyID: + return "f64"; + case Type::PointerTyID: + return getValueType(Ty->getPointerElementType(), TypeName); + case Type::VectorTyID: + return getValueType(Ty->getVectorElementType(), TypeName); + default: + return "struct"; + } +} + +std::string MetadataStreamerV3::getTypeName(Type *Ty, bool Signed) const { + switch (Ty->getTypeID()) { + case Type::IntegerTyID: { + if (!Signed) + return (Twine('u') + getTypeName(Ty, true)).str(); + + auto BitWidth = Ty->getIntegerBitWidth(); + switch (BitWidth) { + case 8: + return "char"; + case 16: + return "short"; + case 32: + return "int"; + case 64: + return "long"; + default: + return (Twine('i') + Twine(BitWidth)).str(); + } + } + case Type::HalfTyID: + return "half"; + case Type::FloatTyID: + return "float"; + case Type::DoubleTyID: + return "double"; + case Type::VectorTyID: { + auto VecTy = cast(Ty); + auto ElTy = VecTy->getElementType(); + auto NumElements = VecTy->getVectorNumElements(); + return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str(); + } + default: + return "unknown"; + } +} + +std::shared_ptr +MetadataStreamerV3::getWorkGroupDimensions(MDNode *Node) const { + auto Dims = std::make_shared(); + if (Node->getNumOperands() != 3) + return Dims; + + for (auto &Op : Node->operands()) + Dims->push_back(std::make_shared( + mdconst::extract(Op)->getZExtValue())); + return Dims; +} + +void MetadataStreamerV3::emitVersion() { + auto Version = std::make_shared(); + Version->push_back(std::make_shared(V3::VersionMajor)); + Version->push_back(std::make_shared(V3::VersionMinor)); + getRootMetadata("amdhsa.version") = std::move(Version); +} + +void MetadataStreamerV3::emitPrintf(const Module &Mod) { + auto Node = Mod.getNamedMetadata("llvm.printf.fmts"); + if (!Node) + return; + + auto Printf = std::make_shared(); + for (auto Op : Node->operands()) + if (Op->getNumOperands()) + Printf->push_back(std::make_shared( + cast(Op->getOperand(0))->getString())); + getRootMetadata("amdhsa.printf") = std::move(Printf); +} + +void MetadataStreamerV3::emitKernelLanguage(const Function &Func, + msgpack::MapNode &Kern) { + // TODO: What about other languages? + auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version"); + if (!Node || !Node->getNumOperands()) + return; + auto Op0 = Node->getOperand(0); + if (Op0->getNumOperands() <= 1) + return; + + Kern[".language"] = std::make_shared("OpenCL C"); + auto LanguageVersion = std::make_shared(); + LanguageVersion->push_back(std::make_shared( + mdconst::extract(Op0->getOperand(0))->getZExtValue())); + LanguageVersion->push_back(std::make_shared( + mdconst::extract(Op0->getOperand(1))->getZExtValue())); + Kern[".language_version"] = std::move(LanguageVersion); +} + +void MetadataStreamerV3::emitKernelAttrs(const Function &Func, + msgpack::MapNode &Kern) { + + if (auto Node = Func.getMetadata("reqd_work_group_size")) + Kern[".reqd_workgroup_size"] = getWorkGroupDimensions(Node); + if (auto Node = Func.getMetadata("work_group_size_hint")) + Kern[".workgroup_size_hint"] = getWorkGroupDimensions(Node); + if (auto Node = Func.getMetadata("vec_type_hint")) { + Kern[".vec_type_hint"] = std::make_shared(getTypeName( + cast(Node->getOperand(0))->getType(), + mdconst::extract(Node->getOperand(1))->getZExtValue())); + } + if (Func.hasFnAttribute("runtime-handle")) { + Kern[".device_enqueue_symbol"] = std::make_shared( + Func.getFnAttribute("runtime-handle").getValueAsString().str()); + } +} + +void MetadataStreamerV3::emitKernelArgs(const Function &Func, + msgpack::MapNode &Kern) { + unsigned Offset = 0; + auto Args = std::make_shared(); + for (auto &Arg : Func.args()) + emitKernelArg(Arg, Offset, *Args); + + emitHiddenKernelArgs(Func, Offset, *Args); + + // TODO: What about other languages? + if (Func.getParent()->getNamedMetadata("opencl.ocl.version")) { + auto &DL = Func.getParent()->getDataLayout(); + auto Int64Ty = Type::getInt64Ty(Func.getContext()); + + emitKernelArg(DL, Int64Ty, "hidden_global_offset_x", Offset, *Args); + emitKernelArg(DL, Int64Ty, "hidden_global_offset_y", Offset, *Args); + emitKernelArg(DL, Int64Ty, "hidden_global_offset_z", Offset, *Args); + + auto Int8PtrTy = + Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); + + // Emit "printf buffer" argument if printf is used, otherwise emit dummy + // "none" argument. + if (Func.getParent()->getNamedMetadata("llvm.printf.fmts")) + emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, *Args); + else + emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args); + + // Emit "default queue" and "completion action" arguments if enqueue kernel + // is used, otherwise emit dummy "none" arguments. + if (Func.hasFnAttribute("calls-enqueue-kernel")) { + emitKernelArg(DL, Int8PtrTy, "hidden_default_queue", Offset, *Args); + emitKernelArg(DL, Int8PtrTy, "hidden_completion_action", Offset, *Args); + } else { + emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args); + emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args); + } + } + + Kern[".args"] = std::move(Args); +} + +void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset, + msgpack::ArrayNode &Args) { + auto Func = Arg.getParent(); + auto ArgNo = Arg.getArgNo(); + const MDNode *Node; + + StringRef Name; + Node = Func->getMetadata("kernel_arg_name"); + if (Node && ArgNo < Node->getNumOperands()) + Name = cast(Node->getOperand(ArgNo))->getString(); + else if (Arg.hasName()) + Name = Arg.getName(); + + StringRef TypeName; + Node = Func->getMetadata("kernel_arg_type"); + if (Node && ArgNo < Node->getNumOperands()) + TypeName = cast(Node->getOperand(ArgNo))->getString(); + + StringRef BaseTypeName; + Node = Func->getMetadata("kernel_arg_base_type"); + if (Node && ArgNo < Node->getNumOperands()) + BaseTypeName = cast(Node->getOperand(ArgNo))->getString(); + + StringRef AccQual; + if (Arg.getType()->isPointerTy() && Arg.onlyReadsMemory() && + Arg.hasNoAliasAttr()) { + AccQual = "read_only"; + } else { + Node = Func->getMetadata("kernel_arg_access_qual"); + if (Node && ArgNo < Node->getNumOperands()) + AccQual = cast(Node->getOperand(ArgNo))->getString(); + } + + StringRef TypeQual; + Node = Func->getMetadata("kernel_arg_type_qual"); + if (Node && ArgNo < Node->getNumOperands()) + TypeQual = cast(Node->getOperand(ArgNo))->getString(); + + Type *Ty = Arg.getType(); + const DataLayout &DL = Func->getParent()->getDataLayout(); + + unsigned PointeeAlign = 0; + if (auto PtrTy = dyn_cast(Ty)) { + if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + PointeeAlign = Arg.getParamAlignment(); + if (PointeeAlign == 0) + PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType()); + } + } + + emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(), + getValueKind(Arg.getType(), TypeQual, BaseTypeName), Offset, + Args, PointeeAlign, Name, TypeName, BaseTypeName, AccQual, + TypeQual); +} + +void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty, + StringRef ValueKind, unsigned &Offset, + msgpack::ArrayNode &Args, + unsigned PointeeAlign, StringRef Name, + StringRef TypeName, + StringRef BaseTypeName, + StringRef AccQual, StringRef TypeQual) { + auto ArgPtr = std::make_shared(); + auto &Arg = *ArgPtr; + + if (!Name.empty()) + Arg[".name"] = std::make_shared(Name); + if (!TypeName.empty()) + Arg[".type_name"] = std::make_shared(TypeName); + auto Size = DL.getTypeAllocSize(Ty); + auto Align = DL.getABITypeAlignment(Ty); + Arg[".size"] = std::make_shared(Size); + Offset = alignTo(Offset, Align); + Arg[".offset"] = std::make_shared(Offset); + Offset += Size; + Arg[".value_kind"] = std::make_shared(ValueKind); + Arg[".value_type"] = + std::make_shared(getValueType(Ty, BaseTypeName)); + if (PointeeAlign) + Arg[".pointee_align"] = std::make_shared(PointeeAlign); + + if (auto PtrTy = dyn_cast(Ty)) + if (auto Qualifier = getAddressSpaceQualifier(PtrTy->getAddressSpace())) + Arg[".address_space"] = std::make_shared(*Qualifier); + + if (auto AQ = getAccessQualifier(AccQual)) + Arg[".access"] = std::make_shared(*AQ); + + // TODO: Emit Arg[".actual_access"]. + + SmallVector SplitTypeQuals; + TypeQual.split(SplitTypeQuals, " ", -1, false); + for (StringRef Key : SplitTypeQuals) { + if (Key == "const") + Arg[".is_const"] = std::make_shared(true); + else if (Key == "restrict") + Arg[".is_restrict"] = std::make_shared(true); + else if (Key == "volatile") + Arg[".is_volatile"] = std::make_shared(true); + else if (Key == "pipe") + Arg[".is_pipe"] = std::make_shared(true); + } + + Args.push_back(std::move(ArgPtr)); +} + +void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func, + unsigned &Offset, + msgpack::ArrayNode &Args) { + int HiddenArgNumBytes = + getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0); + + if (!HiddenArgNumBytes) + return; + + auto &DL = Func.getParent()->getDataLayout(); + auto Int64Ty = Type::getInt64Ty(Func.getContext()); + + if (HiddenArgNumBytes >= 8) + emitKernelArg(DL, Int64Ty, "hidden_global_offset_x", Offset, Args); + if (HiddenArgNumBytes >= 16) + emitKernelArg(DL, Int64Ty, "hidden_global_offset_y", Offset, Args); + if (HiddenArgNumBytes >= 24) + emitKernelArg(DL, Int64Ty, "hidden_global_offset_z", Offset, Args); + + auto Int8PtrTy = + Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); + + // Emit "printf buffer" argument if printf is used, otherwise emit dummy + // "none" argument. + if (HiddenArgNumBytes >= 32) { + if (Func.getParent()->getNamedMetadata("llvm.printf.fmts")) + emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, Args); + else + emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args); + } + + // Emit "default queue" and "completion action" arguments if enqueue kernel is + // used, otherwise emit dummy "none" arguments. + if (HiddenArgNumBytes >= 48) { + if (Func.hasFnAttribute("calls-enqueue-kernel")) { + emitKernelArg(DL, Int8PtrTy, "hidden_default_queue", Offset, Args); + emitKernelArg(DL, Int8PtrTy, "hidden_completion_action", Offset, Args); + } else { + emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args); + emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args); + } + } +} + +std::shared_ptr +MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const { + const GCNSubtarget &STM = MF.getSubtarget(); + const SIMachineFunctionInfo &MFI = *MF.getInfo(); + const Function &F = MF.getFunction(); + + auto HSAKernelProps = std::make_shared(); + auto &Kern = *HSAKernelProps; + + unsigned MaxKernArgAlign; + Kern[".kernarg_segment_size"] = std::make_shared( + STM.getKernArgSegmentSize(F, MaxKernArgAlign)); + Kern[".group_segment_fixed_size"] = + std::make_shared(ProgramInfo.LDSSize); + Kern[".private_segment_fixed_size"] = + std::make_shared(ProgramInfo.ScratchSize); + Kern[".kernarg_segment_align"] = + std::make_shared(std::max(uint32_t(4), MaxKernArgAlign)); + Kern[".wavefront_size"] = + std::make_shared(STM.getWavefrontSize()); + Kern[".sgpr_count"] = std::make_shared(ProgramInfo.NumSGPR); + Kern[".vgpr_count"] = std::make_shared(ProgramInfo.NumVGPR); + Kern[".max_flat_workgroup_size"] = + std::make_shared(MFI.getMaxFlatWorkGroupSize()); + Kern[".sgpr_spill_count"] = + std::make_shared(MFI.getNumSpilledSGPRs()); + Kern[".vgpr_spill_count"] = + std::make_shared(MFI.getNumSpilledVGPRs()); + + return HSAKernelProps; +} + +bool MetadataStreamerV3::emitTo(AMDGPUTargetStreamer &TargetStreamer) { + return TargetStreamer.EmitHSAMetadata(getHSAMetadataRoot(), true); +} + +void MetadataStreamerV3::begin(const Module &Mod) { + emitVersion(); + emitPrintf(Mod); + getRootMetadata("amdhsa.kernels").reset(new msgpack::ArrayNode()); +} + +void MetadataStreamerV3::end() { + std::string HSAMetadataString; + raw_string_ostream StrOS(HSAMetadataString); + yaml::Output YOut(StrOS); + YOut << HSAMetadataRoot; + + if (DumpHSAMetadata) + dump(StrOS.str()); + if (VerifyHSAMetadata) + verify(StrOS.str()); +} + +void MetadataStreamerV3::emitKernel(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) { + auto &Func = MF.getFunction(); + auto KernelProps = getHSAKernelProps(MF, ProgramInfo); + + assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL || + Func.getCallingConv() == CallingConv::SPIR_KERNEL); + + auto &KernelsNode = getRootMetadata("amdhsa.kernels"); + auto Kernels = cast(KernelsNode.get()); + + { + auto &Kern = *KernelProps; + Kern[".name"] = std::make_shared(Func.getName()); + Kern[".symbol"] = std::make_shared( + (Twine(Func.getName()) + Twine(".kd")).str()); + emitKernelLanguage(Func, Kern); + emitKernelAttrs(Func, Kern); + emitKernelArgs(Func, Kern); + } + + Kernels->push_back(std::move(KernelProps)); +} + } // end namespace HSAMD } // end namespace AMDGPU } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index a1e08235a5ec4234012080b4a8253ccfd586d56d..afc09baf952d6e7c869cd9435600da4a14bf3ed6 100644 --- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -19,10 +19,12 @@ #include "AMDGPU.h" #include "AMDKernelCodeT.h" #include "llvm/ADT/StringRef.h" +#include "llvm/BinaryFormat/MsgPackTypes.h" #include "llvm/Support/AMDGPUMetadata.h" namespace llvm { +class AMDGPUTargetStreamer; class Argument; class DataLayout; class Function; @@ -34,7 +36,92 @@ class Type; namespace AMDGPU { namespace HSAMD { -class MetadataStreamer final { +class MetadataStreamer { +public: + virtual ~MetadataStreamer(){}; + + virtual bool emitTo(AMDGPUTargetStreamer &TargetStreamer) = 0; + + virtual void begin(const Module &Mod) = 0; + + virtual void end() = 0; + + virtual void emitKernel(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) = 0; +}; + +class MetadataStreamerV3 final : public MetadataStreamer { +private: + std::shared_ptr HSAMetadataRoot = + std::make_shared(); + + void dump(StringRef HSAMetadataString) const; + + void verify(StringRef HSAMetadataString) const; + + Optional getAccessQualifier(StringRef AccQual) const; + + Optional getAddressSpaceQualifier(unsigned AddressSpace) const; + + StringRef getValueKind(Type *Ty, StringRef TypeQual, + StringRef BaseTypeName) const; + + StringRef getValueType(Type *Ty, StringRef TypeName) const; + + std::string getTypeName(Type *Ty, bool Signed) const; + + std::shared_ptr + getWorkGroupDimensions(MDNode *Node) const; + + std::shared_ptr + getHSAKernelProps(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const; + + void emitVersion(); + + void emitPrintf(const Module &Mod); + + void emitKernelLanguage(const Function &Func, msgpack::MapNode &Kern); + + void emitKernelAttrs(const Function &Func, msgpack::MapNode &Kern); + + void emitKernelArgs(const Function &Func, msgpack::MapNode &Kern); + + void emitKernelArg(const Argument &Arg, unsigned &Offset, + msgpack::ArrayNode &Args); + + void emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind, + unsigned &Offset, msgpack::ArrayNode &Args, + unsigned PointeeAlign = 0, StringRef Name = "", + StringRef TypeName = "", StringRef BaseTypeName = "", + StringRef AccQual = "", StringRef TypeQual = ""); + + void emitHiddenKernelArgs(const Function &Func, unsigned &Offset, + msgpack::ArrayNode &Args); + + std::shared_ptr &getRootMetadata(StringRef Key) { + return (*cast(HSAMetadataRoot.get()))[Key]; + } + + std::shared_ptr &getHSAMetadataRoot() { + return HSAMetadataRoot; + } + +public: + MetadataStreamerV3() = default; + ~MetadataStreamerV3() = default; + + bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override; + + void begin(const Module &Mod) override; + + void end() override; + + void emitKernel(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) override; +}; + +class MetadataStreamerV2 final : public MetadataStreamer { private: Metadata HSAMetadata; @@ -44,7 +131,7 @@ private: AccessQualifier getAccessQualifier(StringRef AccQual) const; - AddressSpaceQualifier getAddressSpaceQualifer(unsigned AddressSpace) const; + AddressSpaceQualifier getAddressSpaceQualifier(unsigned AddressSpace) const; ValueKind getValueKind(Type *Ty, StringRef TypeQual, StringRef BaseTypeName) const; @@ -82,19 +169,22 @@ private: void emitHiddenKernelArgs(const Function &Func); -public: - MetadataStreamer() = default; - ~MetadataStreamer() = default; - const Metadata &getHSAMetadata() const { return HSAMetadata; } - void begin(const Module &Mod); +public: + MetadataStreamerV2() = default; + ~MetadataStreamerV2() = default; + + bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override; + + void begin(const Module &Mod) override; - void end(); + void end() override; - void emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo); + void emitKernel(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) override; }; } // end namespace HSAMD diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index ad0a9e388af894f4ca8d2f6f7e49b8b8950726e0..f5894c9022a6e5e3c142785cb4e640f99fcaec72 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -655,8 +655,11 @@ bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { } bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, - ISD::LoadExtType, + ISD::LoadExtType ExtTy, EVT NewVT) const { + // TODO: This may be worth removing. Check regression tests for diffs. + if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT)) + return false; unsigned NewSize = NewVT.getStoreSizeInBits(); diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index b7d1575ca898fc55254532018543dddc26ec4340..282d1c118333629717d6eecbd8f4c0791b5e5193 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -135,6 +135,12 @@ def brtarget : Operand; // Misc. PatFrags //===----------------------------------------------------------------------===// +class HasOneUseUnaryOp : PatFrag< + (ops node:$src0), + (op $src0), + [{ return N->hasOneUse(); }] +>; + class HasOneUseBinOp : PatFrag< (ops node:$src0, node:$src1), (op $src0, $src1), @@ -165,6 +171,8 @@ def or_oneuse : HasOneUseBinOp; def xor_oneuse : HasOneUseBinOp; } // Properties = [SDNPCommutative, SDNPAssociative] +def not_oneuse : HasOneUseUnaryOp; + def add_oneuse : HasOneUseBinOp; def sub_oneuse : HasOneUseBinOp; @@ -796,18 +804,30 @@ class ROTRPattern : AMDGPUPat < (BIT_ALIGN $src0, $src0, $src1) >; -// This matches 16 permutations of -// max(min(x, y), min(max(x, y), z)) -class IntMed3Pat : AMDGPUPat< + SDPatternOperator max_oneuse, + ValueType vt = i32> { + + // This matches 16 permutations of + // min(max(a, b), max(min(a, b), c)) + def : AMDGPUPat < + (min (max_oneuse vt:$src0, vt:$src1), + (max_oneuse (min_oneuse vt:$src0, vt:$src1), vt:$src2)), + (med3Inst vt:$src0, vt:$src1, vt:$src2) +>; + + // This matches 16 permutations of + // max(min(x, y), min(max(x, y), z)) + def : AMDGPUPat < (max (min_oneuse vt:$src0, vt:$src1), (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)), (med3Inst $src0, $src1, $src2) >; - +} + // Special conversion patterns def cvt_rpi_i32_f32 : PatFrag < diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp index 896e2055cf620615788f7bb369a8289027d22423..02108ca3ddd783cbe3d355139faec0b2cad4c878 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp @@ -40,7 +40,7 @@ StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID, if (IntrID < Intrinsic::num_intrinsics) return StringRef(); - assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics && + assert(IntrID < SIIntrinsic::num_AMDGPU_intrinsics && "Invalid intrinsic ID"); return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]; @@ -91,7 +91,7 @@ Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, = cast(M->getOrInsertFunction(getName(IntrID, Tys), FTy)); AttributeList AS = - getAttributes(M->getContext(), static_cast(IntrID)); + getAttributes(M->getContext(), static_cast(IntrID)); F->setAttributes(AS); return F; } diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h index ef42f9a319af69ca3541040e0f15ef09d6e7790a..a1a094dded23d007a277f3a68152a56dbbbb0e60 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h +++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h @@ -20,7 +20,7 @@ namespace llvm { class TargetMachine; -namespace AMDGPUIntrinsic { +namespace SIIntrinsic { enum ID { last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1, #define GET_INTRINSIC_ENUM_VALUES diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 877251a3ac40384ab84fc64b0e0f311355a6e194..62fe0f3a8b91aa38c3e651307940f2667f7fdbf3 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -45,6 +45,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); + const LLT CodePtr = FlatPtr; + const LLT AddrSpaces[] = { GlobalPtr, ConstantPtr, @@ -88,17 +90,31 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, // between these two scenarios. setAction({G_CONSTANT, S1}, Legal); - setAction({G_FADD, S32}, Legal); + setAction({G_FRAME_INDEX, PrivatePtr}, Legal); + + getActionDefinitionsBuilder( + { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA}) + .legalFor({S32, S64}); + + // Use actual fsub instruction + setAction({G_FSUB, S32}, Legal); + + // Must use fadd + fneg + setAction({G_FSUB, S64}, Lower); setAction({G_FCMP, S1}, Legal); setAction({G_FCMP, 1, S32}, Legal); setAction({G_FCMP, 1, S64}, Legal); - setAction({G_FMUL, S32}, Legal); - setAction({G_ZEXT, S64}, Legal); setAction({G_ZEXT, 1, S32}, Legal); + setAction({G_SEXT, S64}, Legal); + setAction({G_SEXT, 1, S32}, Legal); + + setAction({G_ANYEXT, S64}, Legal); + setAction({G_ANYEXT, 1, S32}, Legal); + setAction({G_FPTOSI, S32}, Legal); setAction({G_FPTOSI, 1, S32}, Legal); @@ -114,14 +130,28 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, setAction({G_GEP, 1, IdxTy}, Legal); } + setAction({G_BLOCK_ADDR, CodePtr}, Legal); + setAction({G_ICMP, S1}, Legal); setAction({G_ICMP, 1, S32}, Legal); + setAction({G_CTLZ, S32}, Legal); + setAction({G_CTLZ_ZERO_UNDEF, S32}, Legal); + setAction({G_CTTZ, S32}, Legal); + setAction({G_CTTZ_ZERO_UNDEF, S32}, Legal); + setAction({G_BSWAP, S32}, Legal); + setAction({G_CTPOP, S32}, Legal); + getActionDefinitionsBuilder(G_INTTOPTR) .legalIf([](const LegalityQuery &Query) { return true; }); + getActionDefinitionsBuilder(G_PTRTOINT) + .legalIf([](const LegalityQuery &Query) { + return true; + }); + getActionDefinitionsBuilder({G_LOAD, G_STORE}) .legalIf([=, &ST](const LegalityQuery &Query) { const LLT &Ty0 = Query.Types[0]; @@ -182,6 +212,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, (Ty1.getSizeInBits() % 32 == 0); }); + getActionDefinitionsBuilder(G_BUILD_VECTOR) + .legalIf([=](const LegalityQuery &Query) { + const LLT &VecTy = Query.Types[0]; + const LLT &ScalarTy = Query.Types[1]; + return VecTy.getSizeInBits() % 32 == 0 && + ScalarTy.getSizeInBits() % 32 == 0 && + VecTy.getSizeInBits() <= 512; + }); // Merge/Unmerge for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/lib/Target/AMDGPU/AMDGPULibFunc.cpp index f37795e961e806c3ae17178e1cfb43adfb9634b0..4fc3fe0f105b0823a4a5d926ce9576950f87a74e 100644 --- a/lib/Target/AMDGPU/AMDGPULibFunc.cpp +++ b/lib/Target/AMDGPU/AMDGPULibFunc.cpp @@ -995,8 +995,10 @@ Function *AMDGPULibFunc::getOrInsertFunction(Module *M, } else { AttributeList Attr; LLVMContext &Ctx = M->getContext(); - Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::ReadOnly); - Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::NoUnwind); + Attr = Attr.addAttribute(Ctx, AttributeList::FunctionIndex, + Attribute::ReadOnly); + Attr = Attr.addAttribute(Ctx, AttributeList::FunctionIndex, + Attribute::NoUnwind); C = M->getOrInsertFunction(FuncName, FuncTy, Attr); } diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index fae1da9233770c32625df0147f59b3dd3848377d..743dc7a0d00b97fb3b119cb6502687fe35aaafa8 100644 --- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -122,14 +122,17 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { VectorType *VT = dyn_cast(ArgTy); bool IsV3 = VT && VT->getNumElements() == 3; + bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType(); + VectorType *V4Ty = nullptr; int64_t AlignDownOffset = alignDown(EltOffset, 4); int64_t OffsetDiff = EltOffset - AlignDownOffset; - unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset); + unsigned AdjustedAlign = MinAlign(DoShiftOpt ? AlignDownOffset : EltOffset, + KernArgBaseAlign); Value *ArgPtr; - if (Size < 32 && !ArgTy->isAggregateType()) { // FIXME: Handle aggregate types + if (DoShiftOpt) { // FIXME: Handle aggregate types // Since we don't have sub-dword scalar loads, avoid doing an extload by // loading earlier than the argument address, and extracting the relevant // bits. @@ -147,7 +150,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { } else { ArgPtr = Builder.CreateConstInBoundsGEP1_64( KernArgSegment, - AlignDownOffset, + EltOffset, Arg.getName() + ".kernarg.offset"); ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS), ArgPtr->getName() + ".cast"); @@ -198,7 +201,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { // TODO: Convert noalias arg to !noalias - if (Size < 32 && !ArgTy->isAggregateType()) { + if (DoShiftOpt) { Value *ExtractBits = OffsetDiff == 0 ? Load : Builder.CreateLShr(Load, OffsetDiff * 8); diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp index 995d9ae3907fc5c2d1126cf2cb311b9a9b0d812b..5e0b7d4290220ad0ac854b1bb71d21407628add1 100644 --- a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp +++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp @@ -42,9 +42,12 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_, if (!FirstMI) return true; + const MachineBasicBlock &MBB = *FirstMI->getParent(); + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); const MachineOperand *Src2 = TII.getNamedOperand(SecondMI, AMDGPU::OpName::src2); - return FirstMI->definesRegister(Src2->getReg()); + return FirstMI->definesRegister(Src2->getReg(), TRI); } default: return false; diff --git a/lib/Target/AMDGPU/AMDGPUPTNote.h b/lib/Target/AMDGPU/AMDGPUPTNote.h index b50a2eb8e9e714aea0fec14d1a9386764707dc78..2feff14d34a15864df2e444cc71ec2ade690101a 100644 --- a/lib/Target/AMDGPU/AMDGPUPTNote.h +++ b/lib/Target/AMDGPU/AMDGPUPTNote.h @@ -23,7 +23,8 @@ namespace ElfNote { const char SectionName[] = ".note"; -const char NoteName[] = "AMD"; +const char NoteNameV2[] = "AMD"; +const char NoteNameV3[] = "AMDGPU"; // TODO: Remove this file once we drop code object v2. enum NoteType{ diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 05b714f924b938f630c72b9b42fd2690610bb959..e53c02202df5bb5c101f29502ff941a0bf7db503 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -35,7 +35,7 @@ AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI) : AMDGPUGenRegisterBankInfo(), TRI(static_cast(&TRI)) { - // HACK: Until this is fully tablegen'd + // HACK: Until this is fully tablegen'd. static bool AlreadyInit = false; if (AlreadyInit) return; @@ -354,9 +354,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { LLVM_FALLTHROUGH; case AMDGPU::G_FADD: + case AMDGPU::G_FSUB: case AMDGPU::G_FPTOSI: case AMDGPU::G_FPTOUI: case AMDGPU::G_FMUL: + case AMDGPU::G_FMA: return getDefaultMappingVOP(MI); case AMDGPU::G_IMPLICIT_DEF: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); @@ -364,7 +366,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case AMDGPU::G_FCONSTANT: - case AMDGPU::G_CONSTANT: { + case AMDGPU::G_CONSTANT: + case AMDGPU::G_FRAME_INDEX: + case AMDGPU::G_BLOCK_ADDR: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; @@ -402,8 +406,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); break; } + case AMDGPU::G_BITCAST: case AMDGPU::G_INTTOPTR: - case AMDGPU::G_BITCAST: { + case AMDGPU::G_PTRTOINT: + case AMDGPU::G_CTLZ: + case AMDGPU::G_CTLZ_ZERO_UNDEF: + case AMDGPU::G_CTTZ: + case AMDGPU::G_CTTZ_ZERO_UNDEF: + case AMDGPU::G_CTPOP: + case AMDGPU::G_BSWAP: + case AMDGPU::G_FABS: + case AMDGPU::G_FNEG: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); @@ -419,7 +432,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); break; } - case AMDGPU::G_ZEXT: { + case AMDGPU::G_ZEXT: + case AMDGPU::G_SEXT: + case AMDGPU::G_ANYEXT: { unsigned Dst = MI.getOperand(0).getReg(); unsigned Src = MI.getOperand(1).getReg(); unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 8b1cb23c6722af19e6fe9472e3aeae84d4ff6971..886aca42b6cb1eb3a8a1bbe9747b423b61f69906 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -581,7 +581,8 @@ public: } bool hasCodeObjectV3() const { - return CodeObjectV3; + // FIXME: Need to add code object v3 support for mesa and pal. + return isAmdHsaOS() ? CodeObjectV3 : false; } bool hasUnalignedBufferAccess() const { diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 403dace533a335181d7e0509a86ff821cd1145b9..70d365f4ad75d51734bd338cf3882868785e1251 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -106,6 +106,11 @@ static cl::opt EnableSDWAPeephole( cl::desc("Enable SDWA peepholer"), cl::init(true)); +static cl::opt EnableDPPCombine( + "amdgpu-dpp-combine", + cl::desc("Enable DPP combiner"), + cl::init(false)); + // Enable address space based alias analysis static cl::opt EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, cl::desc("Enable AMDGPU Alias Analysis"), @@ -145,6 +150,13 @@ static cl::opt EnableAtomicOptimizations( cl::init(false), cl::Hidden); +// Enable Mode register optimization +static cl::opt EnableSIModeRegisterPass( + "amdgpu-mode-register", + cl::desc("Enable mode register pass"), + cl::init(true), + cl::Hidden); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -158,9 +170,11 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeR600VectorRegMergerPass(*PR); initializeGlobalISel(*PR); initializeAMDGPUDAGToDAGISelPass(*PR); + initializeGCNDPPCombinePass(*PR); initializeSILowerI1CopiesPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); + initializeSIFixupVectorISelPass(*PR); initializeSIFoldOperandsPass(*PR); initializeSIPeepholeSDWAPass(*PR); initializeSIShrinkInstructionsPass(*PR); @@ -182,6 +196,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); initializeSIInsertWaitcntsPass(*PR); + initializeSIModeRegisterPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); initializeSIInsertSkipsPass(*PR); @@ -303,12 +318,6 @@ static Reloc::Model getEffectiveRelocModel(Optional RM) { return Reloc::PIC_; } -static CodeModel::Model getEffectiveCodeModel(Optional CM) { - if (CM) - return *CM; - return CodeModel::Small; -} - AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, @@ -317,7 +326,7 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OptLevel) : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), FS, Options, getEffectiveRelocModel(RM), - getEffectiveCodeModel(CM), OptLevel), + getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), TLOF(createTLOF(getTargetTriple())) { initAsmInfo(); } @@ -789,6 +798,8 @@ void GCNPassConfig::addMachineSSAOptimization() { // // XXX - Can we get away without running DeadMachineInstructionElim again? addPass(&SIFoldOperandsID); + if (EnableDPPCombine) + addPass(&GCNDPPCombineID); addPass(&DeadMachineInstructionElimID); addPass(&SILoadStoreOptimizerID); if (EnableSDWAPeephole) { @@ -813,6 +824,7 @@ bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); addPass(&SIFixSGPRCopiesID); addPass(createSILowerI1CopiesPass()); + addPass(createSIFixupVectorISelPass()); return false; } @@ -878,7 +890,8 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { void GCNPassConfig::addPostRegAlloc() { addPass(&SIFixVGPRCopiesID); - addPass(&SIOptimizeExecMaskingID); + if (getOptLevel() > CodeGenOpt::None) + addPass(&SIOptimizeExecMaskingID); TargetPassConfig::addPostRegAlloc(); } @@ -889,6 +902,7 @@ void GCNPassConfig::addPreEmitPass() { addPass(createSIMemoryLegalizerPass()); addPass(createSIInsertWaitcntsPass()); addPass(createSIShrinkInstructionsPass()); + addPass(createSIModeRegisterPass()); // The hazard recognizer that runs as part of the post-ra scheduler does not // guarantee to be able handle all hazards correctly. This is because if there diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp index e2f718bd3c34d5058c04efa6c191ff8ac940219f..7a720717cc800857a67e01da6e8423ad1e2fd2f4 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp @@ -29,3 +29,13 @@ MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal( return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM); } + +MCSection *AMDGPUTargetObjectFile::getExplicitSectionGlobal( + const GlobalObject *GO, SectionKind SK, const TargetMachine &TM) const { + // Set metadata access for the explicit section + StringRef SectionName = GO->getSection(); + if (SectionName.startswith(".AMDGPU.metadata.")) + SK = SectionKind::getMetadata(); + + return TargetLoweringObjectFileELF::getExplicitSectionGlobal(GO, SK, TM); +} diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h index dd9dc1a88fc2b68148bc083b9da0560452ccc3bb..a4ae1a2c18c26bdbdace155b7923ca9be4aa83e0 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h +++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h @@ -26,6 +26,8 @@ class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF { public: MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override; + MCSection *getExplicitSectionGlobal(const GlobalObject *GO, SectionKind Kind, + const TargetMachine &TM) const override; }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index bd94193087207ab8b5f4215ebdc767b77e41f5e2..3f9af27a2e5e1d2fec59755ae1896ff42b2abf50 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -3065,9 +3065,18 @@ bool AMDGPUAsmParser::ParseDirectiveISAVersion() { } bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() { + const char *AssemblerDirectiveBegin; + const char *AssemblerDirectiveEnd; + std::tie(AssemblerDirectiveBegin, AssemblerDirectiveEnd) = + AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI()) + ? std::make_tuple(HSAMD::V3::AssemblerDirectiveBegin, + HSAMD::V3::AssemblerDirectiveEnd) + : std::make_tuple(HSAMD::AssemblerDirectiveBegin, + HSAMD::AssemblerDirectiveEnd); + if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA) { return Error(getParser().getTok().getLoc(), - (Twine(HSAMD::AssemblerDirectiveBegin) + Twine(" directive is " + (Twine(AssemblerDirectiveBegin) + Twine(" directive is " "not available on non-amdhsa OSes")).str()); } @@ -3085,7 +3094,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() { if (getLexer().is(AsmToken::Identifier)) { StringRef ID = getLexer().getTok().getIdentifier(); - if (ID == AMDGPU::HSAMD::AssemblerDirectiveEnd) { + if (ID == AssemblerDirectiveEnd) { Lex(); FoundEnd = true; break; @@ -3107,8 +3116,13 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() { YamlStream.flush(); - if (!getTargetStreamer().EmitHSAMetadata(HSAMetadataString)) - return Error(getParser().getTok().getLoc(), "invalid HSA metadata"); + if (IsaInfo::hasCodeObjectV3(&getSTI())) { + if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString)) + return Error(getParser().getTok().getLoc(), "invalid HSA metadata"); + } else { + if (!getTargetStreamer().EmitHSAMetadataV2(HSAMetadataString)) + return Error(getParser().getTok().getLoc(), "invalid HSA metadata"); + } return false; } @@ -3145,6 +3159,10 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".amdhsa_kernel") return ParseDirectiveAMDHSAKernel(); + + // TODO: Restructure/combine with PAL metadata directive. + if (IDVal == AMDGPU::HSAMD::V3::AssemblerDirectiveBegin) + return ParseDirectiveHSAMetadata(); } else { if (IDVal == ".hsa_code_object_version") return ParseDirectiveHSACodeObjectVersion(); @@ -3160,10 +3178,10 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".amd_amdgpu_isa") return ParseDirectiveISAVersion(); - } - if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin) - return ParseDirectiveHSAMetadata(); + if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin) + return ParseDirectiveHSAMetadata(); + } if (IDVal == PALMD::AssemblerDirective) return ParseDirectivePALMetadata(); @@ -5275,12 +5293,14 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); } - // All DPP instructions with at least one source operand have a fake "old" - // source at the beginning that's tied to the dst operand. Handle it here. - if (Desc.getNumOperands() >= 2) - Inst.addOperand(Inst.getOperand(0)); - for (unsigned E = Operands.size(); I != E; ++I) { + auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(), + MCOI::TIED_TO); + if (TiedTo != -1) { + assert((unsigned)TiedTo < Inst.getNumOperands()); + // handle tied old or src2 for MAC instructions + Inst.addOperand(Inst.getOperand(TiedTo)); + } AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); // Add the register arguments if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) { diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td index e48b73b0f1e793d95d11dc9e2b687b859f828650..51c2abeac2ffb07f0b495a81a48cbbf7113e7bd2 100644 --- a/lib/Target/AMDGPU/BUFInstructions.td +++ b/lib/Target/AMDGPU/BUFInstructions.td @@ -286,6 +286,12 @@ multiclass MTBUF_Pseudo_Stores { + string ret = !subst("DWORDX2", "DWORD", + !subst("DWORDX3", "DWORD", + !subst("DWORDX4", "DWORD", Op))); +} + class MUBUF_Pseudo pattern=[]> : InstSI, @@ -299,6 +305,9 @@ class MUBUF_Pseudo (NAME); + Instruction BaseOpcode = !cast(MUBUFGetBaseOpcode.ret); + let VM_CNT = 1; let EXP_CNT = 1; let MUBUF = 1; @@ -321,6 +330,7 @@ class MUBUF_Pseudo has_offset = 1; bits<1> has_slc = 1; bits<1> has_tfe = 1; + bits<4> dwords = 0; } class MUBUF_Real op, MUBUF_Pseudo ps> : @@ -394,6 +404,16 @@ class getMUBUFInsDA vdataList, ); } +class getMUBUFDwords { + string regClassAsInt = !cast(regClass); + int ret = + !if(!eq(regClassAsInt, !cast(VGPR_32)), 1, + !if(!eq(regClassAsInt, !cast(VReg_64)), 2, + !if(!eq(regClassAsInt, !cast(VReg_96)), 3, + !if(!eq(regClassAsInt, !cast(VReg_128)), 4, + 0)))); +} + class getMUBUFIns vdataList=[], bit isLds = 0> { dag ret = !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA.ret, @@ -454,6 +474,7 @@ class MUBUF_Load_Pseudo .ret; } // FIXME: tfe can't be an operand because it requires a separate @@ -517,6 +538,7 @@ class MUBUF_Store_Pseudo .ret; } multiclass MUBUF_Pseudo_Stores; defm : MUBUFScratchLoadPat_Lo16; } - -// BUFFER_LOAD_DWORD*, addr64=0 -multiclass MUBUF_Load_Dword { - - def : GCNPat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset, - imm:$offset, 0, 0, imm:$glc, imm:$slc, - imm:$tfe)), - (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), (as_i1imm $tfe)) - >; - - def : GCNPat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, - imm:$offset, 1, 0, imm:$glc, imm:$slc, - imm:$tfe)), - (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $tfe)) - >; - - def : GCNPat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, - imm:$offset, 0, 1, imm:$glc, imm:$slc, - imm:$tfe)), - (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), (as_i1imm $tfe)) - >; - - def : GCNPat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset, - imm:$offset, 1, 1, imm:$glc, imm:$slc, - imm:$tfe)), - (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $tfe)) - >; -} - -defm : MUBUF_Load_Dword ; -defm : MUBUF_Load_Dword ; -defm : MUBUF_Load_Dword ; - multiclass MUBUFStore_Atomic_Pattern { // Store follows atomic op convention so address is forst @@ -2116,3 +2090,22 @@ let SubtargetPredicate = HasPackedD16VMem in { defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_vi <0x0e>; defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0f>; } // End HasUnpackedD16VMem. + +def MUBUFInfoTable : GenericTable { + let FilterClass = "MUBUF_Pseudo"; + let CppTypeName = "MUBUFInfo"; + let Fields = ["Opcode", "BaseOpcode", "dwords", "has_vaddr", "has_srsrc", "has_soffset"]; + + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getMUBUFOpcodeHelper"; +} + +def getMUBUFInfoFromOpcode : SearchIndex { + let Table = MUBUFInfoTable; + let Key = ["Opcode"]; +} + +def getMUBUFInfoFromBaseOpcodeAndDwords : SearchIndex { + let Table = MUBUFInfoTable; + let Key = ["BaseOpcode", "dwords"]; +} diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index 3c87dc188270364f66e0986062de9f6d6f8e706f..7d1219914823d558b2d4dd28ffcd74cc8fee1ec3 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -96,6 +96,7 @@ add_llvm_target(AMDGPUCodeGen SIAnnotateControlFlow.cpp SIDebuggerInsertNops.cpp SIFixSGPRCopies.cpp + SIFixupVectorISel.cpp SIFixVGPRCopies.cpp SIFixWWMLiveness.cpp SIFoldOperands.cpp @@ -118,6 +119,8 @@ add_llvm_target(AMDGPUCodeGen SIShrinkInstructions.cpp SIWholeQuadMode.cpp GCNILPSched.cpp + GCNDPPCombine.cpp + SIModeRegister.cpp ) add_subdirectory(AsmParser) diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td index 18e8b8a1c2d1a6e6fd29bc5cda3f4bdcb5f51f5f..44040d352e6a86ef5e6fa979cb6d815c4a5e8872 100644 --- a/lib/Target/AMDGPU/FLATInstructions.td +++ b/lib/Target/AMDGPU/FLATInstructions.td @@ -121,6 +121,11 @@ class FLAT_Real op, FLAT_Pseudo ps> : let Inst{63-56} = !if(ps.has_vdst, vdst, ?); } +class GlobalSaddrTable { + bit IsSaddr = is_saddr; + string SaddrOp = Name; +} + // TODO: Is exec allowed for saddr? The disabled value 0x7f is the // same encoding value as exec_hi, so it isn't possible to use that if // saddr is 32-bit (which isn't handled here yet). @@ -171,15 +176,19 @@ class FLAT_Store_Pseudo { let is_flat_global = 1 in { - def "" : FLAT_Load_Pseudo; - def _SADDR : FLAT_Load_Pseudo; + def "" : FLAT_Load_Pseudo, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Load_Pseudo, + GlobalSaddrTable<1, opName>; } } multiclass FLAT_Global_Store_Pseudo { let is_flat_global = 1 in { - def "" : FLAT_Store_Pseudo; - def _SADDR : FLAT_Store_Pseudo; + def "" : FLAT_Store_Pseudo, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Store_Pseudo, + GlobalSaddrTable<1, opName>; } } @@ -262,6 +271,7 @@ multiclass FLAT_Atomic_Pseudo< (outs), (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc), " $vaddr, $vdata$offset$slc">, + GlobalSaddrTable<0, opName>, AtomicNoRet { let PseudoInstr = NAME; } @@ -272,6 +282,7 @@ multiclass FLAT_Atomic_Pseudo< " $vdst, $vaddr, $vdata$offset glc$slc", [(set vt:$vdst, (atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>, + GlobalSaddrTable<0, opName#"_rtn">, AtomicNoRet ; } @@ -287,6 +298,7 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN< (outs), (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc), " $vaddr, $vdata, off$offset$slc">, + GlobalSaddrTable<0, opName>, AtomicNoRet { let has_saddr = 1; let PseudoInstr = NAME; @@ -296,6 +308,7 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN< (outs), (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc), " $vaddr, $vdata, $saddr$offset$slc">, + GlobalSaddrTable<1, opName>, AtomicNoRet { let has_saddr = 1; let enabled_saddr = 1; @@ -317,6 +330,7 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN< " $vdst, $vaddr, $vdata, off$offset glc$slc", [(set vt:$vdst, (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>, + GlobalSaddrTable<0, opName#"_rtn">, AtomicNoRet { let has_saddr = 1; } @@ -325,6 +339,7 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN< (outs vdst_rc:$vdst), (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc), " $vdst, $vaddr, $vdata, $saddr$offset glc$slc">, + GlobalSaddrTable<1, opName#"_rtn">, AtomicNoRet { let has_saddr = 1; let enabled_saddr = 1; diff --git a/lib/Target/AMDGPU/GCNDPPCombine.cpp b/lib/Target/AMDGPU/GCNDPPCombine.cpp new file mode 100644 index 0000000000000000000000000000000000000000..56071d0d23744dfd0e887e8b6f13cff377cc5c9f --- /dev/null +++ b/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -0,0 +1,446 @@ +//=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0 +// operand.If any of the use instruction cannot be combined with the mov the +// whole sequence is reverted. +// +// $old = ... +// $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane, +// dpp_controls..., $bound_ctrl +// $res = VALU $dpp_value, ... +// +// to +// +// $res = VALU_DPP $folded_old, $vgpr_to_be_read_from_other_lane, ..., +// dpp_controls..., $folded_bound_ctrl +// +// Combining rules : +// +// $bound_ctrl is DPP_BOUND_ZERO, $old is any +// $bound_ctrl is DPP_BOUND_OFF, $old is 0 +// +// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_ZERO +// $bound_ctrl is DPP_BOUND_OFF, $old is undef +// +// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_OFF +// $bound_ctrl is DPP_BOUND_OFF, $old is foldable +// +// ->$folded_old = folded value, $folded_bound_ctrl = DPP_BOUND_OFF +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/Pass.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "gcn-dpp-combine" + +STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined."); + +namespace { + +class GCNDPPCombine : public MachineFunctionPass { + MachineRegisterInfo *MRI; + const SIInstrInfo *TII; + + using RegSubRegPair = TargetInstrInfo::RegSubRegPair; + + MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const; + + RegSubRegPair foldOldOpnd(MachineInstr &OrigMI, + RegSubRegPair OldOpndVGPR, + MachineOperand &OldOpndValue) const; + + MachineInstr *createDPPInst(MachineInstr &OrigMI, + MachineInstr &MovMI, + RegSubRegPair OldOpndVGPR, + MachineOperand *OldOpnd, + bool BoundCtrlZero) const; + + MachineInstr *createDPPInst(MachineInstr &OrigMI, + MachineInstr &MovMI, + RegSubRegPair OldOpndVGPR, + bool BoundCtrlZero) const; + + bool hasNoImmOrEqual(MachineInstr &MI, + unsigned OpndName, + int64_t Value, + int64_t Mask = -1) const; + + bool combineDPPMov(MachineInstr &MI) const; + +public: + static char ID; + + GCNDPPCombine() : MachineFunctionPass(ID) { + initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "GCN DPP Combine"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // end anonymous namespace + +INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine", false, false) + +char GCNDPPCombine::ID = 0; + +char &llvm::GCNDPPCombineID = GCNDPPCombine::ID; + +FunctionPass *llvm::createGCNDPPCombinePass() { + return new GCNDPPCombine(); +} + +static int getDPPOp(unsigned Op) { + auto DPP32 = AMDGPU::getDPPOp32(Op); + if (DPP32 != -1) + return DPP32; + + auto E32 = AMDGPU::getVOPe32(Op); + return E32 != -1 ? AMDGPU::getDPPOp32(E32) : -1; +} + +// tracks the register operand definition and returns: +// 1. immediate operand used to initialize the register if found +// 2. nullptr if the register operand is undef +// 3. the operand itself otherwise +MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const { + auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI); + if (!Def) + return nullptr; + + switch(Def->getOpcode()) { + default: break; + case AMDGPU::IMPLICIT_DEF: + return nullptr; + case AMDGPU::COPY: + case AMDGPU::V_MOV_B32_e32: { + auto &Op1 = Def->getOperand(1); + if (Op1.isImm()) + return &Op1; + break; + } + } + return &OldOpnd; +} + +MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, + MachineInstr &MovMI, + RegSubRegPair OldOpndVGPR, + bool BoundCtrlZero) const { + assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); + assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() == + TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg()); + + auto OrigOp = OrigMI.getOpcode(); + auto DPPOp = getDPPOp(OrigOp); + if (DPPOp == -1) { + LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n"); + return nullptr; + } + + auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI, + OrigMI.getDebugLoc(), TII->get(DPPOp)); + bool Fail = false; + do { + auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst); + assert(Dst); + DPPInst.add(*Dst); + int NumOperands = 1; + + const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old); + if (OldIdx != -1) { + assert(OldIdx == NumOperands); + assert(isOfRegClass(OldOpndVGPR, AMDGPU::VGPR_32RegClass, *MRI)); + DPPInst.addReg(OldOpndVGPR.Reg, 0, OldOpndVGPR.SubReg); + ++NumOperands; + } + + if (auto *Mod0 = TII->getNamedOperand(OrigMI, + AMDGPU::OpName::src0_modifiers)) { + assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, + AMDGPU::OpName::src0_modifiers)); + assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); + DPPInst.addImm(Mod0->getImm()); + ++NumOperands; + } + auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0); + assert(Src0); + if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) { + LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n"); + Fail = true; + break; + } + DPPInst.add(*Src0); + ++NumOperands; + + if (auto *Mod1 = TII->getNamedOperand(OrigMI, + AMDGPU::OpName::src1_modifiers)) { + assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, + AMDGPU::OpName::src1_modifiers)); + assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); + DPPInst.addImm(Mod1->getImm()); + ++NumOperands; + } + if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { + if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) { + LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n"); + Fail = true; + break; + } + DPPInst.add(*Src1); + ++NumOperands; + } + + if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) { + if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) { + LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n"); + Fail = true; + break; + } + DPPInst.add(*Src2); + } + + DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl)); + DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask)); + DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask)); + DPPInst.addImm(BoundCtrlZero ? 1 : 0); + } while (false); + + if (Fail) { + DPPInst.getInstr()->eraseFromParent(); + return nullptr; + } + LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr()); + return DPPInst.getInstr(); +} + +GCNDPPCombine::RegSubRegPair +GCNDPPCombine::foldOldOpnd(MachineInstr &OrigMI, + RegSubRegPair OldOpndVGPR, + MachineOperand &OldOpndValue) const { + assert(OldOpndValue.isImm()); + switch (OrigMI.getOpcode()) { + default: break; + case AMDGPU::V_MAX_U32_e32: + if (OldOpndValue.getImm() == std::numeric_limits::max()) + return OldOpndVGPR; + break; + case AMDGPU::V_MAX_I32_e32: + if (OldOpndValue.getImm() == std::numeric_limits::max()) + return OldOpndVGPR; + break; + case AMDGPU::V_MIN_I32_e32: + if (OldOpndValue.getImm() == std::numeric_limits::min()) + return OldOpndVGPR; + break; + + case AMDGPU::V_MUL_I32_I24_e32: + case AMDGPU::V_MUL_U32_U24_e32: + if (OldOpndValue.getImm() == 1) { + auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); + assert(Src1 && Src1->isReg()); + return getRegSubRegPair(*Src1); + } + break; + } + return RegSubRegPair(); +} + +// Cases to combine: +// $bound_ctrl is DPP_BOUND_ZERO, $old is any +// $bound_ctrl is DPP_BOUND_OFF, $old is 0 +// -> $old = undef, $bound_ctrl = DPP_BOUND_ZERO + +// $bound_ctrl is DPP_BOUND_OFF, $old is undef +// -> $old = undef, $bound_ctrl = DPP_BOUND_OFF + +// $bound_ctrl is DPP_BOUND_OFF, $old is foldable +// -> $old = folded value, $bound_ctrl = DPP_BOUND_OFF + +MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, + MachineInstr &MovMI, + RegSubRegPair OldOpndVGPR, + MachineOperand *OldOpndValue, + bool BoundCtrlZero) const { + assert(OldOpndVGPR.Reg); + if (!BoundCtrlZero && OldOpndValue) { + assert(OldOpndValue->isImm()); + OldOpndVGPR = foldOldOpnd(OrigMI, OldOpndVGPR, *OldOpndValue); + if (!OldOpndVGPR.Reg) { + LLVM_DEBUG(dbgs() << " failed: old immediate cannot be folded\n"); + return nullptr; + } + } + return createDPPInst(OrigMI, MovMI, OldOpndVGPR, BoundCtrlZero); +} + +// returns true if MI doesn't have OpndName immediate operand or the +// operand has Value +bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName, + int64_t Value, int64_t Mask) const { + auto *Imm = TII->getNamedOperand(MI, OpndName); + if (!Imm) + return true; + + assert(Imm->isImm()); + return (Imm->getImm() & Mask) == Value; +} + +bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { + assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); + auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl); + assert(BCZOpnd && BCZOpnd->isImm()); + bool BoundCtrlZero = 0 != BCZOpnd->getImm(); + + LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI); + + auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old); + assert(OldOpnd && OldOpnd->isReg()); + auto OldOpndVGPR = getRegSubRegPair(*OldOpnd); + auto *OldOpndValue = getOldOpndValue(*OldOpnd); + assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd); + if (OldOpndValue) { + if (BoundCtrlZero) { + OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef, ignore old opnd + OldOpndValue = nullptr; + } else { + if (!OldOpndValue->isImm()) { + LLVM_DEBUG(dbgs() << " failed: old operand isn't an imm or undef\n"); + return false; + } + if (OldOpndValue->getImm() == 0) { + OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef + OldOpndValue = nullptr; + BoundCtrlZero = true; + } + } + } + + LLVM_DEBUG(dbgs() << " old="; + if (!OldOpndValue) + dbgs() << "undef"; + else + dbgs() << OldOpndValue->getImm(); + dbgs() << ", bound_ctrl=" << BoundCtrlZero << '\n'); + + std::vector OrigMIs, DPPMIs; + if (!OldOpndVGPR.Reg) { // OldOpndVGPR = undef + OldOpndVGPR = RegSubRegPair( + MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass)); + auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(), + TII->get(AMDGPU::IMPLICIT_DEF), OldOpndVGPR.Reg); + DPPMIs.push_back(UndefInst.getInstr()); + } + + OrigMIs.push_back(&MovMI); + bool Rollback = true; + for (auto &Use : MRI->use_nodbg_operands( + TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg())) { + Rollback = true; + + auto &OrigMI = *Use.getParent(); + auto OrigOp = OrigMI.getOpcode(); + if (TII->isVOP3(OrigOp)) { + if (!TII->hasVALU32BitEncoding(OrigOp)) { + LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n"); + break; + } + // check if other than abs|neg modifiers are set (opsel for example) + const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG); + if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) || + !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) || + !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) || + !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) { + LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n"); + break; + } + } else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) { + LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n"); + break; + } + + LLVM_DEBUG(dbgs() << " combining: " << OrigMI); + if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) { + if (auto *DPPInst = createDPPInst(OrigMI, MovMI, OldOpndVGPR, + OldOpndValue, BoundCtrlZero)) { + DPPMIs.push_back(DPPInst); + Rollback = false; + } + } else if (OrigMI.isCommutable() && + &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { + auto *BB = OrigMI.getParent(); + auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI); + BB->insert(OrigMI, NewMI); + if (TII->commuteInstruction(*NewMI)) { + LLVM_DEBUG(dbgs() << " commuted: " << *NewMI); + if (auto *DPPInst = createDPPInst(*NewMI, MovMI, OldOpndVGPR, + OldOpndValue, BoundCtrlZero)) { + DPPMIs.push_back(DPPInst); + Rollback = false; + } + } else + LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n"); + NewMI->eraseFromParent(); + } else + LLVM_DEBUG(dbgs() << " failed: no suitable operands\n"); + if (Rollback) + break; + OrigMIs.push_back(&OrigMI); + } + + for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs)) + MI->eraseFromParent(); + + return !Rollback; +} + +bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) { + auto &ST = MF.getSubtarget(); + if (!ST.hasDPP() || skipFunction(MF.getFunction())) + return false; + + MRI = &MF.getRegInfo(); + TII = ST.getInstrInfo(); + + assert(MRI->isSSA() && "Must be run on SSA"); + + bool Changed = false; + for (auto &MBB : MF) { + for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) { + auto &MI = *I++; + if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) { + Changed = true; + ++NumDPPMovsCombined; + } + } + } + return Changed; +} diff --git a/lib/Target/AMDGPU/LLVMBuild.txt b/lib/Target/AMDGPU/LLVMBuild.txt index c54a13c4b4d885f9505dae2e08c4e91ed6fb9e81..e591d756a545e25d914a76efefec29d0f4626232 100644 --- a/lib/Target/AMDGPU/LLVMBuild.txt +++ b/lib/Target/AMDGPU/LLVMBuild.txt @@ -30,5 +30,5 @@ has_disassembler = 1 type = Library name = AMDGPUCodeGen parent = AMDGPU -required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize GlobalISel +required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize GlobalISel BinaryFormat add_to_library_groups = AMDGPU diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 225bf5b7816bf8ab0712f32cf8fcf4999dd9b295..c17fe126546ce97b3cd28cac59fd6f97114f3eca 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -17,7 +17,9 @@ #include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDKernelCodeTUtils.h" #include "llvm/ADT/Twine.h" +#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h" #include "llvm/BinaryFormat/ELF.h" +#include "llvm/BinaryFormat/MsgPackTypes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Metadata.h" @@ -35,12 +37,13 @@ namespace llvm { using namespace llvm; using namespace llvm::AMDGPU; +using namespace llvm::AMDGPU::HSAMD; //===----------------------------------------------------------------------===// // AMDGPUTargetStreamer //===----------------------------------------------------------------------===// -bool AMDGPUTargetStreamer::EmitHSAMetadata(StringRef HSAMetadataString) { +bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) { HSAMD::Metadata HSAMetadata; if (HSAMD::fromString(HSAMetadataString, HSAMetadata)) return false; @@ -48,6 +51,15 @@ bool AMDGPUTargetStreamer::EmitHSAMetadata(StringRef HSAMetadataString) { return EmitHSAMetadata(HSAMetadata); } +bool AMDGPUTargetStreamer::EmitHSAMetadataV3(StringRef HSAMetadataString) { + std::shared_ptr HSAMetadataRoot; + yaml::Input YIn(HSAMetadataString); + YIn >> HSAMetadataRoot; + if (YIn.error()) + return false; + return EmitHSAMetadata(HSAMetadataRoot, false); +} + StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { AMDGPU::GPUKind AK; @@ -195,9 +207,26 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata( if (HSAMD::toString(HSAMetadata, HSAMetadataString)) return false; - OS << '\t' << HSAMD::AssemblerDirectiveBegin << '\n'; + OS << '\t' << AssemblerDirectiveBegin << '\n'; OS << HSAMetadataString << '\n'; - OS << '\t' << HSAMD::AssemblerDirectiveEnd << '\n'; + OS << '\t' << AssemblerDirectiveEnd << '\n'; + return true; +} + +bool AMDGPUTargetAsmStreamer::EmitHSAMetadata( + std::shared_ptr &HSAMetadataRoot, bool Strict) { + V3::MetadataVerifier Verifier(Strict); + if (!Verifier.verify(*HSAMetadataRoot)) + return false; + + std::string HSAMetadataString; + raw_string_ostream StrOS(HSAMetadataString); + yaml::Output YOut(StrOS); + YOut << HSAMetadataRoot; + + OS << '\t' << V3::AssemblerDirectiveBegin << '\n'; + OS << StrOS.str() << '\n'; + OS << '\t' << V3::AssemblerDirectiveEnd << '\n'; return true; } @@ -358,13 +387,13 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() { return static_cast(Streamer); } -void AMDGPUTargetELFStreamer::EmitAMDGPUNote( - const MCExpr *DescSZ, unsigned NoteType, +void AMDGPUTargetELFStreamer::EmitNote( + StringRef Name, const MCExpr *DescSZ, unsigned NoteType, function_ref EmitDesc) { auto &S = getStreamer(); auto &Context = S.getContext(); - auto NameSZ = sizeof(ElfNote::NoteName); + auto NameSZ = Name.size() + 1; S.PushSection(); S.SwitchSection(Context.getELFSection( @@ -372,7 +401,7 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUNote( S.EmitIntValue(NameSZ, 4); // namesz S.EmitValue(DescSZ, 4); // descz S.EmitIntValue(NoteType, 4); // type - S.EmitBytes(StringRef(ElfNote::NoteName, NameSZ)); // name + S.EmitBytes(Name); // name S.EmitValueToAlignment(4, 0, 1, 0); // padding 0 EmitDesc(S); // desc S.EmitValueToAlignment(4, 0, 1, 0); // padding 0 @@ -384,14 +413,11 @@ void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {} void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion( uint32_t Major, uint32_t Minor) { - EmitAMDGPUNote( - MCConstantExpr::create(8, getContext()), - ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION, - [&](MCELFStreamer &OS){ - OS.EmitIntValue(Major, 4); - OS.EmitIntValue(Minor, 4); - } - ); + EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(8, getContext()), + ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS) { + OS.EmitIntValue(Major, 4); + OS.EmitIntValue(Minor, 4); + }); } void @@ -407,21 +433,18 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major, sizeof(Major) + sizeof(Minor) + sizeof(Stepping) + VendorNameSize + ArchNameSize; - EmitAMDGPUNote( - MCConstantExpr::create(DescSZ, getContext()), - ElfNote::NT_AMDGPU_HSA_ISA, - [&](MCELFStreamer &OS) { - OS.EmitIntValue(VendorNameSize, 2); - OS.EmitIntValue(ArchNameSize, 2); - OS.EmitIntValue(Major, 4); - OS.EmitIntValue(Minor, 4); - OS.EmitIntValue(Stepping, 4); - OS.EmitBytes(VendorName); - OS.EmitIntValue(0, 1); // NULL terminate VendorName - OS.EmitBytes(ArchName); - OS.EmitIntValue(0, 1); // NULL terminte ArchName - } - ); + EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(DescSZ, getContext()), + ElfNote::NT_AMDGPU_HSA_ISA, [&](MCELFStreamer &OS) { + OS.EmitIntValue(VendorNameSize, 2); + OS.EmitIntValue(ArchNameSize, 2); + OS.EmitIntValue(Major, 4); + OS.EmitIntValue(Minor, 4); + OS.EmitIntValue(Stepping, 4); + OS.EmitBytes(VendorName); + OS.EmitIntValue(0, 1); // NULL terminate VendorName + OS.EmitBytes(ArchName); + OS.EmitIntValue(0, 1); // NULL terminte ArchName + }); } void @@ -450,15 +473,41 @@ bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) { MCSymbolRefExpr::create(DescEnd, Context), MCSymbolRefExpr::create(DescBegin, Context), Context); - EmitAMDGPUNote( - DescSZ, - ELF::NT_AMD_AMDGPU_ISA, - [&](MCELFStreamer &OS) { - OS.EmitLabel(DescBegin); - OS.EmitBytes(IsaVersionString); - OS.EmitLabel(DescEnd); - } - ); + EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_ISA, + [&](MCELFStreamer &OS) { + OS.EmitLabel(DescBegin); + OS.EmitBytes(IsaVersionString); + OS.EmitLabel(DescEnd); + }); + return true; +} + +bool AMDGPUTargetELFStreamer::EmitHSAMetadata( + std::shared_ptr &HSAMetadataRoot, bool Strict) { + V3::MetadataVerifier Verifier(Strict); + if (!Verifier.verify(*HSAMetadataRoot)) + return false; + + std::string HSAMetadataString; + raw_string_ostream StrOS(HSAMetadataString); + msgpack::Writer MPWriter(StrOS); + HSAMetadataRoot->write(MPWriter); + + // Create two labels to mark the beginning and end of the desc field + // and a MCExpr to calculate the size of the desc field. + auto &Context = getContext(); + auto *DescBegin = Context.createTempSymbol(); + auto *DescEnd = Context.createTempSymbol(); + auto *DescSZ = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(DescEnd, Context), + MCSymbolRefExpr::create(DescBegin, Context), Context); + + EmitNote(ElfNote::NoteNameV3, DescSZ, ELF::NT_AMDGPU_METADATA, + [&](MCELFStreamer &OS) { + OS.EmitLabel(DescBegin); + OS.EmitBytes(StrOS.str()); + OS.EmitLabel(DescEnd); + }); return true; } @@ -477,28 +526,24 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata( MCSymbolRefExpr::create(DescEnd, Context), MCSymbolRefExpr::create(DescBegin, Context), Context); - EmitAMDGPUNote( - DescSZ, - ELF::NT_AMD_AMDGPU_HSA_METADATA, - [&](MCELFStreamer &OS) { - OS.EmitLabel(DescBegin); - OS.EmitBytes(HSAMetadataString); - OS.EmitLabel(DescEnd); - } - ); + EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_HSA_METADATA, + [&](MCELFStreamer &OS) { + OS.EmitLabel(DescBegin); + OS.EmitBytes(HSAMetadataString); + OS.EmitLabel(DescEnd); + }); return true; } bool AMDGPUTargetELFStreamer::EmitPALMetadata( const PALMD::Metadata &PALMetadata) { - EmitAMDGPUNote( - MCConstantExpr::create(PALMetadata.size() * sizeof(uint32_t), getContext()), - ELF::NT_AMD_AMDGPU_PAL_METADATA, - [&](MCELFStreamer &OS){ - for (auto I : PALMetadata) - OS.EmitIntValue(I, sizeof(uint32_t)); - } - ); + EmitNote(ElfNote::NoteNameV2, + MCConstantExpr::create(PALMetadata.size() * sizeof(uint32_t), + getContext()), + ELF::NT_AMD_AMDGPU_PAL_METADATA, [&](MCELFStreamer &OS) { + for (auto I : PALMetadata) + OS.EmitIntValue(I, sizeof(uint32_t)); + }); return true; } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index 2b885592f32be5e2e1607c4696d384397b20cb02..9a807c804f9ff1cddc1a5593900146aef938b411 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -11,6 +11,7 @@ #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H #include "AMDKernelCodeT.h" +#include "llvm/BinaryFormat/MsgPackTypes.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/AMDGPUMetadata.h" @@ -52,7 +53,20 @@ public: virtual bool EmitISAVersion(StringRef IsaVersionString) = 0; /// \returns True on success, false on failure. - virtual bool EmitHSAMetadata(StringRef HSAMetadataString); + virtual bool EmitHSAMetadataV2(StringRef HSAMetadataString); + + /// \returns True on success, false on failure. + virtual bool EmitHSAMetadataV3(StringRef HSAMetadataString); + + /// Emit HSA Metadata + /// + /// When \p Strict is true, known metadata elements must already be + /// well-typed. When \p Strict is false, known types are inferred and + /// the \p HSAMetadata structure is updated with the correct types. + /// + /// \returns True on success, false on failure. + virtual bool EmitHSAMetadata(std::shared_ptr &HSAMetadata, + bool Strict) = 0; /// \returns True on success, false on failure. virtual bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) = 0; @@ -91,6 +105,10 @@ public: /// \returns True on success, false on failure. bool EmitISAVersion(StringRef IsaVersionString) override; + /// \returns True on success, false on failure. + bool EmitHSAMetadata(std::shared_ptr &HSAMetadata, + bool Strict) override; + /// \returns True on success, false on failure. bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override; @@ -107,8 +125,8 @@ public: class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer { MCStreamer &Streamer; - void EmitAMDGPUNote(const MCExpr *DescSize, unsigned NoteType, - function_ref EmitDesc); + void EmitNote(StringRef Name, const MCExpr *DescSize, unsigned NoteType, + function_ref EmitDesc); public: AMDGPUTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI); @@ -131,6 +149,10 @@ public: /// \returns True on success, false on failure. bool EmitISAVersion(StringRef IsaVersionString) override; + /// \returns True on success, false on failure. + bool EmitHSAMetadata(std::shared_ptr &HSAMetadata, + bool Strict) override; + /// \returns True on success, false on failure. bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override; diff --git a/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt b/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt index 773ee7c0a4ba752933147dea790c3750d0558528..bc910a470d72a412ed799b1d0b7bd8a4027b3773 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt +++ b/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt @@ -19,5 +19,5 @@ type = Library name = AMDGPUDesc parent = AMDGPU -required_libraries = Core MC AMDGPUAsmPrinter AMDGPUInfo AMDGPUUtils Support +required_libraries = Core MC AMDGPUAsmPrinter AMDGPUInfo AMDGPUUtils Support BinaryFormat add_to_library_groups = AMDGPU diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp index 1683fe6c9a571a5dcd286eeaa8edeb6b00f5ad0e..679cf18d2c20b970f1828686bc164919126c004c 100644 --- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp +++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -226,11 +226,11 @@ private: // occur in the same basic block as its definition, because // it is illegal for the scheduler to schedule them in // different blocks. - if (UseI->readsRegister(MOI->getReg())) + if (UseI->readsRegister(MOI->getReg(), &TRI)) LastUseCount = AluInstCount; // Exit early if the current use kills the register - if (UseI != Def && UseI->killsRegister(MOI->getReg())) + if (UseI != Def && UseI->killsRegister(MOI->getReg(), &TRI)) break; } if (LastUseCount) diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp index 97228965e68c3658d948c60869b7683d5760047d..9cc3e5f3c314c929b50f9a6defef686d7af14cbf 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -229,11 +229,11 @@ bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { } bool R600InstrInfo::usesAddressRegister(MachineInstr &MI) const { - return MI.findRegisterUseOperandIdx(R600::AR_X) != -1; + return MI.findRegisterUseOperandIdx(R600::AR_X, false, &RI) != -1; } bool R600InstrInfo::definesAddressRegister(MachineInstr &MI) const { - return MI.findRegisterDefOperandIdx(R600::AR_X) != -1; + return MI.findRegisterDefOperandIdx(R600::AR_X, false, false, &RI) != -1; } bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const { diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index a6d28d6999e5f6e4c15f14ca1e12780e55065d45..7f6abc34cff3a2f7685fb10f65a8c31be47be4f5 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -88,7 +88,10 @@ enum : uint64_t { IsPacked = UINT64_C(1) << 49, // Is a D16 buffer instruction. - D16Buf = UINT64_C(1) << 50 + D16Buf = UINT64_C(1) << 50, + + // Uses floating point double precision rounding mode + FPDPRounding = UINT64_C(1) << 51 }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. diff --git a/lib/Target/AMDGPU/SIFixupVectorISel.cpp b/lib/Target/AMDGPU/SIFixupVectorISel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ee39eb04d8316c6af5eb8006114cf935170b9298 --- /dev/null +++ b/lib/Target/AMDGPU/SIFixupVectorISel.cpp @@ -0,0 +1,231 @@ +//===-- SIFixupVectorISel.cpp - Fixup post ISel vector issues -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +/// SIFixupVectorISel pass cleans up post ISEL Vector issues. +/// Currently this will convert GLOBAL_{LOAD|STORE}_* +/// and GLOBAL_Atomic_* instructions into their _SADDR variants, +/// feeding the sreg into the saddr field of the new instruction. +/// We currently handle a REG_SEQUENCE feeding the vaddr +/// and decompose it into a base and index. +/// +/// Transform: +/// %17:vgpr_32, %19:sreg_64_xexec = V_ADD_I32_e64 %21:sgpr_32, %22:vgpr_32 +/// %18:vgpr_32, %20:sreg_64_xexec = V_ADDC_U32_e64 %25:vgpr_32, +/// %24:vgpr_32, %19:sreg_64_xexec +/// %16:vreg_64 = REG_SEQUENCE %17:vgpr_32, %sub0, %18:vgpr_32, %sub1 +/// %11:vreg_64 = COPY %16:vreg_64 +/// %10:vgpr_32 = GLOBAL_LOAD_DWORD killed %11:vreg_64, 16, 0, 0 +/// Into: +/// %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64, 36, 0 +/// %14:vreg_64 = REG_SEQUENCE %6:vgpr_32, %sub0, %15:vgpr_32, %sub1 +/// %10:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %14:vreg_64, %4:sreg_64_xexec,16... +/// +//===----------------------------------------------------------------------===// +// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" +#define DEBUG_TYPE "si-fixup-vector-isel" + +using namespace llvm; + +static cl::opt EnableGlobalSGPRAddr( + "amdgpu-enable-global-sgpr-addr", + cl::desc("Enable use of SGPR regs for GLOBAL LOAD/STORE instructions"), + cl::init(false)); + +STATISTIC(NumSGPRGlobalOccurs, "Number of global ld/st opportunities"); +STATISTIC(NumSGPRGlobalSaddrs, "Number of global sgpr instructions converted"); + +namespace { + +class SIFixupVectorISel : public MachineFunctionPass { +public: + static char ID; + +public: + SIFixupVectorISel() : MachineFunctionPass(ID) { + initializeSIFixupVectorISelPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIFixupVectorISel, DEBUG_TYPE, + "SI Fixup Vector ISel", false, false) + +char SIFixupVectorISel::ID = 0; + +char &llvm::SIFixupVectorISelID = SIFixupVectorISel::ID; + +FunctionPass *llvm::createSIFixupVectorISelPass() { + return new SIFixupVectorISel(); +} + +static bool findSRegBaseAndIndex(MachineOperand *Op, + unsigned &BaseReg, + unsigned &IndexReg, + MachineRegisterInfo &MRI, + const SIRegisterInfo *TRI) { + SmallVector Worklist; + Worklist.push_back(Op); + while (!Worklist.empty()) { + MachineOperand *WOp = Worklist.pop_back_val(); + if (!WOp->isReg() || + !TargetRegisterInfo::isVirtualRegister(WOp->getReg())) + continue; + MachineInstr *DefInst = MRI.getUniqueVRegDef(WOp->getReg()); + switch (DefInst->getOpcode()) { + default: + continue; + case AMDGPU::COPY: + Worklist.push_back(&DefInst->getOperand(1)); + break; + case AMDGPU::REG_SEQUENCE: + if (DefInst->getNumOperands() != 5) + continue; + Worklist.push_back(&DefInst->getOperand(1)); + Worklist.push_back(&DefInst->getOperand(3)); + break; + case AMDGPU::V_ADD_I32_e64: + // The V_ADD_* and its analogous V_ADDCV_* are generated by + // a previous pass which lowered from an ADD_64_PSEUDO, + // which generates subregs to break up the 64 bit args. + if (DefInst->getOperand(2).getSubReg() != AMDGPU::NoSubRegister) + continue; + BaseReg = DefInst->getOperand(2).getReg(); + if (DefInst->getOperand(3).getSubReg() != AMDGPU::NoSubRegister) + continue; + IndexReg = DefInst->getOperand(3).getReg(); + // Chase the IndexReg. + MachineInstr *MI = MRI.getUniqueVRegDef(IndexReg); + if (!MI || !MI->isCopy()) + continue; + // Make sure the reg class is 64 bit for Index. + // If the Index register is a subreg, we want it to reference + // a 64 bit register which we will use as the Index reg. + const TargetRegisterClass *IdxRC, *BaseRC; + IdxRC = MRI.getRegClass(MI->getOperand(1).getReg()); + if (AMDGPU::getRegBitWidth(IdxRC->getID()) != 64) + continue; + IndexReg = MI->getOperand(1).getReg(); + // Chase the BaseReg. + MI = MRI.getUniqueVRegDef(BaseReg); + if (!MI || !MI->isCopy()) + continue; + // Make sure the register class is 64 bit for Base. + BaseReg = MI->getOperand(1).getReg(); + BaseRC = MRI.getRegClass(BaseReg); + if (AMDGPU::getRegBitWidth(BaseRC->getID()) != 64) + continue; + // Make sure Base is SReg and Index is VReg. + if (!TRI->isSGPRReg(MRI, BaseReg)) + return false; + if (!TRI->hasVGPRs(MRI.getRegClass(IndexReg))) + return false; + // clear any killed flags on Index and Base regs, used later. + MRI.clearKillFlags(IndexReg); + MRI.clearKillFlags(BaseReg); + return true; + } + } + return false; +} + +// Identify Global LOAD|STORE/ATOMIC and try to convert to _SADDR. +static bool fixupGlobalSaddr(MachineBasicBlock &MBB, + MachineFunction &MF, + MachineRegisterInfo &MRI, + const GCNSubtarget &ST, + const SIInstrInfo *TII, + const SIRegisterInfo *TRI) { + if (!EnableGlobalSGPRAddr) + return false; + bool FuncModified = false; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + int NewOpcd = AMDGPU::getGlobalSaddrOp(MI.getOpcode()); + if (NewOpcd < 0) + continue; + // Update our statistics on opportunities seen. + ++NumSGPRGlobalOccurs; + LLVM_DEBUG(dbgs() << "Global Mem opp " << MI << '\n'); + // Need a Base and Index or we cant transform to _SADDR. + unsigned BaseReg = 0; + unsigned IndexReg = 0; + MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); + if (!findSRegBaseAndIndex(Op, BaseReg, IndexReg, MRI, TRI)) + continue; + ++NumSGPRGlobalSaddrs; + FuncModified = true; + // Create the new _SADDR Memory instruction. + bool HasVdst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst) != nullptr; + MachineOperand *VData = TII->getNamedOperand(MI, AMDGPU::OpName::vdata); + MachineInstr *NewGlob = nullptr; + NewGlob = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcd)); + if (HasVdst) + NewGlob->addOperand(MF, MI.getOperand(0)); + NewGlob->addOperand(MF, MachineOperand::CreateReg(IndexReg, false)); + if (VData) + NewGlob->addOperand(MF, *VData); + NewGlob->addOperand(MF, MachineOperand::CreateReg(BaseReg, false)); + NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::offset)); + + MachineOperand *Glc = TII->getNamedOperand(MI, AMDGPU::OpName::glc); + // Atomics dont have a GLC, so omit the field if not there. + if (Glc) + NewGlob->addOperand(MF, *Glc); + NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::slc)); + // _D16 have an vdst_in operand, copy it in. + MachineOperand *VDstInOp = TII->getNamedOperand(MI, + AMDGPU::OpName::vdst_in); + if (VDstInOp) + NewGlob->addOperand(MF, *VDstInOp); + NewGlob->copyImplicitOps(MF, MI); + NewGlob->cloneMemRefs(MF, MI); + // Remove the old Global Memop instruction. + MI.eraseFromParent(); + LLVM_DEBUG(dbgs() << "New Global Mem " << *NewGlob << '\n'); + } + return FuncModified; +} + +bool SIFixupVectorISel::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + MachineRegisterInfo &MRI = MF.getRegInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + bool FuncModified = false; + for (MachineBasicBlock &MBB : MF) { + // Cleanup missed Saddr opportunites from ISel. + FuncModified |= fixupGlobalSaddr(MBB, MF, MRI, ST, TII, TRI); + } + return FuncModified; +} diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 254f1362f1f0f3cabed56208b33b3243ddd6e5c7..12d0bc528ca570c13034dd724b2487065bfc9c79 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -679,6 +679,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -4847,6 +4848,71 @@ SDValue SITargetLowering::lowerImage(SDValue Op, return SDValue(NewNode, 0); } +SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, + SDValue Offset, SDValue GLC, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + VT.getStoreSize(), VT.getStoreSize()); + + if (!Offset->isDivergent()) { + SDValue Ops[] = { + Rsrc, + Offset, // Offset + GLC // glc + }; + return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, + DAG.getVTList(VT), Ops, VT, MMO); + } + + // We have a divergent offset. Emit a MUBUF buffer load instead. We can + // assume that the buffer is unswizzled. + SmallVector Loads; + unsigned NumLoads = 1; + MVT LoadVT = VT.getSimpleVT(); + unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; + assert((LoadVT.getScalarType() == MVT::i32 || + LoadVT.getScalarType() == MVT::f32) && + isPowerOf2_32(NumElts)); + + if (NumElts == 8 || NumElts == 16) { + NumLoads = NumElts == 16 ? 4 : 2; + LoadVT = MVT::v4i32; + } + + SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue}); + unsigned CachePolicy = cast(GLC)->getZExtValue(); + SDValue Ops[] = { + DAG.getEntryNode(), // Chain + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + {}, // voffset + {}, // soffset + {}, // offset + DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy + DAG.getConstant(0, DL, MVT::i1), // idxen + }; + + // Use the alignment to ensure that the required offsets will fit into the + // immediate offsets. + setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4); + + uint64_t InstOffset = cast(Ops[5])->getZExtValue(); + for (unsigned i = 0; i < NumLoads; ++i) { + Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32); + Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, + Ops, LoadVT, MMO)); + } + + if (VT == MVT::v8i32 || VT == MVT::v16i32) + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads); + + return Loads[0]; +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -5000,39 +5066,16 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDZ); - case AMDGPUIntrinsic::SI_load_const: { - SDValue Ops[] = { - Op.getOperand(1), // Ptr - Op.getOperand(2), // Offset - DAG.getTargetConstant(0, DL, MVT::i1) // glc - }; - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant, - VT.getStoreSize(), 4); - SDVTList VTList = DAG.getVTList(MVT::i32); - SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, - VTList, Ops, MVT::i32, MMO); - + case SIIntrinsic::SI_load_const: { + SDValue Load = + lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2), + DAG.getTargetConstant(0, DL, MVT::i1), DAG); return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load); } case Intrinsic::amdgcn_s_buffer_load: { unsigned Cache = cast(Op.getOperand(3))->getZExtValue(); - SDValue Ops[] = { - Op.getOperand(1), // Ptr - Op.getOperand(2), // Offset - DAG.getTargetConstant(Cache & 1, DL, MVT::i1) // glc - }; - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant, - VT.getStoreSize(), VT.getStoreSize()); - return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, - Op->getVTList(), Ops, VT, MMO); + return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), + DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG); } case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); @@ -5766,19 +5809,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain, Op.getOperand(2), Op.getOperand(3)); } - case AMDGPUIntrinsic::AMDGPU_kill: { - SDValue Src = Op.getOperand(2); - if (const ConstantFPSDNode *K = dyn_cast(Src)) { - if (!K->isNegative()) - return Chain; - - SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32); - return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne); - } - - SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src); - return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast); - } case Intrinsic::amdgcn_s_barrier: { if (getTargetMachine().getOptLevel() > CodeGenOpt::None) { const GCNSubtarget &ST = MF.getSubtarget(); @@ -5789,55 +5819,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, } return SDValue(); }; - case AMDGPUIntrinsic::SI_tbuffer_store: { - - // Extract vindex and voffset from vaddr as appropriate - const ConstantSDNode *OffEn = cast(Op.getOperand(10)); - const ConstantSDNode *IdxEn = cast(Op.getOperand(11)); - SDValue VAddr = Op.getOperand(5); - - SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); - - assert(!(OffEn->isOne() && IdxEn->isOne()) && - "Legacy intrinsic doesn't support both offset and index - use new version"); - - SDValue VIndex = IdxEn->isOne() ? VAddr : Zero; - SDValue VOffset = OffEn->isOne() ? VAddr : Zero; - - // Deal with the vec-3 case - const ConstantSDNode *NumChannels = cast(Op.getOperand(4)); - auto Opcode = NumChannels->getZExtValue() == 3 ? - AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT; - - unsigned Dfmt = cast(Op.getOperand(8))->getZExtValue(); - unsigned Nfmt = cast(Op.getOperand(9))->getZExtValue(); - unsigned Glc = cast(Op.getOperand(12))->getZExtValue(); - unsigned Slc = cast(Op.getOperand(13))->getZExtValue(); - SDValue Ops[] = { - Chain, - Op.getOperand(3), // vdata - Op.getOperand(2), // rsrc - VIndex, - VOffset, - Op.getOperand(6), // soffset - Op.getOperand(7), // inst_offset - DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format - DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn->isOne(), DL, MVT::i1), // idxen - }; - - assert((cast(Op.getOperand(14)))->getZExtValue() == 0 && - "Value of tfe other than zero is unsupported"); - - EVT VT = Op.getOperand(3).getValueType(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOStore, - VT.getStoreSize(), 4); - return DAG.getMemIntrinsicNode(Opcode, DL, - Op->getVTList(), Ops, VT, MMO); - } - case Intrinsic::amdgcn_tbuffer_store: { SDValue VData = Op.getOperand(2); bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); @@ -6067,13 +6048,13 @@ std::pair SITargetLowering::splitBufferOffsets( // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, - SelectionDAG &DAG, - SDValue *Offsets) const { + SelectionDAG &DAG, SDValue *Offsets, + unsigned Align) const { SDLoc DL(CombinedOffset); if (auto C = dyn_cast(CombinedOffset)) { uint32_t Imm = C->getZExtValue(); uint32_t SOffset, ImmOffset; - if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget)) { + if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) { Offsets[0] = DAG.getConstant(0, DL, MVT::i32); Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); @@ -6085,8 +6066,8 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, SDValue N1 = CombinedOffset.getOperand(1); uint32_t SOffset, ImmOffset; int Offset = cast(N1)->getSExtValue(); - if (Offset >= 0 - && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, Subtarget)) { + if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, + Subtarget, Align)) { Offsets[0] = N0; Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); @@ -8025,7 +8006,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine( switch(Opc) { default: - return SDValue(); + break; // TODO: Support other binary operations. case ISD::FADD: case ISD::FSUB: @@ -8051,12 +8032,34 @@ SDValue SITargetLowering::performExtractVectorEltCombine( } } - if (!DCI.isBeforeLegalize()) - return SDValue(); - unsigned VecSize = VecVT.getSizeInBits(); unsigned EltSize = EltVT.getSizeInBits(); + // EXTRACT_VECTOR_ELT (, var-idx) => n x select (e, const-idx) + // This elminates non-constant index and subsequent movrel or scratch access. + // Sub-dword vectors of size 2 dword or less have better implementation. + // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32 + // instructions. + if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) && + !isa(N->getOperand(1))) { + SDLoc SL(N); + SDValue Idx = N->getOperand(1); + EVT IdxVT = Idx.getValueType(); + SDValue V; + for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { + SDValue IC = DAG.getConstant(I, SL, IdxVT); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC); + if (I == 0) + V = Elt; + else + V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ); + } + return V; + } + + if (!DCI.isBeforeLegalize()) + return SDValue(); + // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit // elements. This exposes more load reduction opportunities by replacing // multiple small extract_vector_elements with a single 32-bit extract. @@ -8092,6 +8095,42 @@ SDValue SITargetLowering::performExtractVectorEltCombine( return SDValue(); } +SDValue +SITargetLowering::performInsertVectorEltCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SDValue Vec = N->getOperand(0); + SDValue Idx = N->getOperand(2); + EVT VecVT = Vec.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + unsigned VecSize = VecVT.getSizeInBits(); + unsigned EltSize = EltVT.getSizeInBits(); + + // INSERT_VECTOR_ELT (, var-idx) + // => BUILD_VECTOR n x select (e, const-idx) + // This elminates non-constant index and subsequent movrel or scratch access. + // Sub-dword vectors of size 2 dword or less have better implementation. + // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32 + // instructions. + if (isa(Idx) || + VecSize > 256 || (VecSize <= 64 && EltSize < 32)) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + SDValue Ins = N->getOperand(1); + EVT IdxVT = Idx.getValueType(); + + SmallVector Ops; + for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { + SDValue IC = DAG.getConstant(I, SL, IdxVT); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC); + SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ); + Ops.push_back(V); + } + + return DAG.getBuildVector(VecVT, SL, Ops); +} + unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const { @@ -8578,6 +8617,9 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { + if (getTargetMachine().getOptLevel() == CodeGenOpt::None) + return SDValue(); + switch (N->getOpcode()) { default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); @@ -8603,12 +8645,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::UMAX: case ISD::UMIN: case AMDGPUISD::FMIN_LEGACY: - case AMDGPUISD::FMAX_LEGACY: { - if (//DCI.getDAGCombineLevel() >= AfterLegalizeDAG && - getTargetMachine().getOptLevel() > CodeGenOpt::None) - return performMinMaxCombine(N, DCI); - break; - } + case AMDGPUISD::FMAX_LEGACY: + return performMinMaxCombine(N, DCI); case ISD::FMA: return performFMACombine(N, DCI); case ISD::LOAD: { @@ -8700,6 +8738,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, } case ISD::EXTRACT_VECTOR_ELT: return performExtractVectorEltCombine(N, DCI); + case ISD::INSERT_VECTOR_ELT: + return performInsertVectorEltCombine(N, DCI); } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } @@ -9243,42 +9283,49 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits); } +LLVM_ATTRIBUTE_UNUSED +static bool isCopyFromRegOfInlineAsm(const SDNode *N) { + assert(N->getOpcode() == ISD::CopyFromReg); + do { + // Follow the chain until we find an INLINEASM node. + N = N->getOperand(0).getNode(); + if (N->getOpcode() == ISD::INLINEASM) + return true; + } while (N->getOpcode() == ISD::CopyFromReg); + return false; +} + bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const { switch (N->getOpcode()) { - case ISD::Register: case ISD::CopyFromReg: { - const RegisterSDNode *R = nullptr; - if (N->getOpcode() == ISD::Register) { - R = dyn_cast(N); - } - else { - R = dyn_cast(N->getOperand(1)); - } - if (R) - { - const MachineFunction * MF = FLI->MF; - const GCNSubtarget &ST = MF->getSubtarget(); - const MachineRegisterInfo &MRI = MF->getRegInfo(); - const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo(); - unsigned Reg = R->getReg(); - if (TRI.isPhysicalRegister(Reg)) - return TRI.isVGPR(MRI, Reg); - - if (MRI.isLiveIn(Reg)) { - // workitem.id.x workitem.id.y workitem.id.z - // Any VGPR formal argument is also considered divergent - if (TRI.isVGPR(MRI, Reg)) - return true; - // Formal arguments of non-entry functions - // are conservatively considered divergent - else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv())) - return true; - } - return !KDA || KDA->isDivergent(FLI->getValueFromVirtualReg(Reg)); + const RegisterSDNode *R = cast(N->getOperand(1)); + const MachineFunction * MF = FLI->MF; + const GCNSubtarget &ST = MF->getSubtarget(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo(); + unsigned Reg = R->getReg(); + if (TRI.isPhysicalRegister(Reg)) + return !TRI.isSGPRReg(MRI, Reg); + + if (MRI.isLiveIn(Reg)) { + // workitem.id.x workitem.id.y workitem.id.z + // Any VGPR formal argument is also considered divergent + if (!TRI.isSGPRReg(MRI, Reg)) + return true; + // Formal arguments of non-entry functions + // are conservatively considered divergent + else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv())) + return true; + return false; } + const Value *V = FLI->getValueFromVirtualReg(Reg); + if (V) + return KDA->isDivergent(V); + assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N)); + return !TRI.isSGPRReg(MRI, Reg); } break; case ISD::LOAD: { diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 73fa05ea58f51e063b40ba419ad823ad7345d7ef..bcef519ee6635ddb6b1d2f12096cce80a7ef57ef 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -60,6 +60,8 @@ private: MVT VT, unsigned Offset) const; SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, SelectionDAG &DAG) const; + SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset, + SDValue GLC, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; @@ -152,6 +154,7 @@ private: SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; @@ -167,7 +170,6 @@ private: SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const; bool isLegalFlatAddressingMode(const AddrMode &AM) const; - bool isLegalGlobalAddressingMode(const AddrMode &AM) const; bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; unsigned isCFIntrinsic(const SDNode *Intr) const; @@ -190,7 +192,7 @@ private: // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, - SDValue *Offsets) const; + SDValue *Offsets, unsigned Align = 4) const; public: SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI); @@ -209,6 +211,7 @@ public: SmallVectorImpl &/*Ops*/, Type *&/*AccessTy*/) const override; + bool isLegalGlobalAddressingMode(const AddrMode &AM) const; bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I = nullptr) const override; diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp index dc9397cf7b85e900ad6b308ee35788d7607d2915..ba21a5ce1293a0d139ddf2089ba53e9fc1b06e66 100644 --- a/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -66,6 +66,8 @@ private: bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); + bool optimizeVccBranch(MachineInstr &MI) const; + public: static char ID; @@ -320,6 +322,96 @@ bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, return true; } +bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const { + // Match: + // sreg = -1 + // vcc = S_AND_B64 exec, sreg + // S_CBRANCH_VCC[N]Z + // => + // S_CBRANCH_EXEC[N]Z + bool Changed = false; + MachineBasicBlock &MBB = *MI.getParent(); + const unsigned CondReg = AMDGPU::VCC; + const unsigned ExecReg = AMDGPU::EXEC; + const unsigned And = AMDGPU::S_AND_B64; + + MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), + E = MBB.rend(); + bool ReadsCond = false; + unsigned Threshold = 5; + for (++A ; A != E ; ++A) { + if (!--Threshold) + return false; + if (A->modifiesRegister(ExecReg, TRI)) + return false; + if (A->modifiesRegister(CondReg, TRI)) { + if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And) + return false; + break; + } + ReadsCond |= A->readsRegister(CondReg, TRI); + } + if (A == E) + return false; + + MachineOperand &Op1 = A->getOperand(1); + MachineOperand &Op2 = A->getOperand(2); + if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { + TII->commuteInstruction(*A); + Changed = true; + } + if (Op1.getReg() != ExecReg) + return Changed; + if (Op2.isImm() && Op2.getImm() != -1) + return Changed; + + unsigned SReg = AMDGPU::NoRegister; + if (Op2.isReg()) { + SReg = Op2.getReg(); + auto M = std::next(A); + bool ReadsSreg = false; + for ( ; M != E ; ++M) { + if (M->definesRegister(SReg, TRI)) + break; + if (M->modifiesRegister(SReg, TRI)) + return Changed; + ReadsSreg |= M->readsRegister(SReg, TRI); + } + if (M == E || + !M->isMoveImmediate() || + !M->getOperand(1).isImm() || + M->getOperand(1).getImm() != -1) + return Changed; + // First if sreg is only used in and instruction fold the immediate + // into that and. + if (!ReadsSreg && Op2.isKill()) { + A->getOperand(2).ChangeToImmediate(-1); + M->eraseFromParent(); + } + } + + if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) && + MI.killsRegister(CondReg, TRI)) + A->eraseFromParent(); + + bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; + if (SReg == ExecReg) { + if (IsVCCZ) { + MI.eraseFromParent(); + return true; + } + MI.setDesc(TII->get(AMDGPU::S_BRANCH)); + } else { + MI.setDesc(TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ + : AMDGPU::S_CBRANCH_EXECNZ)); + } + + MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI)); + MI.addImplicitDefUseOperands(*MBB.getParent()); + + return true; +} + bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); @@ -384,7 +476,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { kill(MI); if (ExecBranchStack.empty()) { - if (skipIfDead(MI, *NextBB)) { + if (NextBB != BE && skipIfDead(MI, *NextBB)) { HaveSkipBlock = true; NextBB = std::next(BI); BE = MF.end(); @@ -417,6 +509,11 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { } break; + case AMDGPU::S_CBRANCH_VCCZ: + case AMDGPU::S_CBRANCH_VCCNZ: + MadeChange |= optimizeVccBranch(MI); + break; + default: break; } diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index eb39984f795910e38acf036c3856bd5c968d7456..79674046fce1bcf9dfba54254655664670bf5e55 100644 --- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -13,6 +13,14 @@ /// Memory reads and writes are issued asynchronously, so we need to insert /// S_WAITCNT instructions when we want to access any of their results or /// overwrite any register that's used asynchronously. +/// +/// TODO: This pass currently keeps one timeline per hardware counter. A more +/// finely-grained approach that keeps one timeline per event type could +/// sometimes get away with generating weaker s_waitcnt instructions. For +/// example, when both SMEM and LDS are in flight and we need to wait for +/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient, +/// but the pass will currently generate a conservative lgkmcnt(0) because +/// multiple event types are in flight. // //===----------------------------------------------------------------------===// @@ -33,7 +41,6 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -69,6 +76,25 @@ static cl::opt ForceEmitZeroFlag( namespace { +template +class enum_iterator + : public iterator_facade_base, + std::forward_iterator_tag, const EnumT> { + EnumT Value; +public: + enum_iterator() = default; + enum_iterator(EnumT Value) : Value(Value) {} + + enum_iterator &operator++() { + Value = static_cast(Value + 1); + return *this; + } + + bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; } + + EnumT operator*() const { return Value; } +}; + // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether // s_waitcnt instruction needs to be emited. @@ -77,12 +103,17 @@ namespace { enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS }; +iterator_range> inst_counter_types() { + return make_range(enum_iterator(VM_CNT), + enum_iterator(NUM_INST_CNTS)); +} + using RegInterval = std::pair; struct { - int32_t VmcntMax; - int32_t ExpcntMax; - int32_t LgkmcntMax; + uint32_t VmcntMax; + uint32_t ExpcntMax; + uint32_t LgkmcntMax; int32_t NumVGPRsMax; int32_t NumSGPRsMax; } HardwareLimits; @@ -108,6 +139,14 @@ enum WaitEventType { NUM_WAIT_EVENTS, }; +static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = { + (1 << VMEM_ACCESS), + (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) | + (1 << SQ_MESSAGE), + (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | + (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS), +}; + // The mapping is: // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots @@ -122,30 +161,38 @@ enum RegisterMapping { NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts. }; -#define ForAllWaitEventType(w) \ - for (enum WaitEventType w = (enum WaitEventType)0; \ - (w) < (enum WaitEventType)NUM_WAIT_EVENTS; \ - (w) = (enum WaitEventType)((w) + 1)) +void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { + switch (T) { + case VM_CNT: + Wait.VmCnt = std::min(Wait.VmCnt, Count); + break; + case EXP_CNT: + Wait.ExpCnt = std::min(Wait.ExpCnt, Count); + break; + case LGKM_CNT: + Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count); + break; + default: + llvm_unreachable("bad InstCounterType"); + } +} -// This is a per-basic-block object that maintains current score brackets -// of each wait counter, and a per-register scoreboard for each wait counter. +// This objects maintains the current score brackets of each wait counter, and +// a per-register scoreboard for each wait counter. +// // We also maintain the latest score for every event type that can change the // waitcnt in order to know if there are multiple types of events within // the brackets. When multiple types of event happen in the bracket, // wait count may get decreased out of order, therefore we need to put in // "s_waitcnt 0" before use. -class BlockWaitcntBrackets { +class WaitcntBrackets { public: - BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) { - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { + WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) { + for (auto T : inst_counter_types()) memset(VgprScores[T], 0, sizeof(VgprScores[T])); - } } - ~BlockWaitcntBrackets() = default; - - static int32_t getWaitCountMax(InstCounterType T) { + static uint32_t getWaitCountMax(InstCounterType T) { switch (T) { case VM_CNT: return HardwareLimits.VmcntMax; @@ -159,33 +206,14 @@ public: return 0; } - void setScoreLB(InstCounterType T, int32_t Val) { - assert(T < NUM_INST_CNTS); - if (T >= NUM_INST_CNTS) - return; - ScoreLBs[T] = Val; - } - - void setScoreUB(InstCounterType T, int32_t Val) { - assert(T < NUM_INST_CNTS); - if (T >= NUM_INST_CNTS) - return; - ScoreUBs[T] = Val; - if (T == EXP_CNT) { - int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT)); - if (ScoreLBs[T] < UB) - ScoreLBs[T] = UB; - } - } - - int32_t getScoreLB(InstCounterType T) { + uint32_t getScoreLB(InstCounterType T) const { assert(T < NUM_INST_CNTS); if (T >= NUM_INST_CNTS) return 0; return ScoreLBs[T]; } - int32_t getScoreUB(InstCounterType T) { + uint32_t getScoreUB(InstCounterType T) const { assert(T < NUM_INST_CNTS); if (T >= NUM_INST_CNTS) return 0; @@ -194,89 +222,56 @@ public: // Mapping from event to counter. InstCounterType eventCounter(WaitEventType E) { - switch (E) { - case VMEM_ACCESS: + if (E == VMEM_ACCESS) return VM_CNT; - case LDS_ACCESS: - case GDS_ACCESS: - case SQ_MESSAGE: - case SMEM_ACCESS: + if (WaitEventMaskForInst[LGKM_CNT] & (1 << E)) return LGKM_CNT; - case EXP_GPR_LOCK: - case GDS_GPR_LOCK: - case VMW_GPR_LOCK: - case EXP_POS_ACCESS: - case EXP_PARAM_ACCESS: - return EXP_CNT; - default: - llvm_unreachable("unhandled event type"); - } - return NUM_INST_CNTS; - } - - void setRegScore(int GprNo, InstCounterType T, int32_t Val) { - if (GprNo < NUM_ALL_VGPRS) { - if (GprNo > VgprUB) { - VgprUB = GprNo; - } - VgprScores[T][GprNo] = Val; - } else { - assert(T == LGKM_CNT); - if (GprNo - NUM_ALL_VGPRS > SgprUB) { - SgprUB = GprNo - NUM_ALL_VGPRS; - } - SgprScores[GprNo - NUM_ALL_VGPRS] = Val; - } + assert(WaitEventMaskForInst[EXP_CNT] & (1 << E)); + return EXP_CNT; } - int32_t getRegScore(int GprNo, InstCounterType T) { + uint32_t getRegScore(int GprNo, InstCounterType T) { if (GprNo < NUM_ALL_VGPRS) { return VgprScores[T][GprNo]; } + assert(T == LGKM_CNT); return SgprScores[GprNo - NUM_ALL_VGPRS]; } void clear() { memset(ScoreLBs, 0, sizeof(ScoreLBs)); memset(ScoreUBs, 0, sizeof(ScoreUBs)); - memset(EventUBs, 0, sizeof(EventUBs)); - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { + PendingEvents = 0; + memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents)); + for (auto T : inst_counter_types()) memset(VgprScores[T], 0, sizeof(VgprScores[T])); - } memset(SgprScores, 0, sizeof(SgprScores)); } + bool merge(const WaitcntBrackets &Other); + RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII, const MachineRegisterInfo *MRI, const SIRegisterInfo *TRI, unsigned OpNo, bool Def) const; - void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII, - const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, - unsigned OpNo, int32_t Val); - - void setWaitAtBeginning() { WaitAtBeginning = true; } - void clearWaitAtBeginning() { WaitAtBeginning = false; } - bool getWaitAtBeginning() const { return WaitAtBeginning; } - void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; } int32_t getMaxVGPR() const { return VgprUB; } int32_t getMaxSGPR() const { return SgprUB; } - int32_t getEventUB(enum WaitEventType W) const { - assert(W < NUM_WAIT_EVENTS); - return EventUBs[W]; - } - - bool counterOutOfOrder(InstCounterType T); - unsigned int updateByWait(InstCounterType T, int ScoreToWait); + bool counterOutOfOrder(InstCounterType T) const; + bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; + bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const; + void determineWait(InstCounterType T, uint32_t ScoreToWait, + AMDGPU::Waitcnt &Wait) const; + void applyWaitcnt(const AMDGPU::Waitcnt &Wait); + void applyWaitcnt(InstCounterType T, unsigned Count); void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, WaitEventType E, MachineInstr &MI); - bool hasPendingSMEM() const { - return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] && - EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]); + bool hasPending() const { return PendingEvents != 0; } + bool hasPendingEvent(WaitEventType E) const { + return PendingEvents & (1 << E); } bool hasPendingFlat() const { @@ -291,75 +286,71 @@ public: LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT]; } - int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; } - - void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; } - - bool getRevisitLoop() const { return RevisitLoop; } - void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; } + void print(raw_ostream &); + void dump() { print(dbgs()); } - void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; } - int32_t getPostOrder() const { return PostOrder; } +private: + struct MergeInfo { + uint32_t OldLB; + uint32_t OtherLB; + uint32_t MyShift; + uint32_t OtherShift; + }; + static bool mergeScore(const MergeInfo &M, uint32_t &Score, + uint32_t OtherScore); + + void setScoreLB(InstCounterType T, uint32_t Val) { + assert(T < NUM_INST_CNTS); + if (T >= NUM_INST_CNTS) + return; + ScoreLBs[T] = Val; + } - void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; } - void clearWaitcnt() { Waitcnt = nullptr; } - MachineInstr *getWaitcnt() const { return Waitcnt; } + void setScoreUB(InstCounterType T, uint32_t Val) { + assert(T < NUM_INST_CNTS); + if (T >= NUM_INST_CNTS) + return; + ScoreUBs[T] = Val; + if (T == EXP_CNT) { + uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT); + if (ScoreLBs[T] < UB && UB < ScoreUBs[T]) + ScoreLBs[T] = UB; + } + } - bool mixedExpTypes() const { return MixedExpTypes; } - void setMixedExpTypes(bool MixedExpTypesIn) { - MixedExpTypes = MixedExpTypesIn; + void setRegScore(int GprNo, InstCounterType T, uint32_t Val) { + if (GprNo < NUM_ALL_VGPRS) { + if (GprNo > VgprUB) { + VgprUB = GprNo; + } + VgprScores[T][GprNo] = Val; + } else { + assert(T == LGKM_CNT); + if (GprNo - NUM_ALL_VGPRS > SgprUB) { + SgprUB = GprNo - NUM_ALL_VGPRS; + } + SgprScores[GprNo - NUM_ALL_VGPRS] = Val; + } } - void print(raw_ostream &); - void dump() { print(dbgs()); } + void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII, + const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, + unsigned OpNo, uint32_t Val); -private: const GCNSubtarget *ST = nullptr; - bool WaitAtBeginning = false; - bool RevisitLoop = false; - bool MixedExpTypes = false; - int32_t PostOrder = 0; - MachineInstr *Waitcnt = nullptr; - int32_t ScoreLBs[NUM_INST_CNTS] = {0}; - int32_t ScoreUBs[NUM_INST_CNTS] = {0}; - int32_t EventUBs[NUM_WAIT_EVENTS] = {0}; + uint32_t ScoreLBs[NUM_INST_CNTS] = {0}; + uint32_t ScoreUBs[NUM_INST_CNTS] = {0}; + uint32_t PendingEvents = 0; + bool MixedPendingEvents[NUM_INST_CNTS] = {false}; // Remember the last flat memory operation. - int32_t LastFlat[NUM_INST_CNTS] = {0}; + uint32_t LastFlat[NUM_INST_CNTS] = {0}; // wait_cnt scores for every vgpr. // Keep track of the VgprUB and SgprUB to make merge at join efficient. int32_t VgprUB = 0; int32_t SgprUB = 0; - int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS]; + uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS]; // Wait cnt scores for every sgpr, only lgkmcnt is relevant. - int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0}; -}; - -// This is a per-loop-region object that records waitcnt status at the end of -// loop footer from the previous iteration. We also maintain an iteration -// count to track the number of times the loop has been visited. When it -// doesn't converge naturally, we force convergence by inserting s_waitcnt 0 -// at the end of the loop footer. -class LoopWaitcntData { -public: - LoopWaitcntData() = default; - ~LoopWaitcntData() = default; - - void incIterCnt() { IterCnt++; } - void resetIterCnt() { IterCnt = 0; } - unsigned getIterCnt() { return IterCnt; } - - void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; } - MachineInstr *getWaitcnt() const { return LfWaitcnt; } - - void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt << '\n';); } - -private: - // s_waitcnt added at the end of loop footer to stablize wait scores - // at the end of the loop footer. - MachineInstr *LfWaitcnt = nullptr; - // Number of iterations the loop has been visited, not including the initial - // walk over. - int32_t IterCnt = 0; + uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0}; }; class SIInsertWaitcnts : public MachineFunctionPass { @@ -368,19 +359,21 @@ private: const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; const MachineRegisterInfo *MRI = nullptr; - const MachineLoopInfo *MLI = nullptr; AMDGPU::IsaVersion IV; - DenseSet BlockVisitedSet; DenseSet TrackedWaitcntSet; DenseSet VCCZBugHandledSet; - DenseMap> - BlockWaitcntBracketsMap; + struct BlockInfo { + MachineBasicBlock *MBB; + std::unique_ptr Incoming; + bool Dirty = true; - std::vector BlockWaitcntProcessedSet; + explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {} + }; - DenseMap> LoopWaitcntDataMap; + std::vector BlockInfos; // by reverse post-order traversal index + DenseMap RpotIdxMap; // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0 // because of amdgpu-waitcnt-forcezero flag @@ -404,13 +397,11 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } bool isForceEmitWaitcnt() const { - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) + for (auto T : inst_counter_types()) if (ForceEmitWaitcnt[T]) return true; return false; @@ -444,27 +435,22 @@ public: } bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; - void generateWaitcntInstBefore(MachineInstr &MI, - BlockWaitcntBrackets *ScoreBrackets); + bool generateWaitcntInstBefore(MachineInstr &MI, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr); void updateEventWaitcntAfter(MachineInstr &Inst, - BlockWaitcntBrackets *ScoreBrackets); - void mergeInputScoreBrackets(MachineBasicBlock &Block); - bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block); - unsigned countNumBottomBlocks(const MachineLoop *Loop); - void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block); - void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst); - bool isWaitcntStronger(unsigned LHS, unsigned RHS); - unsigned combineWaitcnt(unsigned LHS, unsigned RHS); + WaitcntBrackets *ScoreBrackets); + bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, + WaitcntBrackets &ScoreBrackets); }; } // end anonymous namespace -RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI, - const SIInstrInfo *TII, - const MachineRegisterInfo *MRI, - const SIRegisterInfo *TRI, - unsigned OpNo, - bool Def) const { +RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, + const SIInstrInfo *TII, + const MachineRegisterInfo *MRI, + const SIRegisterInfo *TRI, + unsigned OpNo, bool Def) const { const MachineOperand &Op = MI->getOperand(OpNo); if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) || (Def && !Op.isDef())) @@ -502,11 +488,11 @@ RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI, return Result; } -void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, - unsigned OpNo, int32_t Val) { +void WaitcntBrackets::setExpScore(const MachineInstr *MI, + const SIInstrInfo *TII, + const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, unsigned OpNo, + uint32_t Val) { RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false); LLVM_DEBUG({ const MachineOperand &Opnd = MI->getOperand(OpNo); @@ -517,26 +503,26 @@ void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI, } } -void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, - WaitEventType E, MachineInstr &Inst) { +void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, + const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, + WaitEventType E, MachineInstr &Inst) { const MachineRegisterInfo &MRIA = *MRI; InstCounterType T = eventCounter(E); - int32_t CurrScore = getScoreUB(T) + 1; - // EventUB and ScoreUB need to be update regardless if this event changes - // the score of a register or not. + uint32_t CurrScore = getScoreUB(T) + 1; + if (CurrScore == 0) + report_fatal_error("InsertWaitcnt score wraparound"); + // PendingEvents and ScoreUB need to be update regardless if this event + // changes the score of a register or not. // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. - EventUBs[E] = CurrScore; + if (!hasPendingEvent(E)) { + if (PendingEvents & WaitEventMaskForInst[T]) + MixedPendingEvents[T] = true; + PendingEvents |= 1 << E; + } setScoreUB(T, CurrScore); if (T == EXP_CNT) { - // Check for mixed export types. If they are mixed, then a waitcnt exp(0) - // is required. - if (!MixedExpTypes) { - MixedExpTypes = counterOutOfOrder(EXP_CNT); - } - // Put score on the source vgprs. If this is a store, just use those // specific register(s). if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) { @@ -661,12 +647,11 @@ void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII, } } -void BlockWaitcntBrackets::print(raw_ostream &OS) { +void WaitcntBrackets::print(raw_ostream &OS) { OS << '\n'; - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - int LB = getScoreLB(T); - int UB = getScoreUB(T); + for (auto T : inst_counter_types()) { + uint32_t LB = getScoreLB(T); + uint32_t UB = getScoreUB(T); switch (T) { case VM_CNT: @@ -686,10 +671,10 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) { if (LB < UB) { // Print vgpr scores. for (int J = 0; J <= getMaxVGPR(); J++) { - int RegScore = getRegScore(J, T); + uint32_t RegScore = getRegScore(J, T); if (RegScore <= LB) continue; - int RelScore = RegScore - LB - 1; + uint32_t RelScore = RegScore - LB - 1; if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) { OS << RelScore << ":v" << J << " "; } else { @@ -699,10 +684,10 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) { // Also need to print sgpr scores for lgkm_cnt. if (T == LGKM_CNT) { for (int J = 0; J <= getMaxSGPR(); J++) { - int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); + uint32_t RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); if (RegScore <= LB) continue; - int RelScore = RegScore - LB - 1; + uint32_t RelScore = RegScore - LB - 1; OS << RelScore << ":s" << J << " "; } } @@ -712,23 +697,31 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) { OS << '\n'; } -unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T, - int ScoreToWait) { - unsigned int NeedWait = 0; - if (ScoreToWait == -1) { - // The score to wait is unknown. This implies that it was not encountered - // during the path of the CFG walk done during the current traversal but - // may be seen on a different path. Emit an s_wait counter with a - // conservative value of 0 for the counter. - NeedWait = CNT_MASK(T); - setScoreLB(T, getScoreUB(T)); - return NeedWait; - } +/// Simplify the waitcnt, in the sense of removing redundant counts, and return +/// whether a waitcnt instruction is needed at all. +bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { + return simplifyWaitcnt(VM_CNT, Wait.VmCnt) | + simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) | + simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt); +} + +bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T, + unsigned &Count) const { + const uint32_t LB = getScoreLB(T); + const uint32_t UB = getScoreUB(T); + if (Count < UB && UB - Count > LB) + return true; + Count = ~0u; + return false; +} + +void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait, + AMDGPU::Waitcnt &Wait) const { // If the score of src_operand falls within the bracket, we need an // s_waitcnt instruction. - const int32_t LB = getScoreLB(T); - const int32_t UB = getScoreUB(T); + const uint32_t LB = getScoreLB(T); + const uint32_t UB = getScoreUB(T); if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { if ((T == VM_CNT || T == LGKM_CNT) && hasPendingFlat() && @@ -736,90 +729,46 @@ unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T, // If there is a pending FLAT operation, and this is a VMem or LGKM // waitcnt and the target can report early completion, then we need // to force a waitcnt 0. - NeedWait = CNT_MASK(T); - setScoreLB(T, getScoreUB(T)); + addWait(Wait, T, 0); } else if (counterOutOfOrder(T)) { // Counter can get decremented out-of-order when there // are multiple types event in the bracket. Also emit an s_wait counter // with a conservative value of 0 for the counter. - NeedWait = CNT_MASK(T); - setScoreLB(T, getScoreUB(T)); + addWait(Wait, T, 0); } else { - NeedWait = CNT_MASK(T); - setScoreLB(T, ScoreToWait); + addWait(Wait, T, UB - ScoreToWait); } } +} - return NeedWait; +void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { + applyWaitcnt(VM_CNT, Wait.VmCnt); + applyWaitcnt(EXP_CNT, Wait.ExpCnt); + applyWaitcnt(LGKM_CNT, Wait.LgkmCnt); } -// Where there are multiple types of event in the bracket of a counter, -// the decrement may go out of order. -bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) { - switch (T) { - case VM_CNT: - return false; - case LGKM_CNT: { - if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] && - EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) { - // Scalar memory read always can go out of order. - return true; - } - int NumEventTypes = 0; - if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] && - EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) { - NumEventTypes++; - } - if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] && - EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) { - NumEventTypes++; - } - if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] && - EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) { - NumEventTypes++; - } - if (NumEventTypes <= 1) { - return false; - } - break; +void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { + const uint32_t UB = getScoreUB(T); + if (Count >= UB) + return; + if (Count != 0) { + if (counterOutOfOrder(T)) + return; + setScoreLB(T, std::max(getScoreLB(T), UB - Count)); + } else { + setScoreLB(T, UB); + MixedPendingEvents[T] = false; + PendingEvents &= ~WaitEventMaskForInst[T]; } - case EXP_CNT: { - // If there has been a mixture of export types, then a waitcnt exp(0) is - // required. - if (MixedExpTypes) - return true; - int NumEventTypes = 0; - if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] && - EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { - NumEventTypes++; - } - if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] && - EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { - NumEventTypes++; - } - if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] && - EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { - NumEventTypes++; - } - if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] && - EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) { - NumEventTypes++; - } - - if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] && - EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) { - NumEventTypes++; - } +} - if (NumEventTypes <= 1) { - return false; - } - break; - } - default: - break; - } - return true; +// Where there are multiple types of event in the bracket of a counter, +// the decrement may go out of order. +bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { + // Scalar memory read always can go out of order. + if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS)) + return true; + return MixedPendingEvents[T]; } INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, @@ -841,29 +790,6 @@ static bool readsVCCZ(const MachineInstr &MI) { !MI.getOperand(1).isUndef(); } -/// Given wait count encodings checks if LHS is stronger than RHS. -bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) { - if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS)) - return false; - if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS)) - return false; - if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS)) - return false; - return true; -} - -/// Given wait count encodings create a new encoding which is stronger -/// or equal to both. -unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) { - unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS), - AMDGPU::decodeVmcnt(IV, RHS)); - unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS), - AMDGPU::decodeLgkmcnt(IV, RHS)); - unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS), - AMDGPU::decodeExpcnt(IV, RHS)); - return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt); -} - /// Generate s_waitcnt instruction to be placed before cur_Inst. /// Instructions of a given type are returned in order, /// but instructions of different types can complete out of order. @@ -874,41 +800,23 @@ unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) { /// and if so what the value of each counter is. /// The "score bracket" is bound by the lower bound and upper bound /// scores (*_score_LB and *_score_ub respectively). -void SIInsertWaitcnts::generateWaitcntInstBefore( - MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) { - // To emit, or not to emit - that's the question! - // Start with an assumption that there is no need to emit. - unsigned int EmitWaitcnt = 0; - - // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug - bool ForceEmitZeroWaitcnt = false; - +bool SIInsertWaitcnts::generateWaitcntInstBefore( + MachineInstr &MI, WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr) { setForceEmitWaitcnt(); bool IsForceEmitWaitcnt = isForceEmitWaitcnt(); if (MI.isDebugInstr()) - return; + return false; - // See if an s_waitcnt is forced at block entry, or is needed at - // program end. - if (ScoreBrackets->getWaitAtBeginning()) { - // Note that we have already cleared the state, so we don't need to update - // it. - ScoreBrackets->clearWaitAtBeginning(); - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - EmitWaitcnt |= CNT_MASK(T); - ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); - } - } + AMDGPU::Waitcnt Wait; // See if this instruction has a forced S_WAITCNT VM. // TODO: Handle other cases of NeedsWaitcntVmBefore() - else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || - MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || - MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) { - EmitWaitcnt |= - ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); + if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || + MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || + MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) { + Wait.VmCnt = 0; } // All waits must be resolved at call return. @@ -916,23 +824,14 @@ void SIInsertWaitcnts::generateWaitcntInstBefore( // with knowledge of the called routines. if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || MI.getOpcode() == AMDGPU::S_SETPC_B64_return) { - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { - ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); - EmitWaitcnt |= CNT_MASK(T); - } - } + Wait = AMDGPU::Waitcnt::allZero(); } // Resolve vm waits before gs-done. else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) == AMDGPU::SendMsg::ID_GS_DONE)) { - if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) { - ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); - EmitWaitcnt |= CNT_MASK(VM_CNT); - } + Wait.VmCnt = 0; } #if 0 // TODO: the following blocks of logic when we have fence. else if (MI.getOpcode() == SC_FENCE) { @@ -996,14 +895,12 @@ void SIInsertWaitcnts::generateWaitcntInstBefore( if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { // Export and GDS are tracked individually, either may trigger a waitcnt // for EXEC. - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK)); + if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) || + ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) || + ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) || + ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) { + Wait.ExpCnt = 0; + } } #if 0 // TODO: the following code to handle CALL. @@ -1035,23 +932,23 @@ void SIInsertWaitcnts::generateWaitcntInstBefore( continue; unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; // VM_CNT is only relevant to vgpr or LDS. - EmitWaitcnt |= ScoreBrackets->updateByWait( - VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); + ScoreBrackets.determineWait( + VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); } for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { const MachineOperand &Op = MI.getOperand(I); const MachineRegisterInfo &MRIA = *MRI; RegInterval Interval = - ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false); + ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false); for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { if (TRI->isVGPR(MRIA, Op.getReg())) { // VM_CNT is only relevant to vgpr or LDS. - EmitWaitcnt |= ScoreBrackets->updateByWait( - VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); + ScoreBrackets.determineWait( + VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); } - EmitWaitcnt |= ScoreBrackets->updateByWait( - LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); + ScoreBrackets.determineWait( + LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); } } // End of for loop that looks at all source operands to decide vm_wait_cnt @@ -1069,26 +966,26 @@ void SIInsertWaitcnts::generateWaitcntInstBefore( if (AS != AMDGPUAS::LOCAL_ADDRESS) continue; unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; - EmitWaitcnt |= ScoreBrackets->updateByWait( - VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT)); + ScoreBrackets.determineWait( + VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + ScoreBrackets.determineWait( + EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); } } for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { MachineOperand &Def = MI.getOperand(I); const MachineRegisterInfo &MRIA = *MRI; RegInterval Interval = - ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true); + ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true); for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { if (TRI->isVGPR(MRIA, Def.getReg())) { - EmitWaitcnt |= ScoreBrackets->updateByWait( - VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT)); + ScoreBrackets.determineWait( + VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + ScoreBrackets.determineWait( + EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); } - EmitWaitcnt |= ScoreBrackets->updateByWait( - LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); + ScoreBrackets.determineWait( + LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); } } // End of for loop that looks at all dest operands. } @@ -1099,172 +996,79 @@ void SIInsertWaitcnts::generateWaitcntInstBefore( // requiring a WAITCNT beforehand. if (MI.getOpcode() == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier()) { - EmitWaitcnt |= - ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT)); + Wait = AMDGPU::Waitcnt::allZero(); } // TODO: Remove this work-around, enable the assert for Bug 457939 // after fixing the scheduler. Also, the Shader Compiler code is // independent of target. if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) { - if (ScoreBrackets->getScoreLB(LGKM_CNT) < - ScoreBrackets->getScoreUB(LGKM_CNT) && - ScoreBrackets->hasPendingSMEM()) { - // Wait on everything, not just LGKM. vccz reads usually come from - // terminators, and we always wait on everything at the end of the - // block, so if we only wait on LGKM here, we might end up with - // another s_waitcnt inserted right after this if there are non-LGKM - // instructions still outstanding. - // FIXME: this is too conservative / the comment is wrong. - // We don't wait on everything at the end of the block and we combine - // waitcnts so we should never have back-to-back waitcnts. - ForceEmitZeroWaitcnt = true; - EmitWaitcnt = true; + if (ScoreBrackets.getScoreLB(LGKM_CNT) < + ScoreBrackets.getScoreUB(LGKM_CNT) && + ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { + Wait.LgkmCnt = 0; } } - // Does this operand processing indicate s_wait counter update? - if (EmitWaitcnt || IsForceEmitWaitcnt) { - int CntVal[NUM_INST_CNTS]; - - if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) { - // Force all waitcnts to 0. - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); - } - CntVal[VM_CNT] = 0; - CntVal[EXP_CNT] = 0; - CntVal[LGKM_CNT] = 0; - } else { - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - if (EmitWaitcnt & CNT_MASK(T)) { - int Delta = - ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T); - int MaxDelta = ScoreBrackets->getWaitCountMax(T); - if (Delta >= MaxDelta) { - Delta = -1; - if (T != EXP_CNT) { - ScoreBrackets->setScoreLB( - T, ScoreBrackets->getScoreUB(T) - MaxDelta); - } - EmitWaitcnt &= ~CNT_MASK(T); - } - CntVal[T] = Delta; - } else { - // If we are not waiting for a particular counter then encode - // it as -1 which means "don't care." - CntVal[T] = -1; - } + // Early-out if no wait is indicated. + if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) { + bool Modified = false; + if (OldWaitcntInstr) { + if (TrackedWaitcntSet.count(OldWaitcntInstr)) { + TrackedWaitcntSet.erase(OldWaitcntInstr); + OldWaitcntInstr->eraseFromParent(); + Modified = true; + } else { + int64_t Imm = OldWaitcntInstr->getOperand(0).getImm(); + ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm)); } + Modified = true; } + return Modified; + } - MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt(); - int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm(); - if (!OldWaitcnt || - (AMDGPU::decodeVmcnt(IV, Imm) != - (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) || - (AMDGPU::decodeExpcnt(IV, Imm) != - (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) || - (AMDGPU::decodeLgkmcnt(IV, Imm) != - (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) { - MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent()); - if (ContainingLoop) { - MachineBasicBlock *TBB = ContainingLoop->getHeader(); - BlockWaitcntBrackets *ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); - if (!ScoreBracket) { - assert(!BlockVisitedSet.count(TBB)); - BlockWaitcntBracketsMap[TBB] = - llvm::make_unique(ST); - ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); - } - ScoreBracket->setRevisitLoop(true); - LLVM_DEBUG(dbgs() << "set-revisit2: Block" - << ContainingLoop->getHeader()->getNumber() << '\n';); - } - } + if (ForceEmitZeroWaitcnts) + Wait = AMDGPU::Waitcnt::allZero(); - // Update an existing waitcount, or make a new one. - unsigned Enc = AMDGPU::encodeWaitcnt(IV, - ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT], - ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT], - ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]); - // We don't remove waitcnts that existed prior to the waitcnt - // pass. Check if the waitcnt to-be-inserted can be avoided - // or if the prev waitcnt can be updated. - bool insertSWaitInst = true; - for (MachineBasicBlock::iterator I = MI.getIterator(), - B = MI.getParent()->begin(); - insertSWaitInst && I != B; --I) { - if (I == MI.getIterator()) - continue; + if (ForceEmitWaitcnt[VM_CNT]) + Wait.VmCnt = 0; + if (ForceEmitWaitcnt[EXP_CNT]) + Wait.ExpCnt = 0; + if (ForceEmitWaitcnt[LGKM_CNT]) + Wait.LgkmCnt = 0; - switch (I->getOpcode()) { - case AMDGPU::S_WAITCNT: - if (isWaitcntStronger(I->getOperand(0).getImm(), Enc)) - insertSWaitInst = false; - else if (!OldWaitcnt) { - OldWaitcnt = &*I; - Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc); - } - break; - // TODO: skip over instructions which never require wait. - } - break; - } - if (insertSWaitInst) { - if (OldWaitcnt) { - assert(OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT); - if (ForceEmitZeroWaitcnts) - LLVM_DEBUG(dbgs() - << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n"); - if (IsForceEmitWaitcnt) - LLVM_DEBUG(dbgs() << "Force emit a s_waitcnt due to debug counter\n"); - - OldWaitcnt->getOperand(0).setImm(Enc); - if (!OldWaitcnt->getParent()) - MI.getParent()->insert(MI, OldWaitcnt); - - LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n" - << "Old Instr: " << MI << '\n' - << "New Instr: " << *OldWaitcnt << '\n'); - } else { - auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(), - MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm(Enc); - TrackedWaitcntSet.insert(SWaitInst); - - LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n" - << "Old Instr: " << MI << '\n' - << "New Instr: " << *SWaitInst << '\n'); - } - } + ScoreBrackets.applyWaitcnt(Wait); - if (CntVal[EXP_CNT] == 0) { - ScoreBrackets->setMixedExpTypes(false); - } + AMDGPU::Waitcnt OldWait; + if (OldWaitcntInstr) { + OldWait = + AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm()); } -} + if (OldWait.dominates(Wait)) + return false; -void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB, - MachineInstr *Waitcnt) { - if (MBB.empty()) { - MBB.push_back(Waitcnt); - return; - } + if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr)) + Wait = Wait.combined(OldWait); - MachineBasicBlock::iterator It = MBB.end(); - MachineInstr *MI = &*(--It); - if (MI->isBranch()) { - MBB.insert(It, Waitcnt); + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + if (OldWaitcntInstr) { + OldWaitcntInstr->getOperand(0).setImm(Enc); + + LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n" + << "Old Instr: " << MI << '\n' + << "New Instr: " << *OldWaitcntInstr << '\n'); } else { - MBB.push_back(Waitcnt); + auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(), + MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(Enc); + TrackedWaitcntSet.insert(SWaitInst); + + LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n" + << "Old Instr: " << MI << '\n' + << "New Instr: " << *SWaitInst << '\n'); } + + return true; } // This is a flat memory operation. Check to see if it has memory @@ -1282,8 +1086,8 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { return false; } -void SIInsertWaitcnts::updateEventWaitcntAfter( - MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) { +void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, + WaitcntBrackets *ScoreBrackets) { // Now look at the instruction opcode. If it is a memory access // instruction, update the upper-bound of the appropriate counter's // bracket and the destination operand scores. @@ -1349,251 +1153,121 @@ void SIInsertWaitcnts::updateEventWaitcntAfter( } } -// Merge the score brackets of the Block's predecessors; -// this merged score bracket is used when adding waitcnts to the Block -void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) { - BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get(); - int32_t MaxPending[NUM_INST_CNTS] = {0}; - int32_t MaxFlat[NUM_INST_CNTS] = {0}; - bool MixedExpTypes = false; - - // For single basic block loops, we need to retain the Block's - // score bracket to have accurate Pred info. So, make a copy of Block's - // score bracket, clear() it (which retains several important bits of info), - // populate, and then replace en masse. For non-single basic block loops, - // just clear Block's current score bracket and repopulate in-place. - bool IsSelfPred; - std::unique_ptr S; - - IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block)) - != Block.pred_end(); - if (IsSelfPred) { - S = llvm::make_unique(*ScoreBrackets); - ScoreBrackets = S.get(); - } - - ScoreBrackets->clear(); - - // See if there are any uninitialized predecessors. If so, emit an - // s_waitcnt 0 at the beginning of the block. - for (MachineBasicBlock *Pred : Block.predecessors()) { - BlockWaitcntBrackets *PredScoreBrackets = - BlockWaitcntBracketsMap[Pred].get(); - bool Visited = BlockVisitedSet.count(Pred); - if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { - continue; - } - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - int span = - PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T); - MaxPending[T] = std::max(MaxPending[T], span); - span = - PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T); - MaxFlat[T] = std::max(MaxFlat[T], span); - } - - MixedExpTypes |= PredScoreBrackets->mixedExpTypes(); - } - - // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK. - for (MachineBasicBlock *Pred : Block.predecessors()) { - BlockWaitcntBrackets *PredScoreBrackets = - BlockWaitcntBracketsMap[Pred].get(); - bool Visited = BlockVisitedSet.count(Pred); - if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { - continue; - } - - int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) - - PredScoreBrackets->getScoreLB(EXP_CNT); - MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan); - int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) - - PredScoreBrackets->getScoreLB(EXP_CNT); - MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan); - } - -#if 0 - // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker. - // TODO: how does LC distinguish between function entry and main entry? - // If this is the entry to a function, force a wait. - MachineBasicBlock &Entry = Block.getParent()->front(); - if (Entry.getNumber() == Block.getNumber()) { - ScoreBrackets->setWaitAtBeginning(); - return; - } -#endif - - // Now set the current Block's brackets to the largest ending bracket. - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - ScoreBrackets->setScoreUB(T, MaxPending[T]); - ScoreBrackets->setScoreLB(T, 0); - ScoreBrackets->setLastFlat(T, MaxFlat[T]); - } - - ScoreBrackets->setMixedExpTypes(MixedExpTypes); - - // Set the register scoreboard. - for (MachineBasicBlock *Pred : Block.predecessors()) { - if (!BlockVisitedSet.count(Pred)) { - continue; - } - - BlockWaitcntBrackets *PredScoreBrackets = - BlockWaitcntBracketsMap[Pred].get(); - - // Now merge the gpr_reg_score information - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - int PredLB = PredScoreBrackets->getScoreLB(T); - int PredUB = PredScoreBrackets->getScoreUB(T); - if (PredLB < PredUB) { - int PredScale = MaxPending[T] - PredUB; - // Merge vgpr scores. - for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) { - int PredRegScore = PredScoreBrackets->getRegScore(J, T); - if (PredRegScore <= PredLB) - continue; - int NewRegScore = PredScale + PredRegScore; - ScoreBrackets->setRegScore( - J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore)); - } - // Also need to merge sgpr scores for lgkm_cnt. - if (T == LGKM_CNT) { - for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) { - int PredRegScore = - PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); - if (PredRegScore <= PredLB) - continue; - int NewRegScore = PredScale + PredRegScore; - ScoreBrackets->setRegScore( - J + NUM_ALL_VGPRS, LGKM_CNT, - std::max( - ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT), - NewRegScore)); - } - } - } - } - - // Also merge the WaitEvent information. - ForAllWaitEventType(W) { - enum InstCounterType T = PredScoreBrackets->eventCounter(W); - int PredEventUB = PredScoreBrackets->getEventUB(W); - if (PredEventUB > PredScoreBrackets->getScoreLB(T)) { - int NewEventUB = - MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T); - if (NewEventUB > 0) { - ScoreBrackets->setEventUB( - W, std::max(ScoreBrackets->getEventUB(W), NewEventUB)); - } - } - } - } - - // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the - // sequencing predecessors, because changes to EXEC require waitcnts due to - // the delayed nature of these operations. - for (MachineBasicBlock *Pred : Block.predecessors()) { - if (!BlockVisitedSet.count(Pred)) { - continue; - } +bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score, + uint32_t OtherScore) { + uint32_t MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift; + uint32_t OtherShifted = + OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift; + Score = std::max(MyShifted, OtherShifted); + return OtherShifted > MyShifted; +} - BlockWaitcntBrackets *PredScoreBrackets = - BlockWaitcntBracketsMap[Pred].get(); - - int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK); - if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) { - int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub - - PredScoreBrackets->getScoreUB(EXP_CNT); - if (new_gds_ub > 0) { - ScoreBrackets->setEventUB( - GDS_GPR_LOCK, - std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub)); - } - } - int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK); - if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) { - int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub - - PredScoreBrackets->getScoreUB(EXP_CNT); - if (new_exp_ub > 0) { - ScoreBrackets->setEventUB( - EXP_GPR_LOCK, - std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub)); +/// Merge the pending events and associater score brackets of \p Other into +/// this brackets status. +/// +/// Returns whether the merge resulted in a change that requires tighter waits +/// (i.e. the merged brackets strictly dominate the original brackets). +bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { + bool StrictDom = false; + + for (auto T : inst_counter_types()) { + // Merge event flags for this counter + const bool OldOutOfOrder = counterOutOfOrder(T); + const uint32_t OldEvents = PendingEvents & WaitEventMaskForInst[T]; + const uint32_t OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; + if (OtherEvents & ~OldEvents) + StrictDom = true; + if (Other.MixedPendingEvents[T] || + (OldEvents && OtherEvents && OldEvents != OtherEvents)) + MixedPendingEvents[T] = true; + PendingEvents |= OtherEvents; + + // Merge scores for this counter + const uint32_t MyPending = ScoreUBs[T] - ScoreLBs[T]; + const uint32_t OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T]; + MergeInfo M; + M.OldLB = ScoreLBs[T]; + M.OtherLB = Other.ScoreLBs[T]; + M.MyShift = OtherPending > MyPending ? OtherPending - MyPending : 0; + M.OtherShift = ScoreUBs[T] - Other.ScoreUBs[T] + M.MyShift; + + const uint32_t NewUB = ScoreUBs[T] + M.MyShift; + if (NewUB < ScoreUBs[T]) + report_fatal_error("waitcnt score overflow"); + ScoreUBs[T] = NewUB; + ScoreLBs[T] = std::min(M.OldLB + M.MyShift, M.OtherLB + M.OtherShift); + + StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]); + + bool RegStrictDom = false; + for (int J = 0, E = std::max(getMaxVGPR(), Other.getMaxVGPR()) + 1; J != E; + J++) { + RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]); + } + + if (T == LGKM_CNT) { + for (int J = 0, E = std::max(getMaxSGPR(), Other.getMaxSGPR()) + 1; + J != E; J++) { + RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]); } } - } - // if a single block loop, update the score brackets. Not needed for other - // blocks, as we did this in-place - if (IsSelfPred) { - BlockWaitcntBracketsMap[&Block] = llvm::make_unique(*ScoreBrackets); + if (RegStrictDom && !OldOutOfOrder) + StrictDom = true; } -} -/// Return true if the given basic block is a "bottom" block of a loop. -/// This works even if the loop is discontiguous. This also handles -/// multiple back-edges for the same "header" block of a loop. -bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop, - const MachineBasicBlock *Block) { - for (MachineBasicBlock *MBB : Loop->blocks()) { - if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) { - return true; - } - } - return false; -} - -/// Count the number of "bottom" basic blocks of a loop. -unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) { - unsigned Count = 0; - for (MachineBasicBlock *MBB : Loop->blocks()) { - if (MBB->isSuccessor(Loop->getHeader())) { - Count++; - } - } - return Count; + return StrictDom; } // Generate s_waitcnt instructions where needed. -void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, - MachineBasicBlock &Block) { - // Initialize the state information. - mergeInputScoreBrackets(Block); - - BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get(); +bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, + MachineBasicBlock &Block, + WaitcntBrackets &ScoreBrackets) { + bool Modified = false; LLVM_DEBUG({ dbgs() << "*** Block" << Block.getNumber() << " ***"; - ScoreBrackets->dump(); + ScoreBrackets.dump(); }); // Walk over the instructions. + MachineInstr *OldWaitcntInstr = nullptr; + for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end(); Iter != E;) { MachineInstr &Inst = *Iter; + // Remove any previously existing waitcnts. if (Inst.getOpcode() == AMDGPU::S_WAITCNT) { - // Leave pre-existing waitcnts, but note their existence via setWaitcnt. - // Remove the waitcnt-pass-generated waitcnts; the pass will add them back - // as needed. - if (!TrackedWaitcntSet.count(&Inst)) - ++Iter; - else { - ++Iter; - Inst.removeFromParent(); + if (OldWaitcntInstr) { + if (TrackedWaitcntSet.count(OldWaitcntInstr)) { + TrackedWaitcntSet.erase(OldWaitcntInstr); + OldWaitcntInstr->eraseFromParent(); + OldWaitcntInstr = nullptr; + } else if (!TrackedWaitcntSet.count(&Inst)) { + // Two successive s_waitcnt's, both of which are pre-existing and + // are therefore preserved. + int64_t Imm = OldWaitcntInstr->getOperand(0).getImm(); + ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm)); + } else { + ++Iter; + Inst.eraseFromParent(); + Modified = true; + continue; + } } - ScoreBrackets->setWaitcnt(&Inst); + + OldWaitcntInstr = &Inst; + ++Iter; continue; } bool VCCZBugWorkAround = false; if (readsVCCZ(Inst) && (!VCCZBugHandledSet.count(&Inst))) { - if (ScoreBrackets->getScoreLB(LGKM_CNT) < - ScoreBrackets->getScoreUB(LGKM_CNT) && - ScoreBrackets->hasPendingSMEM()) { + if (ScoreBrackets.getScoreLB(LGKM_CNT) < + ScoreBrackets.getScoreUB(LGKM_CNT) && + ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) VCCZBugWorkAround = true; } @@ -1601,9 +1275,10 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, // Generate an s_waitcnt instruction to be placed before // cur_Inst, if needed. - generateWaitcntInstBefore(Inst, ScoreBrackets); + Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr); + OldWaitcntInstr = nullptr; - updateEventWaitcntAfter(Inst, ScoreBrackets); + updateEventWaitcntAfter(Inst, &ScoreBrackets); #if 0 // TODO: implement resource type check controlled by options with ub = LB. // If this instruction generates a S_SETVSKIP because it is an @@ -1616,11 +1291,9 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } #endif - ScoreBrackets->clearWaitcnt(); - LLVM_DEBUG({ Inst.print(dbgs()); - ScoreBrackets->dump(); + ScoreBrackets.dump(); }); // Check to see if this is a GWS instruction. If so, and if this is CI or @@ -1632,10 +1305,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P || Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) { // TODO: && context->target_info->GwsRequiresMemViolTest() ) { - ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); - ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - ScoreBrackets->updateByWait(LGKM_CNT, - ScoreBrackets->getScoreUB(LGKM_CNT)); + ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZero()); } // TODO: Remove this work-around after fixing the scheduler and enable the @@ -1648,71 +1318,13 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, AMDGPU::VCC) .addReg(AMDGPU::VCC); VCCZBugHandledSet.insert(&Inst); + Modified = true; } ++Iter; } - // Check if we need to force convergence at loop footer. - MachineLoop *ContainingLoop = MLI->getLoopFor(&Block); - if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) { - LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); - WaitcntData->print(); - LLVM_DEBUG(dbgs() << '\n';); - - // The iterative waitcnt insertion algorithm aims for optimal waitcnt - // placement, but doesn't guarantee convergence for a loop. Each - // loop should take at most (n+1) iterations for it to converge naturally, - // where n is the number of bottom blocks. If this threshold is reached and - // the result hasn't converged, then we force convergence by inserting - // a s_waitcnt at the end of loop footer. - if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) { - // To ensure convergence, need to make wait events at loop footer be no - // more than those from the previous iteration. - // As a simplification, instead of tracking individual scores and - // generating the precise wait count, just wait on 0. - bool HasPending = false; - MachineInstr *SWaitInst = WaitcntData->getWaitcnt(); - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { - ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); - HasPending = true; - break; - } - } - - if (HasPending) { - if (!SWaitInst) { - SWaitInst = BuildMI(Block, Block.getFirstNonPHI(), - DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm(0); - TrackedWaitcntSet.insert(SWaitInst); -#if 0 // TODO: Format the debug output - OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context); - OutputTransformAdd(SWaitInst, context); -#endif - } -#if 0 // TODO: ?? - _DEV( REPORTED_STATS->force_waitcnt_converge = 1; ) -#endif - } - - if (SWaitInst) { - LLVM_DEBUG({ - SWaitInst->print(dbgs()); - dbgs() << "\nAdjusted score board:"; - ScoreBrackets->dump(); - }); - - // Add this waitcnt to the block. It is either newly created or - // created in previous iterations and added back since block traversal - // always removes waitcnts. - insertWaitcntBeforeCF(Block, SWaitInst); - WaitcntData->setWaitcnt(SWaitInst); - } - } - } + return Modified; } bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { @@ -1720,13 +1332,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); - MLI = &getAnalysis(); IV = AMDGPU::getIsaVersion(ST->getCPU()); const SIMachineFunctionInfo *MFI = MF.getInfo(); ForceEmitZeroWaitcnts = ForceEmitZeroFlag; - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) + for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV); @@ -1746,93 +1356,70 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1; TrackedWaitcntSet.clear(); - BlockVisitedSet.clear(); VCCZBugHandledSet.clear(); - LoopWaitcntDataMap.clear(); - BlockWaitcntProcessedSet.clear(); + RpotIdxMap.clear(); + BlockInfos.clear(); + + // Keep iterating over the blocks in reverse post order, inserting and + // updating s_waitcnt where needed, until a fix point is reached. + for (MachineBasicBlock *MBB : + ReversePostOrderTraversal(&MF)) { + RpotIdxMap[MBB] = BlockInfos.size(); + BlockInfos.emplace_back(MBB); + } - // Walk over the blocks in reverse post order, inserting - // s_waitcnt where needed. - ReversePostOrderTraversal RPOT(&MF); + std::unique_ptr Brackets; bool Modified = false; - for (ReversePostOrderTraversal::rpo_iterator - I = RPOT.begin(), - E = RPOT.end(), J = RPOT.begin(); - I != E;) { - MachineBasicBlock &MBB = **I; - - BlockVisitedSet.insert(&MBB); - - BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get(); - if (!ScoreBrackets) { - BlockWaitcntBracketsMap[&MBB] = llvm::make_unique(ST); - ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get(); - } - ScoreBrackets->setPostOrder(MBB.getNumber()); - MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB); - if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr) - LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique(); - - // If we are walking into the block from before the loop, then guarantee - // at least 1 re-walk over the loop to propagate the information, even if - // no S_WAITCNT instructions were generated. - if (ContainingLoop && ContainingLoop->getHeader() == &MBB) { - unsigned Count = countNumBottomBlocks(ContainingLoop); - - // If the loop has multiple back-edges, and so more than one "bottom" - // basic block, we have to guarantee a re-walk over every blocks. - if ((std::count(BlockWaitcntProcessedSet.begin(), - BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) { - BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true); - LLVM_DEBUG(dbgs() << "set-revisit1: Block" - << ContainingLoop->getHeader()->getNumber() << '\n';); + bool Repeat; + do { + Repeat = false; + + for (BlockInfo &BI : BlockInfos) { + if (!BI.Dirty) + continue; + + unsigned Idx = std::distance(&*BlockInfos.begin(), &BI); + + if (BI.Incoming) { + if (!Brackets) + Brackets = llvm::make_unique(*BI.Incoming); + else + *Brackets = *BI.Incoming; + } else { + if (!Brackets) + Brackets = llvm::make_unique(ST); + else + Brackets->clear(); } - } - // Walk over the instructions. - insertWaitcntInBlock(MF, MBB); - - // Record that waitcnts have been processed at least once for this block. - BlockWaitcntProcessedSet.push_back(&MBB); - - // See if we want to revisit the loop. If a loop has multiple back-edges, - // we shouldn't revisit the same "bottom" basic block. - if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) && - std::count(BlockWaitcntProcessedSet.begin(), - BlockWaitcntProcessedSet.end(), &MBB) == 1) { - MachineBasicBlock *EntryBB = ContainingLoop->getHeader(); - BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get(); - if (EntrySB && EntrySB->getRevisitLoop()) { - EntrySB->setRevisitLoop(false); - J = I; - int32_t PostOrder = EntrySB->getPostOrder(); - // TODO: Avoid this loop. Find another way to set I. - for (ReversePostOrderTraversal::rpo_iterator - X = RPOT.begin(), - Y = RPOT.end(); - X != Y; ++X) { - MachineBasicBlock &MBBX = **X; - if (MBBX.getNumber() == PostOrder) { - I = X; - break; + Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets); + BI.Dirty = false; + + if (Brackets->hasPending()) { + BlockInfo *MoveBracketsToSucc = nullptr; + for (MachineBasicBlock *Succ : BI.MBB->successors()) { + unsigned SuccIdx = RpotIdxMap[Succ]; + BlockInfo &SuccBI = BlockInfos[SuccIdx]; + if (!SuccBI.Incoming) { + SuccBI.Dirty = true; + if (SuccIdx <= Idx) + Repeat = true; + if (!MoveBracketsToSucc) { + MoveBracketsToSucc = &SuccBI; + } else { + SuccBI.Incoming = llvm::make_unique(*Brackets); + } + } else if (SuccBI.Incoming->merge(*Brackets)) { + SuccBI.Dirty = true; + if (SuccIdx <= Idx) + Repeat = true; } } - LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); - WaitcntData->incIterCnt(); - LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';); - continue; - } else { - LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); - // Loop converged, reset iteration count. If this loop gets revisited, - // it must be from an outer loop, the counter will restart, this will - // ensure we don't force convergence on such revisits. - WaitcntData->resetIterCnt(); + if (MoveBracketsToSucc) + MoveBracketsToSucc->Incoming = std::move(Brackets); } } - - J = I; - ++I; - } + } while (Repeat); SmallVector EndPgmBlocks; diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td index b73d30940fc38edb933098b7d2b44b6f1268e40f..65ffc27b8b6084068c8b5f0da64ed079f98a5b91 100644 --- a/lib/Target/AMDGPU/SIInstrFormats.td +++ b/lib/Target/AMDGPU/SIInstrFormats.td @@ -121,6 +121,10 @@ class InstSI DisableSIDecoder = 0; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 562428ef37c011a2badf1df50d485186b0ed46e8..b7c4eed621124654b9a6de824f9a9e6eb0fe44f0 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -265,9 +265,10 @@ static bool isStride64(unsigned Opc) { } } -bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, - int64_t &Offset, - const TargetRegisterInfo *TRI) const { +bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt, + MachineOperand *&BaseOp, + int64_t &Offset, + const TargetRegisterInfo *TRI) const { unsigned Opc = LdSt.getOpcode(); if (isDS(LdSt)) { @@ -275,11 +276,10 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, getNamedOperand(LdSt, AMDGPU::OpName::offset); if (OffsetImm) { // Normal, single offset LDS instruction. - const MachineOperand *AddrReg = - getNamedOperand(LdSt, AMDGPU::OpName::addr); - - BaseReg = AddrReg->getReg(); + BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); Offset = OffsetImm->getImm(); + assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " + "operands of type register."); return true; } @@ -310,10 +310,10 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, if (isStride64(Opc)) EltSize *= 64; - const MachineOperand *AddrReg = - getNamedOperand(LdSt, AMDGPU::OpName::addr); - BaseReg = AddrReg->getReg(); + BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); Offset = EltSize * Offset0; + assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " + "operands of type register."); return true; } @@ -325,19 +325,20 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, if (SOffset && SOffset->isReg()) return false; - const MachineOperand *AddrReg = - getNamedOperand(LdSt, AMDGPU::OpName::vaddr); + MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); if (!AddrReg) return false; const MachineOperand *OffsetImm = getNamedOperand(LdSt, AMDGPU::OpName::offset); - BaseReg = AddrReg->getReg(); + BaseOp = AddrReg; Offset = OffsetImm->getImm(); if (SOffset) // soffset can be an inline immediate. Offset += SOffset->getImm(); + assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " + "operands of type register."); return true; } @@ -347,36 +348,46 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, if (!OffsetImm) return false; - const MachineOperand *SBaseReg = - getNamedOperand(LdSt, AMDGPU::OpName::sbase); - BaseReg = SBaseReg->getReg(); + MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase); + BaseOp = SBaseReg; Offset = OffsetImm->getImm(); + assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " + "operands of type register."); return true; } if (isFLAT(LdSt)) { - const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); + MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); if (VAddr) { // Can't analyze 2 offsets. if (getNamedOperand(LdSt, AMDGPU::OpName::saddr)) return false; - BaseReg = VAddr->getReg(); + BaseOp = VAddr; } else { // scratch instructions have either vaddr or saddr. - BaseReg = getNamedOperand(LdSt, AMDGPU::OpName::saddr)->getReg(); + BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); } Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); + assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " + "operands of type register."); return true; } return false; } -static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1, - const MachineInstr &MI2, unsigned BaseReg2) { - if (BaseReg1 == BaseReg2) +static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, + const MachineOperand &BaseOp1, + const MachineInstr &MI2, + const MachineOperand &BaseOp2) { + // Support only base operands with base registers. + // Note: this could be extended to support FI operands. + if (!BaseOp1.isReg() || !BaseOp2.isReg()) + return false; + + if (BaseOp1.isIdenticalTo(BaseOp2)) return true; if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) @@ -402,12 +413,13 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1, return Base1 == Base2; } -bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, - unsigned BaseReg1, - MachineInstr &SecondLdSt, - unsigned BaseReg2, +bool SIInstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1, + MachineOperand &BaseOp2, unsigned NumLoads) const { - if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2)) + MachineInstr &FirstLdSt = *BaseOp1.getParent(); + MachineInstr &SecondLdSt = *BaseOp2.getParent(); + + if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2)) return false; const MachineOperand *FirstDst = nullptr; @@ -864,7 +876,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - DebugLoc DL = MBB.findDebugLoc(MI); + const DebugLoc &DL = MBB.findDebugLoc(MI); unsigned Size = FrameInfo.getObjectSize(FrameIndex); unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); @@ -963,9 +975,9 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); - const SIMachineFunctionInfo *MFI = MF->getInfo(); + SIMachineFunctionInfo *MFI = MF->getInfo(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - DebugLoc DL = MBB.findDebugLoc(MI); + const DebugLoc &DL = MBB.findDebugLoc(MI); unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); unsigned Size = FrameInfo.getObjectSize(FrameIndex); unsigned SpillSize = TRI->getSpillSize(*RC); @@ -977,6 +989,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, PtrInfo, MachineMemOperand::MOLoad, Size, Align); if (RI.isSGPRClass(RC)) { + MFI->setHasSpilledSGPRs(); + // FIXME: Maybe this should not include a memoperand because it will be // lowered to non-memory instructions. const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); @@ -1018,7 +1032,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); const GCNSubtarget &ST = MF->getSubtarget(); - DebugLoc DL = MBB.findDebugLoc(MI); + const DebugLoc &DL = MBB.findDebugLoc(MI); unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); unsigned WavefrontSize = ST.getWavefrontSize(); @@ -1026,7 +1040,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( if (!MFI->hasCalculatedTID()) { MachineBasicBlock &Entry = MBB.getParent()->front(); MachineBasicBlock::iterator Insert = Entry.front(); - DebugLoc DL = Insert->getDebugLoc(); + const DebugLoc &DL = Insert->getDebugLoc(); TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, *MF); @@ -1632,7 +1646,34 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, SmallVectorImpl &Cond, bool AllowModify) const { MachineBasicBlock::iterator I = MBB.getFirstTerminator(); - if (I == MBB.end()) + auto E = MBB.end(); + if (I == E) + return false; + + // Skip over the instructions that are artificially terminators for special + // exec management. + while (I != E && !I->isBranch() && !I->isReturn() && + I->getOpcode() != AMDGPU::SI_MASK_BRANCH) { + switch (I->getOpcode()) { + case AMDGPU::SI_MASK_BRANCH: + case AMDGPU::S_MOV_B64_term: + case AMDGPU::S_XOR_B64_term: + case AMDGPU::S_ANDN2_B64_term: + break; + case AMDGPU::SI_IF: + case AMDGPU::SI_ELSE: + case AMDGPU::SI_KILL_I1_TERMINATOR: + case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: + // FIXME: It's messy that these need to be considered here at all. + return true; + default: + llvm_unreachable("unexpected non-branch terminator inst"); + } + + ++I; + } + + if (I == E) return false; if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) @@ -2133,11 +2174,13 @@ static bool offsetsDoNotOverlap(int WidthA, int OffsetA, bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, MachineInstr &MIb) const { - unsigned BaseReg0, BaseReg1; + MachineOperand *BaseOp0, *BaseOp1; int64_t Offset0, Offset1; - if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && - getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { + if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) && + getMemOperandWithOffset(MIb, BaseOp1, Offset1, &RI)) { + if (!BaseOp0->isIdenticalTo(*BaseOp1)) + return false; if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { // FIXME: Handle ds_read2 / ds_write2. @@ -2145,8 +2188,7 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, } unsigned Width0 = (*MIa.memoperands_begin())->getSize(); unsigned Width1 = (*MIb.memoperands_begin())->getSize(); - if (BaseReg0 == BaseReg1 && - offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { + if (offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { return true; } } @@ -2584,6 +2626,10 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI, if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) return false; + // Can it be shrunk to a valid 32 bit opcode? + if (!hasVALU32BitEncoding(MI.getOpcode())) + return false; + // Check output modifiers return !hasModifiersSet(MI, AMDGPU::OpName::omod) && !hasModifiersSet(MI, AMDGPU::OpName::clamp); @@ -3121,6 +3167,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; + case AMDGPU::S_XNOR_B32: + return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; @@ -3558,8 +3606,13 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, // pointer value is uniform. MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { - unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); - SBase->setReg(SGPR); + unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); + SBase->setReg(SGPR); + } + MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff); + if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { + unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); + SOff->setReg(SGPR); } } @@ -4088,22 +4141,50 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, // Default handling break; case AMDGPU::S_AND_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64, MDT); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); Inst.eraseFromParent(); continue; case AMDGPU::S_OR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64, MDT); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); Inst.eraseFromParent(); continue; case AMDGPU::S_XOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64, MDT); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_NAND_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_NOR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_XNOR_B64: + if (ST.hasDLInsts()) + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); + else + splitScalar64BitXnor(Worklist, Inst, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_ANDN2_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_ORN2_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); Inst.eraseFromParent(); continue; case AMDGPU::S_NOT_B64: - splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); Inst.eraseFromParent(); continue; @@ -4184,120 +4265,26 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, Inst.eraseFromParent(); continue; - case AMDGPU::S_XNOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); + case AMDGPU::S_NAND_B32: + splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); Inst.eraseFromParent(); continue; - case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: { - unsigned VDst; - unsigned NewOpcode; - - switch(Opcode) { - case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: - NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_OFFEN; - VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - break; - case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: - NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; - VDst = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - break; - case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: - NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; - VDst = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass); - break; - case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: - splitScalarBuffer(Worklist, Inst); - Inst.eraseFromParent(); - continue; - } - - const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff); - auto Add = MRI.getUniqueVRegDef(VAddr->getReg()); - unsigned Offset = 0; - - // FIXME: This isn't safe because the addressing mode doesn't work - // correctly if vaddr is negative. - // - // FIXME: Should probably be done somewhere else, maybe SIFoldOperands. - // - // See if we can extract an immediate offset by recognizing one of these: - // V_ADD_I32_e32 dst, imm, src1 - // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1 - // V_ADD will be removed by "Remove dead machine instructions". - if (Add && - (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 || - Add->getOpcode() == AMDGPU::V_ADD_U32_e32 || - Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) { - static const unsigned SrcNames[2] = { - AMDGPU::OpName::src0, - AMDGPU::OpName::src1, - }; - - // Find a literal offset in one of source operands. - for (int i = 0; i < 2; i++) { - const MachineOperand *Src = - getNamedOperand(*Add, SrcNames[i]); - - if (Src->isReg()) { - MachineInstr *Def = MRI.getUniqueVRegDef(Src->getReg()); - if (Def) { - if (Def->isMoveImmediate()) - Src = &Def->getOperand(1); - else if (Def->isCopy()) { - auto Mov = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); - if (Mov && Mov->isMoveImmediate()) { - Src = &Mov->getOperand(1); - } - } - } - } - - if (Src) { - if (Src->isImm()) - Offset = Src->getImm(); - else if (Src->isCImm()) - Offset = Src->getCImm()->getZExtValue(); - } - - if (Offset && isLegalMUBUFImmOffset(Offset)) { - VAddr = getNamedOperand(*Add, SrcNames[!i]); - break; - } - - Offset = 0; - } - } + case AMDGPU::S_NOR_B32: + splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); + Inst.eraseFromParent(); + continue; - MachineInstr *NewInstr = - BuildMI(*MBB, Inst, Inst.getDebugLoc(), - get(NewOpcode), VDst) - .add(*VAddr) // vaddr - .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc - .addImm(0) // soffset - .addImm(Offset) // offset - .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm()) - .addImm(0) // slc - .addImm(0) // tfe - .cloneMemRefs(Inst) - .getInstr(); - - MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(), - VDst); - addUsersToMoveToVALUWorklist(VDst, MRI, Worklist); + case AMDGPU::S_ANDN2_B32: + splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); Inst.eraseFromParent(); + continue; - // Legalize all operands other than the offset. Notably, convert the srsrc - // into SGPRs using v_readfirstlane if needed. - legalizeOperands(*NewInstr, MDT); + case AMDGPU::S_ORN2_B32: + splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); + Inst.eraseFromParent(); continue; } - } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { // We cannot move this instruction to the VALU, so we should try to @@ -4471,23 +4458,116 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, MachineOperand &Src0 = Inst.getOperand(1); MachineOperand &Src1 = Inst.getOperand(2); - legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); - legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); - - unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); if (ST.hasDLInsts()) { + unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); + legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); + BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) .add(Src0) .add(Src1); + + MRI.replaceRegWith(Dest.getReg(), NewDest); + addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); } else { - unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor) - .add(Src0) + // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can + // invert either source and then perform the XOR. If either source is a + // scalar register, then we can leave the inversion on the scalar unit to + // acheive a better distrubution of scalar and vector instructions. + bool Src0IsSGPR = Src0.isReg() && + RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); + bool Src1IsSGPR = Src1.isReg() && + RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); + MachineInstr *Not = nullptr; + MachineInstr *Xor = nullptr; + unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + // Build a pair of scalar instructions and add them to the work list. + // The next iteration over the work list will lower these to the vector + // unit as necessary. + if (Src0IsSGPR) { + Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp) + .add(Src0); + Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) + .addReg(Temp) .add(Src1); + } else if (Src1IsSGPR) { + Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp) + .add(Src1); + Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) + .add(Src0) + .addReg(Temp); + } else { + Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) + .add(Src0) + .add(Src1); + Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) + .addReg(Temp); + Worklist.insert(Not); + } + + MRI.replaceRegWith(Dest.getReg(), NewDest); + + Worklist.insert(Xor); - BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest) - .addReg(Xor); + addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); } +} + +void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, + MachineInstr &Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + const DebugLoc &DL = Inst.getDebugLoc(); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + + unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) + .add(Src0) + .add(Src1); + + MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) + .addReg(Interm); + + Worklist.insert(&Op); + Worklist.insert(&Not); + + MRI.replaceRegWith(Dest.getReg(), NewDest); + addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); +} + +void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, + MachineInstr &Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + const DebugLoc &DL = Inst.getDebugLoc(); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + + unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) + .add(Src1); + + MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest) + .add(Src0) + .addReg(Interm); + + Worklist.insert(&Not); + Worklist.insert(&Op); MRI.replaceRegWith(Dest.getReg(), NewDest); addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); @@ -4520,13 +4600,13 @@ void SIInstrInfo::splitScalar64BitUnaryOp( const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); - BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); + MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); - BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); + MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) @@ -4537,6 +4617,9 @@ void SIInstrInfo::splitScalar64BitUnaryOp( MRI.replaceRegWith(Dest.getReg(), FullDestReg); + Worklist.insert(&LoHalf); + Worklist.insert(&HiHalf); + // We don't need to legalizeOperands here because for a single operand, src0 // will support any kind of input. @@ -4642,6 +4725,10 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, AMDGPU::sub0, Src0SubRC); MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub1, Src0SubRC); + MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, + AMDGPU::sub1, Src1SubRC); const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); @@ -4652,11 +4739,6 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, .add(SrcReg0Sub0) .add(SrcReg1Sub0); - MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, - AMDGPU::sub1, Src0SubRC); - MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, - AMDGPU::sub1, Src1SubRC); - unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) .add(SrcReg0Sub1) @@ -4671,22 +4753,62 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, MRI.replaceRegWith(Dest.getReg(), FullDestReg); - // Try to legalize the operands in case we need to swap the order to keep it - // valid. - legalizeOperands(LoHalf, MDT); - legalizeOperands(HiHalf, MDT); + Worklist.insert(&LoHalf); + Worklist.insert(&HiHalf); // Move all users of this moved vlaue. addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } +void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, + MachineInstr &Inst, + MachineDominatorTree *MDT) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + const DebugLoc &DL = Inst.getDebugLoc(); + + MachineBasicBlock::iterator MII = Inst; + + const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); + + unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + MachineOperand* Op0; + MachineOperand* Op1; + + if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) { + Op0 = &Src0; + Op1 = &Src1; + } else { + Op0 = &Src1; + Op1 = &Src0; + } + + BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm) + .add(*Op0); + + unsigned NewDest = MRI.createVirtualRegister(DestRC); + + MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest) + .addReg(Interm) + .add(*Op1); + + MRI.replaceRegWith(Dest.getReg(), NewDest); + + Worklist.insert(&Xor); +} + void SIInstrInfo::splitScalar64BitBCNT( SetVectorType &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst.getDebugLoc(); + const DebugLoc &DL = Inst.getDebugLoc(); MachineOperand &Dest = Inst.getOperand(0); MachineOperand &Src = Inst.getOperand(1); @@ -4722,7 +4844,7 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst.getDebugLoc(); + const DebugLoc &DL = Inst.getDebugLoc(); MachineOperand &Dest = Inst.getOperand(0); uint32_t Imm = Inst.getOperand(2).getImm(); @@ -4778,73 +4900,6 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::splitScalarBuffer(SetVectorType &Worklist, - MachineInstr &Inst) const { - MachineBasicBlock &MBB = *Inst.getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - - MachineBasicBlock::iterator MII = Inst; - auto &DL = Inst.getDebugLoc(); - - MachineOperand &Dest = *getNamedOperand(Inst, AMDGPU::OpName::sdst);; - MachineOperand &Rsrc = *getNamedOperand(Inst, AMDGPU::OpName::sbase); - MachineOperand &Offset = *getNamedOperand(Inst, AMDGPU::OpName::soff); - MachineOperand &Glc = *getNamedOperand(Inst, AMDGPU::OpName::glc); - - unsigned Opcode = Inst.getOpcode(); - unsigned NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; - unsigned Count = 0; - const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); - const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); - - switch(Opcode) { - default: - return; - case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: - Count = 2; - break; - case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: - Count = 4; - break; - } - - // FIXME: Should also attempt to build VAddr and Offset like the non-split - // case (see call site for this function) - - // Create a vector of result registers - SmallVector ResultRegs; - for (unsigned i = 0; i < Count ; ++i) { - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass); - MachineInstr &NewMI = *BuildMI(MBB, MII, DL, get(NewOpcode), ResultReg) - .addReg(Offset.getReg()) // offset - .addReg(Rsrc.getReg()) // rsrc - .addImm(0) // soffset - .addImm(i << 4) // inst_offset - .addImm(Glc.getImm()) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addMemOperand(*Inst.memoperands_begin()); - // Extract the 4 32 bit sub-registers from the result to add into the final REG_SEQUENCE - auto &NewDestOp = NewMI.getOperand(0); - for (unsigned i = 0 ; i < 4 ; i++) - ResultRegs.push_back(buildExtractSubReg(MII, MRI, NewDestOp, &AMDGPU::VReg_128RegClass, - RI.getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass)); - } - // Create a new combined result to replace original with - unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); - MachineInstrBuilder CombinedResBuilder = BuildMI(MBB, MII, DL, - get(TargetOpcode::REG_SEQUENCE), FullDestReg); - - for (unsigned i = 0 ; i < Count * 4 ; ++i) { - CombinedResBuilder - .addReg(ResultRegs[i]) - .addImm(RI.getSubRegFromChannel(i)); - } - - MRI.replaceRegWith(Dest.getReg(), FullDestReg); - addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); -} - void SIInstrInfo::addUsersToMoveToVALUWorklist( unsigned DstReg, MachineRegisterInfo &MRI, @@ -4934,10 +4989,10 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist( make_range(MachineBasicBlock::iterator(SCCDefInst), SCCDefInst.getParent()->end())) { // Exit if we find another SCC def. - if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1) + if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) return; - if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1) + if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) Worklist.insert(&MI); } } @@ -5455,3 +5510,84 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { return MCOp; } + +static +TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) { + assert(RegOpnd.isReg()); + return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() : + getRegSubRegPair(RegOpnd); +} + +TargetInstrInfo::RegSubRegPair +llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) { + assert(MI.isRegSequence()); + for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I) + if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) { + auto &RegOp = MI.getOperand(1 + 2 * I); + return getRegOrUndef(RegOp); + } + return TargetInstrInfo::RegSubRegPair(); +} + +// Try to find the definition of reg:subreg in subreg-manipulation pseudos +// Following a subreg of reg:subreg isn't supported +static bool followSubRegDef(MachineInstr &MI, + TargetInstrInfo::RegSubRegPair &RSR) { + if (!RSR.SubReg) + return false; + switch (MI.getOpcode()) { + default: break; + case AMDGPU::REG_SEQUENCE: + RSR = getRegSequenceSubReg(MI, RSR.SubReg); + return true; + // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg + case AMDGPU::INSERT_SUBREG: + if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm()) + // inserted the subreg we're looking for + RSR = getRegOrUndef(MI.getOperand(2)); + else { // the subreg in the rest of the reg + auto R1 = getRegOrUndef(MI.getOperand(1)); + if (R1.SubReg) // subreg of subreg isn't supported + return false; + RSR.Reg = R1.Reg; + } + return true; + } + return false; +} + +MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, + MachineRegisterInfo &MRI) { + assert(MRI.isSSA()); + if (!TargetRegisterInfo::isVirtualRegister(P.Reg)) + return nullptr; + + auto RSR = P; + auto *DefInst = MRI.getVRegDef(RSR.Reg); + while (auto *MI = DefInst) { + DefInst = nullptr; + switch (MI->getOpcode()) { + case AMDGPU::COPY: + case AMDGPU::V_MOV_B32_e32: { + auto &Op1 = MI->getOperand(1); + if (Op1.isReg() && + TargetRegisterInfo::isVirtualRegister(Op1.getReg())) { + if (Op1.isUndef()) + return nullptr; + RSR = getRegSubRegPair(Op1); + DefInst = MRI.getVRegDef(RSR.Reg); + } + break; + } + default: + if (followSubRegDef(*MI, RSR)) { + if (!RSR.Reg) + return nullptr; + DefInst = MRI.getVRegDef(RSR.Reg); + } + } + if (!DefInst) + return MI; + } + return nullptr; +} diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 2f51b19995035e4f063fc1325396949ff6d370eb..5b1a05f3785ec722553790ae753668bee63b8f31 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -89,6 +89,14 @@ private: void lowerScalarXnor(SetVectorType &Worklist, MachineInstr &Inst) const; + void splitScalarNotBinop(SetVectorType &Worklist, + MachineInstr &Inst, + unsigned Opcode) const; + + void splitScalarBinOpN2(SetVectorType &Worklist, + MachineInstr &Inst, + unsigned Opcode) const; + void splitScalar64BitUnaryOp(SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const; @@ -99,12 +107,13 @@ private: unsigned Opcode, MachineDominatorTree *MDT = nullptr) const; + void splitScalar64BitXnor(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT = nullptr) const; + void splitScalar64BitBCNT(SetVectorType &Worklist, MachineInstr &Inst) const; void splitScalar64BitBFE(SetVectorType &Worklist, MachineInstr &Inst) const; - void splitScalarBuffer(SetVectorType &Worklist, - MachineInstr &Inst) const; void movePackToVALU(SetVectorType &Worklist, MachineRegisterInfo &MRI, MachineInstr &Inst) const; @@ -164,12 +173,11 @@ public: int64_t &Offset1, int64_t &Offset2) const override; - bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, - int64_t &Offset, - const TargetRegisterInfo *TRI) const final; + bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp, + int64_t &Offset, + const TargetRegisterInfo *TRI) const final; - bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1, - MachineInstr &SecondLdSt, unsigned BaseReg2, + bool shouldClusterMemOps(MachineOperand &BaseOp1, MachineOperand &BaseOp2, unsigned NumLoads) const override; bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, @@ -596,6 +604,14 @@ public: return MI.getDesc().TSFlags & ClampFlags; } + static bool usesFPDPRounding(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::FPDPRounding; + } + + bool usesFPDPRounding(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::FPDPRounding; + } + bool isVGPRCopy(const MachineInstr &MI) const { assert(MI.isCopy()); unsigned Dest = MI.getOperand(0).getReg(); @@ -910,9 +926,36 @@ public: /// Return -1 if the target-specific opcode for the pseudo instruction does /// not exist. If Opcode is not a pseudo instruction, this is identity. int pseudoToMCOpcode(int Opcode) const; - }; +/// \brief Returns true if a reg:subreg pair P has a TRC class +inline bool isOfRegClass(const TargetInstrInfo::RegSubRegPair &P, + const TargetRegisterClass &TRC, + MachineRegisterInfo &MRI) { + auto *RC = MRI.getRegClass(P.Reg); + if (!P.SubReg) + return RC == &TRC; + auto *TRI = MRI.getTargetRegisterInfo(); + return RC == TRI->getMatchingSuperRegClass(RC, &TRC, P.SubReg); +} + +/// \brief Create RegSubRegPair from a register MachineOperand +inline +TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O) { + assert(O.isReg()); + return TargetInstrInfo::RegSubRegPair(O.getReg(), O.getSubReg()); +} + +/// \brief Return the SubReg component from REG_SEQUENCE +TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, + unsigned SubReg); + +/// \brief Return the defining instruction for a given reg:subreg pair +/// skipping copy like instructions and subreg-manipulation pseudos. +/// Following another subreg of a reg:subreg isn't supported. +MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, + MachineRegisterInfo &MRI); + namespace AMDGPU { LLVM_READONLY @@ -924,6 +967,9 @@ namespace AMDGPU { LLVM_READONLY int getSDWAOp(uint16_t Opcode); + LLVM_READONLY + int getDPPOp32(uint16_t Opcode); + LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode); @@ -954,6 +1000,9 @@ namespace AMDGPU { LLVM_READONLY int getSOPKOp(uint16_t Opcode); + LLVM_READONLY + int getGlobalSaddrOp(uint16_t Opcode); + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 0859989b03963018ff2f20c4a4beda01cc94d910..13afa4d4974bf18f1b592f51e0e61a452c63ce1a 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -1622,7 +1622,7 @@ class getHasExt { + bit ret = !if(!eq(NumSrcArgs, 0), 0, + getHasExt.ret); +} + class BitOr { bit ret = !if(a, 1, !if(b, 1, 0)); } @@ -1710,7 +1716,7 @@ class VOPProfile _ArgVT> { field bit HasSDWAOMod = isFloatType.ret; field bit HasExt = getHasExt.ret; - field bit HasExtDPP = HasExt; + field bit HasExtDPP = getHasDPP.ret; field bit HasExtSDWA = HasExt; field bit HasExtSDWA9 = HasExt; field int NeedPatGen = PatGenMode.NoPattern; @@ -1741,8 +1747,10 @@ class VOPProfile _ArgVT> { getOpSelMod.ret, getOpSelMod.ret, getOpSelMod.ret>.ret; - field dag InsDPP = getInsDPP.ret; + field dag InsDPP = !if(HasExtDPP, + getInsDPP.ret, + (ins)); field dag InsSDWA = getInsSDWA.ret; @@ -1756,7 +1764,8 @@ class VOPProfile _ArgVT> { HasSrc0FloatMods, HasSrc1FloatMods, HasSrc2FloatMods>.ret; - field string AsmDPP = getAsmDPP.ret; + field string AsmDPP = !if(HasExtDPP, + getAsmDPP.ret, ""); field string AsmSDWA = getAsmSDWA.ret; field string AsmSDWA9 = getAsmSDWA9.ret; } @@ -1931,6 +1940,15 @@ def getBasicFromSDWAOp : InstrMapping { let ValueCols = [["Default"]]; } +// Maps ordinary instructions to their DPP counterparts +def getDPPOp32 : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["AsmVariantName"]; + let KeyCol = ["Default"]; + let ValueCols = [["DPP"]]; +} + // Maps an commuted opcode to its original version def getCommuteOrig : InstrMapping { let FilterClass = "Commutable_REV"; @@ -2017,6 +2035,15 @@ def getAtomicNoRetOp : InstrMapping { let ValueCols = [["0"]]; } +// Maps a GLOBAL to its SADDR form. +def getGlobalSaddrOp : InstrMapping { + let FilterClass = "GlobalSaddrTable"; + let RowFields = ["SaddrOp"]; + let ColFields = ["IsSaddr"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + include "SIInstructions.td" include "DSInstructions.td" diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 9714203d3d7053d7d0b3aa838d02e1e1cef47c13..5f78a10727503525fd54b3f06fb0c6a1486da9b1 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -247,7 +247,7 @@ def SI_LOOP : CFPseudoInstSI < (outs), (ins SReg_64:$saved, brtarget:$target), [(AMDGPUloop i64:$saved, bb:$target)], 1, 1> { let Size = 8; - let isBranch = 0; + let isBranch = 1; let hasSideEffects = 1; } @@ -307,6 +307,7 @@ def SI_ILLEGAL_COPY : SPseudoInstSI < def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> { let isTerminator = 1; let usesCustomInserter = 1; + let isBranch = 1; } def SI_PS_LIVE : PseudoInstSI < @@ -579,7 +580,8 @@ def : Pat < (int_amdgcn_kill (i1 (setcc f32:$src, InlineFPImm:$imm, cond:$cond))), (SI_KILL_F32_COND_IMM_PSEUDO $src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond)) >; -// TODO: we could add more variants for other types of conditionals + + // TODO: we could add more variants for other types of conditionals //===----------------------------------------------------------------------===// // VOP1 Patterns @@ -1535,15 +1537,15 @@ def : GCNPat < } // End SubtargetPredicate = HasVOP3PInsts -// def : GCNPat < -// (v2f16 (scalar_to_vector f16:$src0)), -// (COPY $src0) -// >; +def : GCNPat < + (v2f16 (scalar_to_vector f16:$src0)), + (COPY $src0) +>; -// def : GCNPat < -// (v2i16 (scalar_to_vector i16:$src0)), -// (COPY $src0) -// >; +def : GCNPat < + (v2i16 (scalar_to_vector i16:$src0)), + (COPY $src0) +>; def : GCNPat < (v4i16 (scalar_to_vector i16:$src0)), @@ -1621,8 +1623,8 @@ defm : BFMPatterns ; defm : BFEPattern ; defm : SHA256MaPattern ; -def : IntMed3Pat; -def : IntMed3Pat; +defm : IntMed3Pat; +defm : IntMed3Pat; } @@ -1649,20 +1651,33 @@ class FP16Med3Pat; -class Int16Med3Pat : GCNPat< + ValueType vt = i16> { + // This matches 16 permutations of + // max(min(x, y), min(max(x, y), z)) + def : GCNPat < (max (min_oneuse vt:$src0, vt:$src1), (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)), (med3Inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE) >; + // This matches 16 permutations of + // min(max(a, b), max(min(a, b), c)) + def : GCNPat < + (min (max_oneuse vt:$src0, vt:$src1), + (max_oneuse (min_oneuse vt:$src0, vt:$src1), vt:$src2)), + (med3Inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE) +>; +} + def : FPMed3Pat; let OtherPredicates = [isGFX9] in { def : FP16Med3Pat; -def : Int16Med3Pat; -def : Int16Med3Pat; +defm : Int16Med3Pat; +defm : Int16Med3Pat; } // End Predicates = [isGFX9] diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td index 7b7cf1635050bdf18f7129593d8c52b684b93132..e51ff4b4bc50efd42aa9eb5a9e99a4b69a7d39ee 100644 --- a/lib/Target/AMDGPU/SIIntrinsics.td +++ b/lib/Target/AMDGPU/SIIntrinsics.td @@ -16,36 +16,4 @@ let TargetPrefix = "SI", isTarget = 1 in { def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; - // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed - def int_SI_tbuffer_store : Intrinsic < - [], - [llvm_anyint_ty, // rsrc(SGPR) - llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32 - llvm_i32_ty, // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW - llvm_i32_ty, // vaddr(VGPR) - llvm_i32_ty, // soffset(SGPR) - llvm_i32_ty, // inst_offset(imm) - llvm_i32_ty, // dfmt(imm) - llvm_i32_ty, // nfmt(imm) - llvm_i32_ty, // offen(imm) - llvm_i32_ty, // idxen(imm) - llvm_i32_ty, // glc(imm) - llvm_i32_ty, // slc(imm) - llvm_i32_ty], // tfe(imm) - []>; - - // Fully-flexible BUFFER_LOAD_DWORD_* except for the ADDR64 bit, which is not exposed - def int_SI_buffer_load_dword : Intrinsic < - [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32 - [llvm_anyint_ty, // rsrc(SGPR) - llvm_anyint_ty, // vaddr(VGPR) - llvm_i32_ty, // soffset(SGPR) - llvm_i32_ty, // inst_offset(imm) - llvm_i32_ty, // offen(imm) - llvm_i32_ty, // idxen(imm) - llvm_i32_ty, // glc(imm) - llvm_i32_ty, // slc(imm) - llvm_i32_ty], // tfe(imm) - [IntrReadMem, IntrArgMemOnly]>; - } // End TargetPrefix = "SI", isTarget = 1 diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index e379e98623a2000b36c2b1ed88280942cd253a2c..52bbe5c03450b0137d8d8fab666f0abc3b13bcaf 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -20,6 +20,26 @@ // ==> // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 // +// This pass also tries to promote constant offset to the immediate by +// adjusting the base. It tries to use a base from the nearby instructions that +// allows it to have a 13bit constant offset and then promotes the 13bit offset +// to the immediate. +// E.g. +// s_movk_i32 s0, 0x1800 +// v_add_co_u32_e32 v0, vcc, s0, v2 +// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +// +// s_movk_i32 s0, 0x1000 +// v_add_co_u32_e32 v5, vcc, s0, v2 +// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +// global_load_dwordx2 v[5:6], v[5:6], off +// global_load_dwordx2 v[0:1], v[0:1], off +// => +// s_movk_i32 s0, 0x1000 +// v_add_co_u32_e32 v5, vcc, s0, v2 +// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +// global_load_dwordx2 v[5:6], v[5:6], off +// global_load_dwordx2 v[0:1], v[5:6], off offset:2048 // // Future improvements: // @@ -43,9 +63,9 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" @@ -74,23 +94,38 @@ using namespace llvm; #define DEBUG_TYPE "si-load-store-opt" namespace { +enum InstClassEnum { + UNKNOWN, + DS_READ, + DS_WRITE, + S_BUFFER_LOAD_IMM, + BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN, + BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET, + BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN, + BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET, + BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact, + BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact, + BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact, + BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact, +}; -class SILoadStoreOptimizer : public MachineFunctionPass { - enum InstClassEnum { - DS_READ_WRITE, - S_BUFFER_LOAD_IMM, - BUFFER_LOAD_OFFEN, - BUFFER_LOAD_OFFSET, - BUFFER_STORE_OFFEN, - BUFFER_STORE_OFFSET, - }; +enum RegisterEnum { + SBASE = 0x1, + SRSRC = 0x2, + SOFFSET = 0x4, + VADDR = 0x8, + ADDR = 0x10, +}; +class SILoadStoreOptimizer : public MachineFunctionPass { struct CombineInfo { MachineBasicBlock::iterator I; MachineBasicBlock::iterator Paired; unsigned EltSize; unsigned Offset0; unsigned Offset1; + unsigned Width0; + unsigned Width1; unsigned BaseOff; InstClassEnum InstClass; bool GLC0; @@ -98,9 +133,23 @@ class SILoadStoreOptimizer : public MachineFunctionPass { bool SLC0; bool SLC1; bool UseST64; - bool IsX2; - SmallVector InstsToMove; - }; + SmallVector InstsToMove; + }; + + struct BaseRegisters { + unsigned LoReg = 0; + unsigned HiReg = 0; + + unsigned LoSubReg = 0; + unsigned HiSubReg = 0; + }; + + struct MemAddress { + BaseRegisters Base; + int64_t Offset = 0; + }; + + using MemInfoMap = DenseMap; private: const GCNSubtarget *STM = nullptr; @@ -108,9 +157,16 @@ private: const SIRegisterInfo *TRI = nullptr; MachineRegisterInfo *MRI = nullptr; AliasAnalysis *AA = nullptr; - unsigned CreatedX2; + bool OptimizeAgain; static bool offsetsCanBeCombined(CombineInfo &CI); + static bool widthsFit(const CombineInfo &CI); + static unsigned getNewOpcode(const CombineInfo &CI); + static std::pair getSubRegIdxs(const CombineInfo &CI); + const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI); + unsigned getOpcodeWidth(const MachineInstr &MI); + InstClassEnum getInstClass(unsigned Opc); + unsigned getRegs(unsigned Opc); bool findMatchingInst(CombineInfo &CI); @@ -123,10 +179,21 @@ private: MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); - unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2, - bool &IsOffen) const; MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); + void updateBaseAndOffset(MachineInstr &I, unsigned NewBase, + int32_t NewOffset); + unsigned computeBase(MachineInstr &MI, const MemAddress &Addr); + MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI); + Optional extractConstOffset(const MachineOperand &Op); + void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr); + /// Promotes constant offset to the immediate by adjusting the base. It + /// tries to use a base from the nearby instructions that allows it to have + /// a 13bit constant offset which gets promoted to the immediate. + bool promoteConstantOffsetToImm(MachineInstr &CI, + MemInfoMap &Visited, + SmallPtrSet &Promoted); + public: static char ID; @@ -153,8 +220,8 @@ public: INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, - "SI Load Store Optimizer", false, false) +INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", + false, false) char SILoadStoreOptimizer::ID = 0; @@ -165,7 +232,7 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass() { } static void moveInstsAfter(MachineBasicBlock::iterator I, - ArrayRef InstsToMove) { + ArrayRef InstsToMove) { MachineBasicBlock *MBB = I->getParent(); ++I; for (MachineInstr *MI : InstsToMove) { @@ -191,21 +258,19 @@ static void addDefsUsesToList(const MachineInstr &MI, static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, const SIInstrInfo *TII, - AliasAnalysis * AA) { + AliasAnalysis *AA) { // RAW or WAR - cannot reorder // WAW - cannot reorder // RAR - safe to reorder return !(A->mayStore() || B->mayStore()) || - TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); + TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); } // Add MI and its defs to the lists if MI reads one of the defs that are // already in the list. Returns true in that case. -static bool -addToListsIfDependent(MachineInstr &MI, - DenseSet &RegDefs, - DenseSet &PhysRegUses, - SmallVectorImpl &Insts) { +static bool addToListsIfDependent(MachineInstr &MI, DenseSet &RegDefs, + DenseSet &PhysRegUses, + SmallVectorImpl &Insts) { for (MachineOperand &Use : MI.operands()) { // If one of the defs is read, then there is a use of Def between I and the // instruction that I will potentially be merged with. We will need to move @@ -228,18 +293,16 @@ addToListsIfDependent(MachineInstr &MI, return false; } -static bool -canMoveInstsAcrossMemOp(MachineInstr &MemOp, - ArrayRef InstsToMove, - const SIInstrInfo *TII, - AliasAnalysis *AA) { +static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, + ArrayRef InstsToMove, + const SIInstrInfo *TII, AliasAnalysis *AA) { assert(MemOp.mayLoadOrStore()); for (MachineInstr *InstToMove : InstsToMove) { if (!InstToMove->mayLoadOrStore()) continue; if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) - return false; + return false; } return true; } @@ -260,10 +323,9 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { CI.BaseOff = 0; // Handle SMEM and VMEM instructions. - if (CI.InstClass != DS_READ_WRITE) { - unsigned Diff = CI.IsX2 ? 2 : 1; - return (EltOffset0 + Diff == EltOffset1 || - EltOffset1 + Diff == EltOffset0) && + if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { + return (EltOffset0 + CI.Width0 == EltOffset1 || + EltOffset1 + CI.Width1 == EltOffset0) && CI.GLC0 == CI.GLC1 && (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1); } @@ -305,42 +367,175 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { return false; } +bool SILoadStoreOptimizer::widthsFit(const CombineInfo &CI) { + const unsigned Width = (CI.Width0 + CI.Width1); + switch (CI.InstClass) { + default: + return Width <= 4; + case S_BUFFER_LOAD_IMM: + switch (Width) { + default: + return false; + case 2: + case 4: + return true; + } + } +} + +unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) { + const unsigned Opc = MI.getOpcode(); + + if (TII->isMUBUF(MI)) { + return AMDGPU::getMUBUFDwords(Opc); + } + + switch (Opc) { + default: + return 0; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + return 1; + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + return 2; + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return 4; + } +} + +InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) { + if (TII->isMUBUF(Opc)) { + const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc); + + // If we couldn't identify the opcode, bail out. + if (baseOpcode == -1) { + return UNKNOWN; + } + + switch (baseOpcode) { + default: + return UNKNOWN; + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: + return BUFFER_LOAD_OFFEN; + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: + return BUFFER_LOAD_OFFSET; + case AMDGPU::BUFFER_STORE_DWORD_OFFEN: + return BUFFER_STORE_OFFEN; + case AMDGPU::BUFFER_STORE_DWORD_OFFSET: + return BUFFER_STORE_OFFSET; + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: + return BUFFER_LOAD_OFFEN_exact; + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: + return BUFFER_LOAD_OFFSET_exact; + case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: + return BUFFER_STORE_OFFEN_exact; + case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: + return BUFFER_STORE_OFFSET_exact; + } + } + + switch (Opc) { + default: + return UNKNOWN; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return S_BUFFER_LOAD_IMM; + case AMDGPU::DS_READ_B32: + case AMDGPU::DS_READ_B64: + case AMDGPU::DS_READ_B32_gfx9: + case AMDGPU::DS_READ_B64_gfx9: + return DS_READ; + case AMDGPU::DS_WRITE_B32: + case AMDGPU::DS_WRITE_B64: + case AMDGPU::DS_WRITE_B32_gfx9: + case AMDGPU::DS_WRITE_B64_gfx9: + return DS_WRITE; + } +} + +unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) { + if (TII->isMUBUF(Opc)) { + unsigned result = 0; + + if (AMDGPU::getMUBUFHasVAddr(Opc)) { + result |= VADDR; + } + + if (AMDGPU::getMUBUFHasSrsrc(Opc)) { + result |= SRSRC; + } + + if (AMDGPU::getMUBUFHasSoffset(Opc)) { + result |= SOFFSET; + } + + return result; + } + + switch (Opc) { + default: + return 0; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return SBASE; + case AMDGPU::DS_READ_B32: + case AMDGPU::DS_READ_B64: + case AMDGPU::DS_READ_B32_gfx9: + case AMDGPU::DS_READ_B64_gfx9: + case AMDGPU::DS_WRITE_B32: + case AMDGPU::DS_WRITE_B64: + case AMDGPU::DS_WRITE_B32_gfx9: + case AMDGPU::DS_WRITE_B64_gfx9: + return ADDR; + } +} + bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); MachineBasicBlock::iterator E = MBB->end(); MachineBasicBlock::iterator MBBI = CI.I; - unsigned AddrOpName[3] = {0}; - int AddrIdx[3]; - const MachineOperand *AddrReg[3]; + const unsigned Opc = CI.I->getOpcode(); + const InstClassEnum InstClass = getInstClass(Opc); + + if (InstClass == UNKNOWN) { + return false; + } + + const unsigned Regs = getRegs(Opc); + + unsigned AddrOpName[5] = {0}; + int AddrIdx[5]; + const MachineOperand *AddrReg[5]; unsigned NumAddresses = 0; - switch (CI.InstClass) { - case DS_READ_WRITE: + if (Regs & ADDR) { AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; - break; - case S_BUFFER_LOAD_IMM: + } + + if (Regs & SBASE) { AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; - break; - case BUFFER_LOAD_OFFEN: - case BUFFER_STORE_OFFEN: - AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; - AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; - AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; - break; - case BUFFER_LOAD_OFFSET: - case BUFFER_STORE_OFFSET: + } + + if (Regs & SRSRC) { AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; + } + + if (Regs & SOFFSET) { AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; - break; + } + + if (Regs & VADDR) { + AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; } for (unsigned i = 0; i < NumAddresses; i++) { AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]); AddrReg[i] = &CI.I->getOperand(AddrIdx[i]); - // We only ever merge operations with the same base address register, so don't - // bother scanning forward if there are no other uses. + // We only ever merge operations with the same base address register, so + // don't bother scanning forward if there are no other uses. if (AddrReg[i]->isReg() && (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) || MRI->hasOneNonDBGUse(AddrReg[i]->getReg()))) @@ -353,8 +548,11 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { DenseSet PhysRegUsesToMove; addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); - for ( ; MBBI != E; ++MBBI) { - if (MBBI->getOpcode() != CI.I->getOpcode()) { + for (; MBBI != E; ++MBBI) { + const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE); + + if ((getInstClass(MBBI->getOpcode()) != InstClass) || + (IsDS && (MBBI->getOpcode() != Opc))) { // This is not a matching DS instruction, but we can keep looking as // long as one of these conditions are met: // 1. It is safe to move I down past MBBI. @@ -368,8 +566,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { } if (MBBI->mayLoadOrStore() && - (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { + (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { // We fail condition #1, but we may still be able to satisfy condition // #2. Add this instruction to the move list and then we will check // if condition #2 holds once we have selected the matching instruction. @@ -413,8 +611,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { continue; } - // Check same base pointer. Be careful of subregisters, which can occur with - // vectors of pointers. + // Check same base pointer. Be careful of subregisters, which can occur + // with vectors of pointers. if (AddrReg[i]->getReg() != AddrRegNext.getReg() || AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { Match = false; @@ -423,13 +621,15 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { } if (Match) { - int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), - AMDGPU::OpName::offset); + int OffsetIdx = + AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset); CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); + CI.Width0 = getOpcodeWidth(*CI.I); CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm(); + CI.Width1 = getOpcodeWidth(*MBBI); CI.Paired = MBBI; - if (CI.InstClass == DS_READ_WRITE) { + if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) { CI.Offset0 &= 0xffff; CI.Offset1 &= 0xffff; } else { @@ -445,7 +645,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { // We also need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. - if (offsetsCanBeCombined(CI)) + if (widthsFit(CI) && offsetsCanBeCombined(CI)) if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) return true; } @@ -472,12 +672,12 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { if (STM->ldsRequiresM0Init()) return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; - return (EltSize == 4) ? - AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9; + return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 + : AMDGPU::DS_READ2ST64_B64_gfx9; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird @@ -489,8 +689,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( unsigned NewOffset0 = CI.Offset0; unsigned NewOffset1 = CI.Offset1; - unsigned Opc = CI.UseST64 ? - read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); + unsigned Opc = + CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; @@ -502,13 +702,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( } assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && - (NewOffset0 != NewOffset1) && - "Computed offset doesn't fit"); + (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); const MCInstrDesc &Read2Desc = TII->get(Opc); - const TargetRegisterClass *SuperRC - = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; + const TargetRegisterClass *SuperRC = + (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; unsigned DestReg = MRI->createVirtualRegister(SuperRC); DebugLoc DL = CI.I->getDebugLoc(); @@ -519,23 +718,24 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( if (CI.BaseOff) { unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) - .addImm(CI.BaseOff); + .addImm(CI.BaseOff); BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) - .addReg(ImmReg) - .addReg(AddrReg->getReg(), 0, BaseSubReg); + .addReg(ImmReg) + .addReg(AddrReg->getReg(), 0, BaseSubReg); BaseSubReg = 0; } - MachineInstrBuilder Read2 = BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) - .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); + MachineInstrBuilder Read2 = + BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) + .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); (void)Read2; @@ -562,32 +762,36 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { if (STM->ldsRequiresM0Init()) return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; - return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9; + return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 + : AMDGPU::DS_WRITE2_B64_gfx9; } unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { if (STM->ldsRequiresM0Init()) - return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; + return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 + : AMDGPU::DS_WRITE2ST64_B64; - return (EltSize == 4) ? - AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9; + return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 + : AMDGPU::DS_WRITE2ST64_B64_gfx9; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); // Be sure to use .addOperand(), and not .addReg() with these. We want to be // sure we preserve the subregister index and any register flags set on them. - const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); - const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); - const MachineOperand *Data1 - = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); + const MachineOperand *AddrReg = + TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); + const MachineOperand *Data0 = + TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); + const MachineOperand *Data1 = + TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); unsigned NewOffset0 = CI.Offset0; unsigned NewOffset1 = CI.Offset1; - unsigned Opc = CI.UseST64 ? - write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); + unsigned Opc = + CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); if (NewOffset0 > NewOffset1) { // Canonicalize the merged instruction so the smaller offset comes first. @@ -596,8 +800,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( } assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && - (NewOffset0 != NewOffset1) && - "Computed offset doesn't fit"); + (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); const MCInstrDesc &Write2Desc = TII->get(Opc); DebugLoc DL = CI.I->getDebugLoc(); @@ -608,25 +811,26 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( if (CI.BaseOff) { unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) - .addImm(CI.BaseOff); + .addImm(CI.BaseOff); BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) - .addReg(ImmReg) - .addReg(AddrReg->getReg(), 0, BaseSubReg); + .addReg(ImmReg) + .addReg(AddrReg->getReg(), 0, BaseSubReg); BaseSubReg = 0; } - MachineInstrBuilder Write2 = BuildMI(*MBB, CI.Paired, DL, Write2Desc) - .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr - .add(*Data0) // data0 - .add(*Data1) // data1 - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); + MachineInstrBuilder Write2 = + BuildMI(*MBB, CI.Paired, DL, Write2Desc) + .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr + .add(*Data0) // data0 + .add(*Data1) // data1 + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); moveInstsAfter(Write2, CI.InstsToMove); @@ -638,15 +842,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( return Next; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM : - AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + const unsigned Opcode = getNewOpcode(CI); + + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); - const TargetRegisterClass *SuperRC = - CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass; unsigned DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); @@ -656,12 +859,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( .addImm(CI.GLC0) // glc .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); - unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; - - // Handle descending offsets - if (CI.Offset0 > CI.Offset1) - std::swap(SubRegIdx0, SubRegIdx1); + std::pair SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); // Copy to the old destination registers. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); @@ -683,29 +883,25 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( return Next; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - unsigned Opcode; - if (CI.InstClass == BUFFER_LOAD_OFFEN) { - Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN : - AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; - } else { - Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET : - AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; - } + const unsigned Opcode = getNewOpcode(CI); - const TargetRegisterClass *SuperRC = - CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); + + // Copy to the new source register. unsigned DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); - if (CI.InstClass == BUFFER_LOAD_OFFEN) - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + const unsigned Regs = getRegs(Opcode); + + if (Regs & VADDR) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -715,12 +911,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( .addImm(0) // tfe .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); - unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; - - // Handle descending offsets - if (CI.Offset0 > CI.Offset1) - std::swap(SubRegIdx0, SubRegIdx1); + std::pair SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); // Copy to the old destination registers. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); @@ -742,57 +935,137 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( return Next; } -unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode( - const MachineInstr &I, bool &IsX2, bool &IsOffen) const { - IsX2 = false; - IsOffen = false; - - switch (I.getOpcode()) { - case AMDGPU::BUFFER_STORE_DWORD_OFFEN: - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; - case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: - IsX2 = true; - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact: - IsX2 = true; - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact; - case AMDGPU::BUFFER_STORE_DWORD_OFFSET: - return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; - case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: - return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: - IsX2 = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact: - IsX2 = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact; - } - return 0; -} - -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( - CombineInfo &CI) { +unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { + const unsigned Width = CI.Width0 + CI.Width1; + + switch (CI.InstClass) { + default: + return AMDGPU::getMUBUFOpcode(CI.InstClass, Width); + case UNKNOWN: + llvm_unreachable("Unknown instruction class"); + case S_BUFFER_LOAD_IMM: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + case 4: + return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; + } + } +} + +std::pair +SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) { + if (CI.Offset0 > CI.Offset1) { + switch (CI.Width0) { + default: + return std::make_pair(0, 0); + case 1: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub1, AMDGPU::sub0); + case 2: + return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1); + case 3: + return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2); + } + case 2: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0); + case 2: + return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1); + } + case 3: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0); + } + } + } else { + switch (CI.Width0) { + default: + return std::make_pair(0, 0); + case 1: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub0, AMDGPU::sub1); + case 2: + return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2); + case 3: + return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3); + } + case 2: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2); + case 2: + return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3); + } + case 3: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3); + } + } + } +} + +const TargetRegisterClass * +SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) { + if (CI.InstClass == S_BUFFER_LOAD_IMM) { + switch (CI.Width0 + CI.Width1) { + default: + return nullptr; + case 2: + return &AMDGPU::SReg_64_XEXECRegClass; + case 4: + return &AMDGPU::SReg_128RegClass; + case 8: + return &AMDGPU::SReg_256RegClass; + case 16: + return &AMDGPU::SReg_512RegClass; + } + } else { + switch (CI.Width0 + CI.Width1) { + default: + return nullptr; + case 2: + return &AMDGPU::VReg_64RegClass; + case 3: + return &AMDGPU::VReg_96RegClass; + case 4: + return &AMDGPU::VReg_128RegClass; + } + } +} + +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - bool Unused1, Unused2; - unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2); - unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; + const unsigned Opcode = getNewOpcode(CI); - // Handle descending offsets - if (CI.Offset0 > CI.Offset1) - std::swap(SubRegIdx0, SubRegIdx1); + std::pair SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); // Copy to the new source register. - const TargetRegisterClass *SuperRC = - CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); unsigned SrcReg = MRI->createVirtualRegister(SuperRC); const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); @@ -805,10 +1078,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( .addImm(SubRegIdx1); auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) - .addReg(SrcReg, RegState::Kill); + .addReg(SrcReg, RegState::Kill); + + const unsigned Regs = getRegs(Opcode); - if (CI.InstClass == BUFFER_STORE_OFFEN) - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + if (Regs & VADDR) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -826,105 +1101,399 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( return Next; } +MachineOperand +SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) { + APInt V(32, Val, true); + if (TII->isInlineConstant(V)) + return MachineOperand::CreateImm(Val); + + unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + MachineInstr *Mov = + BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), + TII->get(AMDGPU::S_MOV_B32), Reg) + .addImm(Val); + (void)Mov; + LLVM_DEBUG(dbgs() << " "; Mov->dump()); + return MachineOperand::CreateReg(Reg, false); +} + +// Compute base address using Addr and return the final register. +unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, + const MemAddress &Addr) { + MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::iterator MBBI = MI.getIterator(); + DebugLoc DL = MI.getDebugLoc(); + + assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || + Addr.Base.LoSubReg) && + "Expected 32-bit Base-Register-Low!!"); + + assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || + Addr.Base.HiSubReg) && + "Expected 32-bit Base-Register-Hi!!"); + + LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); + MachineOperand OffsetLo = createRegOrImm(static_cast(Addr.Offset), MI); + MachineOperand OffsetHi = + createRegOrImm(static_cast(Addr.Offset >> 32), MI); + unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned DeadCarryReg = + MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineInstr *LoHalf = + BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0) + .addReg(CarryReg, RegState::Define) + .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) + .add(OffsetLo); + (void)LoHalf; + LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); + + MachineInstr *HiHalf = + BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) + .addReg(DeadCarryReg, RegState::Define | RegState::Dead) + .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) + .add(OffsetHi) + .addReg(CarryReg, RegState::Kill); + (void)HiHalf; + LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); + + unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); + MachineInstr *FullBase = + BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + (void)FullBase; + LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); + + return FullDestReg; +} + +// Update base and offset with the NewBase and NewOffset in MI. +void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, + unsigned NewBase, + int32_t NewOffset) { + TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase); + TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); +} + +Optional +SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) { + if (Op.isImm()) + return Op.getImm(); + + if (!Op.isReg()) + return None; + + MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); + if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || + !Def->getOperand(1).isImm()) + return None; + + return Def->getOperand(1).getImm(); +} + +// Analyze Base and extracts: +// - 32bit base registers, subregisters +// - 64bit constant offset +// Expecting base computation as: +// %OFFSET0:sgpr_32 = S_MOV_B32 8000 +// %LO:vgpr_32, %c:sreg_64_xexec = +// V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, +// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec +// %Base:vreg_64 = +// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 +void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, + MemAddress &Addr) { + if (!Base.isReg()) + return; + + MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); + if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE + || Def->getNumOperands() != 5) + return; + + MachineOperand BaseLo = Def->getOperand(1); + MachineOperand BaseHi = Def->getOperand(3); + if (!BaseLo.isReg() || !BaseHi.isReg()) + return; + + MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); + MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); + + if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 || + !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) + return; + + const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); + const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); + + auto Offset0P = extractConstOffset(*Src0); + if (Offset0P) + BaseLo = *Src1; + else { + if (!(Offset0P = extractConstOffset(*Src1))) + return; + BaseLo = *Src0; + } + + Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); + Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); + + if (Src0->isImm()) + std::swap(Src0, Src1); + + if (!Src1->isImm()) + return; + + uint64_t Offset1 = Src1->getImm(); + BaseHi = *Src0; + + Addr.Base.LoReg = BaseLo.getReg(); + Addr.Base.HiReg = BaseHi.getReg(); + Addr.Base.LoSubReg = BaseLo.getSubReg(); + Addr.Base.HiSubReg = BaseHi.getSubReg(); + Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); +} + +bool SILoadStoreOptimizer::promoteConstantOffsetToImm( + MachineInstr &MI, + MemInfoMap &Visited, + SmallPtrSet &AnchorList) { + + // TODO: Support flat and scratch. + if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 || + TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) + return false; + + // TODO: Support Store. + if (!MI.mayLoad()) + return false; + + if (AnchorList.count(&MI)) + return false; + + LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); + + if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { + LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); + return false; + } + + // Step1: Find the base-registers and a 64bit constant offset. + MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); + MemAddress MAddr; + if (Visited.find(&MI) == Visited.end()) { + processBaseWithConstOffset(Base, MAddr); + Visited[&MI] = MAddr; + } else + MAddr = Visited[&MI]; + + if (MAddr.Offset == 0) { + LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" + " constant offsets that can be promoted.\n";); + return false; + } + + LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " + << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); + + // Step2: Traverse through MI's basic block and find an anchor(that has the + // same base-registers) with the highest 13bit distance from MI's offset. + // E.g. (64bit loads) + // bb: + // addr1 = &a + 4096; load1 = load(addr1, 0) + // addr2 = &a + 6144; load2 = load(addr2, 0) + // addr3 = &a + 8192; load3 = load(addr3, 0) + // addr4 = &a + 10240; load4 = load(addr4, 0) + // addr5 = &a + 12288; load5 = load(addr5, 0) + // + // Starting from the first load, the optimization will try to find a new base + // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 + // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 + // as the new-base(anchor) because of the maximum distance which can + // accomodate more intermediate bases presumeably. + // + // Step3: move (&a + 8192) above load1. Compute and promote offsets from + // (&a + 8192) for load1, load2, load4. + // addr = &a + 8192 + // load1 = load(addr, -4096) + // load2 = load(addr, -2048) + // load3 = load(addr, 0) + // load4 = load(addr, 2048) + // addr5 = &a + 12288; load5 = load(addr5, 0) + // + MachineInstr *AnchorInst = nullptr; + MemAddress AnchorAddr; + uint32_t MaxDist = std::numeric_limits::min(); + SmallVector, 4> InstsWCommonBase; + + MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::iterator E = MBB->end(); + MachineBasicBlock::iterator MBBI = MI.getIterator(); + ++MBBI; + const SITargetLowering *TLI = + static_cast(STM->getTargetLowering()); + + for ( ; MBBI != E; ++MBBI) { + MachineInstr &MINext = *MBBI; + // TODO: Support finding an anchor(with same base) from store addresses or + // any other load addresses where the opcodes are different. + if (MINext.getOpcode() != MI.getOpcode() || + TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) + continue; + + const MachineOperand &BaseNext = + *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); + MemAddress MAddrNext; + if (Visited.find(&MINext) == Visited.end()) { + processBaseWithConstOffset(BaseNext, MAddrNext); + Visited[&MINext] = MAddrNext; + } else + MAddrNext = Visited[&MINext]; + + if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || + MAddrNext.Base.HiReg != MAddr.Base.HiReg || + MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || + MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) + continue; + + InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); + + int64_t Dist = MAddr.Offset - MAddrNext.Offset; + TargetLoweringBase::AddrMode AM; + AM.HasBaseReg = true; + AM.BaseOffs = Dist; + if (TLI->isLegalGlobalAddressingMode(AM) && + (uint32_t)std::abs(Dist) > MaxDist) { + MaxDist = std::abs(Dist); + + AnchorAddr = MAddrNext; + AnchorInst = &MINext; + } + } + + if (AnchorInst) { + LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; + AnchorInst->dump()); + LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " + << AnchorAddr.Offset << "\n\n"); + + // Instead of moving up, just re-compute anchor-instruction's base address. + unsigned Base = computeBase(MI, AnchorAddr); + + updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); + LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); + + for (auto P : InstsWCommonBase) { + TargetLoweringBase::AddrMode AM; + AM.HasBaseReg = true; + AM.BaseOffs = P.second - AnchorAddr.Offset; + + if (TLI->isLegalGlobalAddressingMode(AM)) { + LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; + dbgs() << ")"; P.first->dump()); + updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); + LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); + } + } + AnchorList.insert(AnchorInst); + return true; + } + + return false; +} + // Scan through looking for adjacent LDS operations with constant offsets from // the same base register. We rely on the scheduler to do the hard work of // clustering nearby loads, and assume these are all adjacent. bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { bool Modified = false; + // Contain the list + MemInfoMap Visited; + // Contains the list of instructions for which constant offsets are being + // promoted to the IMM. + SmallPtrSet AnchorList; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { MachineInstr &MI = *I; + if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) + Modified = true; + // Don't combine if volatile. if (MI.hasOrderedMemoryRef()) { ++I; continue; } + const unsigned Opc = MI.getOpcode(); + CombineInfo CI; CI.I = I; - unsigned Opc = MI.getOpcode(); - if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 || - Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) { + CI.InstClass = getInstClass(Opc); - CI.InstClass = DS_READ_WRITE; + switch (CI.InstClass) { + default: + break; + case DS_READ: CI.EltSize = - (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4; - + (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 + : 4; if (findMatchingInst(CI)) { Modified = true; I = mergeRead2Pair(CI); } else { ++I; } - continue; - } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 || - Opc == AMDGPU::DS_WRITE_B32_gfx9 || - Opc == AMDGPU::DS_WRITE_B64_gfx9) { - CI.InstClass = DS_READ_WRITE; - CI.EltSize - = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4; - + case DS_WRITE: + CI.EltSize = + (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 + : 4; if (findMatchingInst(CI)) { Modified = true; I = mergeWrite2Pair(CI); } else { ++I; } - continue; - } - if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || - Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) { - // EltSize is in units of the offset encoding. - CI.InstClass = S_BUFFER_LOAD_IMM; + case S_BUFFER_LOAD_IMM: CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); - CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; if (findMatchingInst(CI)) { Modified = true; I = mergeSBufferLoadImmPair(CI); - if (!CI.IsX2) - CreatedX2++; + OptimizeAgain |= (CI.Width0 + CI.Width1) < 16; } else { ++I; } continue; - } - if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) { - if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN) - CI.InstClass = BUFFER_LOAD_OFFEN; - else - CI.InstClass = BUFFER_LOAD_OFFSET; - + case BUFFER_LOAD_OFFEN: + case BUFFER_LOAD_OFFSET: + case BUFFER_LOAD_OFFEN_exact: + case BUFFER_LOAD_OFFSET_exact: CI.EltSize = 4; - CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; if (findMatchingInst(CI)) { Modified = true; I = mergeBufferLoadPair(CI); - if (!CI.IsX2) - CreatedX2++; + OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; } else { ++I; } continue; - } - - bool StoreIsX2, IsOffen; - if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) { - CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET; + case BUFFER_STORE_OFFEN: + case BUFFER_STORE_OFFSET: + case BUFFER_STORE_OFFEN_exact: + case BUFFER_STORE_OFFSET_exact: CI.EltSize = 4; - CI.IsX2 = StoreIsX2; if (findMatchingInst(CI)) { Modified = true; I = mergeBufferStorePair(CI); - if (!CI.IsX2) - CreatedX2++; + OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; } else { ++I; } @@ -958,12 +1527,10 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { bool Modified = false; for (MachineBasicBlock &MBB : MF) { - CreatedX2 = 0; - Modified |= optimizeBlock(MBB); - - // Run again to convert x2 to x4. - if (CreatedX2 >= 1) + do { + OptimizeAgain = false; Modified |= optimizeBlock(MBB); + } while (OptimizeAgain); } return Modified; diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp index 6670def7d09457d319133a6fdfc04c52314e96c8..fb7e670068fe61201caf3dae4acb8258ac5646bc 100644 --- a/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -1955,12 +1955,12 @@ void SIScheduleDAGMI::schedule() for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { SUnit *SU = &SUnits[i]; - unsigned BaseLatReg; + MachineOperand *BaseLatOp; int64_t OffLatReg; if (SITII->isLowLatencyInstruction(*SU->getInstr())) { IsLowLatencySU[i] = 1; - if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg, - TRI)) + if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg, + TRI)) LowLatencyOffset[i] = OffLatReg; } else if (SITII->isHighLatencyInstruction(*SU->getInstr())) IsHighLatencySU[i] = 1; diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 6fa52f57601be4844849fa52ff54c738dc4cc5e1..b4a4e9e33133dfb5373300e5b6c2395acf6e40f1 100644 --- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -812,6 +812,12 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); + const GCNSubtarget &STM = MBB.getParent()->getSubtarget(); + + const unsigned Flush = STM.isAmdPalOS() || STM.isMesa3DOS() + ? AMDGPU::BUFFER_WBINVL1 + : AMDGPU::BUFFER_WBINVL1_VOL; + if (Pos == Position::AFTER) ++MI; @@ -819,7 +825,7 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1_VOL)); + BuildMI(MBB, MI, DL, TII->get(Flush)); Changed = true; break; case SIAtomicScope::WORKGROUP: diff --git a/lib/Target/AMDGPU/SIModeRegister.cpp b/lib/Target/AMDGPU/SIModeRegister.cpp new file mode 100644 index 0000000000000000000000000000000000000000..883fd308f2f4bd64c622fb69729117e336bb5dde --- /dev/null +++ b/lib/Target/AMDGPU/SIModeRegister.cpp @@ -0,0 +1,406 @@ +//===-- SIModeRegister.cpp - Mode Register --------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This pass inserts changes to the Mode register settings as required. +/// Note that currently it only deals with the Double Precision Floating Point +/// rounding mode setting, but is intended to be generic enough to be easily +/// expanded. +/// +//===----------------------------------------------------------------------===// +// +#include "AMDGPU.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include + +#define DEBUG_TYPE "si-mode-register" + +STATISTIC(NumSetregInserted, "Number of setreg of mode register inserted."); + +using namespace llvm; + +struct Status { + // Mask is a bitmask where a '1' indicates the corresponding Mode bit has a + // known value + unsigned Mask; + unsigned Mode; + + Status() : Mask(0), Mode(0){}; + + Status(unsigned Mask, unsigned Mode) : Mask(Mask), Mode(Mode) { + Mode &= Mask; + }; + + // merge two status values such that only values that don't conflict are + // preserved + Status merge(const Status &S) const { + return Status((Mask | S.Mask), ((Mode & ~S.Mask) | (S.Mode & S.Mask))); + } + + // merge an unknown value by using the unknown value's mask to remove bits + // from the result + Status mergeUnknown(unsigned newMask) { + return Status(Mask & ~newMask, Mode & ~newMask); + } + + // intersect two Status values to produce a mode and mask that is a subset + // of both values + Status intersect(const Status &S) const { + unsigned NewMask = (Mask & S.Mask) & (Mode ^ ~S.Mode); + unsigned NewMode = (Mode & NewMask); + return Status(NewMask, NewMode); + } + + // produce the delta required to change the Mode to the required Mode + Status delta(const Status &S) const { + return Status((S.Mask & (Mode ^ S.Mode)) | (~Mask & S.Mask), S.Mode); + } + + bool operator==(const Status &S) const { + return (Mask == S.Mask) && (Mode == S.Mode); + } + + bool operator!=(const Status &S) const { return !(*this == S); } + + bool isCompatible(Status &S) { + return ((Mask & S.Mask) == S.Mask) && ((Mode & S.Mask) == S.Mode); + } + + bool isCombinable(Status &S) { + return !(Mask & S.Mask) || isCompatible(S); + } +}; + +class BlockData { +public: + // The Status that represents the mode register settings required by the + // FirstInsertionPoint (if any) in this block. Calculated in Phase 1. + Status Require; + + // The Status that represents the net changes to the Mode register made by + // this block, Calculated in Phase 1. + Status Change; + + // The Status that represents the mode register settings on exit from this + // block. Calculated in Phase 2. + Status Exit; + + // The Status that represents the intersection of exit Mode register settings + // from all predecessor blocks. Calculated in Phase 2, and used by Phase 3. + Status Pred; + + // In Phase 1 we record the first instruction that has a mode requirement, + // which is used in Phase 3 if we need to insert a mode change. + MachineInstr *FirstInsertionPoint; + + BlockData() : FirstInsertionPoint(nullptr) {}; +}; + +namespace { + +class SIModeRegister : public MachineFunctionPass { +public: + static char ID; + + std::vector> BlockInfo; + std::queue Phase2List; + + // The default mode register setting currently only caters for the floating + // point double precision rounding mode. + // We currently assume the default rounding mode is Round to Nearest + // NOTE: this should come from a per function rounding mode setting once such + // a setting exists. + unsigned DefaultMode = FP_ROUND_ROUND_TO_NEAREST; + Status DefaultStatus = + Status(FP_ROUND_MODE_DP(0x3), FP_ROUND_MODE_DP(DefaultMode)); + +public: + SIModeRegister() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + void processBlockPhase1(MachineBasicBlock &MBB, const SIInstrInfo *TII); + + void processBlockPhase2(MachineBasicBlock &MBB, const SIInstrInfo *TII); + + void processBlockPhase3(MachineBasicBlock &MBB, const SIInstrInfo *TII); + + Status getInstructionMode(MachineInstr &MI, const SIInstrInfo *TII); + + void insertSetreg(MachineBasicBlock &MBB, MachineInstr *I, + const SIInstrInfo *TII, Status InstrMode); +}; +} // End anonymous namespace. + +INITIALIZE_PASS(SIModeRegister, DEBUG_TYPE, + "Insert required mode register values", false, false) + +char SIModeRegister::ID = 0; + +char &llvm::SIModeRegisterID = SIModeRegister::ID; + +FunctionPass *llvm::createSIModeRegisterPass() { return new SIModeRegister(); } + +// Determine the Mode register setting required for this instruction. +// Instructions which don't use the Mode register return a null Status. +// Note this currently only deals with instructions that use the floating point +// double precision setting. +Status SIModeRegister::getInstructionMode(MachineInstr &MI, + const SIInstrInfo *TII) { + if (TII->usesFPDPRounding(MI)) { + switch (MI.getOpcode()) { + case AMDGPU::V_INTERP_P1LL_F16: + case AMDGPU::V_INTERP_P1LV_F16: + case AMDGPU::V_INTERP_P2_F16: + // f16 interpolation instructions need double precision round to zero + return Status(FP_ROUND_MODE_DP(3), + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_ZERO)); + default: + return DefaultStatus; + } + } + return Status(); +} + +// Insert a setreg instruction to update the Mode register. +// It is possible (though unlikely) for an instruction to require a change to +// the value of disjoint parts of the Mode register when we don't know the +// value of the intervening bits. In that case we need to use more than one +// setreg instruction. +void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI, + const SIInstrInfo *TII, Status InstrMode) { + while (InstrMode.Mask) { + unsigned Offset = countTrailingZeros(InstrMode.Mask); + unsigned Width = countTrailingOnes(InstrMode.Mask >> Offset); + unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1); + BuildMI(MBB, MI, 0, TII->get(AMDGPU::S_SETREG_IMM32_B32)) + .addImm(Value) + .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) | + (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) | + (AMDGPU::Hwreg::ID_MODE << AMDGPU::Hwreg::ID_SHIFT_)); + ++NumSetregInserted; + InstrMode.Mask &= ~(((1 << Width) - 1) << Offset); + } +} + +// In Phase 1 we iterate through the instructions of the block and for each +// instruction we get its mode usage. If the instruction uses the Mode register +// we: +// - update the Change status, which tracks the changes to the Mode register +// made by this block +// - if this instruction's requirements are compatible with the current setting +// of the Mode register we merge the modes +// - if it isn't compatible and an InsertionPoint isn't set, then we set the +// InsertionPoint to the current instruction, and we remember the current +// mode +// - if it isn't compatible and InsertionPoint is set we insert a seteg before +// that instruction (unless this instruction forms part of the block's +// entry requirements in which case the insertion is deferred until Phase 3 +// when predecessor exit values are known), and move the insertion point to +// this instruction +// - if this is a setreg instruction we treat it as an incompatible instruction. +// This is sub-optimal but avoids some nasty corner cases, and is expected to +// occur very rarely. +// - on exit we have set the Require, Change, and initial Exit modes. +void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB, + const SIInstrInfo *TII) { + auto NewInfo = llvm::make_unique(); + MachineInstr *InsertionPoint = nullptr; + // RequirePending is used to indicate whether we are collecting the initial + // requirements for the block, and need to defer the first InsertionPoint to + // Phase 3. It is set to false once we have set FirstInsertionPoint, or when + // we discover an explict setreg that means this block doesn't have any + // initial requirements. + bool RequirePending = true; + Status IPChange; + for (MachineInstr &MI : MBB) { + Status InstrMode = getInstructionMode(MI, TII); + if ((MI.getOpcode() == AMDGPU::S_SETREG_B32) || + (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32)) { + // We preserve any explicit mode register setreg instruction we encounter, + // as we assume it has been inserted by a higher authority (this is + // likely to be a very rare occurrence). + unsigned Dst = TII->getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); + if (((Dst & AMDGPU::Hwreg::ID_MASK_) >> AMDGPU::Hwreg::ID_SHIFT_) != + AMDGPU::Hwreg::ID_MODE) + continue; + + unsigned Width = ((Dst & AMDGPU::Hwreg::WIDTH_M1_MASK_) >> + AMDGPU::Hwreg::WIDTH_M1_SHIFT_) + + 1; + unsigned Offset = + (Dst & AMDGPU::Hwreg::OFFSET_MASK_) >> AMDGPU::Hwreg::OFFSET_SHIFT_; + unsigned Mask = ((1 << Width) - 1) << Offset; + + // If an InsertionPoint is set we will insert a setreg there. + if (InsertionPoint) { + insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change)); + InsertionPoint = nullptr; + } + // If this is an immediate then we know the value being set, but if it is + // not an immediate then we treat the modified bits of the mode register + // as unknown. + if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32) { + unsigned Val = TII->getNamedOperand(MI, AMDGPU::OpName::imm)->getImm(); + unsigned Mode = (Val << Offset) & Mask; + Status Setreg = Status(Mask, Mode); + // If we haven't already set the initial requirements for the block we + // don't need to as the requirements start from this explicit setreg. + RequirePending = false; + NewInfo->Change = NewInfo->Change.merge(Setreg); + } else { + NewInfo->Change = NewInfo->Change.mergeUnknown(Mask); + } + } else if (!NewInfo->Change.isCompatible(InstrMode)) { + // This instruction uses the Mode register and its requirements aren't + // compatible with the current mode. + if (InsertionPoint) { + // If the required mode change cannot be included in the current + // InsertionPoint changes, we need a setreg and start a new + // InsertionPoint. + if (!IPChange.delta(NewInfo->Change).isCombinable(InstrMode)) { + if (RequirePending) { + // This is the first insertionPoint in the block so we will defer + // the insertion of the setreg to Phase 3 where we know whether or + // not it is actually needed. + NewInfo->FirstInsertionPoint = InsertionPoint; + NewInfo->Require = NewInfo->Change; + RequirePending = false; + } else { + insertSetreg(MBB, InsertionPoint, TII, + IPChange.delta(NewInfo->Change)); + IPChange = NewInfo->Change; + } + // Set the new InsertionPoint + InsertionPoint = &MI; + } + NewInfo->Change = NewInfo->Change.merge(InstrMode); + } else { + // No InsertionPoint is currently set - this is either the first in + // the block or we have previously seen an explicit setreg. + InsertionPoint = &MI; + IPChange = NewInfo->Change; + NewInfo->Change = NewInfo->Change.merge(InstrMode); + } + } + } + if (RequirePending) { + // If we haven't yet set the initial requirements for the block we set them + // now. + NewInfo->FirstInsertionPoint = InsertionPoint; + NewInfo->Require = NewInfo->Change; + } else if (InsertionPoint) { + // We need to insert a setreg at the InsertionPoint + insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change)); + } + NewInfo->Exit = NewInfo->Change; + BlockInfo[MBB.getNumber()] = std::move(NewInfo); +} + +// In Phase 2 we revisit each block and calculate the common Mode register +// value provided by all predecessor blocks. If the Exit value for the block +// is changed, then we add the successor blocks to the worklist so that the +// exit value is propagated. +void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB, + const SIInstrInfo *TII) { +// BlockData *BI = BlockInfo[MBB.getNumber()]; + unsigned ThisBlock = MBB.getNumber(); + if (MBB.pred_empty()) { + // There are no predecessors, so use the default starting status. + BlockInfo[ThisBlock]->Pred = DefaultStatus; + } else { + // Build a status that is common to all the predecessors by intersecting + // all the predecessor exit status values. + MachineBasicBlock::pred_iterator P = MBB.pred_begin(), E = MBB.pred_end(); + MachineBasicBlock &PB = *(*P); + BlockInfo[ThisBlock]->Pred = BlockInfo[PB.getNumber()]->Exit; + + for (P = std::next(P); P != E; P = std::next(P)) { + MachineBasicBlock *Pred = *P; + BlockInfo[ThisBlock]->Pred = BlockInfo[ThisBlock]->Pred.intersect(BlockInfo[Pred->getNumber()]->Exit); + } + } + Status TmpStatus = BlockInfo[ThisBlock]->Pred.merge(BlockInfo[ThisBlock]->Change); + if (BlockInfo[ThisBlock]->Exit != TmpStatus) { + BlockInfo[ThisBlock]->Exit = TmpStatus; + // Add the successors to the work list so we can propagate the changed exit + // status. + for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(), + E = MBB.succ_end(); + S != E; S = std::next(S)) { + MachineBasicBlock &B = *(*S); + Phase2List.push(&B); + } + } +} + +// In Phase 3 we revisit each block and if it has an insertion point defined we +// check whether the predecessor mode meets the block's entry requirements. If +// not we insert an appropriate setreg instruction to modify the Mode register. +void SIModeRegister::processBlockPhase3(MachineBasicBlock &MBB, + const SIInstrInfo *TII) { +// BlockData *BI = BlockInfo[MBB.getNumber()]; + unsigned ThisBlock = MBB.getNumber(); + if (!BlockInfo[ThisBlock]->Pred.isCompatible(BlockInfo[ThisBlock]->Require)) { + Status Delta = BlockInfo[ThisBlock]->Pred.delta(BlockInfo[ThisBlock]->Require); + if (BlockInfo[ThisBlock]->FirstInsertionPoint) + insertSetreg(MBB, BlockInfo[ThisBlock]->FirstInsertionPoint, TII, Delta); + else + insertSetreg(MBB, &MBB.instr_front(), TII, Delta); + } +} + +bool SIModeRegister::runOnMachineFunction(MachineFunction &MF) { + BlockInfo.resize(MF.getNumBlockIDs()); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + // Processing is performed in a number of phases + + // Phase 1 - determine the initial mode required by each block, and add setreg + // instructions for intra block requirements. + for (MachineBasicBlock &BB : MF) + processBlockPhase1(BB, TII); + + // Phase 2 - determine the exit mode from each block. We add all blocks to the + // list here, but will also add any that need to be revisited during Phase 2 + // processing. + for (MachineBasicBlock &BB : MF) + Phase2List.push(&BB); + while (!Phase2List.empty()) { + processBlockPhase2(*Phase2List.front(), TII); + Phase2List.pop(); + } + + // Phase 3 - add an initial setreg to each block where the required entry mode + // is not satisfied by the exit mode of all its predecessors. + for (MachineBasicBlock &BB : MF) + processBlockPhase3(BB, TII); + + BlockInfo.clear(); + + return NumSetregInserted > 0; +} diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index e53d876f232b31f639e713625a39a6435a0a9fe3..c671fed34bdf14bfcc27dc34183f4889f238ca56 100644 --- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -103,6 +103,122 @@ static MachineInstr* getOrExecSource(const MachineInstr &MI, return SaveExecInst; } +// Optimize sequence +// %sel = V_CNDMASK_B32_e64 0, 1, %cc +// %cmp = V_CMP_NE_U32 1, %1 +// $vcc = S_AND_B64 $exec, %cmp +// S_CBRANCH_VCC[N]Z +// => +// $vcc = S_ANDN2_B64 $exec, %cc +// S_CBRANCH_VCC[N]Z +// +// It is the negation pattern inserted by DAGCombiner::visitBRCOND() in the +// rebuildSetCC(). We start with S_CBRANCH to avoid exhaustive search, but +// only 3 first instructions are really needed. S_AND_B64 with exec is a +// required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive +// lanes. +// +// Returns %cc register on success. +static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB, + const GCNSubtarget &ST, + MachineRegisterInfo &MRI, + LiveIntervals *LIS) { + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const unsigned AndOpc = AMDGPU::S_AND_B64; + const unsigned Andn2Opc = AMDGPU::S_ANDN2_B64; + const unsigned CondReg = AMDGPU::VCC; + const unsigned ExecReg = AMDGPU::EXEC; + + auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + return Opc == AMDGPU::S_CBRANCH_VCCZ || + Opc == AMDGPU::S_CBRANCH_VCCNZ; }); + if (I == MBB.terminators().end()) + return AMDGPU::NoRegister; + + auto *And = TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister, + *I, MRI, LIS); + if (!And || And->getOpcode() != AndOpc || + !And->getOperand(1).isReg() || !And->getOperand(2).isReg()) + return AMDGPU::NoRegister; + + MachineOperand *AndCC = &And->getOperand(1); + unsigned CmpReg = AndCC->getReg(); + unsigned CmpSubReg = AndCC->getSubReg(); + if (CmpReg == ExecReg) { + AndCC = &And->getOperand(2); + CmpReg = AndCC->getReg(); + CmpSubReg = AndCC->getSubReg(); + } else if (And->getOperand(2).getReg() != ExecReg) { + return AMDGPU::NoRegister; + } + + auto *Cmp = TRI->findReachingDef(CmpReg, CmpSubReg, *And, MRI, LIS); + if (!Cmp || !(Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e32 || + Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e64) || + Cmp->getParent() != And->getParent()) + return AMDGPU::NoRegister; + + MachineOperand *Op1 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src0); + MachineOperand *Op2 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src1); + if (Op1->isImm() && Op2->isReg()) + std::swap(Op1, Op2); + if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1) + return AMDGPU::NoRegister; + + unsigned SelReg = Op1->getReg(); + auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, MRI, LIS); + if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64) + return AMDGPU::NoRegister; + + Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0); + Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1); + MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2); + if (!Op1->isImm() || !Op2->isImm() || !CC->isReg() || + Op1->getImm() != 0 || Op2->getImm() != 1) + return AMDGPU::NoRegister; + + LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' + << *Cmp << '\t' << *And); + + unsigned CCReg = CC->getReg(); + LIS->RemoveMachineInstrFromMaps(*And); + MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(), + TII->get(Andn2Opc), And->getOperand(0).getReg()) + .addReg(ExecReg) + .addReg(CCReg, CC->getSubReg()); + And->eraseFromParent(); + LIS->InsertMachineInstrInMaps(*Andn2); + + LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2 << '\n'); + + // Try to remove compare. Cmp value should not used in between of cmp + // and s_and_b64 if VCC or just unused if any other register. + if ((TargetRegisterInfo::isVirtualRegister(CmpReg) && + MRI.use_nodbg_empty(CmpReg)) || + (CmpReg == CondReg && + std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(), + [&](const MachineInstr &MI) { + return MI.readsRegister(CondReg, TRI); }))) { + LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n'); + + LIS->RemoveMachineInstrFromMaps(*Cmp); + Cmp->eraseFromParent(); + + // Try to remove v_cndmask_b32. + if (TargetRegisterInfo::isVirtualRegister(SelReg) && + MRI.use_nodbg_empty(SelReg)) { + LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n'); + + LIS->RemoveMachineInstrFromMaps(*Sel); + Sel->eraseFromParent(); + } + } + + return CCReg; +} + bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -117,6 +233,14 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock &MBB : MF) { + if (unsigned Reg = optimizeVcndVcmpPair(MBB, ST, MRI, LIS)) { + RecalcRegs.insert(Reg); + RecalcRegs.insert(AMDGPU::VCC_LO); + RecalcRegs.insert(AMDGPU::VCC_HI); + RecalcRegs.insert(AMDGPU::SCC); + Changed = true; + } + // Try to remove unneeded instructions before s_endpgm. if (MBB.succ_empty()) { if (MBB.empty()) diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 0e000b72962eb0ea37df3363e5f30a2bd8e683f3..2d43d5d05ef6437a0f49038141978d1218646b5f 100644 --- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -90,7 +90,9 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; void matchSDWAOperands(MachineBasicBlock &MBB); std::unique_ptr matchSDWAOperand(MachineInstr &MI); - bool isConvertibleToSDWA(const MachineInstr &MI, const GCNSubtarget &ST) const; + bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const; + void pseudoOpConvertToVOP2(MachineInstr &MI, + const GCNSubtarget &ST) const; bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; @@ -854,7 +856,82 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { } } -bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, +// Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and +// V_ADD_I32_e64 into V_ADD_I32_e32. This allows isConvertibleToSDWA +// to perform its transformation on V_ADD_I32_e32 into V_ADD_I32_sdwa. +// +// We are transforming from a VOP3 into a VOP2 form of the instruction. +// %19:vgpr_32 = V_AND_B32_e32 255, +// killed %16:vgpr_32, implicit $exec +// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64 +// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec +// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 +// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec +// +// becomes +// %47:vgpr_32 = V_ADD_I32_sdwa +// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, +// implicit-def $vcc, implicit $exec +// %48:vgpr_32 = V_ADDC_U32_e32 +// 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec +void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, + const GCNSubtarget &ST) const { + int Opc = MI.getOpcode(); + assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) && + "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64"); + + // Can the candidate MI be shrunk? + if (!TII->canShrink(MI, *MRI)) + return; + Opc = AMDGPU::getVOPe32(Opc); + // Find the related ADD instruction. + const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); + if (!Sdst) + return; + MachineOperand *NextOp = findSingleRegUse(Sdst, MRI); + if (!NextOp) + return; + MachineInstr &MISucc = *NextOp->getParent(); + // Can the successor be shrunk? + if (!TII->canShrink(MISucc, *MRI)) + return; + int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode()); + // Make sure the carry in/out are subsequently unused. + MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2); + if (!CarryIn) + return; + MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst); + if (!CarryOut) + return; + if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg())) + return; + // Make sure VCC or its subregs are dead before MI. + MachineBasicBlock &MBB = *MI.getParent(); + auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25); + if (Liveness != MachineBasicBlock::LQR_Dead) + return; + // Check if VCC is referenced in range of (MI,MISucc]. + for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator(); + I != E; ++I) { + if (I->modifiesRegister(AMDGPU::VCC, TRI)) + return; + } + // Make the two new e32 instruction variants. + // Replace MI with V_{SUB|ADD}_I32_e32 + auto NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc)); + NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)); + NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); + NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)); + MI.eraseFromParent(); + // Replace MISucc with V_{SUBB|ADDC}_U32_e32 + auto NewInst = BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc)); + NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst)); + NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0)); + NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1)); + MISucc.eraseFromParent(); +} + +bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const { // Check if this is already an SDWA instruction unsigned Opc = MI.getOpcode(); @@ -1127,6 +1204,22 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock &MBB : MF) { bool Changed = false; do { + // Preprocess the ADD/SUB pairs so they could be SDWA'ed. + // Look for a possible ADD or SUB that resulted from a previously lowered + // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 + // lowers the pair of instructions into e32 form. + matchSDWAOperands(MBB); + for (const auto &OperandPair : SDWAOperands) { + const auto &Operand = OperandPair.second; + MachineInstr *PotentialMI = Operand->potentialToConvert(TII); + if (PotentialMI && + (PotentialMI->getOpcode() == AMDGPU::V_ADD_I32_e64 || + PotentialMI->getOpcode() == AMDGPU::V_SUB_I32_e64)) + pseudoOpConvertToVOP2(*PotentialMI, ST); + } + SDWAOperands.clear(); + + // Generate potential match list. matchSDWAOperands(MBB); for (const auto &OperandPair : SDWAOperands) { diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 669a60fadd4f891e28b123d62f06b4ba2efef6b6..97cfde2b2354128982007949da015f1d03000fe0 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -18,9 +18,12 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/CodeGen/SlotIndexes.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" @@ -901,7 +904,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, .addImm(0) // glc .addMemOperand(MMO); - if (NumSubRegs > 1) + if (NumSubRegs > 1 && i == 0) MIB.addReg(SuperReg, RegState::ImplicitDefine); continue; @@ -915,7 +918,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, .addReg(Spill.VGPR) .addImm(Spill.Lane); - if (NumSubRegs > 1) + if (NumSubRegs > 1 && i == 0) MIB.addReg(SuperReg, RegState::ImplicitDefine); } else { if (OnlyToVGPR) @@ -1599,3 +1602,57 @@ SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, llvm_unreachable("not implemented"); } } + +// Find reaching register definition +MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg, + MachineInstr &Use, + MachineRegisterInfo &MRI, + LiveIntervals *LIS) const { + auto &MDT = LIS->getAnalysis(); + SlotIndex UseIdx = LIS->getInstructionIndex(Use); + SlotIndex DefIdx; + + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (!LIS->hasInterval(Reg)) + return nullptr; + LiveInterval &LI = LIS->getInterval(Reg); + LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) + : MRI.getMaxLaneMaskForVReg(Reg); + VNInfo *V = nullptr; + if (LI.hasSubRanges()) { + for (auto &S : LI.subranges()) { + if ((S.LaneMask & SubLanes) == SubLanes) { + V = S.getVNInfoAt(UseIdx); + break; + } + } + } else { + V = LI.getVNInfoAt(UseIdx); + } + if (!V) + return nullptr; + DefIdx = V->def; + } else { + // Find last def. + for (MCRegUnitIterator Units(Reg, this); Units.isValid(); ++Units) { + LiveRange &LR = LIS->getRegUnit(*Units); + if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { + if (!DefIdx.isValid() || + MDT.dominates(LIS->getInstructionFromIndex(DefIdx), + LIS->getInstructionFromIndex(V->def))) + DefIdx = V->def; + } else { + return nullptr; + } + } + } + + MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); + + if (!Def || !MDT.dominates(Def, &Use)) + return nullptr; + + assert(Def->modifiesRegister(Reg, this)); + + return Def; +} diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index 5a51b67ca719c30c349582e1b201b9e14d91e6f8..b82fefde47e1344d7b21638328793fca2ef2b8b6 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -228,6 +228,12 @@ public: getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override; + // Find reaching register definition + MachineInstr *findReachingDef(unsigned Reg, unsigned SubReg, + MachineInstr &Use, + MachineRegisterInfo &MRI, + LiveIntervals *LIS) const; + private: void buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index f87a0763b353b999943cc42edc0f00af59ca29b2..80bc180ba6eea5a2dbafeb60858a96c8bfbb9594 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -435,7 +435,7 @@ def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, let AllocationPriority = 7; } -def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> { +def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> { let CopyCost = 1; let AllocationPriority = 8; } @@ -444,13 +444,13 @@ def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add let isAllocatable = 0; } -def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32, +def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> { let CopyCost = 1; let AllocationPriority = 8; } -def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32, +def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, (add SReg_64_XEXEC, EXEC)> { let CopyCost = 1; let AllocationPriority = 8; @@ -459,15 +459,15 @@ def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32, // Requires 2 s_mov_b64 to copy let CopyCost = 2 in { -def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)> { +def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v16i8, v2i64], 32, (add SGPR_128Regs)> { let AllocationPriority = 10; } -def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128Regs)> { +def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v16i8, v2i64], 32, (add TTMP_128Regs)> { let isAllocatable = 0; } -def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64, v2f64], 32, +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v16i8, v2i64, v2f64], 32, (add SGPR_128, TTMP_128)> { let AllocationPriority = 10; } diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 015773b110420505693912faa7d137630926ca3f..6ad7dd0e3a7ce9cb764b8bfc61f1fd3b45887ecc 100644 --- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -212,6 +212,82 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { } } +/// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals. +/// For AND or OR, try using S_BITSET{0,1} to clear or set bits. +/// If the inverse of the immediate is legal, use ANDN2, ORN2 or +/// XNOR (as a ^ b == ~(a ^ ~b)). +/// \returns true if the caller should continue the machine function iterator +static bool shrinkScalarLogicOp(const GCNSubtarget &ST, + MachineRegisterInfo &MRI, + const SIInstrInfo *TII, + MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + const MachineOperand *Dest = &MI.getOperand(0); + MachineOperand *Src0 = &MI.getOperand(1); + MachineOperand *Src1 = &MI.getOperand(2); + MachineOperand *SrcReg = Src0; + MachineOperand *SrcImm = Src1; + + if (SrcImm->isImm() && + !AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) { + uint32_t Imm = static_cast(SrcImm->getImm()); + uint32_t NewImm = 0; + + if (Opc == AMDGPU::S_AND_B32) { + if (isPowerOf2_32(~Imm)) { + NewImm = countTrailingOnes(Imm); + Opc = AMDGPU::S_BITSET0_B32; + } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + NewImm = ~Imm; + Opc = AMDGPU::S_ANDN2_B32; + } + } else if (Opc == AMDGPU::S_OR_B32) { + if (isPowerOf2_32(Imm)) { + NewImm = countTrailingZeros(Imm); + Opc = AMDGPU::S_BITSET1_B32; + } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + NewImm = ~Imm; + Opc = AMDGPU::S_ORN2_B32; + } + } else if (Opc == AMDGPU::S_XOR_B32) { + if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + NewImm = ~Imm; + Opc = AMDGPU::S_XNOR_B32; + } + } else { + llvm_unreachable("unexpected opcode"); + } + + if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) && + SrcImm == Src0) { + if (!TII->commuteInstruction(MI, false, 1, 2)) + NewImm = 0; + } + + if (NewImm != 0) { + if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) && + SrcReg->isReg()) { + MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); + MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); + return true; + } + + if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { + MI.setDesc(TII->get(Opc)); + if (Opc == AMDGPU::S_BITSET0_B32 || + Opc == AMDGPU::S_BITSET1_B32) { + Src0->ChangeToImmediate(NewImm); + MI.RemoveOperand(2); + } else { + SrcImm->setImm(NewImm); + } + } + } + } + + return false; +} + // This is the same as MachineInstr::readsRegister/modifiesRegister except // it takes subregs into account. static bool instAccessReg(iterator_range &&R, @@ -512,6 +588,14 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { continue; } + // Shrink scalar logic operations. + if (MI.getOpcode() == AMDGPU::S_AND_B32 || + MI.getOpcode() == AMDGPU::S_OR_B32 || + MI.getOpcode() == AMDGPU::S_XOR_B32) { + if (shrinkScalarLogicOp(ST, MRI, TII, MI)) + continue; + } + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) continue; diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td index 8bd7de7269b7ed650f9cae0e5ec0c8470e09aedf..8a063e1a4867328022854a2f8fb4d83add936fae 100644 --- a/lib/Target/AMDGPU/SMInstructions.td +++ b/lib/Target/AMDGPU/SMInstructions.td @@ -751,6 +751,12 @@ defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16i32>; + +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", f32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2f32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4f32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8f32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>; } // End let AddedComplexity = 100 let OtherPredicates = [isSICI] in { diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td index 5841dcb2b9c68fdb37c92ab40265b25bd492be83..ca5e981ac5c25c1f1f6c77a10d5c4e9d8a206795 100644 --- a/lib/Target/AMDGPU/SOPInstructions.td +++ b/lib/Target/AMDGPU/SOPInstructions.td @@ -336,6 +336,12 @@ class SOP2_64_32_32 pattern=[]> : SOP2_Pseudo < "$sdst, $src0, $src1", pattern >; +class UniformUnaryFrag : PatFrag < + (ops node:$src0), + (Op $src0), + [{ return !N->isDivergent(); }] +>; + class UniformBinFrag : PatFrag < (ops node:$src0, node:$src1), (Op $src0, $src1), @@ -421,16 +427,39 @@ def S_XNOR_B32 : SOP2_32 <"s_xnor_b32", def S_XNOR_B64 : SOP2_64 <"s_xnor_b64", [(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))] >; + +def S_NAND_B32 : SOP2_32 <"s_nand_b32", + [(set i32:$sdst, (not (and_oneuse i32:$src0, i32:$src1)))] +>; + +def S_NAND_B64 : SOP2_64 <"s_nand_b64", + [(set i64:$sdst, (not (and_oneuse i64:$src0, i64:$src1)))] +>; + +def S_NOR_B32 : SOP2_32 <"s_nor_b32", + [(set i32:$sdst, (not (or_oneuse i32:$src0, i32:$src1)))] +>; + +def S_NOR_B64 : SOP2_64 <"s_nor_b64", + [(set i64:$sdst, (not (or_oneuse i64:$src0, i64:$src1)))] +>; } // End isCommutable = 1 -def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32">; -def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64">; -def S_ORN2_B32 : SOP2_32 <"s_orn2_b32">; -def S_ORN2_B64 : SOP2_64 <"s_orn2_b64">; -def S_NAND_B32 : SOP2_32 <"s_nand_b32">; -def S_NAND_B64 : SOP2_64 <"s_nand_b64">; -def S_NOR_B32 : SOP2_32 <"s_nor_b32">; -def S_NOR_B64 : SOP2_64 <"s_nor_b64">; +def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32", + [(set i32:$sdst, (UniformBinFrag i32:$src0, (UniformUnaryFrag i32:$src1)))] +>; + +def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64", + [(set i64:$sdst, (UniformBinFrag i64:$src0, (UniformUnaryFrag i64:$src1)))] +>; + +def S_ORN2_B32 : SOP2_32 <"s_orn2_b32", + [(set i32:$sdst, (UniformBinFrag i32:$src0, (UniformUnaryFrag i32:$src1)))] +>; + +def S_ORN2_B64 : SOP2_64 <"s_orn2_b64", + [(set i64:$sdst, (UniformBinFrag i64:$src0, (UniformUnaryFrag i64:$src1)))] +>; } // End Defs = [SCC] // Use added complexity so these patterns are preferred to the VALU patterns. diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 634ec8fcc3d11ad657327d32445e3a89f4aef55f..54c866bdc63ce060b036a09d86f226cbc02a0bf6 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -128,6 +128,49 @@ int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) { return NewInfo ? NewInfo->Opcode : -1; } +struct MUBUFInfo { + uint16_t Opcode; + uint16_t BaseOpcode; + uint8_t dwords; + bool has_vaddr; + bool has_srsrc; + bool has_soffset; +}; + +#define GET_MUBUFInfoTable_DECL +#define GET_MUBUFInfoTable_IMPL +#include "AMDGPUGenSearchableTables.inc" + +int getMUBUFBaseOpcode(unsigned Opc) { + const MUBUFInfo *Info = getMUBUFInfoFromOpcode(Opc); + return Info ? Info->BaseOpcode : -1; +} + +int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords) { + const MUBUFInfo *Info = getMUBUFInfoFromBaseOpcodeAndDwords(BaseOpc, Dwords); + return Info ? Info->Opcode : -1; +} + +int getMUBUFDwords(unsigned Opc) { + const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc); + return Info ? Info->dwords : 0; +} + +bool getMUBUFHasVAddr(unsigned Opc) { + const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc); + return Info ? Info->has_vaddr : false; +} + +bool getMUBUFHasSrsrc(unsigned Opc) { + const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc); + return Info ? Info->has_srsrc : false; +} + +bool getMUBUFHasSoffset(unsigned Opc) { + const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc); + return Info ? Info->has_soffset : false; +} + // Wrapper for Tablegen'd function. enum Subtarget is not defined in any // header files, so we need to wrap it in a function that takes unsigned // instead. @@ -159,7 +202,8 @@ void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) { } bool hasCodeObjectV3(const MCSubtargetInfo *STI) { - return STI->getFeatureBits().test(FeatureCodeObjectV3); + return STI->getTargetTriple().getOS() == Triple::AMDHSA && + STI->getFeatureBits().test(FeatureCodeObjectV3); } unsigned getWavefrontSize(const MCSubtargetInfo *STI) { @@ -521,6 +565,14 @@ void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, Lgkmcnt = decodeLgkmcnt(Version, Waitcnt); } +Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded) { + Waitcnt Decoded; + Decoded.VmCnt = decodeVmcnt(Version, Encoded); + Decoded.ExpCnt = decodeExpcnt(Version, Encoded); + Decoded.LgkmCnt = decodeLgkmcnt(Version, Encoded); + return Decoded; +} + unsigned encodeVmcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned Vmcnt) { Waitcnt = @@ -551,6 +603,10 @@ unsigned encodeWaitcnt(const IsaVersion &Version, return Waitcnt; } +unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded) { + return encodeWaitcnt(Version, Decoded.VmCnt, Decoded.ExpCnt, Decoded.LgkmCnt); +} + unsigned getInitialPSInputAddr(const Function &F) { return getIntegerAttribute(F, "InitialPSInputAddr", 0); } @@ -754,6 +810,7 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::VS_64RegClassID: case AMDGPU::SReg_64RegClassID: case AMDGPU::VReg_64RegClassID: + case AMDGPU::SReg_64_XEXECRegClassID: return 64; case AMDGPU::VReg_96RegClassID: return 96; @@ -894,9 +951,12 @@ bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) { // Given Imm, split it into the values to put into the SOffset and ImmOffset // fields in an MUBUF instruction. Return false if it is not possible (due to a // hardware bug needing a workaround). +// +// The required alignment ensures that individual address components remain +// aligned if they are aligned to begin with. It also ensures that additional +// offsets within the given alignment can be added to the resulting ImmOffset. bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, - const GCNSubtarget *Subtarget) { - const uint32_t Align = 4; + const GCNSubtarget *Subtarget, uint32_t Align) { const uint32_t MaxImm = alignDown(4095, Align); uint32_t Overflow = 0; diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index d45f42498692ede8e565d1c8c63fc540493ba473..1ef7da328d31a85658b0a4ee3df9f613c7a34106 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -220,6 +220,24 @@ int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, LLVM_READONLY int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels); +LLVM_READONLY +int getMUBUFBaseOpcode(unsigned Opc); + +LLVM_READONLY +int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords); + +LLVM_READONLY +int getMUBUFDwords(unsigned Opc); + +LLVM_READONLY +bool getMUBUFHasVAddr(unsigned Opc); + +LLVM_READONLY +bool getMUBUFHasSrsrc(unsigned Opc); + +LLVM_READONLY +bool getMUBUFHasSoffset(unsigned Opc); + LLVM_READONLY int getMCOpcode(uint16_t Opcode, unsigned Gen); @@ -258,6 +276,32 @@ std::pair getIntegerPairAttribute(const Function &F, std::pair Default, bool OnlyFirstRequired = false); +/// Represents the counter values to wait for in an s_waitcnt instruction. +/// +/// Large values (including the maximum possible integer) can be used to +/// represent "don't care" waits. +struct Waitcnt { + unsigned VmCnt = ~0u; + unsigned ExpCnt = ~0u; + unsigned LgkmCnt = ~0u; + + Waitcnt() {} + Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt) + : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt) {} + + static Waitcnt allZero() { return Waitcnt(0, 0, 0); } + + bool dominates(const Waitcnt &Other) const { + return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt && + LgkmCnt <= Other.LgkmCnt; + } + + Waitcnt combined(const Waitcnt &Other) const { + return Waitcnt(std::min(VmCnt, Other.VmCnt), std::min(ExpCnt, Other.ExpCnt), + std::min(LgkmCnt, Other.LgkmCnt)); + } +}; + /// \returns Vmcnt bit mask for given isa \p Version. unsigned getVmcntBitMask(const IsaVersion &Version); @@ -291,6 +335,8 @@ unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt); void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt); +Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded); + /// \returns \p Waitcnt with encoded \p Vmcnt for given isa \p Version. unsigned encodeVmcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned Vmcnt); @@ -318,6 +364,8 @@ unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt); +unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded); + unsigned getInitialPSInputAddr(const Function &F); LLVM_READNONE @@ -441,11 +489,8 @@ int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); /// not the encoded offset. bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); -// Given Imm, split it into the values to put into the SOffset and ImmOffset -// fields in an MUBUF instruction. Return false if it is not possible (due to a -// hardware bug needing a workaround). bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, - const GCNSubtarget *Subtarget); + const GCNSubtarget *Subtarget, uint32_t Align = 4); /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td index 9f4673cf7ab55e88c3e76f606ff3457c573ff37a..68446ab79720af70c856845bcefaffd1f19ca642 100644 --- a/lib/Target/AMDGPU/VOP1Instructions.td +++ b/lib/Target/AMDGPU/VOP1Instructions.td @@ -84,6 +84,10 @@ class VOP1_SDWA_Pseudo pattern=[]> : let AsmMatchConverter = "cvtSdwaVOP1"; } +class VOP1_DPP_Pseudo pattern=[]> : + VOP_DPP_Pseudo { +} + class getVOP1Pat64 : LetDummies { list ret = !if(P.HasModifiers, @@ -103,6 +107,8 @@ multiclass VOP1Inst ; def _e64 : VOP3_Pseudo .ret>; def _sdwa : VOP1_SDWA_Pseudo ; + foreach _ = BoolToList.ret in + def _dpp : VOP1_DPP_Pseudo ; } // Special profile for instructions which have clamp @@ -173,7 +179,9 @@ defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>; defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>; defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>; defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>; +let FPDPRounding = 1 in { defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>; +} // End FPDPRounding = 1 defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>; defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>; defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>; @@ -226,7 +234,9 @@ defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>; let SchedRW = [WriteDoubleAdd] in { defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>; defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>; +let FPDPRounding = 1 in { defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>; +} // End FPDPRounding = 1 } // End SchedRW = [WriteDoubleAdd] defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>; @@ -333,8 +343,10 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>; let SubtargetPredicate = Has16BitInsts in { +let FPDPRounding = 1 in { defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>; defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>; +} // End FPDPRounding = 1 defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>; defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>; let SchedRW = [WriteQuarterRate32] in { @@ -352,7 +364,9 @@ defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>; defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>; defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>; defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>; +let FPDPRounding = 1 in { defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>; +} // End FPDPRounding = 1 } @@ -500,13 +514,8 @@ defm V_EXP_LEGACY_F32 : VOP1_Real_ci <0x46>; // VI //===----------------------------------------------------------------------===// -class VOP1_DPP op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> : - VOP_DPP { - let Defs = ps.Defs; - let Uses = ps.Uses; - let SchedRW = ps.SchedRW; - let hasSideEffects = ps.hasSideEffects; - +class VOP1_DPPe op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : + VOP_DPPe

{ bits<8> vdst; let Inst{8-0} = 0xfa; // dpp let Inst{16-9} = op; @@ -544,9 +553,10 @@ multiclass VOP1_Real_vi op> { VOP_SDWA9_Real (NAME#"_sdwa")>, VOP1_SDWA9Ae (NAME#"_sdwa").Pfl>; - // For now left dpp only for asm/dasm - // TODO: add corresponding pseudo - def _dpp : VOP1_DPP(NAME#"_e32")>; + foreach _ = BoolToList(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_vi : + VOP_DPP_Real(NAME#"_dpp"), SIEncodingFamily.VI>, + VOP1_DPPe(NAME#"_dpp")>; } defm V_NOP : VOP1_Real_vi <0x0>; @@ -717,9 +727,11 @@ multiclass VOP1_Real_gfx9 op> { VOP_SDWA9_Real (NAME#"_sdwa")>, VOP1_SDWA9Ae (NAME#"_sdwa").Pfl>; - // For now left dpp only for asm/dasm - // TODO: add corresponding pseudo - def _dpp : VOP1_DPP(NAME#"_e32")>; + foreach _ = BoolToList(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx9 : + VOP_DPP_Real(NAME#"_dpp"), SIEncodingFamily.GFX9>, + VOP1_DPPe(NAME#"_dpp")>; + } defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>; diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index db031be7e558ed9d3b46ae5a7f9be9aaf5ed213a..e3fd7b5f9fadd9f645fabeab78b30a3d749f9e6d 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -105,6 +105,11 @@ class VOP2_SDWA_Pseudo pattern=[]> : let AsmMatchConverter = "cvtSdwaVOP2"; } +class VOP2_DPP_Pseudo pattern=[]> : + VOP_DPP_Pseudo { +} + + class getVOP2Pat64 : LetDummies { list ret = !if(P.HasModifiers, [(set P.DstVT:$vdst, @@ -155,7 +160,12 @@ multiclass VOP2Inst : VOP2Inst_e32, VOP2Inst_e64, - VOP2Inst_sdwa; + VOP2Inst_sdwa { + let renamedInGFX9 = GFX9Renamed in { + foreach _ = BoolToList.ret in + def _dpp : VOP2_DPP_Pseudo ; + } +} multiclass VOP2bInst { let AsmMatchConverter = "cvtSdwaVOP2b"; } + foreach _ = BoolToList.ret in + def _dpp : VOP2_DPP_Pseudo ; } def _e64 : VOP3_Pseudo .ret>, @@ -194,6 +206,9 @@ multiclass VOP2eInst { let AsmMatchConverter = "cvtSdwaVOP2b"; } + + foreach _ = BoolToList.ret in + def _dpp : VOP2_DPP_Pseudo ; } def _e64 : VOP3_Pseudo .ret>, @@ -233,9 +248,9 @@ class VOP_MAC : VOPProfile <[vt, vt, vt, vt]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64, 3, 0, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; - let InsDPP = (ins DstRCDPP:$old, - Src0ModDPP:$src0_modifiers, Src0DPP:$src0, + let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, + VGPR_32:$src2, // stub argument dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); @@ -540,18 +555,23 @@ def : divergent_i64_BinOp ; let SubtargetPredicate = Has16BitInsts in { +let FPDPRounding = 1 in { def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">; +defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>; +} // End FPDPRounding = 1 + defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>; defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>; defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>; -defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>; let isCommutable = 1 in { +let FPDPRounding = 1 in { defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>; defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>; defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">; defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>; def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">; +} // End FPDPRounding = 1 defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>; defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>; defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">; @@ -778,13 +798,8 @@ defm V_CVT_PK_I16_I32 : VOP2_Real_e32e64_si <0x31>; // VI //===----------------------------------------------------------------------===// -class VOP2_DPP op, VOP2_Pseudo ps, string OpName = ps.OpName, VOPProfile P = ps.Pfl> : - VOP_DPP { - let Defs = ps.Defs; - let Uses = ps.Uses; - let SchedRW = ps.SchedRW; - let hasSideEffects = ps.hasSideEffects; - +class VOP2_DPPe op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : + VOP_DPPe

{ bits<8> vdst; bits<8> src1; let Inst{8-0} = 0xfa; //dpp @@ -865,8 +880,13 @@ multiclass VOP2be_Real_e32e64_vi_only op, string OpName, string AsmName VOP2_SDWA_Pseudo ps = !cast(OpName#"_sdwa"); let AsmString = AsmName # ps.AsmOperands; } - def _dpp : - VOP2_DPP(OpName#"_e32"), AsmName>; + foreach _ = BoolToList(OpName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_vi : + VOP_DPP_Real(OpName#"_dpp"), SIEncodingFamily.VI>, + VOP2_DPPe(OpName#"_dpp")> { + VOP2_DPP_Pseudo ps = !cast(OpName#"_dpp"); + let AsmString = AsmName # ps.AsmOperands; + } } } @@ -893,10 +913,14 @@ multiclass VOP2be_Real_e32e64_gfx9 op, string OpName, string AsmName> { VOP2_SDWA_Pseudo ps = !cast(OpName#"_sdwa"); let AsmString = AsmName # ps.AsmOperands; } - def _dpp_gfx9 : - VOP2_DPP(OpName#"_e32"), AsmName> { - let DecoderNamespace = "SDWA9"; - } + foreach _ = BoolToList(OpName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx9 : + VOP_DPP_Real(OpName#"_dpp"), SIEncodingFamily.GFX9>, + VOP2_DPPe(OpName#"_dpp")> { + VOP2_DPP_Pseudo ps = !cast(OpName#"_dpp"); + let AsmString = AsmName # ps.AsmOperands; + let DecoderNamespace = "SDWA9"; + } } multiclass VOP2_Real_e32e64_gfx9 op> { @@ -914,19 +938,23 @@ multiclass VOP2_Real_e32e64_gfx9 op> { VOP_SDWA9_Real (NAME#"_sdwa")>, VOP2_SDWA9Ae (NAME#"_sdwa").Pfl> { } - def _dpp_gfx9 : - VOP2_DPP(NAME#"_e32")> { - let DecoderNamespace = "SDWA9"; - } + foreach _ = BoolToList(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx9 : + VOP_DPP_Real(NAME#"_dpp"), SIEncodingFamily.GFX9>, + VOP2_DPPe(NAME#"_dpp")> { + let DecoderNamespace = "SDWA9"; + } } } // AssemblerPredicates = [isGFX9] multiclass VOP2_Real_e32e64_vi op> : Base_VOP2_Real_e32e64_vi, VOP2_SDWA_Real, VOP2_SDWA9_Real { - // For now left dpp only for asm/dasm - // TODO: add corresponding pseudo - def _dpp : VOP2_DPP(NAME#"_e32")>; + + foreach _ = BoolToList(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_vi : + VOP_DPP_Real(NAME#"_dpp"), SIEncodingFamily.VI>, + VOP2_DPPe(NAME#"_dpp")>; } defm V_CNDMASK_B32 : VOP2_Real_e32e64_vi <0x0>; diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index 51bee3efeb2c4b923d87cd324aece67117c47672..4b8c1f208a0eda54003b5fb238f2c55e92279994 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -220,7 +220,8 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { // VOP3 INTERP //===----------------------------------------------------------------------===// -class VOP3Interp : VOP3_Pseudo { +class VOP3Interp pattern = []> : + VOP3_Pseudo { let AsmMatchConverter = "cvtVOP3Interp"; } @@ -292,9 +293,11 @@ def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile, fma>; def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile, int_amdgcn_lerp>; let SchedRW = [WriteDoubleAdd] in { +let FPDPRounding = 1 in { def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile, fma>; def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile, fadd, 1>; def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile, fmul, 1>; +} // End FPDPRounding = 1 def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile, fminnum_like, 1>; def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile, fmaxnum_like, 1>; } // End SchedRW = [WriteDoubleAdd] @@ -324,6 +327,7 @@ def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC, def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, getVOP3VCC.ret> { let SchedRW = [WriteDouble]; + let FPDPRounding = 1; } } // End Uses = [VCC, EXEC] @@ -354,10 +358,10 @@ def V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile, int_amdgcn_cvt_pk_u8_f32>; def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile, AMDGPUdiv_fixup>; -let SchedRW = [WriteDoubleAdd] in { +let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in { def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile, AMDGPUdiv_fixup>; def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile, AMDGPUldexp, 1>; -} // End SchedRW = [WriteDoubleAdd] +} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1 def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> { let SchedRW = [WriteFloatFMA, WriteSALU]; @@ -368,6 +372,7 @@ def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> { let SchedRW = [WriteDouble, WriteSALU]; let AsmMatchConverter = ""; + let FPDPRounding = 1; } def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile>; @@ -431,33 +436,51 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile, AMDGPUdiv_fixup> { let Predicates = [Has16BitInsts, isVIOnly]; + let FPDPRounding = 1; } def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9", VOP3_Profile, AMDGPUdiv_fixup> { let renamedInGFX9 = 1; let Predicates = [Has16BitInsts, isGFX9]; + let FPDPRounding = 1; +} + +def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile, fma> { + let Predicates = [Has16BitInsts, isVIOnly]; + let FPDPRounding = 1; +} +def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile, fma> { + let renamedInGFX9 = 1; + let Predicates = [Has16BitInsts, isGFX9]; + let FPDPRounding = 1; } let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in { let renamedInGFX9 = 1 in { -def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile, fmad>; def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile>; def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile>; -def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile, fma>; +let FPDPRounding = 1 in { +def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile, fmad>; +let Uses = [M0, EXEC] in { def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>; -} +} // End Uses = [M0, EXEC] +} // End FPDPRounding = 1 +} // End renamedInGFX9 = 1 let SubtargetPredicate = isGFX9 in { -def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile>; +def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile> { + let FPDPRounding = 1; +} def V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile>; def V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile>; -def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile>; def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>; } // End SubtargetPredicate = isGFX9 +let Uses = [M0, EXEC], FPDPRounding = 1 in { def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>; def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>; +} // End Uses = [M0, EXEC], FPDPRounding = 1 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1 @@ -485,6 +508,37 @@ defm: Ternary_i16_Pats; } // End Predicates = [Has16BitInsts] +class ThreeOpFrag : PatFrag< + (ops node:$x, node:$y, node:$z), + // When the inner operation is used multiple times, selecting 3-op + // instructions may still be beneficial -- if the other users can be + // combined similarly. Let's be conservative for now. + (op2 (HasOneUseBinOp node:$x, node:$y), node:$z), + [{ + // Only use VALU ops when the result is divergent. + if (!N->isDivergent()) + return false; + + // Check constant bus limitations. + // + // Note: Use !isDivergent as a conservative proxy for whether the value + // is in an SGPR (uniform values can end up in VGPRs as well). + unsigned ConstantBusUses = 0; + for (unsigned i = 0; i < 3; ++i) { + if (!Operands[i]->isDivergent() && + !isInlineImmediate(Operands[i].getNode())) { + ConstantBusUses++; + if (ConstantBusUses >= 2) + return false; + } + } + + return true; + }] +> { + let PredicateCodeUsesOperands = 1; +} + let SubtargetPredicate = isGFX9 in { def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile>; def V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile>; @@ -519,6 +573,22 @@ def V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile>; def V_SUB_I32_gfx9 : VOP3Inst <"v_sub_i32_gfx9", VOP3_Profile>; + + +class ThreeOp_i32_Pats : GCNPat < + // This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions. + (ThreeOpFrag i32:$src0, i32:$src1, i32:$src2), + (inst i32:$src0, i32:$src1, i32:$src2) +>; + +def : ThreeOp_i32_Pats; +def : ThreeOp_i32_Pats; +def : ThreeOp_i32_Pats; +def : ThreeOp_i32_Pats; +def : ThreeOp_i32_Pats; +def : ThreeOp_i32_Pats; +def : ThreeOp_i32_Pats; + } // End SubtargetPredicate = isGFX9 //===----------------------------------------------------------------------===// @@ -792,12 +862,15 @@ defm V_FMA_F16 : VOP3_F16_Real_vi <0x1ee>; defm V_DIV_FIXUP_F16 : VOP3_F16_Real_vi <0x1ef>; defm V_INTERP_P2_F16 : VOP3Interp_F16_Real_vi <0x276>; +let FPDPRounding = 1 in { defm V_MAD_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ea, "V_MAD_F16", "v_mad_legacy_f16">; -defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">; -defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">; defm V_FMA_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ee, "V_FMA_F16", "v_fma_legacy_f16">; defm V_DIV_FIXUP_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ef, "V_DIV_FIXUP_F16", "v_div_fixup_legacy_f16">; defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16", "v_interp_p2_legacy_f16">; +} // End FPDPRounding = 1 + +defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">; +defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">; defm V_MAD_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">; defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">; diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td index 2efd28b9cd8bc1b0c76ca85253a4adff3abaa79f..0d25a86da3266480619df72c85e6d6f0610ca7da 100644 --- a/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/lib/Target/AMDGPU/VOP3PInstructions.td @@ -42,12 +42,14 @@ class VOP3_VOP3PInst, fma>; def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile>; def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile>; +let FPDPRounding = 1 in { +def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile, fma>; def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile, fadd>; def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile, fmul>; +} // End FPDPRounding = 1 def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile, fmaxnum_like>; def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile, fminnum_like>; @@ -137,12 +139,14 @@ let SubtargetPredicate = HasMadMixInsts in { let isCommutable = 1 in { def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile>; +let FPDPRounding = 1 in { // Clamp modifier is applied after conversion to f16. def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile, 1>; let ClampLo = 0, ClampHi = 1 in { def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile, 1>; } +} // End FPDPRounding = 1 } defm : MadFmaMixPats; @@ -154,12 +158,14 @@ let SubtargetPredicate = HasFmaMixInsts in { let isCommutable = 1 in { def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile>; +let FPDPRounding = 1 in { // Clamp modifier is applied after conversion to f16. def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile, 1>; let ClampLo = 0, ClampHi = 1 in { def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile, 1>; } +} // End FPDPRounding = 1 } defm : MadFmaMixPats; diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td index e177b2fd081f5a7a37e8101528a34b179243b7da..7de7d90d27b3add0bb70891242e856098492d38a 100644 --- a/lib/Target/AMDGPU/VOPInstructions.td +++ b/lib/Target/AMDGPU/VOPInstructions.td @@ -505,9 +505,14 @@ class VOP_DPPe : Enc64 { let Inst{63-60} = row_mask; } -class VOP_DPP : - InstSI , - VOP_DPPe

{ +class VOP_DPP_Pseudo pattern=[]> : + InstSI , + VOP , + SIMCInstr , + MnemonicAlias { + + let isPseudo = 1; + let isCodeGenOnly = 1; let mayLoad = 0; let mayStore = 0; @@ -517,6 +522,11 @@ class VOP_DPP : let VALU = 1; let DPP = 1; let Size = 8; + let Uses = [EXEC]; + let isConvergent = 1; + + string Mnemonic = OpName; + string AsmOperands = P.AsmDPP; let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", ""); let SubtargetPredicate = HasDPP; @@ -526,6 +536,36 @@ class VOP_DPP : let Constraints = !if(P.NumSrcArgs, "$old = $vdst", ""); let DisableEncoding = !if(P.NumSrcArgs, "$old", ""); let DecoderNamespace = "DPP"; + + VOPProfile Pfl = P; +} + +class VOP_DPP_Real : + InstSI , + SIMCInstr { + + let isPseudo = 0; + let isCodeGenOnly = 0; + + let Defs = ps.Defs; + let Uses = ps.Uses; + let SchedRW = ps.SchedRW; + let hasSideEffects = ps.hasSideEffects; + + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + + // Copy relevant pseudo op flags + let isConvergent = ps.isConvergent; + let SubtargetPredicate = ps.SubtargetPredicate; + let AssemblerPredicate = ps.AssemblerPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + let AsmVariantName = ps.AsmVariantName; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let DecoderNamespace = ps.DecoderNamespace; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + let TSFlags = ps.TSFlags; } class getNumNodeArgs { diff --git a/lib/Target/ARC/ARCTargetMachine.cpp b/lib/Target/ARC/ARCTargetMachine.cpp index 1acae3a88870f6f3a15406a57a28d550ac891292..6f5bbd3b4ef3690584a115112dbcf80821d73e1f 100644 --- a/lib/Target/ARC/ARCTargetMachine.cpp +++ b/lib/Target/ARC/ARCTargetMachine.cpp @@ -26,12 +26,6 @@ static Reloc::Model getRelocModel(Optional RM) { return *RM; } -static CodeModel::Model getEffectiveCodeModel(Optional CM) { - if (CM) - return *CM; - return CodeModel::Small; -} - /// ARCTargetMachine ctor - Create an ILP32 architecture model ARCTargetMachine::ARCTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -43,7 +37,7 @@ ARCTargetMachine::ARCTargetMachine(const Target &T, const Triple &TT, "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-" "f32:32:32-i64:32-f64:32-a:0:32-n32", TT, CPU, FS, Options, getRelocModel(RM), - getEffectiveCodeModel(CM), OL), + getEffectiveCodeModel(CM, CodeModel::Small), OL), TLOF(make_unique()), Subtarget(TT, CPU, FS, *this) { initAsmInfo(); diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp index deab039772c0bff4fe7d36b0173de47651fddbc4..8e80c32bcf89183425809fe85ac320f4255535d3 100644 --- a/lib/Target/ARM/ARMCallLowering.cpp +++ b/lib/Target/ARM/ARMCallLowering.cpp @@ -429,7 +429,7 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, auto &TLI = *getTLI(); auto Subtarget = TLI.getSubtarget(); - if (Subtarget->isThumb()) + if (Subtarget->isThumb1Only()) return false; // Quick exit if there aren't any args @@ -500,6 +500,22 @@ struct CallReturnHandler : public IncomingValueHandler { MachineInstrBuilder MIB; }; +// FIXME: This should move to the ARMSubtarget when it supports all the opcodes. +unsigned getCallOpcode(const ARMSubtarget &STI, bool isDirect) { + if (isDirect) + return STI.isThumb() ? ARM::tBL : ARM::BL; + + if (STI.isThumb()) + return ARM::tBLXr; + + if (STI.hasV5TOps()) + return ARM::BLX; + + if (STI.hasV4TOps()) + return ARM::BX_CALL; + + return ARM::BMOVPCRX_CALL; +} } // end anonymous namespace bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, @@ -517,27 +533,34 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, if (STI.genLongCalls()) return false; + if (STI.isThumb1Only()) + return false; + auto CallSeqStart = MIRBuilder.buildInstr(ARM::ADJCALLSTACKDOWN); // Create the call instruction so we can add the implicit uses of arg // registers, but don't insert it yet. bool isDirect = !Callee.isReg(); - auto CallOpcode = - isDirect ? ARM::BL - : STI.hasV5TOps() - ? ARM::BLX - : STI.hasV4TOps() ? ARM::BX_CALL : ARM::BMOVPCRX_CALL; - auto MIB = MIRBuilder.buildInstrNoInsert(CallOpcode) - .add(Callee) - .addRegMask(TRI->getCallPreservedMask(MF, CallConv)); - if (Callee.isReg()) { + auto CallOpcode = getCallOpcode(STI, isDirect); + auto MIB = MIRBuilder.buildInstrNoInsert(CallOpcode); + + bool isThumb = STI.isThumb(); + if (isThumb) + MIB.add(predOps(ARMCC::AL)); + + MIB.add(Callee); + if (!isDirect) { auto CalleeReg = Callee.getReg(); - if (CalleeReg && !TRI->isPhysicalRegister(CalleeReg)) - MIB->getOperand(0).setReg(constrainOperandRegClass( + if (CalleeReg && !TRI->isPhysicalRegister(CalleeReg)) { + unsigned CalleeIdx = isThumb ? 2 : 0; + MIB->getOperand(CalleeIdx).setReg(constrainOperandRegClass( MF, *TRI, MRI, *STI.getInstrInfo(), *STI.getRegBankInfo(), - *MIB.getInstr(), MIB->getDesc(), Callee, 0)); + *MIB.getInstr(), MIB->getDesc(), Callee, CalleeIdx)); + } } + MIB.addRegMask(TRI->getCallPreservedMask(MF, CallConv)); + SmallVector ArgInfos; for (auto Arg : OrigArgs) { if (!isSupportedType(DL, TLI, Arg.Ty)) diff --git a/lib/Target/ARM/ARMCodeGenPrepare.cpp b/lib/Target/ARM/ARMCodeGenPrepare.cpp index 0bd1f9ca63918d253fe44f11c67198da294ad357..b631c2bc687b05cd3e63f3a94dbf31489242598d 100644 --- a/lib/Target/ARM/ARMCodeGenPrepare.cpp +++ b/lib/Target/ARM/ARMCodeGenPrepare.cpp @@ -109,24 +109,25 @@ EnableDSPWithImms("arm-enable-scalar-dsp-imms", cl::Hidden, cl::init(false), namespace { class IRPromoter { SmallPtrSet NewInsts; - SmallVector InstsToRemove; - DenseMap TruncTysMap; + SmallPtrSet InstsToRemove; + DenseMap> TruncTysMap; SmallPtrSet Promoted; Module *M = nullptr; LLVMContext &Ctx; IntegerType *ExtTy = nullptr; IntegerType *OrigTy = nullptr; - - void PrepareConstants(SmallPtrSetImpl &Visited, - SmallPtrSetImpl &SafeToPromote); - void ExtendSources(SmallPtrSetImpl &Sources); - void PromoteTree(SmallPtrSetImpl &Visited, - SmallPtrSetImpl &Sources, - SmallPtrSetImpl &Sinks, - SmallPtrSetImpl &SafeToPromote); - void TruncateSinks(SmallPtrSetImpl &Sources, - SmallPtrSetImpl &Sinks); - void Cleanup(SmallPtrSetImpl &Sinks); + SmallPtrSetImpl *Visited; + SmallPtrSetImpl *Sources; + SmallPtrSetImpl *Sinks; + SmallPtrSetImpl *SafeToPromote; + + void ReplaceAllUsersOfWith(Value *From, Value *To); + void PrepareConstants(void); + void ExtendSources(void); + void ConvertTruncs(void); + void PromoteTree(void); + void TruncateSinks(void); + void Cleanup(void); public: IRPromoter(Module *M) : M(M), Ctx(M->getContext()), @@ -180,6 +181,22 @@ static bool generateSignBits(Value *V) { Opc == Instruction::SRem; } +static bool EqualTypeSize(Value *V) { + return V->getType()->getScalarSizeInBits() == ARMCodeGenPrepare::TypeSize; +} + +static bool LessOrEqualTypeSize(Value *V) { + return V->getType()->getScalarSizeInBits() <= ARMCodeGenPrepare::TypeSize; +} + +static bool GreaterThanTypeSize(Value *V) { + return V->getType()->getScalarSizeInBits() > ARMCodeGenPrepare::TypeSize; +} + +static bool LessThanTypeSize(Value *V) { + return V->getType()->getScalarSizeInBits() < ARMCodeGenPrepare::TypeSize; +} + /// Some instructions can use 8- and 16-bit operands, and we don't need to /// promote anything larger. We disallow booleans to make life easier when /// dealing with icmps but allow any other integer that is <= 16 bits. Void @@ -194,15 +211,15 @@ static bool isSupportedType(Value *V) { if (auto *Ld = dyn_cast(V)) Ty = cast(Ld->getPointerOperandType())->getElementType(); - const IntegerType *IntTy = dyn_cast(Ty); - if (!IntTy) + if (!isa(Ty) || + cast(V->getType())->getBitWidth() == 1) return false; - return IntTy->getBitWidth() == ARMCodeGenPrepare::TypeSize; + return LessOrEqualTypeSize(V); } /// Return true if the given value is a source in the use-def chain, producing -/// a narrow (i8, i16) value. These values will be zext to start the promotion +/// a narrow 'TypeSize' value. These values will be zext to start the promotion /// of the tree to i32. We guarantee that these won't populate the upper bits /// of the register. ZExt on the loads will be free, and the same for call /// return values because we only accept ones that guarantee a zeroext ret val. @@ -211,6 +228,7 @@ static bool isSupportedType(Value *V) { static bool isSource(Value *V) { if (!isa(V->getType())) return false; + // TODO Allow zext to be sources. if (isa(V)) return true; @@ -221,7 +239,7 @@ static bool isSource(Value *V) { else if (auto *Call = dyn_cast(V)) return Call->hasRetAttr(Attribute::AttrKind::ZExt); else if (auto *Trunc = dyn_cast(V)) - return isSupportedType(Trunc); + return EqualTypeSize(Trunc); return false; } @@ -232,20 +250,23 @@ static bool isSink(Value *V) { // TODO The truncate also isn't actually necessary because we would already // proved that the data value is kept within the range of the original data // type. - auto UsesNarrowValue = [](Value *V) { - return V->getType()->getScalarSizeInBits() == ARMCodeGenPrepare::TypeSize; - }; + // Sinks are: + // - points where the value in the register is being observed, such as an + // icmp, switch or store. + // - points where value types have to match, such as calls and returns. + // - zext are included to ease the transformation and are generally removed + // later on. if (auto *Store = dyn_cast(V)) - return UsesNarrowValue(Store->getValueOperand()); + return LessOrEqualTypeSize(Store->getValueOperand()); if (auto *Return = dyn_cast(V)) - return UsesNarrowValue(Return->getReturnValue()); - if (auto *Trunc = dyn_cast(V)) - return UsesNarrowValue(Trunc->getOperand(0)); + return LessOrEqualTypeSize(Return->getReturnValue()); if (auto *ZExt = dyn_cast(V)) - return UsesNarrowValue(ZExt->getOperand(0)); + return GreaterThanTypeSize(ZExt); + if (auto *Switch = dyn_cast(V)) + return LessThanTypeSize(Switch->getCondition()); if (auto *ICmp = dyn_cast(V)) - return ICmp->isSigned(); + return ICmp->isSigned() || LessThanTypeSize(ICmp->getOperand(0)); return isa(V); } @@ -416,23 +437,32 @@ static Intrinsic::ID getNarrowIntrinsic(Instruction *I) { llvm_unreachable("unhandled opcode for narrow intrinsic"); } -static void ReplaceAllUsersOfWith(Value *From, Value *To) { +void IRPromoter::ReplaceAllUsersOfWith(Value *From, Value *To) { SmallVector Users; Instruction *InstTo = dyn_cast(To); + bool ReplacedAll = true; + + LLVM_DEBUG(dbgs() << "ARM CGP: Replacing " << *From << " with " << *To + << "\n"); + for (Use &U : From->uses()) { auto *User = cast(U.getUser()); - if (InstTo && User->isIdenticalTo(InstTo)) + if (InstTo && User->isIdenticalTo(InstTo)) { + ReplacedAll = false; continue; + } Users.push_back(User); } for (auto *U : Users) U->replaceUsesOfWith(From, To); + + if (ReplacedAll) + if (auto *I = dyn_cast(From)) + InstsToRemove.insert(I); } -void -IRPromoter::PrepareConstants(SmallPtrSetImpl &Visited, - SmallPtrSetImpl &SafeToPromote) { +void IRPromoter::PrepareConstants() { IRBuilder<> Builder{Ctx}; // First step is to prepare the instructions for mutation. Most constants // just need to be zero extended into their new type, but complications arise @@ -453,12 +483,12 @@ IRPromoter::PrepareConstants(SmallPtrSetImpl &Visited, // immediate as operand 1, we create an equivalent instruction using a // positive immediate. That positive immediate can then be zext along with // all the other immediates later. - for (auto *V : Visited) { + for (auto *V : *Visited) { if (!isa(V)) continue; auto *I = cast(V); - if (SafeToPromote.count(I)) { + if (SafeToPromote->count(I)) { if (!isa(I)) continue; @@ -483,16 +513,16 @@ IRPromoter::PrepareConstants(SmallPtrSetImpl &Visited, NewInst->copyIRFlags(I); NewInsts.insert(NewInst); } - InstsToRemove.push_back(I); + InstsToRemove.insert(I); I->replaceAllUsesWith(NewVal); } } } for (auto *I : NewInsts) - Visited.insert(I); + Visited->insert(I); } -void IRPromoter::ExtendSources(SmallPtrSetImpl &Sources) { +void IRPromoter::ExtendSources() { IRBuilder<> Builder{Ctx}; auto InsertZExt = [&](Value *V, Instruction *InsertPt) { @@ -510,13 +540,13 @@ void IRPromoter::ExtendSources(SmallPtrSetImpl &Sources) { I->moveAfter(InsertPt); NewInsts.insert(I); } + ReplaceAllUsersOfWith(V, ZExt); - TruncTysMap[ZExt] = TruncTysMap[V]; }; // Now, insert extending instructions between the sources and their users. LLVM_DEBUG(dbgs() << "ARM CGP: Promoting sources:\n"); - for (auto V : Sources) { + for (auto V : *Sources) { LLVM_DEBUG(dbgs() << " - " << *V << "\n"); if (auto *I = dyn_cast(V)) InsertZExt(I, I); @@ -530,22 +560,19 @@ void IRPromoter::ExtendSources(SmallPtrSetImpl &Sources) { } } -void IRPromoter::PromoteTree(SmallPtrSetImpl &Visited, - SmallPtrSetImpl &Sources, - SmallPtrSetImpl &Sinks, - SmallPtrSetImpl &SafeToPromote) { +void IRPromoter::PromoteTree() { LLVM_DEBUG(dbgs() << "ARM CGP: Mutating the tree..\n"); IRBuilder<> Builder{Ctx}; // Mutate the types of the instructions within the tree. Here we handle // constant operands. - for (auto *V : Visited) { - if (Sources.count(V)) + for (auto *V : *Visited) { + if (Sources->count(V)) continue; auto *I = cast(V); - if (Sinks.count(I)) + if (Sinks->count(I)) continue; for (unsigned i = 0, e = I->getNumOperands(); i < e; ++i) { @@ -568,15 +595,15 @@ void IRPromoter::PromoteTree(SmallPtrSetImpl &Visited, // Finally, any instructions that should be promoted but haven't yet been, // need to be handled using intrinsics. - for (auto *V : Visited) { + for (auto *V : *Visited) { auto *I = dyn_cast(V); if (!I) continue; - if (Sources.count(I) || Sinks.count(I)) + if (Sources->count(I) || Sinks->count(I)) continue; - if (!shouldPromote(I) || SafeToPromote.count(I) || NewInsts.count(I)) + if (!shouldPromote(I) || SafeToPromote->count(I) || NewInsts.count(I)) continue; assert(EnableDSP && "DSP intrinisc insertion not enabled!"); @@ -590,29 +617,21 @@ void IRPromoter::PromoteTree(SmallPtrSetImpl &Visited, Builder.SetCurrentDebugLocation(I->getDebugLoc()); Value *Args[] = { I->getOperand(0), I->getOperand(1) }; CallInst *Call = Builder.CreateCall(DSPInst, Args); - ReplaceAllUsersOfWith(I, Call); - InstsToRemove.push_back(I); NewInsts.insert(Call); - TruncTysMap[Call] = OrigTy; + ReplaceAllUsersOfWith(I, Call); } } -void IRPromoter::TruncateSinks(SmallPtrSetImpl &Sources, - SmallPtrSetImpl &Sinks) { +void IRPromoter::TruncateSinks() { LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the sinks:\n"); IRBuilder<> Builder{Ctx}; - auto InsertTrunc = [&](Value *V) -> Instruction* { + auto InsertTrunc = [&](Value *V, Type *TruncTy) -> Instruction* { if (!isa(V) || !isa(V->getType())) return nullptr; - if ((!Promoted.count(V) && !NewInsts.count(V)) || !TruncTysMap.count(V) || - Sources.count(V)) - return nullptr; - - Type *TruncTy = TruncTysMap[V]; - if (TruncTy == ExtTy) + if ((!Promoted.count(V) && !NewInsts.count(V)) || Sources->count(V)) return nullptr; LLVM_DEBUG(dbgs() << "ARM CGP: Creating " << *TruncTy << " Trunc for " @@ -626,14 +645,15 @@ void IRPromoter::TruncateSinks(SmallPtrSetImpl &Sources, // Fix up any stores or returns that use the results of the promoted // chain. - for (auto I : Sinks) { - LLVM_DEBUG(dbgs() << " - " << *I << "\n"); + for (auto I : *Sinks) { + LLVM_DEBUG(dbgs() << "ARM CGP: For Sink: " << *I << "\n"); // Handle calls separately as we need to iterate over arg operands. if (auto *Call = dyn_cast(I)) { for (unsigned i = 0; i < Call->getNumArgOperands(); ++i) { Value *Arg = Call->getArgOperand(i); - if (Instruction *Trunc = InsertTrunc(Arg)) { + Type *Ty = TruncTysMap[Call][i]; + if (Instruction *Trunc = InsertTrunc(Arg, Ty)) { Trunc->moveBefore(Call); Call->setArgOperand(i, Trunc); } @@ -641,44 +661,53 @@ void IRPromoter::TruncateSinks(SmallPtrSetImpl &Sources, continue; } + // Special case switches because we need to truncate the condition. + if (auto *Switch = dyn_cast(I)) { + Type *Ty = TruncTysMap[Switch][0]; + if (Instruction *Trunc = InsertTrunc(Switch->getCondition(), Ty)) { + Trunc->moveBefore(Switch); + Switch->setCondition(Trunc); + } + continue; + } + // Now handle the others. for (unsigned i = 0; i < I->getNumOperands(); ++i) { - if (Instruction *Trunc = InsertTrunc(I->getOperand(i))) { + Type *Ty = TruncTysMap[I][i]; + if (Instruction *Trunc = InsertTrunc(I->getOperand(i), Ty)) { Trunc->moveBefore(I); I->setOperand(i, Trunc); } } } - } -void IRPromoter::Cleanup(SmallPtrSetImpl &Sinks) { - // Some zext sinks will now have become redundant, along with their trunc - // operands, so remove them. - for (auto I : Sinks) { - if (auto *ZExt = dyn_cast(I)) { - if (ZExt->getDestTy() != ExtTy) - continue; +void IRPromoter::Cleanup() { + // Some zexts will now have become redundant, along with their trunc + // operands, so remove them + for (auto V : *Visited) { + if (!isa(V)) + continue; - Value *Src = ZExt->getOperand(0); - if (ZExt->getSrcTy() == ZExt->getDestTy()) { - LLVM_DEBUG(dbgs() << "ARM CGP: Removing unnecessary zext\n"); - ReplaceAllUsersOfWith(ZExt, Src); - InstsToRemove.push_back(ZExt); - continue; - } + auto ZExt = cast(V); + if (ZExt->getDestTy() != ExtTy) + continue; - // For any truncs that we insert to handle zexts, we can replace the - // result of the zext with the input to the trunc. - if (NewInsts.count(Src) && isa(Src)) { - auto *Trunc = cast(Src); - assert(Trunc->getOperand(0)->getType() == ExtTy && - "expected inserted trunc to be operating on i32"); - LLVM_DEBUG(dbgs() << "ARM CGP: Replacing zext with trunc operand: " - << *Trunc->getOperand(0)); - ReplaceAllUsersOfWith(ZExt, Trunc->getOperand(0)); - InstsToRemove.push_back(ZExt); - } + Value *Src = ZExt->getOperand(0); + if (ZExt->getSrcTy() == ZExt->getDestTy()) { + LLVM_DEBUG(dbgs() << "ARM CGP: Removing unnecessary cast: " << *ZExt + << "\n"); + ReplaceAllUsersOfWith(ZExt, Src); + continue; + } + + // For any truncs that we insert to handle zexts, we can replace the + // result of the zext with the input to the trunc. + if (NewInsts.count(Src) && isa(V) && isa(Src)) { + auto *Trunc = cast(Src); + assert(Trunc->getOperand(0)->getType() == ExtTy && + "expected inserted trunc to be operating on i32"); + ReplaceAllUsersOfWith(ZExt, Trunc->getOperand(0)); } } @@ -694,6 +723,29 @@ void IRPromoter::Cleanup(SmallPtrSetImpl &Sinks) { Promoted.clear(); } +void IRPromoter::ConvertTruncs() { + IRBuilder<> Builder{Ctx}; + + for (auto *V : *Visited) { + if (!isa(V) || Sources->count(V)) + continue; + + auto *Trunc = cast(V); + assert(LessThanTypeSize(Trunc) && "expected narrow trunc"); + + Builder.SetInsertPoint(Trunc); + unsigned NumBits = + cast(Trunc->getType())->getScalarSizeInBits(); + ConstantInt *Mask = ConstantInt::get(Ctx, APInt::getMaxValue(NumBits)); + Value *Masked = Builder.CreateAnd(Trunc->getOperand(0), Mask); + + if (auto *I = dyn_cast(Masked)) + NewInsts.insert(I); + + ReplaceAllUsersOfWith(Trunc, Masked); + } +} + void IRPromoter::Mutate(Type *OrigTy, SmallPtrSetImpl &Visited, SmallPtrSetImpl &Sources, @@ -707,39 +759,49 @@ void IRPromoter::Mutate(Type *OrigTy, assert(OrigTy->getPrimitiveSizeInBits() < ExtTy->getPrimitiveSizeInBits() && "original type not smaller than extended type"); - // Cache original types. - for (auto *V : Visited) - TruncTysMap[V] = V->getType(); + this->Visited = &Visited; + this->Sources = &Sources; + this->Sinks = &Sinks; + this->SafeToPromote = &SafeToPromote; + + // Cache original types of the values that will likely need truncating + for (auto *I : Sinks) { + if (auto *Call = dyn_cast(I)) { + for (unsigned i = 0; i < Call->getNumArgOperands(); ++i) { + Value *Arg = Call->getArgOperand(i); + TruncTysMap[Call].push_back(Arg->getType()); + } + } else if (auto *Switch = dyn_cast(I)) + TruncTysMap[I].push_back(Switch->getCondition()->getType()); + else { + for (unsigned i = 0; i < I->getNumOperands(); ++i) + TruncTysMap[I].push_back(I->getOperand(i)->getType()); + } + } // Convert adds and subs using negative immediates to equivalent instructions // that use positive constants. - PrepareConstants(Visited, SafeToPromote); + PrepareConstants(); // Insert zext instructions between sources and their users. - ExtendSources(Sources); + ExtendSources(); + + // Convert any truncs, that aren't sources, into AND masks. + ConvertTruncs(); // Promote visited instructions, mutating their types in place. Also insert // DSP intrinsics, if enabled, for adds and subs which would be unsafe to // promote. - PromoteTree(Visited, Sources, Sinks, SafeToPromote); + PromoteTree(); // Insert trunc instructions for use by calls, stores etc... - TruncateSinks(Sources, Sinks); + TruncateSinks(); // Finally, remove unecessary zexts and truncs, delete old instructions and // clear the data structures. - Cleanup(Sinks); + Cleanup(); - LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete:\n"); - LLVM_DEBUG(dbgs(); - for (auto *V : Sources) - V->dump(); - for (auto *I : NewInsts) - I->dump(); - for (auto *V : Visited) { - if (!Sources.count(V)) - V->dump(); - }); + LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete\n"); } /// We accept most instructions, as well as Arguments and ConstantInsts. We @@ -747,8 +809,15 @@ void IRPromoter::Mutate(Type *OrigTy, /// return value is zeroext. We don't allow opcodes that can introduce sign /// bits. bool ARMCodeGenPrepare::isSupportedValue(Value *V) { - if (isa(V)) - return true; + if (auto *I = dyn_cast(V)) { + // Now that we allow small types than TypeSize, only allow icmp of + // TypeSize because they will require a trunc to be legalised. + // TODO: Allow icmp of smaller types, and calculate at the end + // whether the transform would be beneficial. + if (isa(I->getOperand(0)->getType())) + return true; + return EqualTypeSize(I->getOperand(0)); + } // Memory instructions if (isa(V) || isa(V)) @@ -766,12 +835,11 @@ bool ARMCodeGenPrepare::isSupportedValue(Value *V) { isa(V)) return isSupportedType(V); - // Truncs can be either sources or sinks. - if (auto *Trunc = dyn_cast(V)) - return isSupportedType(Trunc) || isSupportedType(Trunc->getOperand(0)); + if (isa(V)) + return false; - if (isa(V) && !isa(V)) - return isSupportedType(cast(V)->getOperand(0)); + if (auto *Cast = dyn_cast(V)) + return isSupportedType(Cast) || isSupportedType(Cast->getOperand(0)); // Special cases for calls as we need to check for zeroext // TODO We should accept calls even if they don't have zeroext, as they can @@ -901,13 +969,17 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) { // Calls can be both sources and sinks. if (isSink(V)) Sinks.insert(cast(V)); + if (isSource(V)) Sources.insert(V); - else if (auto *I = dyn_cast(V)) { - // Visit operands of any instruction visited. - for (auto &U : I->operands()) { - if (!AddLegalInst(U)) - return false; + + if (!isSink(V) && !isSource(V)) { + if (auto *I = dyn_cast(V)) { + // Visit operands of any instruction visited. + for (auto &U : I->operands()) { + if (!AddLegalInst(U)) + return false; + } } } @@ -973,6 +1045,8 @@ bool ARMCodeGenPrepare::runOnFunction(Function &F) { if (CI.isSigned() || !isa(CI.getOperand(0)->getType())) continue; + LLVM_DEBUG(dbgs() << "ARM CGP: Searching from: " << CI << "\n"); + for (auto &Op : CI.operands()) { if (auto *I = dyn_cast(Op)) MadeChange |= TryToPromote(I); diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index a66cd7053c0aa2be3ee31b0844fec7b455b73cb7..a50abfdbee446188465f9c215fdf1725a218a9b6 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -2951,7 +2951,8 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, unsigned ResultReg = MI->getOperand(0).getReg(); if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false)) return false; - MI->eraseFromParent(); + MachineBasicBlock::iterator I(MI); + removeDeadCode(I, std::next(I)); return true; } @@ -2970,12 +2971,16 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, unsigned ConstAlign = MF->getDataLayout().getPrefTypeAlignment(Type::getInt32PtrTy(*Context)); unsigned Idx = MF->getConstantPool()->getConstantPoolIndex(CPV, ConstAlign); + MachineMemOperand *CPMMO = + MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), + MachineMemOperand::MOLoad, 4, 4); unsigned TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass); unsigned Opc = isThumb2 ? ARM::t2LDRpci : ARM::LDRcp; MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), TempReg) - .addConstantPoolIndex(Idx); + .addConstantPoolIndex(Idx) + .addMemOperand(CPMMO); if (Opc == ARM::LDRcp) MIB.addImm(0); MIB.add(predOps(ARMCC::AL)); @@ -2988,6 +2993,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) .addReg(TempReg) .addImm(ARMPCLabelIndex); + if (!Subtarget->isThumb()) MIB.add(predOps(ARMCC::AL)); diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index 3e057e953d86a2a94b762b8ca3cd7f1485fc63fe..2417166ca962149fc429675f4934228763cac98f 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -911,6 +911,7 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, assert(RegInfo->hasBasePointer(MF) && "VLAs and dynamic stack alignment, but missing base pointer!"); FrameReg = RegInfo->getBaseRegister(); + Offset -= SPAdj; } return Offset; } @@ -2157,9 +2158,15 @@ void ARMFrameLowering::adjustForSegmentedStacks( // Do not generate a prologue for leaf functions with a stack of size zero. // For non-leaf functions we have to allow for the possibility that the - // call is to a non-split function, as in PR37807. - if (StackSize == 0 && !MFI.hasTailCall()) + // callis to a non-split function, as in PR37807. This function could also + // take the address of a non-split function. When the linker tries to adjust + // its non-existent prologue, it would fail with an error. Mark the object + // file so that such failures are not errors. See this Go language bug-report + // https://go-review.googlesource.com/c/go/+/148819/ + if (StackSize == 0 && !MFI.hasTailCall()) { + MF.getMMI().setHasNosplitStack(true); return; + } // Use R4 and R5 as scratch registers. // We save R4 and R5 before use and restore them before leaving the function. diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 77622567d4b9b8ec357f1d4b06d592036676ce18..8e0e82388251ae6499998fd6c32cc2d8c875cfa3 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1763,12 +1763,14 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, default: llvm_unreachable("unhandled vld type"); // Double-register operations: case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4f16: case MVT::v4i16: OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: OpcodeIndex = 2; break; case MVT::v1i64: OpcodeIndex = 3; break; // Quad-register operations: case MVT::v16i8: OpcodeIndex = 0; break; + case MVT::v8f16: case MVT::v8i16: OpcodeIndex = 1; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 2; break; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 56d2e510cb719761d1b0ed3200179a491148902a..8d5cf0a2c1c2180bc9172f705791628724464eac 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -984,7 +984,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // On v8, we have particularly efficient implementations of atomic fences // if they can be combined with nearby atomic loads and stores. - if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) { + if (!Subtarget->hasAcquireRelease() || + getTargetMachine().getOptLevel() == 0) { // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. InsertFencesForAtomic = true; } @@ -1282,6 +1283,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; case ARMISD::CMOV: return "ARMISD::CMOV"; + case ARMISD::SUBS: return "ARMISD::SUBS"; case ARMISD::SSAT: return "ARMISD::SSAT"; case ARMISD::USAT: return "ARMISD::USAT"; @@ -7794,6 +7796,50 @@ SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, return LowerCallTo(CLI).first; } +// This is a code size optimisation: return the original SDIV node to +// DAGCombiner when we don't want to expand SDIV into a sequence of +// instructions, and an empty node otherwise which will cause the +// SDIV to be expanded in DAGCombine. +SDValue +ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, + SelectionDAG &DAG, + SmallVectorImpl &Created) const { + // TODO: Support SREM + if (N->getOpcode() != ISD::SDIV) + return SDValue(); + + const auto &ST = static_cast(DAG.getSubtarget()); + const auto &MF = DAG.getMachineFunction(); + const bool MinSize = MF.getFunction().optForMinSize(); + const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode() + : ST.hasDivideInARMMode(); + + // Don't touch vector types; rewriting this may lead to scalarizing + // the int divs. + if (N->getOperand(0).getValueType().isVector()) + return SDValue(); + + // Bail if MinSize is not set, and also for both ARM and Thumb mode we need + // hwdiv support for this to be really profitable. + if (!(MinSize && HasDivide)) + return SDValue(); + + // ARM mode is a bit simpler than Thumb: we can handle large power + // of 2 immediates with 1 mov instruction; no further checks required, + // just return the sdiv node. + if (!ST.isThumb()) + return SDValue(N, 0); + + // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV, + // and thus lose the code size benefits of a MOVS that requires only 2. + // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here, + // but as it's doing exactly this, it's not worth the trouble to get TTI. + if (Divisor.sgt(128)) + return SDValue(); + + return SDValue(N, 0); +} + SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const { assert(Op.getValueType() == MVT::i32 && @@ -12663,30 +12709,38 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1)); Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry); } - } else if (CC == ARMCC::NE && LHS != RHS && + } else if (CC == ARMCC::NE && !isNullConstant(RHS) && (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) { // This seems pointless but will allow us to combine it further below. - // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y) - SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); + // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 + SDValue Sub = + DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); + SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, + Sub.getValue(1), SDValue()); Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc, - N->getOperand(3), Cmp); + N->getOperand(3), CPSRGlue.getValue(1)); + FalseVal = Sub; } } else if (isNullConstant(TrueVal)) { - if (CC == ARMCC::EQ && LHS != RHS && + if (CC == ARMCC::EQ && !isNullConstant(RHS) && (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) { // This seems pointless but will allow us to combine it further below // Note that we change == for != as this is the dual for the case above. - // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y) - SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); + // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 + SDValue Sub = + DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); + SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, + Sub.getValue(1), SDValue()); Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal, DAG.getConstant(ARMCC::NE, dl, MVT::i32), - N->getOperand(3), Cmp); + N->getOperand(3), CPSRGlue.getValue(1)); + FalseVal = Sub; } } // On Thumb1, the DAG above may be further combined if z is a power of 2 // (z == 2 ^ K). - // CMOV (SUB x, y), z, !=, (CMPZ x, y) -> + // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 -> // merge t3, t4 // where t1 = (SUBCARRY (SUB x, y), z, 0) // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) @@ -12694,8 +12748,8 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { // t4 = (SUB 1, t2:1) [ we want a carry, not a borrow ] const APInt *TrueConst; if (Subtarget->isThumb1Only() && CC == ARMCC::NE && - (FalseVal.getOpcode() == ISD::SUB) && (FalseVal.getOperand(0) == LHS) && - (FalseVal.getOperand(1) == RHS) && + (FalseVal.getOpcode() == ARMISD::SUBS) && + (FalseVal.getOperand(0) == LHS) && (FalseVal.getOperand(1) == RHS) && (TrueConst = isPowerOf2Constant(TrueVal))) { SDVTList VTs = DAG.getVTList(VT, MVT::i32); unsigned ShiftAmount = TrueConst->logBase2(); diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index be53987a2af93fe939e920c583542e826dc53847..ccee7fa258ab4978bc7b3f8f4b4a5f3d2bd848b4 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -85,6 +85,7 @@ class VectorType; FMSTAT, // ARM fmstat instruction. CMOV, // ARM conditional move instructions. + SUBS, // Flag-setting subtraction. SSAT, // Signed saturation USAT, // Unsigned saturation @@ -694,6 +695,9 @@ class VectorType; unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; + SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, + SmallVectorImpl &Created) const override; + /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be /// expanded to FMAs when this method returns true, otherwise fmuladd is diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 76f8414e8f0b87ef2bbded872d1f7952ed7f5f43..488af7bf12c76a62724244eafb96784fde21aa81 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -144,6 +144,7 @@ def ARMintretflag : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov, [SDNPInGlue]>; +def ARMsubs : SDNode<"ARMISD::SUBS", SDTIntBinOp, [SDNPOutGlue]>; def ARMssatnoshift : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>; @@ -3334,7 +3335,7 @@ multiclass arm_ldst_mult, ComplexDeprecationPredicate<"ARMLoad">; @@ -3641,6 +3642,14 @@ let isAdd = 1 in defm ADDS : AsI1_bin_s_irs; defm SUBS : AsI1_bin_s_irs; +def : ARMPat<(ARMsubs GPR:$Rn, mod_imm:$imm), (SUBSri $Rn, mod_imm:$imm)>; +def : ARMPat<(ARMsubs GPR:$Rn, GPR:$Rm), (SUBSrr $Rn, $Rm)>; +def : ARMPat<(ARMsubs GPR:$Rn, so_reg_imm:$shift), + (SUBSrsi $Rn, so_reg_imm:$shift)>; +def : ARMPat<(ARMsubs GPR:$Rn, so_reg_reg:$shift), + (SUBSrsr $Rn, so_reg_reg:$shift)>; + + let isAdd = 1 in defm ADC : AI1_adde_sube_irs<0b0101, "adc", ARMadde, 1>; defm SBC : AI1_adde_sube_irs<0b0110, "sbc", ARMsube>; diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index 3c153625b01684be5ff8fd55963738c71d47af3f..b20b34eaa6a90b28ca3ca7fc2641f656b96c05c7 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -781,7 +781,7 @@ defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rr, // These require base address to be written back or one of the loaded regs. let hasSideEffects = 0 in { -let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +let mayLoad = 1, hasExtraDefRegAllocReq = 1, variadicOpsAreDefs = 1 in def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops), IIC_iLoad_m, "ldm${p}\t$Rn, $regs", []>, T1Encoding<{1,1,0,0,1,?}> { bits<3> Rn; @@ -826,7 +826,8 @@ def : InstAlias<"ldm${p} $Rn!, $regs", (tLDMIA tGPR:$Rn, pred:$p, reglist:$regs), 0>, Requires<[IsThumb, IsThumb1Only]>; -let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in +let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1, + variadicOpsAreDefs = 1 in def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops), IIC_iPop, "pop${p}\t$regs", []>, @@ -1351,6 +1352,12 @@ let hasPostISelHook = 1, Defs = [CPSR] in { Sched<[WriteALU]>; } + +def : T1Pat<(ARMsubs tGPR:$Rn, tGPR:$Rm), (tSUBSrr $Rn, $Rm)>; +def : T1Pat<(ARMsubs tGPR:$Rn, imm0_7:$imm3), (tSUBSi3 $Rn, imm0_7:$imm3)>; +def : T1Pat<(ARMsubs tGPR:$Rn, imm0_255:$imm8), (tSUBSi8 $Rn, imm0_255:$imm8)>; + + // Sign-extend byte def tSXTB : // A8.6.222 T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index 0c5720e0267f584d7e7bd36b047cc041de5c1e06..4130dbd884572d341b80177bc0c67058b835b6a2 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -1775,7 +1775,7 @@ multiclass thumb2_ld_mult; multiclass thumb2_st_mult; defm t2ADDS : T2I_bin_s_irs ; defm t2SUBS : T2I_bin_s_irs ; +def : T2Pat<(ARMsubs GPRnopc:$Rn, t2_so_imm:$imm), + (t2SUBSri $Rn, t2_so_imm:$imm)>; +def : T2Pat<(ARMsubs GPRnopc:$Rn, rGPR:$Rm), (t2SUBSrr $Rn, $Rm)>; +def : T2Pat<(ARMsubs GPRnopc:$Rn, t2_so_reg:$ShiftedRm), + (t2SUBSrs $Rn, t2_so_reg:$ShiftedRm)>; + let hasPostISelHook = 1 in { defm t2ADC : T2I_adde_sube_irs<0b1010, "adc", ARMadde, 1>; defm t2SBC : T2I_adde_sube_irs<0b1011, "sbc", ARMsube>; @@ -4445,13 +4451,13 @@ def : T2Pat<(atomic_store_32 t2addrmode_negimm8:$addr, GPR:$val), def : T2Pat<(atomic_store_32 t2addrmode_so_reg:$addr, GPR:$val), (t2STRs GPR:$val, t2addrmode_so_reg:$addr)>; -let AddedComplexity = 8 in { - def : T2Pat<(atomic_load_acquire_8 addr_offset_none:$addr), (t2LDAB addr_offset_none:$addr)>; - def : T2Pat<(atomic_load_acquire_16 addr_offset_none:$addr), (t2LDAH addr_offset_none:$addr)>; - def : T2Pat<(atomic_load_acquire_32 addr_offset_none:$addr), (t2LDA addr_offset_none:$addr)>; - def : T2Pat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val), (t2STLB GPR:$val, addr_offset_none:$addr)>; - def : T2Pat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (t2STLH GPR:$val, addr_offset_none:$addr)>; - def : T2Pat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (t2STL GPR:$val, addr_offset_none:$addr)>; +let AddedComplexity = 8, Predicates = [IsThumb, HasAcquireRelease, HasV7Clrex] in { + def : Pat<(atomic_load_acquire_8 addr_offset_none:$addr), (t2LDAB addr_offset_none:$addr)>; + def : Pat<(atomic_load_acquire_16 addr_offset_none:$addr), (t2LDAH addr_offset_none:$addr)>; + def : Pat<(atomic_load_acquire_32 addr_offset_none:$addr), (t2LDA addr_offset_none:$addr)>; + def : Pat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val), (t2STLB GPR:$val, addr_offset_none:$addr)>; + def : Pat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (t2STLH GPR:$val, addr_offset_none:$addr)>; + def : Pat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (t2STL GPR:$val, addr_offset_none:$addr)>; } diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp index 6692a4d4142052b2d2e4cf754b432a0c39ccbc87..293e734c97cd6a7c600077151d46816515d9cf9d 100644 --- a/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/lib/Target/ARM/ARMInstructionSelector.cpp @@ -76,6 +76,42 @@ private: const ARMRegisterBankInfo &RBI; const ARMSubtarget &STI; + // Store the opcodes that we might need, so we don't have to check what kind + // of subtarget (ARM vs Thumb) we have all the time. + struct OpcodeCache { + unsigned ZEXT16; + unsigned SEXT16; + + unsigned ZEXT8; + unsigned SEXT8; + + // Used for implementing ZEXT/SEXT from i1 + unsigned AND; + unsigned RSB; + + unsigned STORE32; + unsigned LOAD32; + + unsigned STORE16; + unsigned LOAD16; + + unsigned STORE8; + unsigned LOAD8; + + OpcodeCache(const ARMSubtarget &STI); + } const Opcodes; + + // Select the opcode for simple extensions (that translate to a single SXT/UXT + // instruction). Extension operations more complicated than that should not + // invoke this. Returns the original opcode if it doesn't know how to select a + // better one. + unsigned selectSimpleExtOpc(unsigned Opc, unsigned Size) const; + + // Select the opcode for simple loads and stores. Returns the original opcode + // if it doesn't know how to select a better one. + unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank, + unsigned Size) const; + #define GET_GLOBALISEL_PREDICATES_DECL #include "ARMGenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATES_DECL @@ -107,7 +143,7 @@ ARMInstructionSelector::ARMInstructionSelector(const ARMBaseTargetMachine &TM, const ARMSubtarget &STI, const ARMRegisterBankInfo &RBI) : InstructionSelector(), TII(*STI.getInstrInfo()), - TRI(*STI.getRegisterInfo()), TM(TM), RBI(RBI), STI(STI), + TRI(*STI.getRegisterInfo()), TM(TM), RBI(RBI), STI(STI), Opcodes(STI), #define GET_GLOBALISEL_PREDICATES_INIT #include "ARMGenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATES_INIT @@ -225,41 +261,63 @@ static bool selectUnmergeValues(MachineInstrBuilder &MIB, return true; } -/// Select the opcode for simple extensions (that translate to a single SXT/UXT -/// instruction). Extension operations more complicated than that should not -/// invoke this. Returns the original opcode if it doesn't know how to select a -/// better one. -static unsigned selectSimpleExtOpc(unsigned Opc, unsigned Size) { +ARMInstructionSelector::OpcodeCache::OpcodeCache(const ARMSubtarget &STI) { + bool isThumb = STI.isThumb(); + + using namespace TargetOpcode; + +#define STORE_OPCODE(VAR, OPC) VAR = isThumb ? ARM::t2##OPC : ARM::OPC + STORE_OPCODE(SEXT16, SXTH); + STORE_OPCODE(ZEXT16, UXTH); + + STORE_OPCODE(SEXT8, SXTB); + STORE_OPCODE(ZEXT8, UXTB); + + STORE_OPCODE(AND, ANDri); + STORE_OPCODE(RSB, RSBri); + + STORE_OPCODE(STORE32, STRi12); + STORE_OPCODE(LOAD32, LDRi12); + + // LDRH/STRH are special... + STORE16 = isThumb ? ARM::t2STRHi12 : ARM::STRH; + LOAD16 = isThumb ? ARM::t2LDRHi12 : ARM::LDRH; + + STORE_OPCODE(STORE8, STRBi12); + STORE_OPCODE(LOAD8, LDRBi12); +#undef MAP_OPCODE +} + +unsigned ARMInstructionSelector::selectSimpleExtOpc(unsigned Opc, + unsigned Size) const { using namespace TargetOpcode; if (Size != 8 && Size != 16) return Opc; if (Opc == G_SEXT) - return Size == 8 ? ARM::SXTB : ARM::SXTH; + return Size == 8 ? Opcodes.SEXT8 : Opcodes.SEXT16; if (Opc == G_ZEXT) - return Size == 8 ? ARM::UXTB : ARM::UXTH; + return Size == 8 ? Opcodes.ZEXT8 : Opcodes.ZEXT16; return Opc; } -/// Select the opcode for simple loads and stores. For types smaller than 32 -/// bits, the value will be zero extended. Returns the original opcode if it -/// doesn't know how to select a better one. -static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank, - unsigned Size) { +unsigned ARMInstructionSelector::selectLoadStoreOpCode(unsigned Opc, + unsigned RegBank, + unsigned Size) const { bool isStore = Opc == TargetOpcode::G_STORE; if (RegBank == ARM::GPRRegBankID) { switch (Size) { case 1: case 8: - return isStore ? ARM::STRBi12 : ARM::LDRBi12; + return isStore ? Opcodes.STORE8 : Opcodes.LOAD8; case 16: - return isStore ? ARM::STRH : ARM::LDRH; + return isStore ? Opcodes.STORE16 : Opcodes.LOAD16; case 32: - return isStore ? ARM::STRi12 : ARM::LDRi12; + return isStore ? Opcodes.STORE32 : Opcodes.LOAD32; default: return Opc; } @@ -702,7 +760,7 @@ bool ARMInstructionSelector::select(MachineInstr &I, switch (SrcSize) { case 1: { // ZExt boils down to & 0x1; for SExt we also subtract that from 0 - I.setDesc(TII.get(ARM::ANDri)); + I.setDesc(TII.get(Opcodes.AND)); MIB.addImm(1).add(predOps(ARMCC::AL)).add(condCodeOp()); if (isSExt) { @@ -714,7 +772,7 @@ bool ARMInstructionSelector::select(MachineInstr &I, auto InsertBefore = std::next(I.getIterator()); auto SubI = - BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(ARM::RSBri)) + BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(Opcodes.RSB)) .addDef(SExtResult) .addUse(AndResult) .addImm(0) diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp index 891418306903c5f19cb970bfee98329e6395024d..4a0c24d58474c73e2478a52744ef0a36831a6ad0 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -75,13 +75,48 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { const LLT s32 = LLT::scalar(32); const LLT s64 = LLT::scalar(64); - getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); - getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); + if (ST.isThumb1Only()) { + // Thumb1 is not supported yet. + computeTables(); + verify(*ST.getInstrInfo()); + return; + } + + getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) + .legalForCartesianProduct({s32}, {s1, s8, s16}); getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) .legalFor({s32}) .minScalar(0, s32); + getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, s32}}); + getActionDefinitionsBuilder(G_PTRTOINT).legalFor({{s32, p0}}); + + getActionDefinitionsBuilder(G_CONSTANT) + .legalFor({s32, p0}) + .clampScalar(0, s32, s32); + + // We're keeping these builders around because we'll want to add support for + // floating point to them. + auto &LoadStoreBuilder = + getActionDefinitionsBuilder({G_LOAD, G_STORE}) + .legalForTypesWithMemSize({ + {s1, p0, 8}, + {s8, p0, 8}, + {s16, p0, 16}, + {s32, p0, 32}, + {p0, p0, 32}}); + + if (ST.isThumb()) { + // FIXME: merge with the code for non-Thumb. + computeTables(); + verify(*ST.getInstrInfo()); + return; + } + + getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); + getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); + if (ST.hasDivideInARMMode()) getActionDefinitionsBuilder({G_SDIV, G_UDIV}) .legalFor({s32}) @@ -101,14 +136,24 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { setAction({Op, s32}, Libcall); } - getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) - .legalForCartesianProduct({s32}, {s1, s8, s16}); - - getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, s32}}); - getActionDefinitionsBuilder(G_PTRTOINT).legalFor({{s32, p0}}); - getActionDefinitionsBuilder({G_ASHR, G_LSHR, G_SHL}).legalFor({s32}); + if (ST.hasV5TOps()) { + getActionDefinitionsBuilder(G_CTLZ) + .legalFor({s32}) + .clampScalar(0, s32, s32); + getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF) + .lowerFor({s32}) + .clampScalar(0, s32, s32); + } else { + getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF) + .libcallFor({s32}) + .clampScalar(0, s32, s32); + getActionDefinitionsBuilder(G_CTLZ) + .lowerFor({s32}) + .clampScalar(0, s32, s32); + } + getActionDefinitionsBuilder(G_GEP).legalFor({{p0, s32}}); getActionDefinitionsBuilder(G_SELECT).legalForCartesianProduct({s32, p0}, @@ -116,20 +161,12 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { getActionDefinitionsBuilder(G_BRCOND).legalFor({s1}); - getActionDefinitionsBuilder(G_CONSTANT) - .legalFor({s32, p0}) - .clampScalar(0, s32, s32); - getActionDefinitionsBuilder(G_ICMP) .legalForCartesianProduct({s1}, {s32, p0}) .minScalar(1, s32); // We're keeping these builders around because we'll want to add support for // floating point to them. - auto &LoadStoreBuilder = - getActionDefinitionsBuilder({G_LOAD, G_STORE}) - .legalForCartesianProduct({s1, s8, s16, s32, p0}, {p0}); - auto &PhiBuilder = getActionDefinitionsBuilder(G_PHI).legalFor({s32, p0}).minScalar(0, s32); @@ -302,7 +339,8 @@ ARMLegalizerInfo::getFCmpLibcalls(CmpInst::Predicate Predicate, bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const { + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const { using namespace TargetOpcode; MIRBuilder.setInstr(MI); diff --git a/lib/Target/ARM/ARMLegalizerInfo.h b/lib/Target/ARM/ARMLegalizerInfo.h index 78ab9412c04bae3eaafdb670350694964c247f99..527bf87f1093043191f1e36951b48ab7e3118c5e 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.h +++ b/lib/Target/ARM/ARMLegalizerInfo.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_ARM_ARMMACHINELEGALIZER_H #include "llvm/ADT/IndexedMap.h" +#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/IR/Instructions.h" @@ -29,7 +30,8 @@ public: ARMLegalizerInfo(const ARMSubtarget &ST); bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const override; + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const override; private: void setFCmpLibcallsGNU(); diff --git a/lib/Target/ARM/ARMParallelDSP.cpp b/lib/Target/ARM/ARMParallelDSP.cpp index 3ab9298c1108ca00b60b5752b3b6619080e62b6d..fc3258914f92b4069f2a90db48e67f1e733a1ae0 100644 --- a/lib/Target/ARM/ARMParallelDSP.cpp +++ b/lib/Target/ARM/ARMParallelDSP.cpp @@ -99,6 +99,8 @@ namespace { for (auto *V : RHS) AllValues.push_back(V); } + + bool AreSymmetrical(BinOpChain *Other); }; struct Reduction { @@ -106,9 +108,9 @@ namespace { // pattern matching. Instruction *AccIntAdd; // The accumulating integer add statement, // i.e, the reduction statement. - OpChainList MACCandidates; // The MAC candidates associated with // this reduction statement. + PMACPairList PMACPairs; Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { }; }; @@ -121,10 +123,13 @@ namespace { Loop *L; const DataLayout *DL; Module *M; + std::map LoadPairs; + std::map> SequentialLoads; - bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs); + bool RecordSequentialLoads(BasicBlock *Header); + bool InsertParallelMACs(Reduction &Reduction); bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem); - PMACPairList CreateParallelMACPairs(OpChainList &Candidates); + void CreateParallelMACPairs(Reduction &R); Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1, Instruction *Acc, bool Exchange, Instruction *InsertAfter); @@ -202,6 +207,12 @@ namespace { LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n"); LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n"); + + if (!RecordSequentialLoads(Header)) { + LLVM_DEBUG(dbgs() << " - No sequential loads found.\n"); + return false; + } + Changes = MatchSMLAD(F); return Changes; } @@ -254,58 +265,14 @@ static bool IsNarrowSequence(Value *V, ValueList &VL) { return false; } -// Element-by-element comparison of Value lists returning true if they are -// instructions with the same opcode or constants with the same value. -static bool AreSymmetrical(const ValueList &VL0, - const ValueList &VL1) { - if (VL0.size() != VL1.size()) { - LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: " - << VL0.size() << " != " << VL1.size() << "\n"); - return false; - } - - const unsigned Pairs = VL0.size(); - LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n"); - - for (unsigned i = 0; i < Pairs; ++i) { - const Value *V0 = VL0[i]; - const Value *V1 = VL1[i]; - const auto *Inst0 = dyn_cast(V0); - const auto *Inst1 = dyn_cast(V1); - - LLVM_DEBUG(dbgs() << "Pair " << i << ":\n"; - dbgs() << "mul1: "; V0->dump(); - dbgs() << "mul2: "; V1->dump()); - - if (!Inst0 || !Inst1) - return false; - - if (Inst0->isSameOperationAs(Inst1)) { - LLVM_DEBUG(dbgs() << "OK: same operation found!\n"); - continue; - } - - const APInt *C0, *C1; - if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1)) - return false; - } - - LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n"); - return true; -} - template static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1, - MemInstList &VecMem, const DataLayout &DL, - ScalarEvolution &SE) { + const DataLayout &DL, ScalarEvolution &SE) { if (!MemOp0->isSimple() || !MemOp1->isSimple()) { LLVM_DEBUG(dbgs() << "No, not touching volatile access\n"); return false; } if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) { - VecMem.clear(); - VecMem.push_back(MemOp0); - VecMem.push_back(MemOp1); LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n"); return true; } @@ -328,16 +295,106 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, return false; } - return AreSequentialAccesses(Ld0, Ld1, VecMem, *DL, *SE); + if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1) + return false; + + VecMem.clear(); + VecMem.push_back(Ld0); + VecMem.push_back(Ld1); + return true; } -PMACPairList -ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) { +/// Iterate through the block and record base, offset pairs of loads as well as +/// maximal sequences of sequential loads. +bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *Header) { + SmallVector Loads; + for (auto &I : *Header) { + auto *Ld = dyn_cast(&I); + if (!Ld) + continue; + Loads.push_back(Ld); + } + + std::map BaseLoads; + + for (auto *Ld0 : Loads) { + for (auto *Ld1 : Loads) { + if (Ld0 == Ld1) + continue; + + if (AreSequentialAccesses(Ld0, Ld1, *DL, *SE)) { + LoadPairs[Ld0] = Ld1; + if (BaseLoads.count(Ld0)) { + LoadInst *Base = BaseLoads[Ld0]; + BaseLoads[Ld1] = Base; + SequentialLoads[Base].push_back(Ld1); + } else { + BaseLoads[Ld1] = Ld0; + SequentialLoads[Ld0].push_back(Ld1); + } + } + } + } + return LoadPairs.size() > 1; +} + +void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) { + OpChainList &Candidates = R.MACCandidates; + PMACPairList &PMACPairs = R.PMACPairs; const unsigned Elems = Candidates.size(); - PMACPairList PMACPairs; if (Elems < 2) - return PMACPairs; + return; + + auto CanPair = [&](BinOpChain *PMul0, BinOpChain *PMul1) { + if (!PMul0->AreSymmetrical(PMul1)) + return false; + + // The first elements of each vector should be loads with sexts. If we + // find that its two pairs of consecutive loads, then these can be + // transformed into two wider loads and the users can be replaced with + // DSP intrinsics. + for (unsigned x = 0; x < PMul0->LHS.size(); x += 2) { + auto *Ld0 = dyn_cast(PMul0->LHS[x]); + auto *Ld1 = dyn_cast(PMul1->LHS[x]); + auto *Ld2 = dyn_cast(PMul0->RHS[x]); + auto *Ld3 = dyn_cast(PMul1->RHS[x]); + + if (!Ld0 || !Ld1 || !Ld2 || !Ld3) + return false; + + LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n" + << "\t Ld0: " << *Ld0 << "\n" + << "\t Ld1: " << *Ld1 << "\n" + << "and operands " << x + 2 << ":\n" + << "\t Ld2: " << *Ld2 << "\n" + << "\t Ld3: " << *Ld3 << "\n"); + + if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) { + if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { + LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); + PMACPairs.push_back(std::make_pair(PMul0, PMul1)); + return true; + } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) { + LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); + LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n"); + PMul1->Exchange = true; + PMACPairs.push_back(std::make_pair(PMul0, PMul1)); + return true; + } + } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) && + AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { + LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); + LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n"); + LLVM_DEBUG(dbgs() << " and swapping muls\n"); + PMul0->Exchange = true; + // Only the second operand can be exchanged, so swap the muls. + PMACPairs.push_back(std::make_pair(PMul1, PMul0)); + return true; + } + } + return false; + }; SmallPtrSet Paired; for (unsigned i = 0; i < Elems; ++i) { @@ -364,77 +421,21 @@ ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) { dbgs() << "- "; Mul0->dump(); dbgs() << "- "; Mul1->dump()); - const ValueList &Mul0_LHS = PMul0->LHS; - const ValueList &Mul0_RHS = PMul0->RHS; - const ValueList &Mul1_LHS = PMul1->LHS; - const ValueList &Mul1_RHS = PMul1->RHS; - - if (!AreSymmetrical(Mul0_LHS, Mul1_LHS) || - !AreSymmetrical(Mul0_RHS, Mul1_RHS)) - continue; - LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n"); - // The first elements of each vector should be loads with sexts. If we - // find that its two pairs of consecutive loads, then these can be - // transformed into two wider loads and the users can be replaced with - // DSP intrinsics. - bool Found = false; - for (unsigned x = 0; x < Mul0_LHS.size(); x += 2) { - auto *Ld0 = dyn_cast(Mul0_LHS[x]); - auto *Ld1 = dyn_cast(Mul1_LHS[x]); - auto *Ld2 = dyn_cast(Mul0_RHS[x]); - auto *Ld3 = dyn_cast(Mul1_RHS[x]); - - if (!Ld0 || !Ld1 || !Ld2 || !Ld3) - continue; - - LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n" - << "\t Ld0: " << *Ld0 << "\n" - << "\t Ld1: " << *Ld1 << "\n" - << "and operands " << x + 2 << ":\n" - << "\t Ld2: " << *Ld2 << "\n" - << "\t Ld3: " << *Ld3 << "\n"); - - if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) { - if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { - LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); - PMACPairs.push_back(std::make_pair(PMul0, PMul1)); - Found = true; - } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) { - LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); - LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n"); - PMul1->Exchange = true; - PMACPairs.push_back(std::make_pair(PMul0, PMul1)); - Found = true; - } - } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd)) { - if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { - LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); - LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n"); - LLVM_DEBUG(dbgs() << " and swapping muls\n"); - PMul0->Exchange = true; - // Only the second operand can be exchanged, so swap the muls. - PMACPairs.push_back(std::make_pair(PMul1, PMul0)); - Found = true; - } - } - } - if (Found) { + if (CanPair(PMul0, PMul1)) { Paired.insert(Mul0); Paired.insert(Mul1); break; } } } - return PMACPairs; } -bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction, - PMACPairList &PMACPairs) { +bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction) { Instruction *Acc = Reduction.Phi; Instruction *InsertAfter = Reduction.AccIntAdd; - for (auto &Pair : PMACPairs) { + for (auto &Pair : Reduction.PMACPairs) { BinOpChain *PMul0 = Pair.first; BinOpChain *PMul1 = Pair.second; LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n"; @@ -524,26 +525,20 @@ static void MatchParallelMACSequences(Reduction &R, if (!I) return false; - Value *MulOp0, *MulOp1; - switch (I->getOpcode()) { case Instruction::Add: if (Match(I->getOperand(0)) || (Match(I->getOperand(1)))) return true; break; - case Instruction::Mul: - if (match (I, (m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) { + case Instruction::Mul: { + Value *MulOp0 = I->getOperand(0); + Value *MulOp1 = I->getOperand(1); + if (isa(MulOp0) && isa(MulOp1)) AddMACCandidate(Candidates, I, MulOp0, MulOp1); - return false; - } - break; + return false; + } case Instruction::SExt: - if (match (I, (m_SExt(m_Mul(m_Value(MulOp0), m_Value(MulOp1)))))) { - Instruction *Mul = cast(I->getOperand(0)); - AddMACCandidate(Candidates, Mul, MulOp0, MulOp1); - return false; - } - break; + return Match(I->getOperand(0)); } return false; }; @@ -685,8 +680,8 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) { for (auto &R : Reductions) { if (AreAliased(AA, Reads, Writes, R.MACCandidates)) return false; - PMACPairList PMACPairs = CreateParallelMACPairs(R.MACCandidates); - Changed |= InsertParallelMACs(R, PMACPairs); + CreateParallelMACPairs(R); + Changed |= InsertParallelMACs(R); } LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump();); @@ -733,6 +728,52 @@ Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1, return Call; } +// Compare the value lists in Other to this chain. +bool BinOpChain::AreSymmetrical(BinOpChain *Other) { + // Element-by-element comparison of Value lists returning true if they are + // instructions with the same opcode or constants with the same value. + auto CompareValueList = [](const ValueList &VL0, + const ValueList &VL1) { + if (VL0.size() != VL1.size()) { + LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: " + << VL0.size() << " != " << VL1.size() << "\n"); + return false; + } + + const unsigned Pairs = VL0.size(); + LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n"); + + for (unsigned i = 0; i < Pairs; ++i) { + const Value *V0 = VL0[i]; + const Value *V1 = VL1[i]; + const auto *Inst0 = dyn_cast(V0); + const auto *Inst1 = dyn_cast(V1); + + LLVM_DEBUG(dbgs() << "Pair " << i << ":\n"; + dbgs() << "mul1: "; V0->dump(); + dbgs() << "mul2: "; V1->dump()); + + if (!Inst0 || !Inst1) + return false; + + if (Inst0->isSameOperationAs(Inst1)) { + LLVM_DEBUG(dbgs() << "OK: same operation found!\n"); + continue; + } + + const APInt *C0, *C1; + if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1)) + return false; + } + + LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n"); + return true; + }; + + return CompareValueList(LHS, Other->LHS) && + CompareValueList(RHS, Other->RHS); +} + Pass *llvm::createARMParallelDSPPass() { return new ARMParallelDSP(); } diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp index 0e16d6bcfe2b27fc3df8ccde6bca7cd30e4c9249..4f28f2dafc70ab4e31c978443a1d28191fe250cf 100644 --- a/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -234,6 +234,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case G_GEP: case G_INTTOPTR: case G_PTRTOINT: + case G_CTLZ: // FIXME: We're abusing the fact that everything lives in a GPR for now; in // the real world we would use different mappings. OperandsMapping = &ARM::ValueMappings[ARM::GPR3OpsIdx]; diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index 519f789fc2156607f14e73500a59b2ebde81b99a..ec02c840d5e17e75b5e2962cc38b1a9f996258dd 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -194,12 +194,6 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT, return *RM; } -static CodeModel::Model getEffectiveCodeModel(Optional CM) { - if (CM) - return *CM; - return CodeModel::Small; -} - /// Create an ARM architecture model. /// ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT, @@ -210,7 +204,7 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OL, bool isLittle) : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT, CPU, FS, Options, getEffectiveRelocModel(TT, RM), - getEffectiveCodeModel(CM), OL), + getEffectiveCodeModel(CM, CodeModel::Small), OL), TargetABI(computeTargetABI(TT, CPU, Options)), TLOF(createTLOF(getTargetTriple())), isLittle(isLittle) { diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index f837db56264c5970d94950a0824656a3d4ac7a50..3832b0112b87b627a41abbad7821dbb43f883b59 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "ARMFeatures.h" +#include "InstPrinter/ARMInstPrinter.h" #include "Utils/ARMBaseInfo.h" #include "MCTargetDesc/ARMAddressingModes.h" #include "MCTargetDesc/ARMBaseInfo.h" @@ -3205,17 +3206,26 @@ public: } // end anonymous namespace. void ARMOperand::print(raw_ostream &OS) const { + auto RegName = [](unsigned Reg) { + if (Reg) + return ARMInstPrinter::getRegisterName(Reg); + else + return "noreg"; + }; + switch (Kind) { case k_CondCode: OS << ""; break; case k_CCOut: - OS << ""; + OS << ""; break; case k_ITCondMask: { static const char *const MaskStr[] = { - "()", "(t)", "(e)", "(tt)", "(et)", "(te)", "(ee)", "(ttt)", "(ett)", - "(tet)", "(eet)", "(tte)", "(ete)", "(tee)", "(eee)" + "(invalid)", "(teee)", "(tee)", "(teet)", + "(te)", "(tete)", "(tet)", "(tett)", + "(t)", "(ttee)", "(tte)", "(ttet)", + "(tt)", "(ttte)", "(ttt)", "(tttt)" }; assert((ITMask.Mask & 0xf) == ITMask.Mask); OS << ""; @@ -3249,13 +3259,25 @@ void ARMOperand::print(raw_ostream &OS) const { OS << ""; break; case k_Memory: - OS << ""; break; case k_PostIndexRegister: OS << "post-idx register " << (PostIdxReg.isAdd ? "" : "-") - << PostIdxReg.RegNum; + << RegName(PostIdxReg.RegNum); if (PostIdxReg.ShiftTy != ARM_AM::no_shift) OS << ARM_AM::getShiftOpcStr(PostIdxReg.ShiftTy) << " " << PostIdxReg.ShiftImm; @@ -3271,23 +3293,21 @@ void ARMOperand::print(raw_ostream &OS) const { break; } case k_Register: - OS << ""; + OS << ""; break; case k_ShifterImmediate: OS << ""; break; case k_ShiftedRegister: - OS << ""; + OS << ""; break; case k_ShiftedImmediate: - OS << ""; + OS << ""; break; case k_RotateImmediate: OS << ""; @@ -3311,7 +3331,7 @@ void ARMOperand::print(raw_ostream &OS) const { const SmallVectorImpl &RegList = getRegList(); for (SmallVectorImpl::const_iterator I = RegList.begin(), E = RegList.end(); I != E; ) { - OS << *I; + OS << RegName(*I); if (++I < E) OS << ", "; } @@ -3320,15 +3340,15 @@ void ARMOperand::print(raw_ostream &OS) const { } case k_VectorList: OS << ""; + << RegName(VectorList.RegNum) << ">"; break; case k_VectorListAllLanes: OS << ""; + << RegName(VectorList.RegNum) << ">"; break; case k_VectorListIndexed: OS << ""; + << VectorList.Count << " * " << RegName(VectorList.RegNum) << ">"; break; case k_Token: OS << "'" << getToken() << "'"; @@ -9157,33 +9177,9 @@ bool ARMAsmParser::isITBlockTerminator(MCInst &Inst) const { // Any arithmetic instruction which writes to the PC also terminates the IT // block. - for (unsigned OpIdx = 0; OpIdx < MCID.getNumDefs(); ++OpIdx) { - MCOperand &Op = Inst.getOperand(OpIdx); - if (Op.isReg() && Op.getReg() == ARM::PC) - return true; - } - - if (MCID.hasImplicitDefOfPhysReg(ARM::PC, MRI)) + if (MCID.hasDefOfPhysReg(Inst, ARM::PC, *MRI)) return true; - // Instructions with variable operand lists, which write to the variable - // operands. We only care about Thumb instructions here, as ARM instructions - // obviously can't be in an IT block. - switch (Inst.getOpcode()) { - case ARM::tLDMIA: - case ARM::t2LDMIA: - case ARM::t2LDMIA_UPD: - case ARM::t2LDMDB: - case ARM::t2LDMDB_UPD: - if (listContainsReg(Inst, 3, ARM::PC)) - return true; - break; - case ARM::tPOP: - if (listContainsReg(Inst, 2, ARM::PC)) - return true; - break; - } - return false; } @@ -9290,6 +9286,10 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, switch (MatchResult) { case Match_Success: + LLVM_DEBUG(dbgs() << "Parsed as: "; + Inst.dump_pretty(dbgs(), MII.getName(Inst.getOpcode())); + dbgs() << "\n"); + // Context sensitive operand constraints aren't handled by the matcher, // so check them here. if (validateInstruction(Inst, Operands)) { @@ -9307,7 +9307,9 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // individual transformations can chain off each other. E.g., // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8) while (processInstruction(Inst, Operands, Out)) - ; + LLVM_DEBUG(dbgs() << "Changed to: "; + Inst.dump_pretty(dbgs(), MII.getName(Inst.getOpcode())); + dbgs() << "\n"); // Only after the instruction is fully processed, we can validate it if (wasInITBlock && hasV8Ops() && isThumb() && diff --git a/lib/Target/ARM/AsmParser/LLVMBuild.txt b/lib/Target/ARM/AsmParser/LLVMBuild.txt index 061885ad4fac836d2e9f8a681f8fb7b3e5778d04..19058aeadb956c886dbfb857ded878c07c55daae 100644 --- a/lib/Target/ARM/AsmParser/LLVMBuild.txt +++ b/lib/Target/ARM/AsmParser/LLVMBuild.txt @@ -19,5 +19,5 @@ type = Library name = ARMAsmParser parent = ARM -required_libraries = ARMDesc ARMInfo MC MCParser Support ARMUtils +required_libraries = ARMDesc ARMInfo ARMAsmPrinter MC MCParser Support ARMUtils add_to_library_groups = ARM diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index 4b4956e914f267bf8e8f131d76f2ae16df6b14f5..0ced8195790d6c5ba78cd0fde12f08e9648b31c4 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -22,6 +22,8 @@ #include "llvm/MC/MCSection.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/ScopedPrinter.h" + using namespace llvm; namespace { @@ -144,6 +146,15 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer, MCValue Target, uint64_t &FixedValue) { uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); + + if (FixupOffset & 0xff000000) { + Asm.getContext().reportError(Fixup.getLoc(), + "can not encode offset '0x" + + to_hexString(FixupOffset) + + "' in resulting scattered relocation."); + return; + } + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); unsigned Type = MachO::ARM_RELOC_HALF; @@ -250,6 +261,15 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer, unsigned Log2Size, uint64_t &FixedValue) { uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); + + if (FixupOffset & 0xff000000) { + Asm.getContext().reportError(Fixup.getLoc(), + "can not encode offset '0x" + + to_hexString(FixupOffset) + + "' in resulting scattered relocation."); + return; + } + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); // See . diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp index 1a91a7030657918e16bc30d91609b30756b8dfc8..d567d333904922f285b765ba6581decab970d920 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -146,9 +146,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); - if (RC == &ARM::GPRRegClass || RC == &ARM::tGPRRegClass || - RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass || - RC == &ARM::GPRnopcRegClass) { + if (ARM::GPRRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DL, get(ARM::t2STRi12)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI) @@ -190,9 +188,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); - if (RC == &ARM::GPRRegClass || RC == &ARM::tGPRRegClass || - RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass || - RC == &ARM::GPRnopcRegClass) { + if (ARM::GPRRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg) .addFrameIndex(FI) .addImm(0) diff --git a/lib/Target/AVR/AVRTargetMachine.cpp b/lib/Target/AVR/AVRTargetMachine.cpp index 74300d9a451c372dce86787652dc94e5e1c9311a..9828cdab68c313465e7be17f073892b3000adfb8 100644 --- a/lib/Target/AVR/AVRTargetMachine.cpp +++ b/lib/Target/AVR/AVRTargetMachine.cpp @@ -40,12 +40,6 @@ static Reloc::Model getEffectiveRelocModel(Optional RM) { return RM.hasValue() ? *RM : Reloc::Static; } -static CodeModel::Model getEffectiveCodeModel(Optional CM) { - if (CM) - return *CM; - return CodeModel::Small; -} - AVRTargetMachine::AVRTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, @@ -53,8 +47,8 @@ AVRTargetMachine::AVRTargetMachine(const Target &T, const Triple &TT, Optional CM, CodeGenOpt::Level OL, bool JIT) : LLVMTargetMachine(T, AVRDataLayout, TT, getCPU(CPU), FS, Options, - getEffectiveRelocModel(RM), getEffectiveCodeModel(CM), - OL), + getEffectiveRelocModel(RM), + getEffectiveCodeModel(CM, CodeModel::Small), OL), SubTarget(TT, getCPU(CPU), FS, *this) { this->TLOF = make_unique(); initAsmInfo(); diff --git a/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/lib/Target/AVR/AsmParser/AVRAsmParser.cpp index d57cc098497fce7fe80f28058954e7e2e460c5aa..f2bb59265271b4625115e9a051cfbd47453e5c71 100644 --- a/lib/Target/AVR/AsmParser/AVRAsmParser.cpp +++ b/lib/Target/AVR/AsmParser/AVRAsmParser.cpp @@ -34,8 +34,9 @@ #define DEBUG_TYPE "avr-asm-parser" -namespace llvm { +using namespace llvm; +namespace { /// Parses AVR assembly from a stream. class AVRAsmParser : public MCTargetAsmParser { const MCSubtargetInfo &STI; @@ -245,6 +246,8 @@ public: } }; +} // end anonymous namespace. + // Auto-generated Match Functions /// Maps from the set of all register names to a register number. @@ -510,6 +513,7 @@ bool AVRAsmParser::parseOperand(OperandVector &Operands) { case AsmToken::Real: if (!tryParseExpression(Operands)) return false; + break; default: break; } @@ -708,5 +712,3 @@ unsigned AVRAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, } return Match_InvalidOperand; } - -} // end of namespace llvm diff --git a/lib/Target/BPF/BPFTargetMachine.cpp b/lib/Target/BPF/BPFTargetMachine.cpp index 52b1bb00316a931eee9d142bcbba848ac08498c1..350465b118ed227a88f2637fcbe4b0d19c6cafc1 100644 --- a/lib/Target/BPF/BPFTargetMachine.cpp +++ b/lib/Target/BPF/BPFTargetMachine.cpp @@ -51,12 +51,6 @@ static Reloc::Model getEffectiveRelocModel(Optional RM) { return *RM; } -static CodeModel::Model getEffectiveCodeModel(Optional CM) { - if (CM) - return *CM; - return CodeModel::Small; -} - BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, @@ -64,8 +58,8 @@ BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT, Optional CM, CodeGenOpt::Level OL, bool JIT) : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, - getEffectiveRelocModel(RM), getEffectiveCodeModel(CM), - OL), + getEffectiveRelocModel(RM), + getEffectiveCodeModel(CM, CodeModel::Small), OL), TLOF(make_unique()), Subtarget(TT, CPU, FS, *this) { initAsmInfo(); diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp index dcab9bfb019a6898a64c5190050a655527e728bd..2eb1f0fc8bd94d6ce6b2e7849fde5e6c9278e85c 100644 --- a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp +++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -313,8 +313,6 @@ public: bool iss30_2Imm() const { return true; } bool iss29_3Imm() const { return true; } bool iss27_2Imm() const { return CheckImmRange(27, 2, true, true, false); } - bool iss10_0Imm() const { return CheckImmRange(10, 0, true, false, false); } - bool iss10_6Imm() const { return CheckImmRange(10, 6, true, false, false); } bool iss9_0Imm() const { return CheckImmRange(9, 0, true, false, false); } bool iss8_0Imm() const { return CheckImmRange(8, 0, true, false, false); } bool iss8_0Imm64() const { return CheckImmRange(8, 0, true, true, false); } diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index a409fff0d342266524e44046a69a97d1a0d3a2d6..428b42eba30da1a4e8e730645e1fb60f5445d67d 100644 --- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -117,6 +117,10 @@ DecodeGeneralDoubleLow8RegsRegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeHvxWRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeHvxVQRRegisterClass(MCInst &Inst, + unsigned RegNo, + uint64_t Address, + const void *Decoder); static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); @@ -145,62 +149,7 @@ static DecodeStatus s32_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder); static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address, const void *Decoder); - -static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t, - const void *Decoder) { - signedDecoder<4>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s29_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t, - const void *Decoder) { - signedDecoder<14>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t, - const void *Decoder) { - signedDecoder<8>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t, - const void *Decoder) { - signedDecoder<7>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s31_1ImmDecoder(MCInst &MI, unsigned tmp, uint64_t, - const void *Decoder) { - signedDecoder<12>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s3_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t, - const void *Decoder) { - signedDecoder<3>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s30_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t, - const void *Decoder) { - signedDecoder<13>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t, - const void *Decoder) { - signedDecoder<6>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s6_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t, - const void *Decoder) { - signedDecoder<9>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp, uint64_t, - const void *Decoder) { - signedDecoder<5>(MI, tmp, Decoder); - return MCDisassembler::Success; -} -static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t, - const void *Decoder) { - signedDecoder<6>(MI, tmp, Decoder); - return MCDisassembler::Success; -} +#include "HexagonDepDecoders.h" #include "HexagonGenDisassemblerTables.inc" static MCDisassembler *createHexagonDisassembler(const Target &T, @@ -663,6 +612,18 @@ static DecodeStatus DecodeHvxWRRegisterClass(MCInst &Inst, unsigned RegNo, return (DecodeRegisterClass(Inst, RegNo >> 1, HvxWRDecoderTable)); } +LLVM_ATTRIBUTE_UNUSED // Suppress warning temporarily. +static DecodeStatus DecodeHvxVQRRegisterClass(MCInst &Inst, + unsigned RegNo, + uint64_t /*Address*/, + const void *Decoder) { + static const MCPhysReg HvxVQRDecoderTable[] = { + Hexagon::VQ0, Hexagon::VQ1, Hexagon::VQ2, Hexagon::VQ3, + Hexagon::VQ4, Hexagon::VQ5, Hexagon::VQ6, Hexagon::VQ7}; + + return DecodeRegisterClass(Inst, RegNo >> 2, HvxVQRDecoderTable); +} + static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t /*Address*/, const void *Decoder) { diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td index 8853dd6d550943db24d4bac73132eaa4eeb362b7..868353e188329acdcfcebec441cc6c0b2e72065a 100644 --- a/lib/Target/Hexagon/Hexagon.td +++ b/lib/Target/Hexagon/Hexagon.td @@ -25,6 +25,9 @@ include "llvm/Target/Target.td" include "HexagonDepArch.td" // Hexagon ISA Extensions +def ExtensionZReg: SubtargetFeature<"zreg", "UseZRegOps", "true", + "Hexagon ZReg extension instructions">; + def ExtensionHVX: SubtargetFeature<"hvx", "HexagonHVXVersion", "Hexagon::ArchEnum::V60", "Hexagon HVX instructions">; def ExtensionHVXV60: SubtargetFeature<"hvxv60", "HexagonHVXVersion", @@ -32,10 +35,14 @@ def ExtensionHVXV60: SubtargetFeature<"hvxv60", "HexagonHVXVersion", [ExtensionHVX]>; def ExtensionHVXV62: SubtargetFeature<"hvxv62", "HexagonHVXVersion", "Hexagon::ArchEnum::V62", "Hexagon HVX instructions", - [ExtensionHVX,ExtensionHVXV60]>; + [ExtensionHVX, ExtensionHVXV60]>; def ExtensionHVXV65: SubtargetFeature<"hvxv65", "HexagonHVXVersion", "Hexagon::ArchEnum::V65", "Hexagon HVX instructions", - [ExtensionHVX,ExtensionHVXV60, ExtensionHVXV62]>; + [ExtensionHVX, ExtensionHVXV60, ExtensionHVXV62]>; +def ExtensionHVXV66: SubtargetFeature<"hvxv66", "HexagonHVXVersion", + "Hexagon::ArchEnum::V66", "Hexagon HVX instructions", + [ExtensionHVX, ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65, + ExtensionZReg]>; def ExtensionHVX64B: SubtargetFeature<"hvx-length64b", "UseHVX64BOps", "true", "Hexagon HVX 64B instructions", [ExtensionHVX]>; @@ -60,6 +67,9 @@ def FeatureDuplex: SubtargetFeature<"duplex", "EnableDuplex", "true", "Enable generation of duplex instruction">; def FeatureReservedR19: SubtargetFeature<"reserved-r19", "ReservedR19", "true", "Reserve register R19">; +def FeatureNoreturnStackElim: SubtargetFeature<"noreturn-stack-elim", + "NoreturnStackElim", "true", + "Eliminate stack allocation in a noreturn function when possible">; //===----------------------------------------------------------------------===// // Hexagon Instruction Predicate Definitions. @@ -78,6 +88,10 @@ def UseHVXV62 : Predicate<"HST->useHVXOps()">, AssemblerPredicate<"ExtensionHVXV62">; def UseHVXV65 : Predicate<"HST->useHVXOps()">, AssemblerPredicate<"ExtensionHVXV65">; +def UseHVXV66 : Predicate<"HST->useHVXOps()">, + AssemblerPredicate<"ExtensionHVXV66">; +def UseZReg : Predicate<"HST->useZRegOps()">, + AssemblerPredicate<"ExtensionZReg">; def Hvx64: HwMode<"+hvx-length64b">; def Hvx128: HwMode<"+hvx-length128b">; @@ -309,8 +323,6 @@ include "HexagonPatternsHVX.td" include "HexagonPatternsV65.td" include "HexagonDepMappings.td" include "HexagonIntrinsics.td" -include "HexagonMapAsm2IntrinV62.gen.td" -include "HexagonMapAsm2IntrinV65.gen.td" def HexagonInstrInfo : InstrInfo; @@ -346,6 +358,10 @@ def : Proc<"hexagonv65", HexagonModelV65, [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65, FeatureDuplex, FeatureMemNoShuf, FeatureMemops, FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>; +def : Proc<"hexagonv66", HexagonModelV66, + [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65, ArchV66, + FeatureDuplex, FeatureMemNoShuf, FeatureMemops, FeatureNVJ, + FeatureNVS, FeaturePackets, FeatureSmallData]>; //===----------------------------------------------------------------------===// // Declare the target which we are implementing diff --git a/lib/Target/Hexagon/HexagonBitTracker.cpp b/lib/Target/Hexagon/HexagonBitTracker.cpp index 94aacbed6af697bd19fa93931eedd58d96fc0b0a..92b6da871a4c7826a72d84f9e74912ef9c3c2719 100644 --- a/lib/Target/Hexagon/HexagonBitTracker.cpp +++ b/lib/Target/Hexagon/HexagonBitTracker.cpp @@ -93,11 +93,12 @@ BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const { const TargetRegisterClass &RC = *MRI.getRegClass(Reg); unsigned ID = RC.getID(); uint16_t RW = getRegBitWidth(RegisterRef(Reg, Sub)); - auto &HRI = static_cast(TRI); + const auto &HRI = static_cast(TRI); bool IsSubLo = (Sub == HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_lo)); switch (ID) { case Hexagon::DoubleRegsRegClassID: case Hexagon::HvxWRRegClassID: + case Hexagon::HvxVQRRegClassID: return IsSubLo ? BT::BitMask(0, RW-1) : BT::BitMask(RW, 2*RW-1); default: @@ -114,9 +115,13 @@ uint16_t HexagonEvaluator::getPhysRegBitWidth(unsigned Reg) const { assert(TargetRegisterInfo::isPhysicalRegister(Reg)); using namespace Hexagon; - for (auto &RC : {HvxVRRegClass, HvxWRRegClass, HvxQRRegClass}) - if (RC.contains(Reg)) - return TRI.getRegSizeInBits(RC); + const auto &HST = MF.getSubtarget(); + if (HST.useHVXOps()) { + for (auto &RC : {HvxVRRegClass, HvxWRRegClass, HvxQRRegClass, + HvxVQRRegClass}) + if (RC.contains(Reg)) + return TRI.getRegSizeInBits(RC); + } // Default treatment for other physical registers. if (const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg)) return TRI.getRegSizeInBits(*RC); @@ -142,6 +147,8 @@ const TargetRegisterClass &HexagonEvaluator::composeWithSubRegIndex( return Hexagon::IntRegsRegClass; case Hexagon::HvxWRRegClassID: return Hexagon::HvxVRRegClass; + case Hexagon::HvxVQRRegClassID: + return Hexagon::HvxWRRegClass; default: break; } diff --git a/lib/Target/Hexagon/HexagonDepArch.h b/lib/Target/Hexagon/HexagonDepArch.h index 1bcf40220619f0e2ece0da2e36122c01139937c5..dff2b2f471d0f1bb83ac53517d0960980a3bb869 100644 --- a/lib/Target/Hexagon/HexagonDepArch.h +++ b/lib/Target/Hexagon/HexagonDepArch.h @@ -1,4 +1,4 @@ -//===- HexagonDepArch.h ---------------------------------------------------===// +//===----------------------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -10,12 +10,11 @@ //===----------------------------------------------------------------------===// - #ifndef HEXAGON_DEP_ARCH_H #define HEXAGON_DEP_ARCH_H namespace llvm { namespace Hexagon { -enum class ArchEnum { NoArch,Generic,V5,V55,V60,V62,V65 }; +enum class ArchEnum { NoArch, Generic, V5, V55, V60, V62, V65, V66 }; } // namespace Hexagon } // namespace llvm; #endif // HEXAGON_DEP_ARCH_H diff --git a/lib/Target/Hexagon/HexagonDepArch.td b/lib/Target/Hexagon/HexagonDepArch.td index ce7956926101b92fe369ddf0115e99bfb4e4e6b4..f1aadae555c8604fed185b243e4d089cf1979d90 100644 --- a/lib/Target/Hexagon/HexagonDepArch.td +++ b/lib/Target/Hexagon/HexagonDepArch.td @@ -1,4 +1,4 @@ -//===- HexagonDepArch.td --------------------------------------------------===// +//===----------------------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -9,7 +9,8 @@ // Automatically generated file, please consult code owner before editing. //===----------------------------------------------------------------------===// - +def ArchV66: SubtargetFeature<"v66", "HexagonArchVersion", "Hexagon::ArchEnum::V66", "Enable Hexagon V66 architecture">; +def HasV66 : Predicate<"HST->hasV66Ops()">, AssemblerPredicate<"ArchV66">; def ArchV65: SubtargetFeature<"v65", "HexagonArchVersion", "Hexagon::ArchEnum::V65", "Enable Hexagon V65 architecture">; def HasV65 : Predicate<"HST->hasV65Ops()">, AssemblerPredicate<"ArchV65">; def ArchV62: SubtargetFeature<"v62", "HexagonArchVersion", "Hexagon::ArchEnum::V62", "Enable Hexagon V62 architecture">; @@ -19,3 +20,4 @@ def HasV60 : Predicate<"HST->hasV60Ops()">, AssemblerPredicate<"ArchV60">; def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "Hexagon::ArchEnum::V55", "Enable Hexagon V55 architecture">; def HasV55 : Predicate<"HST->hasV55Ops()">, AssemblerPredicate<"ArchV55">; def ArchV5: SubtargetFeature<"v5", "HexagonArchVersion", "Hexagon::ArchEnum::V5", "Enable Hexagon V5 architecture">; +def HasV5 : Predicate<"HST->hasV5Ops()">, AssemblerPredicate<"ArchV5">; diff --git a/lib/Target/Hexagon/HexagonDepDecoders.h b/lib/Target/Hexagon/HexagonDepDecoders.h new file mode 100644 index 0000000000000000000000000000000000000000..9f78412f45d24f4b656cdc67ca4633bb453b013d --- /dev/null +++ b/lib/Target/Hexagon/HexagonDepDecoders.h @@ -0,0 +1,79 @@ +//===----------------------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Automatically generated file, please consult code owner before editing. +//===----------------------------------------------------------------------===// + +// clang-format off + +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-function" +#endif + +static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<4>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s29_3ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<14>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<8>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<7>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s31_1ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<12>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s3_0ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<3>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s30_2ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<13>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<6>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s6_3ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<9>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<5>(MI, tmp, Decoder); + return MCDisassembler::Success; +} +static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t, const void *Decoder) { + signedDecoder<6>(MI, tmp, Decoder); + return MCDisassembler::Success; +} + +#if defined(__clang__) +#pragma clang diagnostic pop +#endif + +// clang-format on diff --git a/lib/Target/Hexagon/HexagonDepIICHVX.td b/lib/Target/Hexagon/HexagonDepIICHVX.td index b27cdae81a283a52b0f284d9f868dda8674bf532..9e3dea9f3e9b00c7e4318a084fc8e9f91444f6bb 100644 --- a/lib/Target/Hexagon/HexagonDepIICHVX.td +++ b/lib/Target/Hexagon/HexagonDepIICHVX.td @@ -1,4 +1,4 @@ -//===- HexagonDepIICHVX.td ------------------------------------------------===// +//===----------------------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -9,1849 +9,2549 @@ // Automatically generated file, please consult code owner before editing. //===----------------------------------------------------------------------===// - -def tc_0317c6ca : InstrItinClass; -def tc_1b93bdc6 : InstrItinClass; -def tc_2171ebae : InstrItinClass; -def tc_28978789 : InstrItinClass; -def tc_29841470 : InstrItinClass; -def tc_316c637c : InstrItinClass; -def tc_354299ad : InstrItinClass; -def tc_35e92f8e : InstrItinClass; -def tc_38208312 : InstrItinClass; -def tc_4105d6b5 : InstrItinClass; -def tc_41f4b64e : InstrItinClass; -def tc_41f99e1c : InstrItinClass; -def tc_45453b98 : InstrItinClass; -def tc_4e2a5159 : InstrItinClass; -def tc_4f190ba3 : InstrItinClass; -def tc_4fd8566e : InstrItinClass; -def tc_51cd3aab : InstrItinClass; -def tc_5a9fc4ec : InstrItinClass; -def tc_5c03dc63 : InstrItinClass; -def tc_5c120602 : InstrItinClass; -def tc_5cbf490b : InstrItinClass; -def tc_63e3d94c : InstrItinClass; -def tc_644584f8 : InstrItinClass; -def tc_66bb62ea : InstrItinClass; -def tc_69b6dd20 : InstrItinClass; -def tc_6b78cf13 : InstrItinClass; -def tc_6fd9ad30 : InstrItinClass; -def tc_71337255 : InstrItinClass; -def tc_72ad7b54 : InstrItinClass; -def tc_7474003e : InstrItinClass; -def tc_77a4c701 : InstrItinClass; -def tc_7c3f55c4 : InstrItinClass; -def tc_7e9f581b : InstrItinClass; -def tc_7fa82b08 : InstrItinClass; -def tc_7fa8b40f : InstrItinClass; -def tc_85d237e3 : InstrItinClass; -def tc_8a6eb39a : InstrItinClass; -def tc_8b6a873f : InstrItinClass; -def tc_908a4c8c : InstrItinClass; -def tc_9311da3f : InstrItinClass; -def tc_94f43c04 : InstrItinClass; -def tc_9777e6bf : InstrItinClass; -def tc_97c165b9 : InstrItinClass; -def tc_98733e9d : InstrItinClass; -def tc_99093773 : InstrItinClass; -def tc_9b9642a1 : InstrItinClass; -def tc_9c267309 : InstrItinClass; -def tc_a3127e12 : InstrItinClass; -def tc_a4c9df3b : InstrItinClass; -def tc_a807365d : InstrItinClass; -def tc_aedb9f9e : InstrItinClass; -def tc_b06ab583 : InstrItinClass; -def tc_b712833a : InstrItinClass; -def tc_b77635b4 : InstrItinClass; -def tc_bbaf280e : InstrItinClass; -def tc_bf142ae2 : InstrItinClass; -def tc_bfe309d5 : InstrItinClass; -def tc_c00bf9c9 : InstrItinClass; -def tc_c4b515c5 : InstrItinClass; -def tc_cbf6d1dc : InstrItinClass; -def tc_cedf314b : InstrItinClass; -def tc_d2cb81ea : InstrItinClass; -def tc_d5090f3e : InstrItinClass; -def tc_d642eff3 : InstrItinClass; -def tc_d725e5b0 : InstrItinClass; -def tc_d7bea0ec : InstrItinClass; -def tc_d98f4d63 : InstrItinClass; -def tc_da979fb3 : InstrItinClass; -def tc_db5b9e2f : InstrItinClass; -def tc_df54ad52 : InstrItinClass; -def tc_e172d86a : InstrItinClass; -def tc_e231aa4f : InstrItinClass; -def tc_e3748cdf : InstrItinClass; -def tc_e5053c8f : InstrItinClass; -def tc_e6299d16 : InstrItinClass; -def tc_eb669007 : InstrItinClass; -def tc_ec58f88a : InstrItinClass; -def tc_eda67dcd : InstrItinClass; -def tc_ee927c0e : InstrItinClass; -def tc_f3fc3f83 : InstrItinClass; -def tc_fa99dc24 : InstrItinClass; +def tc_04da405a : InstrItinClass; +def tc_05058f6f : InstrItinClass; +def tc_05ac6f98 : InstrItinClass; +def tc_05ca8cfd : InstrItinClass; +def tc_08a4f1b6 : InstrItinClass; +def tc_0b04c6c7 : InstrItinClass; +def tc_0ec46cf9 : InstrItinClass; +def tc_131f1c81 : InstrItinClass; +def tc_1381a97c : InstrItinClass; +def tc_15fdf750 : InstrItinClass; +def tc_16ff9ef8 : InstrItinClass; +def tc_191381c1 : InstrItinClass; +def tc_1ad8a370 : InstrItinClass; +def tc_1ba8a0cd : InstrItinClass; +def tc_20a4bbec : InstrItinClass; +def tc_257f6f7c : InstrItinClass; +def tc_26a377fe : InstrItinClass; +def tc_2c745bb8 : InstrItinClass; +def tc_2d4051cd : InstrItinClass; +def tc_2e8f5f6e : InstrItinClass; +def tc_309dbb4f : InstrItinClass; +def tc_3904b926 : InstrItinClass; +def tc_3aacf4a8 : InstrItinClass; +def tc_3ad719fb : InstrItinClass; +def tc_3c56e5ce : InstrItinClass; +def tc_3ce09744 : InstrItinClass; +def tc_3e2aaafc : InstrItinClass; +def tc_447d9895 : InstrItinClass; +def tc_453fe68d : InstrItinClass; +def tc_46d6c3e0 : InstrItinClass; +def tc_51d0ecc3 : InstrItinClass; +def tc_52447ecc : InstrItinClass; +def tc_540c3da3 : InstrItinClass; +def tc_54a0dc47 : InstrItinClass; +def tc_561aaa58 : InstrItinClass; +def tc_56c4f9fe : InstrItinClass; +def tc_56e64202 : InstrItinClass; +def tc_58d21193 : InstrItinClass; +def tc_5bf8afbb : InstrItinClass; +def tc_61bf7c03 : InstrItinClass; +def tc_649072c2 : InstrItinClass; +def tc_660769f1 : InstrItinClass; +def tc_663c80a7 : InstrItinClass; +def tc_6942b6e0 : InstrItinClass; +def tc_6e7fa133 : InstrItinClass; +def tc_71646d06 : InstrItinClass; +def tc_7177e272 : InstrItinClass; +def tc_718b5c53 : InstrItinClass; +def tc_7273323b : InstrItinClass; +def tc_7417e785 : InstrItinClass; +def tc_767c4e9d : InstrItinClass; +def tc_7e6a3e89 : InstrItinClass; +def tc_8772086c : InstrItinClass; +def tc_87adc037 : InstrItinClass; +def tc_8e420e4d : InstrItinClass; +def tc_90bcc1db : InstrItinClass; +def tc_933f2b39 : InstrItinClass; +def tc_946013d8 : InstrItinClass; +def tc_9d1dc972 : InstrItinClass; +def tc_9f363d21 : InstrItinClass; +def tc_a02a10a8 : InstrItinClass; +def tc_a0dbea28 : InstrItinClass; +def tc_a7e6707d : InstrItinClass; +def tc_ab23f776 : InstrItinClass; +def tc_abe8c3b2 : InstrItinClass; +def tc_ac4046bc : InstrItinClass; +def tc_af25efd9 : InstrItinClass; +def tc_b091f1c6 : InstrItinClass; +def tc_b28e51aa : InstrItinClass; +def tc_b4416217 : InstrItinClass; +def tc_b9db8205 : InstrItinClass; +def tc_c0749f3c : InstrItinClass; +def tc_c127de3a : InstrItinClass; +def tc_c4edf264 : InstrItinClass; +def tc_c5dba46e : InstrItinClass; +def tc_c7039829 : InstrItinClass; +def tc_cd94bfe0 : InstrItinClass; +def tc_d8287c14 : InstrItinClass; +def tc_db5555f3 : InstrItinClass; +def tc_dd5b0695 : InstrItinClass; +def tc_df80eeb0 : InstrItinClass; +def tc_e2d2e9e5 : InstrItinClass; +def tc_e35c1e93 : InstrItinClass; +def tc_e3f68a46 : InstrItinClass; +def tc_e675c45a : InstrItinClass; +def tc_e699ae41 : InstrItinClass; +def tc_e8797b98 : InstrItinClass; +def tc_e99d4c2e : InstrItinClass; +def tc_f1de44ef : InstrItinClass; +def tc_f21e8abb : InstrItinClass; +def tc_fd7610da : InstrItinClass; class DepHVXItinV55 { list DepHVXItinV55_list = [ - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_XLSHF]>], [9, 5], + [HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST]>], [1, 2, 5], + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7], [Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7], - [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ALL]>], [3, 2], - [HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_SHIFT]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST]>], [1, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7], - [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7], + [HVX_FWD, HVX_FWD]>, - InstrItinData , InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_XLANE]>], [1, 2, 5], - [Hex_FWD, Hex_FWD, HVX_FWD]>, - - InstrItinData , - InstrStage<1, [SLOT1], 0>, - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_XLANE]>], [9, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 2], - [HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_ALL]>], [], + []>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 5, 5], - [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2], + InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7], - [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7], + [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [SLOT1], 0>, - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2], + [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7], - [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, - - InstrItinData , InstrStage<1, [CVI_ST]>], [3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], + [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2], + [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2], + InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], - [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7], - [HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7], + [HVX_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , + InstrItinData , + InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], - [Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ST]>], [1, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2], - [HVX_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7], - [HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLSHF]>], [9, 7, 5], + InstrStage<1, [CVI_XLANE]>], [9, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_SHIFT]>], [9, 5], + [HVX_FWD, HVX_FWD]>, - InstrItinData , InstrStage<1, [CVI_LD]>], [9, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2], - [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [SLOT1], 0>, - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5], - [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1], + [Hex_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7], - [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9], - [HVX_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5], + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2], + InstrItinData , + InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , InstrStage<1, [CVI_ST], 0>, InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1], - [Hex_FWD, HVX_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], - [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_XLANE]>], [9, 2], + [HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7], - [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7], - [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 5, 5], - [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , InstrStage<1, [CVI_LD]>], [9, 3, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , + InstrStage<1, [CVI_ST]>], [3, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], + InstrStage<1, [CVI_XLANE]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7], [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7], + InstrStage<1, [CVI_ST]>], [2, 1, 2, 5], [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, - InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9], + [HVX_FWD]>, - InstrItinData , InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], - [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7], + [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7], - [HVX_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ALL]>], [3, 2], + [HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ALL]>], [2], - [Hex_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], + InstrStage<1, [CVI_SHIFT]>], [9, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7], - [Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2], + InstrItinData , + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_ALL]>], [3], - [HVX_FWD]>, + InstrStage<1, [CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 5], - [HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST]>], [2, 1, 2, 5], - [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7], + [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5], + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7], [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_XLSHF]>], [9, 5], - [HVX_FWD, HVX_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrStage<1, [CVI_ZW]>], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST]>], [3, 1, 2, 5], - [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7], - [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_ST]>], [1, 2, 5], + [Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], - [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2], [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], - [Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7], + [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ALL]>], [], - []>, + InstrStage<1, [CVI_ALL]>], [2], + [Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 5], + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7], [HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7], - [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 5, 5], + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, - InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 5, 5], - [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2], + [HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]> - ]; -} + InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], + [Hex_FWD, Hex_FWD, HVX_FWD]>, -class DepHVXItinV60 { - list DepHVXItinV60_list = [ - InstrItinData , + InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7], + InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST]>], [1, 2, 5], - [Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7], - [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ALL]>], [3, 2], - [HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST]>], [1, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ZW]>], [2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7], + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_XLANE]>], [1, 2, 5], - [Hex_FWD, Hex_FWD, HVX_FWD]>, - - InstrItinData , - InstrStage<1, [SLOT1], 0>, - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_XLANE]>], [9, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 2], - [HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_ALL]>], [3], + [HVX_FWD]>, - InstrItinData , InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 5, 5], - [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ZW]>], [1, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], + [Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7], - [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , InstrStage<1, [SLOT1], 0>, - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7], - [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>], [1, 2, 5], + [Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST]>], [3, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]> + ]; +} - InstrItinData DepHVXItinV60_list = [ + InstrItinData , - InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_XLSHF]>], [9, 5], + [HVX_FWD, HVX_FWD]>, - InstrItinData , + InstrItinData , InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7], + [Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , InstrStage<1, [CVI_LD], 0>, InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7], - [HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_SHIFT]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], - [Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2], + InstrStage<1, [CVI_MPY01]>], [9, 5, 2], [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2], - [HVX_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7], + [HVX_FWD, HVX_FWD]>, - InstrItinData , InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7], - [HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLSHF]>], [9, 7, 5], - [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_ALL]>], [], + []>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD]>], [9, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7], + [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2], [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , - InstrStage<1, [SLOT1], 0>, - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5], - [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_ST]>], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7], - [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], + [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9], - [HVX_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5], - [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2], + [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], - [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1], - [Hex_FWD, HVX_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], - [HVX_FWD, HVX_FWD, HVX_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7], - [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7], - [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 5, 5], - [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7], + [HVX_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD]>], [9, 3, 1, 2], + InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], - [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ST]>], [1, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData , InstrStage<1, [CVI_ST], 0>, InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7], [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, - InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, - - InstrItinData , InstrStage<1, [CVI_ST], 0>, InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7], - [HVX_FWD, Hex_FWD, HVX_FWD]>, - - InstrItinData , InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_ALL]>], [2], - [Hex_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], + InstrStage<1, [CVI_XLANE]>], [9, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_SHIFT]>], [9, 5], + [HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7], - [Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_LD]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1], + [Hex_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2], + InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ALL]>], [3], - [HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 5], - [HVX_FWD, HVX_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_ST]>], [2, 1, 2, 5], - [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5], - [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_XLANE]>], [9, 2], + [HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLSHF]>], [9, 5], - [HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrStage<1, [CVI_LD]>], [9, 3, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , InstrStage<1, [CVI_ST]>], [3, 1, 2, 5], [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7], - [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_XLANE]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], + InstrItinData , + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7], [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], - [Hex_FWD, Hex_FWD, HVX_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_ALL]>], [], - []>, + InstrStage<1, [CVI_ST]>], [2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 5], - [HVX_FWD, HVX_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9], + [HVX_FWD]>, - InstrItinData , InstrStage<1, [CVI_ST], 0>, InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7], [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 5, 5], - [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, - InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ALL]>], [3, 2], + [HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 5, 5], + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_SHIFT]>], [9, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]> - ]; -} + InstrItinData , + InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, -class DepHVXItinV62 { - list DepHVXItinV62_list = [ - InstrItinData , InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST]>], [1, 2, 5], - [Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7], - [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ALL]>], [3, 2], - [HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST]>], [1, 2], - [Hex_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7], - [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7], + [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_XLANE]>], [1, 2, 5], - [Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ZW]>], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_LD], 0>, InstrStage<1, [CVI_XLANE]>], [9, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 2], - [HVX_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 5, 5], - [HVX_FWD, HVX_FWD, HVX_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7], - [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_ST]>], [1, 2, 5], + [Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [SLOT1], 0>, + InstrItinData , InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7], - [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7], + [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST]>], [3, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], - [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_ALL]>], [2], + [Hex_FWD]>, - InstrItinData , InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7], [HVX_FWD, HVX_FWD]>, - InstrItinData , + InstrItinData , + InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], - [Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2], [HVX_FWD, Hex_FWD]>, - InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_ST], 0>, InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7], - [HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLSHF]>], [9, 7, 5], + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD]>], [9, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_ZW]>], [2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2], - [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_ST], 0>, InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5], [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7], - [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9], + InstrStage<1, [CVI_ALL]>], [3], [HVX_FWD]>, - InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ZW]>], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5], - [HVX_FWD, HVX_FWD, HVX_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , + InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], - [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_XLANE]>], [1, 2, 5], + [Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1], - [Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]> + ]; +} - InstrItinData DepHVXItinV62_list = [ + InstrItinData , - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], - [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_XLSHF]>], [9, 5], + [HVX_FWD, HVX_FWD]>, - InstrItinData , InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7], - [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7], + [Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7], - [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], + InstrStage<1, [CVI_SHIFT]>], [9, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD]>], [9, 3, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], - [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7], + [HVX_FWD, HVX_FWD]>, - InstrItinData , + InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7], + InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5], [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , + InstrStage<1, [CVI_ALL]>], [], + []>, + + InstrItinData , InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, - InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], - [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7], + [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7], - [HVX_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2], + [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ALL]>], [2], - [Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ST]>], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], - [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], + [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7], - [Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2], + [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2], + InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ALL]>], [3], - [HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7], + [HVX_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 5], - [HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST]>], [2, 1, 2, 5], - [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , + InstrStage<1, [CVI_ST]>], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData , - InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5], + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7], [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_XLSHF]>], [9, 5], - [HVX_FWD, HVX_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , InstrStage<1, [CVI_LD], 0>, InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST]>], [3, 1, 2, 5], - [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_XLANE]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7], - [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_SHIFT]>], [9, 5], + [HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], - [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_LD]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1], + [Hex_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], - [Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ALL]>], [], - []>, + InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 5], - [HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7], - [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 5, 5], - [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_XLANE]>], [9, 2], + [HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, - InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 5, 5], - [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]> - ]; -} + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, -class DepHVXItinV65 { - list DepHVXItinV65_list = [ - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_LD]>], [9, 3, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST]>], [1, 2, 5], - [Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_ST]>], [3, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7], - [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_XLANE]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ALL]>], [3, 2], - [HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST]>], [1, 2], - [Hex_FWD, Hex_FWD]>, + InstrStage<1, [CVI_ST]>], [2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7], - [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9], + [HVX_FWD]>, - InstrItinData , - InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_XLANE]>], [1, 2, 5], - [Hex_FWD, Hex_FWD, HVX_FWD]>, - - InstrItinData , - InstrStage<1, [SLOT1], 0>, - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_XLANE]>], [9, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD]>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7], + [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 2], + InstrStage<1, [CVI_ALL]>], [3, 2], [HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 5, 5], + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , InstrStage<1, [CVI_ST], 0>, InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7], [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [SLOT1], 0>, - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7], - [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_ZW]>], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST]>], [3, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ST]>], [1, 2, 5], + [Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , InstrStage<1, [CVI_LD], 0>, InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], - [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7], - [HVX_FWD, HVX_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], - [Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7], + [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , + InstrStage<1, [CVI_ALL]>], [2], + [Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2], [HVX_FWD, Hex_FWD]>, - InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_ST], 0>, InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7], - [HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLSHF]>], [9, 7, 5], + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ZW]>], [2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ALL]>], [3], + [HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD]>], [9, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD]>, + InstrStage<1, [CVI_ZW]>], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>], [1, 2, 5], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]> + ]; +} + +class DepHVXItinV65 { + list DepHVXItinV65_list = [ + InstrItinData , + InstrStage<1, [CVI_XLSHF]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_SHIFT]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , InstrStage<1, [CVI_MPY01]>], [9, 5, 2], [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ALL]>], [], + []>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7], + [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2], [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST]>], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], + [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2], + [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7], + [HVX_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST]>], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData , InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5], + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7], [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2], + InstrStage<1, [CVI_XLANE]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_SHIFT]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1], + [Hex_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_XLANE]>], [9, 2], + [HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , + InstrStage<1, [CVI_LD]>], [9, 3, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST]>], [3, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_XLANE]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST]>], [2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9], + [HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7], + [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ALL]>], [3, 2], + [HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7], + [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ZW]>], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST]>], [1, 2, 5], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7], + [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ALL]>], [2], + [Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2], + [HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ZW]>], [2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ALL]>], [3], + [HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ZW]>], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>], [1, 2, 5], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]> + ]; +} + +class DepHVXItinV66 { + list DepHVXItinV66_list = [ + InstrItinData , + InstrStage<1, [CVI_XLSHF]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_SHIFT]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ALL]>], [], + []>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7], + [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2], + [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST]>], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], + [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2], + [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7], + [HVX_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST]>], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_XLANE]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_SHIFT]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1], + [Hex_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7], - [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9], - [HVX_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5], + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2], + InstrItinData , + InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , InstrStage<1, [CVI_ST], 0>, InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1], - [Hex_FWD, HVX_FWD, Hex_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], - [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_XLANE]>], [9, 2], + [HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7], - [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7], - [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], - [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , InstrStage<1, [CVI_LD]>], [9, 3, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , + InstrStage<1, [CVI_ST]>], [3, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], + InstrStage<1, [CVI_XLANE]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7], [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7], + InstrStage<1, [CVI_ST]>], [2, 1, 2, 5], [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, - InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9], + [HVX_FWD]>, - InstrItinData , InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], - [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7], + [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7], - [HVX_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ALL]>], [3, 2], + [HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ALL]>], [2], - [Hex_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7], - [Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2], + InstrItinData , + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ALL]>], [3], - [HVX_FWD]>, + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_SHIFT]>], [9, 5], - [HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7], + [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST]>], [2, 1, 2, 5], + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7], [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , + InstrStage<1, [CVI_ZW]>], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , InstrStage<1, [SLOT1], 0>, - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5], - [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST]>], [1, 2, 5], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLSHF]>], [9, 5], - [HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7], + [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2], + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, - InstrItinData , + InstrStage<1, [CVI_ALL]>], [2], + [Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + + InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2], + InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ST]>], [3, 1, 2, 5], - [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - - InstrItinData , - InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7], - [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], - [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2], + [HVX_FWD, Hex_FWD]>, - InstrItinData , InstrStage<1, [CVI_ST], 0>, InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], [Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_ALL]>], [], - []>, + InstrItinData , + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_MPY01]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 5], - [HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_LD], 0>, - InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2], - [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrStage<1, [CVI_ZW]>], [2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, - InstrItinData , + InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_ST], 0>, - InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7], - [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY01]>], [9, 5, 5], - [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ALL]>], [3], + [HVX_FWD]>, - InstrItinData , - InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, - InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2], - [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData , - InstrStage<1, [CVI_XLANE]>], [9, 5, 5], - [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData , + InstrStage<1, [CVI_ZW]>], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, - InstrItinData , InstrStage<1, [CVI_MPY01]>], [9, 5, 2], - [HVX_FWD, HVX_FWD, Hex_FWD]> + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData , + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>], [1, 2, 5], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData , + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]> ]; } diff --git a/lib/Target/Hexagon/HexagonDepIICScalar.td b/lib/Target/Hexagon/HexagonDepIICScalar.td index 931504b56ccbf2b3b302bd3ba4075c5733cc2d9f..9da25952fb1c96e0312b71ab21139a5c96b19b7d 100644 --- a/lib/Target/Hexagon/HexagonDepIICScalar.td +++ b/lib/Target/Hexagon/HexagonDepIICScalar.td @@ -1,4 +1,4 @@ -//===- HexagonDepIICScalar.td ---------------------------------------------===// +//===----------------------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -9,3087 +9,3789 @@ // Automatically generated file, please consult code owner before editing. //===----------------------------------------------------------------------===// - -def tc_00afc57e : InstrItinClass; -def tc_00e7c26e : InstrItinClass; -def tc_03220ffa : InstrItinClass; -def tc_038a1342 : InstrItinClass; -def tc_04c9decc : InstrItinClass; -def tc_05b6c987 : InstrItinClass; -def tc_0cd51c76 : InstrItinClass; -def tc_0dc560de : InstrItinClass; -def tc_0fc1ae07 : InstrItinClass; -def tc_10b97e27 : InstrItinClass; -def tc_1372bca1 : InstrItinClass; -def tc_14cd4cfa : InstrItinClass; -def tc_15411484 : InstrItinClass; -def tc_16d0d8d5 : InstrItinClass; -def tc_181af5d0 : InstrItinClass; -def tc_1853ea6d : InstrItinClass; -def tc_1b82a277 : InstrItinClass; -def tc_1b9c9ee5 : InstrItinClass; -def tc_1d5a38a8 : InstrItinClass; -def tc_1e856f58 : InstrItinClass; -def tc_234a11a5 : InstrItinClass; -def tc_238d91d2 : InstrItinClass; -def tc_29175780 : InstrItinClass; -def tc_2a160009 : InstrItinClass; -def tc_2b2f4060 : InstrItinClass; -def tc_2b6f77c6 : InstrItinClass; -def tc_2f185f5c : InstrItinClass; -def tc_2fc0c436 : InstrItinClass; -def tc_351fed2d : InstrItinClass; -def tc_3669266a : InstrItinClass; -def tc_367f7f3d : InstrItinClass; -def tc_36c68ad1 : InstrItinClass; -def tc_395dc00f : InstrItinClass; -def tc_3bc2c5d3 : InstrItinClass; -def tc_3cb8ea06 : InstrItinClass; -def tc_3d04548d : InstrItinClass; -def tc_3da80ba5 : InstrItinClass; -def tc_3e07fb90 : InstrItinClass; -def tc_41d5298e : InstrItinClass; -def tc_4403ca65 : InstrItinClass; -def tc_44126683 : InstrItinClass; -def tc_452f85af : InstrItinClass; -def tc_481e5e5c : InstrItinClass; -def tc_49eb22c8 : InstrItinClass; -def tc_4ca572d4 : InstrItinClass; -def tc_4d9914c9 : InstrItinClass; -def tc_4d99bca9 : InstrItinClass; -def tc_4f7cd700 : InstrItinClass; -def tc_513bef45 : InstrItinClass; -def tc_51b866be : InstrItinClass; -def tc_523fcf30 : InstrItinClass; -def tc_5274e61a : InstrItinClass; -def tc_52d7bbea : InstrItinClass; -def tc_53bc8a6a : InstrItinClass; -def tc_53bdb2f6 : InstrItinClass; -def tc_540fdfbc : InstrItinClass; -def tc_55050d58 : InstrItinClass; -def tc_57288781 : InstrItinClass; -def tc_594ab548 : InstrItinClass; -def tc_59a01ead : InstrItinClass; -def tc_5acef64a : InstrItinClass; -def tc_5ba5997d : InstrItinClass; -def tc_5eb851fc : InstrItinClass; -def tc_5f6847a1 : InstrItinClass; -def tc_60571023 : InstrItinClass; -def tc_609d2efe : InstrItinClass; -def tc_63fe3df7 : InstrItinClass; -def tc_66888ded : InstrItinClass; -def tc_6792d5ff : InstrItinClass; -def tc_681a2300 : InstrItinClass; -def tc_68cb12ce : InstrItinClass; -def tc_6aa5711a : InstrItinClass; -def tc_6ac37025 : InstrItinClass; -def tc_6ebb4a12 : InstrItinClass; -def tc_6efc556e : InstrItinClass; -def tc_6fa4db47 : InstrItinClass; -def tc_73043bf4 : InstrItinClass; -def tc_746baa8e : InstrItinClass; -def tc_74e47fd9 : InstrItinClass; -def tc_7934b9df : InstrItinClass; -def tc_7a830544 : InstrItinClass; -def tc_7f881c76 : InstrItinClass; -def tc_84df2cd3 : InstrItinClass; -def tc_855b0b61 : InstrItinClass; -def tc_87735c3b : InstrItinClass; -def tc_897d1a9d : InstrItinClass; -def tc_8b15472a : InstrItinClass; -def tc_8fd5f294 : InstrItinClass; -def tc_8fe6b782 : InstrItinClass; -def tc_90f3e30c : InstrItinClass; -def tc_976ddc4f : InstrItinClass; -def tc_97743097 : InstrItinClass; -def tc_994333cd : InstrItinClass; -def tc_999d32db : InstrItinClass; -def tc_99be14ca : InstrItinClass; -def tc_9c00ce8d : InstrItinClass; -def tc_9c98e8af : InstrItinClass; -def tc_9d5941c7 : InstrItinClass; -def tc_9ef61e5c : InstrItinClass; -def tc_9faf76ae : InstrItinClass; -def tc_9fdb5406 : InstrItinClass; -def tc_a21dc435 : InstrItinClass; -def tc_a27582fa : InstrItinClass; -def tc_a46f0df5 : InstrItinClass; -def tc_a788683e : InstrItinClass; -def tc_a8acdac0 : InstrItinClass; -def tc_a904d137 : InstrItinClass; -def tc_adb14c66 : InstrItinClass; -def tc_b13761ae : InstrItinClass; -def tc_b166348b : InstrItinClass; -def tc_b44c6e2a : InstrItinClass; -def tc_b77c481f : InstrItinClass; -def tc_b7dd427e : InstrItinClass; -def tc_b9488031 : InstrItinClass; -def tc_b9c0b731 : InstrItinClass; -def tc_b9c4623f : InstrItinClass; -def tc_bad2bcaf : InstrItinClass; -def tc_bcc96cee : InstrItinClass; -def tc_bde7aaf4 : InstrItinClass; -def tc_be706f30 : InstrItinClass; -def tc_c2f7d806 : InstrItinClass; -def tc_c5e2426d : InstrItinClass; -def tc_c6aa82f7 : InstrItinClass; -def tc_c6ce9b3f : InstrItinClass; -def tc_c6ebf8dd : InstrItinClass; -def tc_c74f796f : InstrItinClass; -def tc_c82dc1ff : InstrItinClass; -def tc_caaebcba : InstrItinClass; -def tc_cd7374a0 : InstrItinClass; -def tc_cde8b071 : InstrItinClass; -def tc_cf47a43f : InstrItinClass; -def tc_cf59f215 : InstrItinClass; -def tc_d088982c : InstrItinClass; -def tc_d1090e34 : InstrItinClass; -def tc_d24b2d85 : InstrItinClass; -def tc_d580173f : InstrItinClass; -def tc_d6bf0472 : InstrItinClass; -def tc_d9709180 : InstrItinClass; -def tc_d9f95eef : InstrItinClass; -def tc_daa058fa : InstrItinClass; -def tc_dbdffe3d : InstrItinClass; -def tc_e0739b8c : InstrItinClass; -def tc_e1e99bfa : InstrItinClass; -def tc_e216a5db : InstrItinClass; -def tc_e421e012 : InstrItinClass; -def tc_e7624c08 : InstrItinClass; -def tc_e7d02c66 : InstrItinClass; -def tc_e913dc32 : InstrItinClass; -def tc_e9c822f7 : InstrItinClass; -def tc_e9fae2d6 : InstrItinClass; -def tc_ef52ed71 : InstrItinClass; -def tc_ef84f62f : InstrItinClass; -def tc_f2704b9a : InstrItinClass; -def tc_f3eaa14b : InstrItinClass; -def tc_f47d212f : InstrItinClass; -def tc_f49e76f4 : InstrItinClass; -def tc_f7dd9c9f : InstrItinClass; -def tc_f86c328a : InstrItinClass; -def tc_f8eeed7a : InstrItinClass; -def tc_fcab4871 : InstrItinClass; -def tc_ff9ee76e : InstrItinClass; - -class DepScalarItinV4 { - list DepScalarItinV4_list = [ - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]> ]; -} +def tc_002cb246 : InstrItinClass; +def tc_0371abea : InstrItinClass; +def tc_05c070ec : InstrItinClass; +def tc_05d3a09b : InstrItinClass; +def tc_0663f615 : InstrItinClass; +def tc_096199d3 : InstrItinClass; +def tc_0a705168 : InstrItinClass; +def tc_0ae0825c : InstrItinClass; +def tc_0b2be201 : InstrItinClass; +def tc_0d8f5752 : InstrItinClass; +def tc_13bfbcf9 : InstrItinClass; +def tc_14b272fa : InstrItinClass; +def tc_14b5c689 : InstrItinClass; +def tc_15aa71c5 : InstrItinClass; +def tc_174516e8 : InstrItinClass; +def tc_17e0d2cd : InstrItinClass; +def tc_1a2fd869 : InstrItinClass; +def tc_1ad90acd : InstrItinClass; +def tc_1ae57e39 : InstrItinClass; +def tc_1b6f7cec : InstrItinClass; +def tc_1c4528a2 : InstrItinClass; +def tc_1c80410a : InstrItinClass; +def tc_1d81e60e : InstrItinClass; +def tc_1fc97744 : InstrItinClass; +def tc_20cdee80 : InstrItinClass; +def tc_2332b92e : InstrItinClass; +def tc_24b66c99 : InstrItinClass; +def tc_25a78932 : InstrItinClass; +def tc_2b8da4c2 : InstrItinClass; +def tc_2eabeebe : InstrItinClass; +def tc_2f7c551d : InstrItinClass; +def tc_2ff964b4 : InstrItinClass; +def tc_30b9bb4a : InstrItinClass; +def tc_32779c6f : InstrItinClass; +def tc_36153880 : InstrItinClass; +def tc_362c6592 : InstrItinClass; +def tc_3962fa26 : InstrItinClass; +def tc_39dfefe8 : InstrItinClass; +def tc_3a867367 : InstrItinClass; +def tc_3b470976 : InstrItinClass; +def tc_3b5b7ef9 : InstrItinClass; +def tc_3bd75825 : InstrItinClass; +def tc_3c76b0ff : InstrItinClass; +def tc_3d495a39 : InstrItinClass; +def tc_40116ca8 : InstrItinClass; +def tc_434c8e1e : InstrItinClass; +def tc_4414d8b1 : InstrItinClass; +def tc_44d3da28 : InstrItinClass; +def tc_4560740b : InstrItinClass; +def tc_4837eefb : InstrItinClass; +def tc_49a8207d : InstrItinClass; +def tc_4ae7b58b : InstrItinClass; +def tc_4b68bce4 : InstrItinClass; +def tc_4c5ba658 : InstrItinClass; +def tc_4d5fa3a1 : InstrItinClass; +def tc_53559e35 : InstrItinClass; +def tc_56336eb0 : InstrItinClass; +def tc_56f114f4 : InstrItinClass; +def tc_57890846 : InstrItinClass; +def tc_5a2711e5 : InstrItinClass; +def tc_5abb5e3f : InstrItinClass; +def tc_5aee39f7 : InstrItinClass; +def tc_5b54b33f : InstrItinClass; +def tc_5b7c0967 : InstrItinClass; +def tc_5bf126a6 : InstrItinClass; +def tc_5d7f5414 : InstrItinClass; +def tc_5ef37dc4 : InstrItinClass; +def tc_6132ba3d : InstrItinClass; +def tc_61830035 : InstrItinClass; +def tc_640086b5 : InstrItinClass; +def tc_643b4717 : InstrItinClass; +def tc_67435e81 : InstrItinClass; +def tc_675e4897 : InstrItinClass; +def tc_679309b8 : InstrItinClass; +def tc_6b25e783 : InstrItinClass; +def tc_703e822c : InstrItinClass; +def tc_7186d325 : InstrItinClass; +def tc_7646c131 : InstrItinClass; +def tc_76851da1 : InstrItinClass; +def tc_779080bf : InstrItinClass; +def tc_784490da : InstrItinClass; +def tc_785f65a7 : InstrItinClass; +def tc_7a91e76a : InstrItinClass; +def tc_838b34ea : InstrItinClass; +def tc_85c9c08f : InstrItinClass; +def tc_85d5d03f : InstrItinClass; +def tc_862b3e70 : InstrItinClass; +def tc_88b4f13d : InstrItinClass; +def tc_89e94ad3 : InstrItinClass; +def tc_8b121f4a : InstrItinClass; +def tc_8b3e402a : InstrItinClass; +def tc_8c945be0 : InstrItinClass; +def tc_8c99de45 : InstrItinClass; +def tc_8d9d0154 : InstrItinClass; +def tc_8fb7ab1b : InstrItinClass; +def tc_9461ff31 : InstrItinClass; +def tc_946df596 : InstrItinClass; +def tc_9ad9998f : InstrItinClass; +def tc_9bfd761f : InstrItinClass; +def tc_9c3ecd83 : InstrItinClass; +def tc_9ca930f7 : InstrItinClass; +def tc_9da59d12 : InstrItinClass; +def tc_9debc299 : InstrItinClass; +def tc_9e313203 : InstrItinClass; +def tc_9fc3dae0 : InstrItinClass; +def tc_a1123dda : InstrItinClass; +def tc_a1c00888 : InstrItinClass; +def tc_a58fd5cc : InstrItinClass; +def tc_a5d4aeec : InstrItinClass; +def tc_a6b1eca9 : InstrItinClass; +def tc_a813cf9a : InstrItinClass; +def tc_a9d88b22 : InstrItinClass; +def tc_ae53734a : InstrItinClass; +def tc_b31c2e97 : InstrItinClass; +def tc_b343892a : InstrItinClass; +def tc_b43e7930 : InstrItinClass; +def tc_b4407292 : InstrItinClass; +def tc_b44ecf75 : InstrItinClass; +def tc_b4b5c03a : InstrItinClass; +def tc_b51dc29a : InstrItinClass; +def tc_b83e6d73 : InstrItinClass; +def tc_b857bf4e : InstrItinClass; +def tc_b8bffe55 : InstrItinClass; +def tc_b90a29b1 : InstrItinClass; +def tc_b9272d6c : InstrItinClass; +def tc_b9e09e03 : InstrItinClass; +def tc_bab0eed9 : InstrItinClass; +def tc_bafaade3 : InstrItinClass; +def tc_bcf98408 : InstrItinClass; +def tc_bd8382d1 : InstrItinClass; +def tc_bdceeac1 : InstrItinClass; +def tc_be9602ff : InstrItinClass; +def tc_bf061958 : InstrItinClass; +def tc_bfec0f01 : InstrItinClass; +def tc_c4db48cb : InstrItinClass; +def tc_c4f596e3 : InstrItinClass; +def tc_c79a189f : InstrItinClass; +def tc_c8ce0b5c : InstrItinClass; +def tc_cd374165 : InstrItinClass; +def tc_cf8126ae : InstrItinClass; +def tc_cfd8378a : InstrItinClass; +def tc_d08ee0f4 : InstrItinClass; +def tc_d1aa9eaa : InstrItinClass; +def tc_d2e63d61 : InstrItinClass; +def tc_d5b7b0c1 : InstrItinClass; +def tc_d5c0729a : InstrItinClass; +def tc_d63f638c : InstrItinClass; +def tc_d65dbf51 : InstrItinClass; +def tc_d773585a : InstrItinClass; +def tc_d9d43ecb : InstrItinClass; +def tc_da4a37ed : InstrItinClass; +def tc_da97ee82 : InstrItinClass; +def tc_db2bce9c : InstrItinClass; +def tc_de4df740 : InstrItinClass; +def tc_de554571 : InstrItinClass; +def tc_df3319ed : InstrItinClass; +def tc_e06f432a : InstrItinClass; +def tc_e4a7f9f0 : InstrItinClass; +def tc_e4b3cb20 : InstrItinClass; +def tc_e78647bd : InstrItinClass; +def tc_e86aa961 : InstrItinClass; +def tc_e93a3d71 : InstrItinClass; +def tc_e95795ec : InstrItinClass; +def tc_e9f3243f : InstrItinClass; +def tc_f429765c : InstrItinClass; +def tc_f675fee8 : InstrItinClass; +def tc_f8e23f0b : InstrItinClass; +def tc_f9058dd7 : InstrItinClass; +def tc_fc3999b4 : InstrItinClass; +def tc_fcc3ddf9 : InstrItinClass; +def tc_fe211424 : InstrItinClass; class DepScalarItinV5 { list DepScalarItinV5_list = [ - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]>, - InstrItinData ]> ]; + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]>, + InstrItinData ]> ]; } class DepScalarItinV55 { list DepScalarItinV55_list = [ - InstrItinData ], [4, 2], + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [1], - [Hex_FWD]>, + InstrItinData ], [3, 3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 2, 1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1], + InstrItinData ], [1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2, 2], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2, 2, 2], + InstrItinData ], [2, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, - - InstrItinData ], [2, 1], + InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [2], - [Hex_FWD]>, - - InstrItinData ], [1], - [Hex_FWD]>, + InstrItinData ], [4, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 1, 2], + InstrItinData ], [1, 2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1], + InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3], + InstrItinData ], [1], [Hex_FWD]>, - InstrItinData ], [3, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 2, 2], + InstrItinData ], [4, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2, 2, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], + InstrItinData ], [2, 1], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2, 2], + InstrItinData ], [3, 3, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], - [Hex_FWD, Hex_FWD]>, - - InstrItinData ], [], + InstrItinData ], [], []>, - InstrItinData ], [3, 3, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [3, 1], - [Hex_FWD, Hex_FWD]>, - - InstrItinData ], [2], - [Hex_FWD]>, - - InstrItinData ], [], - []>, - - InstrItinData ], [], - []>, - - InstrItinData ], [4, 3, 1], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, - - InstrItinData ], [1, 2], + InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], + InstrItinData ], [1, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [1], - [Hex_FWD]>, - - InstrItinData ], [3, 1, 1, 2, 2], + InstrItinData ], [3, 2, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 3, 1, 2, 2], + InstrItinData ], [4, 3, 1, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2, 2], + InstrItinData ], [4, 2, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, - - InstrItinData ], [3, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [5, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [], - []>, - - InstrItinData ], [1, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 4, 1], + InstrItinData ], [1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 1], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [3, 1, 2], + InstrItinData ], [2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 4, 1, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [2, 1, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [], + InstrItinData ], [], []>, - InstrItinData ], [3, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 1], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 3], + InstrItinData ], [4, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [4, 3, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [1], + [Hex_FWD]>, + + InstrItinData ], [4, 3, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 1, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 1], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 2, 2], + InstrItinData ], [4, 3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 1, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 3, 2], + InstrItinData ], [4, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 3, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [2], + [Hex_FWD]>, - InstrItinData ], [4, 2, 1, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [2], + [Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [2, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 3, 1, 2], + InstrItinData ], [3, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2], + [Hex_FWD]>, + + InstrItinData ], [3, 3, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [1, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1], + InstrItinData ], [3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, - - InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1], - [Hex_FWD, Hex_FWD]>, - - InstrItinData ], [2, 2, 3], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 2], + InstrItinData ], [4, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [], - []>, - - InstrItinData ], [4, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [1, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 1, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], + InstrItinData ], [1, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 3, 1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [2, 1], + InstrItinData ], [2, 3], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [1], + [Hex_FWD]>, - InstrItinData ], [3, 2, 2, 2], + InstrItinData ], [4, 2, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 1, 2], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 1, 1], + InstrItinData ], [3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2], + InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [1], + [Hex_FWD]>, - InstrItinData ], [4, 1], + InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [1], - [Hex_FWD]>, + InstrItinData ], [1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2, 2], + InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 4, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 2, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 1, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 3, 2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1], - [Hex_FWD]>, + InstrItinData ], [4, 3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 2, 2, 2], + InstrItinData ], [3, 2, 1, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1], + InstrItinData ], [2, 1], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], + InstrItinData ], [3, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2], [Hex_FWD]>, - InstrItinData ], [3, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], + InstrItinData ], [1, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [3, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [3], + [Hex_FWD]>, - InstrItinData ], [2, 1, 1, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [], + InstrItinData ], [], []>, - InstrItinData ], [1, 1, 2, 3], + InstrItinData ], [2], + [Hex_FWD]>, + + InstrItinData ], [2, 1, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 2], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 1, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 2], - [Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 3, 2, 1, 2], + InstrItinData ], [4, 4, 1, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2, 1], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [1, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [4, 3, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], + InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 1, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3], + [Hex_FWD]>, - InstrItinData ], [4, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2, 2], + InstrItinData ], [4, 1, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1], - [Hex_FWD]>, - - InstrItinData ], [4, 4, 1, 1, 1], + InstrItinData ], [4, 2, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], + InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [2, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [2], + [Hex_FWD]>, - InstrItinData ], [4, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 3, 1, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [1, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 1], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [1, 1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 1, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 1], + InstrItinData ], [3, 1], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], + InstrItinData ], [1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1], - [Hex_FWD, Hex_FWD]>, - - InstrItinData ], [2, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [1], + [Hex_FWD]>, - InstrItinData ], [4, 2, 2], + InstrItinData ], [4, 4, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 1, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3], - [Hex_FWD]>, + InstrItinData ], [4, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 1, 1], + InstrItinData ], [4, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3], - [Hex_FWD]>, + InstrItinData ], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], + InstrItinData ], [4, 1], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 4, 2, 2], + InstrItinData ], [4, 2, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], - [Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 1], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2, 2], + InstrItinData ], [3, 1, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [], - []>, + InstrItinData ], [4, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 3], - [Hex_FWD, Hex_FWD]> - ]; -} + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, -class DepScalarItinV60 { - list DepScalarItinV60_list = [ - InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [1], - [Hex_FWD]>, + InstrItinData ], [3, 1], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 2, 1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [5, 2, 1, 1, 2], + InstrItinData ], [4, 4, 1, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1], + InstrItinData ], [3, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [1], + [Hex_FWD]>, - InstrItinData ], [4, 2, 1, 2, 2], + InstrItinData ], [3, 1, 2, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [2], + InstrItinData ], [1], [Hex_FWD]>, - InstrItinData ], [2, 1], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 3, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, + InstrItinData ], [1, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [1], - [Hex_FWD]>, + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 1, 2], + InstrItinData ], [3, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1], + InstrItinData ], [2, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 2], + InstrItinData ], [3, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3], + [Hex_FWD]>, + + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 1, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 3, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 3, 3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2], + [Hex_FWD]>, + + InstrItinData ], [1, 2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 1], + [Hex_FWD, Hex_FWD]> + ]; +} + +class DepScalarItinV60 { + list DepScalarItinV60_list = [ + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1], + [Hex_FWD]>, + + InstrItinData ], [4, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 2, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [], + []>, + + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 3, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [], + []>, + + InstrItinData ], [5, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [5, 5, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [], + []>, + + InstrItinData ], [2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 1, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [], + []>, + + InstrItinData ], [5, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [5, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1], + [Hex_FWD]>, + + InstrItinData ], [4, 3, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 1, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [5, 2, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2], + [Hex_FWD]>, + + InstrItinData ], [2], + [Hex_FWD]>, + + InstrItinData ], [2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2], + [Hex_FWD]>, + + InstrItinData ], [3, 3, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 3], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1], + [Hex_FWD]>, + + InstrItinData ], [4, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1], + [Hex_FWD]>, + + InstrItinData ], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3], + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 3, 2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2], + [Hex_FWD]>, + + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [], + []>, + + InstrItinData ], [3], + [Hex_FWD]>, + + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [], + []>, + + InstrItinData ], [2], + [Hex_FWD]>, + + InstrItinData ], [2, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [], + []>, + + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 4, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3], + [Hex_FWD]>, + + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [5, 2, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2], + [Hex_FWD]>, + + InstrItinData ], [3, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [], + []>, + + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1], + [Hex_FWD]>, + + InstrItinData ], [5, 5, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 4, 1, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1], + [Hex_FWD]>, + + InstrItinData ], [3, 1, 2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1], + [Hex_FWD]>, + + InstrItinData ], [3, 3, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3], + [Hex_FWD]>, + + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 1, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 3, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 3, 3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2], [Hex_FWD]>, - InstrItinData ], [1, 2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 1], + [Hex_FWD, Hex_FWD]> + ]; +} + +class DepScalarItinV62 { + list DepScalarItinV62_list = [ + InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 1], + InstrItinData ], [1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [], - []>, + InstrItinData ], [4, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 3, 2, 2], + InstrItinData ], [1, 2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], + InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 1, 2], + InstrItinData ], [1], + [Hex_FWD]>, + + InstrItinData ], [4, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 2, 1], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1], + InstrItinData ], [2, 1], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, + InstrItinData ], [3, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [], + InstrItinData ], [], []>, - InstrItinData ], [], - []>, + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 3, 2], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, - - InstrItinData ], [1, 2], + InstrItinData ], [2, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [1], - [Hex_FWD]>, + InstrItinData ], [1, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 1, 2, 2], + InstrItinData ], [3, 2, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 3, 1, 2, 2], + InstrItinData ], [4, 3, 1, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2, 2], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, - - InstrItinData ], [3, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [5, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [], - []>, - - InstrItinData ], [1, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [5, 5, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [5, 5, 1], + InstrItinData ], [1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 4, 1, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [2, 1, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [], + InstrItinData ], [], []>, - InstrItinData ], [3, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [5, 1], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 3], + InstrItinData ], [5, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [4, 3, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [1], + [Hex_FWD]>, + + InstrItinData ], [4, 3, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 1, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 1], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 2, 2], + InstrItinData ], [4, 3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [5, 2, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 3, 2], + InstrItinData ], [4, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 3, 2], + InstrItinData ], [2], + [Hex_FWD]>, + + InstrItinData ], [2], + [Hex_FWD]>, + + InstrItinData ], [2, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 1, 1, 2], + InstrItinData ], [3, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2], + [Hex_FWD]>, + + InstrItinData ], [3, 3, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [1, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 3, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [5, 1, 1], + InstrItinData ], [4, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, + InstrItinData ], [1, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], + InstrItinData ], [4, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1], + InstrItinData ], [2, 3], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2, 3], + InstrItinData ], [1], + [Hex_FWD]>, + + InstrItinData ], [4, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 2], + InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [], - []>, + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 3, 1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1], + InstrItinData ], [1], + [Hex_FWD]>, + + InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 2], + InstrItinData ], [1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 2], + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 3, 2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2, 2, 2], + InstrItinData ], [3, 2, 1, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [2, 1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [2, 1], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1], + InstrItinData ], [3, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [2], + [Hex_FWD]>, + + InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [2, 1], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [3], + [Hex_FWD]>, - InstrItinData ], [4, 1], + InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [1], - [Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [2], + [Hex_FWD]>, - InstrItinData ], [5, 5, 1, 1], + InstrItinData ], [2, 1, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 2, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 1, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [1], - [Hex_FWD]>, + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 2, 2, 2], + InstrItinData ], [3, 4, 2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1], + InstrItinData ], [1, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, + InstrItinData ], [3, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], + InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1, 1, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3], + [Hex_FWD]>, - InstrItinData ], [], - []>, + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 1, 2, 3], + InstrItinData ], [5, 2, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 3, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 3, 1, 1, 2], + InstrItinData ], [4, 2, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [1, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 2, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2, 1], + InstrItinData ], [2], + [Hex_FWD]>, + + InstrItinData ], [3, 2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [2, 3, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], - [Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 2], + InstrItinData ], [4, 1], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [3, 1, 1], + InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1, 2], + InstrItinData ], [3, 2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 1], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1], + InstrItinData ], [1], [Hex_FWD]>, - InstrItinData ], [4, 4, 1, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [5, 5, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 1, 2, 2], + InstrItinData ], [3, 1, 1, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], + InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], - [Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], + InstrItinData ], [4, 1], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 3, 1, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [5, 2, 1, 1], + InstrItinData ], [4, 2, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 1, 2, 2], + InstrItinData ], [4, 1, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 1, 2, 3], + InstrItinData ], [4, 2, 2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 1], + InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [3, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1], + InstrItinData ], [2, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], + InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], + InstrItinData ], [3, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 1, 2, 3], + InstrItinData ], [4, 4, 2, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3], - [Hex_FWD]>, + InstrItinData ], [3, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [1], + [Hex_FWD]>, - InstrItinData ], [4, 2, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 1, 2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3], + InstrItinData ], [1], [Hex_FWD]>, - InstrItinData ], [2, 2], - [Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 2, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 3, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 4, 2, 2], + InstrItinData ], [4, 2, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], - [Hex_FWD, Hex_FWD]>, - - InstrItinData ], [5, 1], + InstrItinData ], [1, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [1, 2, 3], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [], - []>, + InstrItinData ], [3, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 3], - [Hex_FWD, Hex_FWD]> - ]; -} + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, -class DepScalarItinV62 { - list DepScalarItinV62_list = [ - InstrItinData ], [4, 2], + InstrItinData ], [2, 1], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [1], + InstrItinData ], [3], [Hex_FWD]>, - InstrItinData ], [4, 3, 2, 1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [5, 2, 1, 1, 2], + InstrItinData ], [4, 2, 1, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [2, 1], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 1, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 3, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 1, 2, 2], + InstrItinData ], [4, 3, 3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2, 2, 2], + InstrItinData ], [4, 2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, + InstrItinData ], [4, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], [Hex_FWD]>, - InstrItinData ], [1], - [Hex_FWD]>, - - InstrItinData ], [4, 2, 1, 2], + InstrItinData ], [1, 2, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [1, 1], + [Hex_FWD, Hex_FWD]> + ]; +} - InstrItinData ], [4, 1, 2], +class DepScalarItinV65 { + list DepScalarItinV65_list = [ + InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3], - [Hex_FWD]>, + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], + InstrItinData ], [4, 3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], + InstrItinData ], [1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 1], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], + InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [], - []>, + InstrItinData ], [4, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 3, 2, 2], + InstrItinData ], [1, 2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], + InstrItinData ], [4, 1], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 1, 2], + InstrItinData ], [1], + [Hex_FWD]>, + + InstrItinData ], [4, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 2, 1], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1], + InstrItinData ], [2, 1], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, + InstrItinData ], [4, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [], + InstrItinData ], [], []>, - InstrItinData ], [], - []>, + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 3, 2], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, - - InstrItinData ], [1, 2], + InstrItinData ], [2, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [1], - [Hex_FWD]>, + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 1, 2, 2], + InstrItinData ], [3, 2, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 3, 1, 2, 2], + InstrItinData ], [4, 3, 1, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2, 2], + InstrItinData ], [3, 2, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, - - InstrItinData ], [3, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [5, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [], - []>, - - InstrItinData ], [1, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [5, 5, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [5, 5, 1], + InstrItinData ], [1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 4, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [2, 1, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [], + InstrItinData ], [], []>, - InstrItinData ], [3, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [5, 1], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 3], + InstrItinData ], [5, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [4, 3, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [1], + [Hex_FWD]>, + + InstrItinData ], [4, 3, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 1, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 1], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 2, 2], + InstrItinData ], [4, 3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [5, 2, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 3, 2], + InstrItinData ], [4, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 3, 2], + InstrItinData ], [2], + [Hex_FWD]>, + + InstrItinData ], [2], + [Hex_FWD]>, + + InstrItinData ], [2, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 1, 1, 2], + InstrItinData ], [3, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2], + [Hex_FWD]>, + + InstrItinData ], [4, 3, 1, 2, 1], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 3, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [5, 1, 1], + InstrItinData ], [4, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, + InstrItinData ], [1, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], + InstrItinData ], [4, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1], + InstrItinData ], [2, 3], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2, 3], + InstrItinData ], [1], + [Hex_FWD]>, + + InstrItinData ], [4, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 2], + InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [], - []>, + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], + InstrItinData ], [3, 1], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 3, 1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1], + InstrItinData ], [1], + [Hex_FWD]>, + + InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 2], + InstrItinData ], [1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 2], + InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 3, 2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [2], + [Hex_FWD]>, + + InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [4, 1], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [4], + [Hex_FWD]>, - InstrItinData ], [2, 1], + InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [1], + InstrItinData ], [], + []>, + + InstrItinData ], [2], [Hex_FWD]>, - InstrItinData ], [1, 2, 2], + InstrItinData ], [2, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [5, 5, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [3, 1, 2, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 1, 1, 2], + InstrItinData ], [3, 4, 2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1], - [Hex_FWD]>, + InstrItinData ], [1, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 2, 2, 2], + InstrItinData ], [3, 3, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 1, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1], + InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, - - InstrItinData ], [3, 2], + InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], + InstrItinData ], [3], + [Hex_FWD]>, + + InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2, 2, 3], + InstrItinData ], [5, 2, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], + InstrItinData ], [4, 2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1, 1, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [], - []>, + InstrItinData ], [2], + [Hex_FWD]>, - InstrItinData ], [1, 1, 2, 3], + InstrItinData ], [3, 2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], + InstrItinData ], [2, 3, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 1], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 1, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1, 2], + InstrItinData ], [3, 2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], + InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 2, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2, 1], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [1], + [Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [5, 5, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], - [Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], + InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 1], + InstrItinData ], [4, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1, 2], + InstrItinData ], [4, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2, 2], + InstrItinData ], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1], - [Hex_FWD]>, + InstrItinData ], [1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 4, 2, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 1, 2, 2], + InstrItinData ], [4, 2, 2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], + InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 2, 2], + InstrItinData ], [3, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], + InstrItinData ], [3, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], + InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 3, 1, 2, 3], + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 4, 2, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [5, 2, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [2], + [Hex_FWD]>, - InstrItinData ], [1, 1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 1, 2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 1, 2, 3], + InstrItinData ], [1], + [Hex_FWD]>, + + InstrItinData ], [4, 3, 1, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 1], + InstrItinData ], [4, 2, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 1, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 1], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3], [Hex_FWD]>, - InstrItinData ], [3, 1, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 2, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3], - [Hex_FWD]>, + InstrItinData ], [4, 2, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], + InstrItinData ], [2, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 1, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 4, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 3, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [2, 1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [5, 1], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 4, 3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1, 2], + InstrItinData ], [4, 2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 2, 2], + InstrItinData ], [4, 4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2, 2], + InstrItinData ], [2], + [Hex_FWD]>, + + InstrItinData ], [1, 2, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [], - []>, - - InstrItinData ], [2, 3], + InstrItinData ], [1, 1], [Hex_FWD, Hex_FWD]> ]; } -class DepScalarItinV65 { - list DepScalarItinV65_list = [ - InstrItinData ], [4, 2], +class DepScalarItinV66 { + list DepScalarItinV66_list = [ + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 3], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [1], - [Hex_FWD]>, + InstrItinData ], [4, 3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 2, 1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [5, 2, 1, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1], + InstrItinData ], [1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2, 2], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2, 2, 2], + InstrItinData ], [2, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, + InstrItinData ], [4, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1], + InstrItinData ], [4, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 1], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], + InstrItinData ], [1], [Hex_FWD]>, - InstrItinData ], [1], - [Hex_FWD]>, + InstrItinData ], [4, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 1, 2], + InstrItinData ], [4, 2, 2, 1], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], + InstrItinData ], [2, 1], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 2], + InstrItinData ], [4, 3, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3], - [Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [4, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], + InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], + InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 2], + InstrItinData ], [2, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [], - []>, + InstrItinData ], [3, 2, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 3, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], + InstrItinData ], [3, 2, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [4, 3, 1, 2], + InstrItinData ], [5, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [5, 5, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, + InstrItinData ], [2, 1], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [], []>, - InstrItinData ], [2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [], []>, - InstrItinData ], [3, 3, 1], + InstrItinData ], [5, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [5, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, + InstrItinData ], [4, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [1], + [Hex_FWD]>, - InstrItinData ], [3, 1], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 3, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1], - [Hex_FWD]>, + InstrItinData ], [2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 1, 2, 2], + InstrItinData ], [3, 1, 1, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2, 2], + InstrItinData ], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 1, 2, 2], + InstrItinData ], [5, 2, 1, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2, 2], + InstrItinData ], [4, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], + InstrItinData ], [2], [Hex_FWD]>, - InstrItinData ], [3, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [2], + [Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [2, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [], - []>, + InstrItinData ], [3, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [2], + [Hex_FWD]>, - InstrItinData ], [5, 5, 1], + InstrItinData ], [4, 3, 1, 2, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1], + InstrItinData ], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 1], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [4, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 4, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1, 1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [], - []>, + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [2, 3], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1], + [Hex_FWD]>, + + InstrItinData ], [4, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 3], + InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 2], - [Hex_FWD, Hex_FWD]>, - - InstrItinData ], [2, 1, 2, 3], + InstrItinData ], [1, 1, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 2], + InstrItinData ], [3, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [1], + [Hex_FWD]>, - InstrItinData ], [4, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 3, 2], + InstrItinData ], [1, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 2], + InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 1, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [3, 2, 2], + InstrItinData ], [4, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 4, 3, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [3, 2, 2], + InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [5, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, + InstrItinData ], [4, 3, 2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [2, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [], - []>, + InstrItinData ], [2], + [Hex_FWD]>, - InstrItinData ], [4, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], + InstrItinData ], [1, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [4, 3, 1, 2, 1], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4], + [Hex_FWD]>, - InstrItinData ], [2, 1], + InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [4, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [2], + [Hex_FWD]>, - InstrItinData ], [4, 2, 2, 2], + InstrItinData ], [2, 1, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2, 2], + InstrItinData ], [4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 2, 2], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [], + []>, - InstrItinData ], [4, 1, 1], + InstrItinData ], [3, 3, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 4, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], + InstrItinData ], [1, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], + InstrItinData ], [3, 3, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], + InstrItinData ], [4, 2, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1], + InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [1], + InstrItinData ], [3], [Hex_FWD]>, - InstrItinData ], [2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [5, 5, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 2], + InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 2, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [5, 2, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 1, 1, 2], + InstrItinData ], [4, 2, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, - - InstrItinData ], [3, 1, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1], + InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2], - [Hex_FWD]>, - - InstrItinData ], [3, 2], + InstrItinData ], [2, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [2], + [Hex_FWD]>, - InstrItinData ], [1, 2, 2, 3], + InstrItinData ], [3, 2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1, 1, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 1], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [], + InstrItinData ], [], []>, - InstrItinData ], [1, 1, 2, 3], + InstrItinData ], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3, 2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 2], + InstrItinData ], [1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 1, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [1], + [Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [5, 5, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 2, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2, 1], + InstrItinData ], [4, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [4, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], - [Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 2], - [Hex_FWD, Hex_FWD]>, - - InstrItinData ], [2, 2], + InstrItinData ], [4, 1], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1, 2], + InstrItinData ], [4, 2, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1], + InstrItinData ], [1, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2, 2], + InstrItinData ], [4, 1, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1], - [Hex_FWD]>, - - InstrItinData ], [4, 4, 2, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [3, 2, 1, 2, 2], + InstrItinData ], [4, 2, 2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], + InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 2, 2], + InstrItinData ], [3, 1, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], + InstrItinData ], [3, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1], + InstrItinData ], [3, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 3, 1, 2, 3], + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [4, 4, 2, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [5, 2, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [2], + [Hex_FWD]>, - InstrItinData ], [1, 1, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 1, 2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 1, 2, 3], + InstrItinData ], [1], + [Hex_FWD]>, + + InstrItinData ], [4, 3, 1, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [1, 1], + InstrItinData ], [4, 2, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [2, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2], + InstrItinData ], [3, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 2], + InstrItinData ], [3, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 1, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 2], + [Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData ], [3], [Hex_FWD]>, - InstrItinData ], [3, 1, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [4, 2, 1, 1], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4], - [Hex_FWD]>, + InstrItinData ], [4, 2, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [2, 2], + InstrItinData ], [2, 2], [Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 1, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [3, 1, 2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 4, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 3, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [2, 1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [5, 1], - [Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 4, 3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 1, 1, 2], + InstrItinData ], [4, 2, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [4, 2, 2], - [Hex_FWD, Hex_FWD, Hex_FWD]>, - - InstrItinData ], [1, 2, 3], - [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData ], [4, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 1, 2, 2], + InstrItinData ], [2, 1, 2, 3], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [3, 2, 2, 2], + InstrItinData ], [4, 4, 2, 2], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, - InstrItinData ], [], - []>, + InstrItinData ], [2], + [Hex_FWD]>, - InstrItinData ], [2, 3], + InstrItinData ], [1, 2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData ], [1, 1], [Hex_FWD, Hex_FWD]> ]; } diff --git a/lib/Target/Hexagon/HexagonDepITypes.h b/lib/Target/Hexagon/HexagonDepITypes.h index 7e06ccede6e7af4f86ffedfbcef4f8215c06910e..81e3971e21d20ae17391759b0a9af940bca4087e 100644 --- a/lib/Target/Hexagon/HexagonDepITypes.h +++ b/lib/Target/Hexagon/HexagonDepITypes.h @@ -1,4 +1,4 @@ -//===- HexagonDepITypes.h -------------------------------------------------===// +//===----------------------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -9,7 +9,6 @@ // Automatically generated file, please consult code owner before editing. //===----------------------------------------------------------------------===// - namespace llvm { namespace HexagonII { enum Type { @@ -45,6 +44,7 @@ enum Type { TypeCVI_VX = 29, TypeCVI_VX_DV = 30, TypeCVI_VX_LATE = 31, + TypeCVI_ZW = 32, TypeDUPLEX = 33, TypeENDLOOP = 34, TypeEXTENDER = 35, @@ -59,7 +59,7 @@ enum Type { TypeS_2op = 44, TypeS_3op = 45, TypeV2LDST = 48, - TypeV4LDST = 49 + TypeV4LDST = 49, }; } } diff --git a/lib/Target/Hexagon/HexagonDepITypes.td b/lib/Target/Hexagon/HexagonDepITypes.td index 0a385bf938fed1c9f6625107c04d5f1d47244f9f..f694062a5232f11e3ae560b34ea0ccb995dcc370 100644 --- a/lib/Target/Hexagon/HexagonDepITypes.td +++ b/lib/Target/Hexagon/HexagonDepITypes.td @@ -1,4 +1,4 @@ -//===- HexagonDepITypes.td ------------------------------------------------===// +//===----------------------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -9,8 +9,7 @@ // Automatically generated file, please consult code owner before editing. //===----------------------------------------------------------------------===// - -class IType t> { bits<6> Value = t; } +class IType t> { bits<7> Value = t; } def TypeALU32_2op : IType<0>; def TypeALU32_3op : IType<1>; def TypeALU32_ADDI : IType<2>; @@ -43,6 +42,7 @@ def TypeCVI_VS_VX : IType<28>; def TypeCVI_VX : IType<29>; def TypeCVI_VX_DV : IType<30>; def TypeCVI_VX_LATE : IType<31>; +def TypeCVI_ZW : IType<32>; def TypeDUPLEX : IType<33>; def TypeENDLOOP : IType<34>; def TypeEXTENDER : IType<35>; diff --git a/lib/Target/Hexagon/HexagonDepInstrFormats.td b/lib/Target/Hexagon/HexagonDepInstrFormats.td index 9f98da3a1deedc8c3e6f8d29815871451e35f9cb..ffe212ef9d97bc4bef363596558b1202cd8edbc6 100644 --- a/lib/Target/Hexagon/HexagonDepInstrFormats.td +++ b/lib/Target/Hexagon/HexagonDepInstrFormats.td @@ -1,4 +1,4 @@ -//===- HexagonDepInstrFormats.td ------------------------------------------===// +//===----------------------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -9,7 +9,6 @@ // Automatically generated file, please consult code owner before editing. //===----------------------------------------------------------------------===// - class Enc_890909 : OpcodeHexagon { bits <5> Rs32; let Inst{20-16} = Rs32{4-0}; @@ -61,14 +60,6 @@ class Enc_27b757 : OpcodeHexagon { bits <5> Vs32; let Inst{4-0} = Vs32{4-0}; } -class Enc_8d04c3 : OpcodeHexagon { - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <5> Vv32; - let Inst{12-8} = Vv32{4-0}; - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; -} class Enc_1de724 : OpcodeHexagon { bits <11> Ii; let Inst{21-20} = Ii{10-9}; @@ -87,12 +78,6 @@ class Enc_0e41fa : OpcodeHexagon { bits <5> Vd32; let Inst{4-0} = Vd32{4-0}; } -class Enc_2a736a : OpcodeHexagon { - bits <5> Vuu32; - let Inst{20-16} = Vuu32{4-0}; - bits <5> Vdd32; - let Inst{7-3} = Vdd32{4-0}; -} class Enc_3d6d37 : OpcodeHexagon { bits <2> Qs4; let Inst{6-5} = Qs4{1-0}; @@ -121,14 +106,6 @@ class Enc_802dc0 : OpcodeHexagon { bits <2> Qv4; let Inst{23-22} = Qv4{1-0}; } -class Enc_6a4549 : OpcodeHexagon { - bits <5> Vu32; - let Inst{12-8} = Vu32{4-0}; - bits <5> Rt32; - let Inst{20-16} = Rt32{4-0}; - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; -} class Enc_6b197f : OpcodeHexagon { bits <4> Ii; let Inst{8-5} = Ii{3-0}; @@ -137,22 +114,6 @@ class Enc_6b197f : OpcodeHexagon { bits <5> Rx32; let Inst{20-16} = Rx32{4-0}; } -class Enc_1f3376 : OpcodeHexagon { - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <5> Vv32; - let Inst{12-8} = Vv32{4-0}; - bits <5> Vxx32; - let Inst{7-3} = Vxx32{4-0}; -} -class Enc_1f5d8f : OpcodeHexagon { - bits <1> Mu2; - let Inst{13-13} = Mu2{0-0}; - bits <5> Ryy32; - let Inst{4-0} = Ryy32{4-0}; - bits <5> Rx32; - let Inst{20-16} = Rx32{4-0}; -} class Enc_51436c : OpcodeHexagon { bits <16> Ii; let Inst{23-22} = Ii{15-14}; @@ -249,6 +210,14 @@ class Enc_d7dc10 : OpcodeHexagon { bits <2> Pd4; let Inst{1-0} = Pd4{1-0}; } +class Enc_6baed4 : OpcodeHexagon { + bits <3> Ii; + let Inst{10-8} = Ii{2-0}; + bits <2> Pv4; + let Inst{12-11} = Pv4{1-0}; + bits <5> Rx32; + let Inst{20-16} = Rx32{4-0}; +} class Enc_736575 : OpcodeHexagon { bits <11> Ii; let Inst{21-20} = Ii{10-9}; @@ -291,14 +260,6 @@ class Enc_509701 : OpcodeHexagon { bits <5> Rdd32; let Inst{4-0} = Rdd32{4-0}; } -class Enc_c84567 : OpcodeHexagon { - bits <5> Vuu32; - let Inst{20-16} = Vuu32{4-0}; - bits <5> Vv32; - let Inst{12-8} = Vv32{4-0}; - bits <5> Vdd32; - let Inst{7-3} = Vdd32{4-0}; -} class Enc_830e5d : OpcodeHexagon { bits <8> Ii; let Inst{12-5} = Ii{7-0}; @@ -310,12 +271,6 @@ class Enc_830e5d : OpcodeHexagon { bits <5> Rd32; let Inst{4-0} = Rd32{4-0}; } -class Enc_ae0040 : OpcodeHexagon { - bits <5> Rs32; - let Inst{20-16} = Rs32{4-0}; - bits <6> Sd64; - let Inst{5-0} = Sd64{5-0}; -} class Enc_79b8c8 : OpcodeHexagon { bits <6> Ii; let Inst{6-3} = Ii{5-2}; @@ -336,16 +291,6 @@ class Enc_58a8bf : OpcodeHexagon { bits <5> Rx32; let Inst{20-16} = Rx32{4-0}; } -class Enc_e8ddd5 : OpcodeHexagon { - bits <16> Ii; - let Inst{21-21} = Ii{15-15}; - let Inst{13-8} = Ii{14-9}; - let Inst{2-0} = Ii{8-6}; - bits <5> Vss32; - let Inst{7-3} = Vss32{4-0}; - bits <5> Rx32; - let Inst{20-16} = Rx32{4-0}; -} class Enc_041d7b : OpcodeHexagon { bits <11> Ii; let Inst{21-20} = Ii{10-9}; @@ -369,14 +314,6 @@ class Enc_f44229 : OpcodeHexagon { bits <3> Nt8; let Inst{10-8} = Nt8{2-0}; } -class Enc_fc563d : OpcodeHexagon { - bits <5> Vuu32; - let Inst{20-16} = Vuu32{4-0}; - bits <5> Vv32; - let Inst{12-8} = Vv32{4-0}; - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; -} class Enc_aad80c : OpcodeHexagon { bits <5> Vuu32; let Inst{12-8} = Vuu32{4-0}; @@ -434,6 +371,14 @@ class Enc_ee5ed0 : OpcodeHexagon { bits <2> n1; let Inst{9-8} = n1{1-0}; } +class Enc_bddee3 : OpcodeHexagon { + bits <5> Vu32; + let Inst{12-8} = Vu32{4-0}; + bits <5> Vyyyy32; + let Inst{4-0} = Vyyyy32{4-0}; + bits <3> Rx8; + let Inst{18-16} = Rx8{2-0}; +} class Enc_935d9b : OpcodeHexagon { bits <5> Ii; let Inst{6-3} = Ii{4-1}; @@ -573,6 +518,14 @@ class Enc_27fd0e : OpcodeHexagon { bits <5> Rx32; let Inst{20-16} = Rx32{4-0}; } +class Enc_d7bc34 : OpcodeHexagon { + bits <5> Vu32; + let Inst{12-8} = Vu32{4-0}; + bits <3> Rt8; + let Inst{18-16} = Rt8{2-0}; + bits <5> Vyyyy32; + let Inst{4-0} = Vyyyy32{4-0}; +} class Enc_93af4c : OpcodeHexagon { bits <7> Ii; let Inst{10-4} = Ii{6-0}; @@ -620,12 +573,6 @@ class Enc_14640c : OpcodeHexagon { let Inst{24-22} = n1{3-1}; let Inst{13-13} = n1{0-0}; } -class Enc_2516bf : OpcodeHexagon { - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; -} class Enc_31db33 : OpcodeHexagon { bits <2> Qt4; let Inst{6-5} = Qt4{1-0}; @@ -656,24 +603,6 @@ class Enc_784502 : OpcodeHexagon { bits <5> Rx32; let Inst{20-16} = Rx32{4-0}; } -class Enc_9a9d62 : OpcodeHexagon { - bits <1> Mu2; - let Inst{13-13} = Mu2{0-0}; - bits <5> Rt32; - let Inst{12-8} = Rt32{4-0}; - bits <5> Vs32; - let Inst{7-3} = Vs32{4-0}; - bits <5> Rx32; - let Inst{20-16} = Rx32{4-0}; -} -class Enc_3a81ac : OpcodeHexagon { - bits <1> Mu2; - let Inst{13-13} = Mu2{0-0}; - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; - bits <5> Rx32; - let Inst{20-16} = Rx32{4-0}; -} class Enc_6413b6 : OpcodeHexagon { bits <11> Ii; let Inst{21-20} = Ii{10-9}; @@ -703,13 +632,13 @@ class Enc_84bff1 : OpcodeHexagon { bits <5> Rdd32; let Inst{4-0} = Rdd32{4-0}; } -class Enc_74aef2 : OpcodeHexagon { +class Enc_f4413a : OpcodeHexagon { bits <4> Ii; let Inst{8-5} = Ii{3-0}; - bits <1> Mu2; - let Inst{13-13} = Mu2{0-0}; - bits <5> Ryy32; - let Inst{4-0} = Ryy32{4-0}; + bits <2> Pt4; + let Inst{10-9} = Pt4{1-0}; + bits <5> Rd32; + let Inst{4-0} = Rd32{4-0}; bits <5> Rx32; let Inst{20-16} = Rx32{4-0}; } @@ -753,16 +682,6 @@ class Enc_e39bb2 : OpcodeHexagon { bits <4> Rd16; let Inst{3-0} = Rd16{3-0}; } -class Enc_7db2f8 : OpcodeHexagon { - bits <5> Vu32; - let Inst{13-9} = Vu32{4-0}; - bits <5> Vv32; - let Inst{8-4} = Vv32{4-0}; - bits <4> Vdd16; - let Inst{3-0} = Vdd16{3-0}; - bits <5> Rx32; - let Inst{20-16} = Rx32{4-0}; -} class Enc_1b64fb : OpcodeHexagon { bits <16> Ii; let Inst{26-25} = Ii{15-14}; @@ -772,6 +691,16 @@ class Enc_1b64fb : OpcodeHexagon { bits <5> Rt32; let Inst{12-8} = Rt32{4-0}; } +class Enc_c1d806 : OpcodeHexagon { + bits <5> Vu32; + let Inst{12-8} = Vu32{4-0}; + bits <5> Vv32; + let Inst{20-16} = Vv32{4-0}; + bits <5> Vd32; + let Inst{4-0} = Vd32{4-0}; + bits <2> Qe4; + let Inst{6-5} = Qe4{1-0}; +} class Enc_c6220b : OpcodeHexagon { bits <2> Ii; let Inst{13-13} = Ii{1-1}; @@ -841,10 +770,6 @@ class Enc_fcf7a7 : OpcodeHexagon { bits <2> Pd4; let Inst{1-0} = Pd4{1-0}; } -class Enc_2c3281 : OpcodeHexagon { - bits <5> Vdd32; - let Inst{7-3} = Vdd32{4-0}; -} class Enc_55355c : OpcodeHexagon { bits <2> Ii; let Inst{13-13} = Ii{1-1}; @@ -877,6 +802,16 @@ class Enc_6185fe : OpcodeHexagon { bits <5> Rdd32; let Inst{4-0} = Rdd32{4-0}; } +class Enc_74aef2 : OpcodeHexagon { + bits <4> Ii; + let Inst{8-5} = Ii{3-0}; + bits <1> Mu2; + let Inst{13-13} = Mu2{0-0}; + bits <5> Ryy32; + let Inst{4-0} = Ryy32{4-0}; + bits <5> Rx32; + let Inst{20-16} = Rx32{4-0}; +} class Enc_cd4705 : OpcodeHexagon { bits <3> Ii; let Inst{7-5} = Ii{2-0}; @@ -920,10 +855,6 @@ class Enc_fef969 : OpcodeHexagon { bits <5> Rd32; let Inst{4-0} = Rd32{4-0}; } -class Enc_b2ffce : OpcodeHexagon { - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; -} class Enc_63eaeb : OpcodeHexagon { bits <2> Ii; let Inst{1-0} = Ii{1-0}; @@ -948,12 +879,6 @@ class Enc_372c9d : OpcodeHexagon { bits <5> Rx32; let Inst{20-16} = Rx32{4-0}; } -class Enc_9e9047 : OpcodeHexagon { - bits <2> Pt4; - let Inst{9-8} = Pt4{1-0}; - bits <5> Rs32; - let Inst{20-16} = Rs32{4-0}; -} class Enc_4dff07 : OpcodeHexagon { bits <2> Qv4; let Inst{12-11} = Qv4{1-0}; @@ -1000,16 +925,6 @@ class Enc_b388cf : OpcodeHexagon { bits <5> Rd32; let Inst{4-0} = Rd32{4-0}; } -class Enc_880793 : OpcodeHexagon { - bits <3> Qt8; - let Inst{2-0} = Qt8{2-0}; - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <5> Vv32; - let Inst{12-8} = Vv32{4-0}; - bits <5> Vdd32; - let Inst{7-3} = Vdd32{4-0}; -} class Enc_ad1c74 : OpcodeHexagon { bits <11> Ii; let Inst{21-20} = Ii{10-9}; @@ -1086,14 +1001,6 @@ class Enc_88d4d9 : OpcodeHexagon { bits <5> Rs32; let Inst{20-16} = Rs32{4-0}; } -class Enc_c0cdde : OpcodeHexagon { - bits <9> Ii; - let Inst{13-5} = Ii{8-0}; - bits <5> Rs32; - let Inst{20-16} = Rs32{4-0}; - bits <2> Pd4; - let Inst{1-0} = Pd4{1-0}; -} class Enc_226535 : OpcodeHexagon { bits <8> Ii; let Inst{12-7} = Ii{7-2}; @@ -1102,14 +1009,6 @@ class Enc_226535 : OpcodeHexagon { bits <5> Rt32; let Inst{4-0} = Rt32{4-0}; } -class Enc_96f0fd : OpcodeHexagon { - bits <5> Rt32; - let Inst{20-16} = Rt32{4-0}; - bits <5> Vx32; - let Inst{7-3} = Vx32{4-0}; - bits <3> Qdd8; - let Inst{2-0} = Qdd8{2-0}; -} class Enc_31aa6a : OpcodeHexagon { bits <5> Ii; let Inst{6-3} = Ii{4-1}; @@ -1120,12 +1019,6 @@ class Enc_31aa6a : OpcodeHexagon { bits <5> Rx32; let Inst{20-16} = Rx32{4-0}; } -class Enc_932b58 : OpcodeHexagon { - bits <5> Vu32; - let Inst{12-8} = Vu32{4-0}; - bits <5> Rt32; - let Inst{20-16} = Rt32{4-0}; -} class Enc_397f23 : OpcodeHexagon { bits <8> Ii; let Inst{13-13} = Ii{7-7}; @@ -1192,14 +1085,6 @@ class Enc_01d3d0 : OpcodeHexagon { bits <5> Vdd32; let Inst{4-0} = Vdd32{4-0}; } -class Enc_3126d7 : OpcodeHexagon { - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <5> Vv32; - let Inst{12-8} = Vv32{4-0}; - bits <5> Vdd32; - let Inst{7-3} = Vdd32{4-0}; -} class Enc_b0e9d8 : OpcodeHexagon { bits <10> Ii; let Inst{21-21} = Ii{9-9}; @@ -1209,6 +1094,14 @@ class Enc_b0e9d8 : OpcodeHexagon { bits <5> Rx32; let Inst{4-0} = Rx32{4-0}; } +class Enc_1bd127 : OpcodeHexagon { + bits <5> Vu32; + let Inst{12-8} = Vu32{4-0}; + bits <3> Rt8; + let Inst{18-16} = Rt8{2-0}; + bits <5> Vdddd32; + let Inst{4-0} = Vdddd32{4-0}; +} class Enc_3694bd : OpcodeHexagon { bits <11> Ii; let Inst{21-20} = Ii{10-9}; @@ -1276,12 +1169,6 @@ class Enc_88c16c : OpcodeHexagon { bits <5> Rxx32; let Inst{4-0} = Rxx32{4-0}; } -class Enc_e7408c : OpcodeHexagon { - bits <6> Sss64; - let Inst{21-16} = Sss64{5-0}; - bits <5> Rdd32; - let Inst{4-0} = Rdd32{4-0}; -} class Enc_770858 : OpcodeHexagon { bits <2> Ps4; let Inst{6-5} = Ps4{1-0}; @@ -1323,15 +1210,14 @@ class Enc_412ff0 : OpcodeHexagon { bits <5> Rxx32; let Inst{12-8} = Rxx32{4-0}; } -class Enc_8e9fbd : OpcodeHexagon { - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <3> Rt8; - let Inst{2-0} = Rt8{2-0}; - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; - bits <5> Vy32; - let Inst{12-8} = Vy32{4-0}; +class Enc_ef601b : OpcodeHexagon { + bits <4> Ii; + let Inst{13-13} = Ii{3-3}; + let Inst{10-8} = Ii{2-0}; + bits <2> Pv4; + let Inst{12-11} = Pv4{1-0}; + bits <5> Rt32; + let Inst{20-16} = Rt32{4-0}; } class Enc_c9a18e : OpcodeHexagon { bits <11> Ii; @@ -1356,19 +1242,6 @@ class Enc_e6abcf : OpcodeHexagon { bits <5> Rtt32; let Inst{12-8} = Rtt32{4-0}; } -class Enc_6339d5 : OpcodeHexagon { - bits <2> Ii; - let Inst{13-13} = Ii{1-1}; - let Inst{7-7} = Ii{0-0}; - bits <2> Pv4; - let Inst{6-5} = Pv4{1-0}; - bits <5> Rs32; - let Inst{20-16} = Rs32{4-0}; - bits <5> Ru32; - let Inst{12-8} = Ru32{4-0}; - bits <5> Rt32; - let Inst{4-0} = Rt32{4-0}; -} class Enc_d6990d : OpcodeHexagon { bits <5> Vuu32; let Inst{12-8} = Vuu32{4-0}; @@ -1377,16 +1250,6 @@ class Enc_d6990d : OpcodeHexagon { bits <5> Vxx32; let Inst{4-0} = Vxx32{4-0}; } -class Enc_6c4697 : OpcodeHexagon { - bits <1> Mu2; - let Inst{13-13} = Mu2{0-0}; - bits <5> Rt32; - let Inst{12-8} = Rt32{4-0}; - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; - bits <5> Rx32; - let Inst{20-16} = Rx32{4-0}; -} class Enc_6c9440 : OpcodeHexagon { bits <10> Ii; let Inst{21-21} = Ii{9-9}; @@ -1445,15 +1308,13 @@ class Enc_9d1247 : OpcodeHexagon { bits <5> Rx32; let Inst{20-16} = Rx32{4-0}; } -class Enc_f4413a : OpcodeHexagon { - bits <4> Ii; - let Inst{8-5} = Ii{3-0}; - bits <2> Pt4; - let Inst{10-9} = Pt4{1-0}; - bits <5> Rd32; - let Inst{4-0} = Rd32{4-0}; - bits <5> Rx32; - let Inst{20-16} = Rx32{4-0}; +class Enc_7b7ba8 : OpcodeHexagon { + bits <2> Qu4; + let Inst{9-8} = Qu4{1-0}; + bits <5> Rt32; + let Inst{20-16} = Rt32{4-0}; + bits <5> Vd32; + let Inst{4-0} = Vd32{4-0}; } class Enc_f7430e : OpcodeHexagon { bits <4> Ii; @@ -1531,12 +1392,6 @@ class Enc_a803e0 : OpcodeHexagon { bits <5> Rs32; let Inst{20-16} = Rs32{4-0}; } -class Enc_fde0e3 : OpcodeHexagon { - bits <5> Rtt32; - let Inst{20-16} = Rtt32{4-0}; - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; -} class Enc_45364e : OpcodeHexagon { bits <5> Vu32; let Inst{12-8} = Vu32{4-0}; @@ -1557,12 +1412,6 @@ class Enc_b909d2 : OpcodeHexagon { let Inst{13-13} = n1{1-1}; let Inst{8-8} = n1{0-0}; } -class Enc_790d6e : OpcodeHexagon { - bits <5> Rt32; - let Inst{20-16} = Rt32{4-0}; - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; -} class Enc_e6c957 : OpcodeHexagon { bits <10> Ii; let Inst{21-21} = Ii{9-9}; @@ -1570,15 +1419,6 @@ class Enc_e6c957 : OpcodeHexagon { bits <5> Rdd32; let Inst{4-0} = Rdd32{4-0}; } -class Enc_fa3ba4 : OpcodeHexagon { - bits <14> Ii; - let Inst{26-25} = Ii{13-12}; - let Inst{13-5} = Ii{11-3}; - bits <5> Rs32; - let Inst{20-16} = Rs32{4-0}; - bits <5> Rdd32; - let Inst{4-0} = Rdd32{4-0}; -} class Enc_0d8870 : OpcodeHexagon { bits <12> Ii; let Inst{26-25} = Ii{11-10}; @@ -1623,14 +1463,6 @@ class Enc_0ed752 : OpcodeHexagon { bits <5> Cdd32; let Inst{4-0} = Cdd32{4-0}; } -class Enc_908985 : OpcodeHexagon { - bits <1> Mu2; - let Inst{13-13} = Mu2{0-0}; - bits <5> Vss32; - let Inst{7-3} = Vss32{4-0}; - bits <5> Rx32; - let Inst{20-16} = Rx32{4-0}; -} class Enc_143445 : OpcodeHexagon { bits <13> Ii; let Inst{26-25} = Ii{12-11}; @@ -1658,16 +1490,6 @@ class Enc_3e3989 : OpcodeHexagon { let Inst{25-22} = n1{4-1}; let Inst{8-8} = n1{0-0}; } -class Enc_12dd8f : OpcodeHexagon { - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <5> Vv32; - let Inst{12-8} = Vv32{4-0}; - bits <3> Rt8; - let Inst{2-0} = Rt8{2-0}; - bits <5> Vx32; - let Inst{7-3} = Vx32{4-0}; -} class Enc_152467 : OpcodeHexagon { bits <5> Ii; let Inst{8-5} = Ii{4-1}; @@ -1676,30 +1498,31 @@ class Enc_152467 : OpcodeHexagon { bits <5> Rx32; let Inst{20-16} = Rx32{4-0}; } -class Enc_6b1bc4 : OpcodeHexagon { - bits <5> Vuu32; - let Inst{20-16} = Vuu32{4-0}; - bits <3> Qt8; - let Inst{10-8} = Qt8{2-0}; - bits <5> Vdd32; - let Inst{7-3} = Vdd32{4-0}; -} -class Enc_daea09 : OpcodeHexagon { - bits <17> Ii; - let Inst{23-22} = Ii{16-15}; - let Inst{20-16} = Ii{14-10}; - let Inst{13-13} = Ii{9-9}; - let Inst{7-1} = Ii{8-2}; +class Enc_9ac432 : OpcodeHexagon { + bits <2> Ps4; + let Inst{17-16} = Ps4{1-0}; + bits <2> Pt4; + let Inst{9-8} = Pt4{1-0}; bits <2> Pu4; - let Inst{9-8} = Pu4{1-0}; + let Inst{7-6} = Pu4{1-0}; + bits <2> Pd4; + let Inst{1-0} = Pd4{1-0}; } -class Enc_f37377 : OpcodeHexagon { - bits <8> Ii; - let Inst{12-7} = Ii{7-2}; - bits <8> II; - let Inst{13-13} = II{7-7}; - let Inst{6-0} = II{6-0}; - bits <5> Rs32; +class Enc_a90628 : OpcodeHexagon { + bits <2> Qv4; + let Inst{23-22} = Qv4{1-0}; + bits <5> Vu32; + let Inst{12-8} = Vu32{4-0}; + bits <5> Vx32; + let Inst{4-0} = Vx32{4-0}; +} +class Enc_f37377 : OpcodeHexagon { + bits <8> Ii; + let Inst{12-7} = Ii{7-2}; + bits <8> II; + let Inst{13-13} = II{7-7}; + let Inst{6-0} = II{6-0}; + bits <5> Rs32; let Inst{20-16} = Rs32{4-0}; } class Enc_a198f6 : OpcodeHexagon { @@ -1712,12 +1535,6 @@ class Enc_a198f6 : OpcodeHexagon { bits <5> Rd32; let Inst{4-0} = Rd32{4-0}; } -class Enc_a265b7 : OpcodeHexagon { - bits <5> Vuu32; - let Inst{20-16} = Vuu32{4-0}; - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; -} class Enc_4e4a80 : OpcodeHexagon { bits <2> Qs4; let Inst{6-5} = Qs4{1-0}; @@ -1728,16 +1545,6 @@ class Enc_4e4a80 : OpcodeHexagon { bits <5> Vvv32; let Inst{4-0} = Vvv32{4-0}; } -class Enc_8d5d98 : OpcodeHexagon { - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <5> Vv32; - let Inst{12-8} = Vv32{4-0}; - bits <3> Rt8; - let Inst{2-0} = Rt8{2-0}; - bits <5> Vxx32; - let Inst{7-3} = Vxx32{4-0}; -} class Enc_3dac0b : OpcodeHexagon { bits <2> Qt4; let Inst{6-5} = Qt4{1-0}; @@ -1780,16 +1587,6 @@ class Enc_2df31d : OpcodeHexagon { bits <4> Rd16; let Inst{3-0} = Rd16{3-0}; } -class Enc_b0e553 : OpcodeHexagon { - bits <16> Ii; - let Inst{21-21} = Ii{15-15}; - let Inst{13-8} = Ii{14-9}; - let Inst{2-0} = Ii{8-6}; - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; - bits <5> Rx32; - let Inst{20-16} = Rx32{4-0}; -} class Enc_25bef0 : OpcodeHexagon { bits <16> Ii; let Inst{26-25} = Ii{15-14}; @@ -1905,10 +1702,14 @@ class Enc_bd1cbc : OpcodeHexagon { bits <5> Rx32; let Inst{20-16} = Rx32{4-0}; } -class Enc_d0fe02 : OpcodeHexagon { - bits <5> Rxx32; - let Inst{20-16} = Rxx32{4-0}; - bits <0> sgp10; +class Enc_c85e2a : OpcodeHexagon { + bits <5> Ii; + let Inst{12-8} = Ii{4-0}; + bits <5> II; + let Inst{22-21} = II{4-3}; + let Inst{7-5} = II{2-0}; + bits <5> Rd32; + let Inst{4-0} = Rd32{4-0}; } class Enc_a30110 : OpcodeHexagon { bits <5> Vu32; @@ -1920,24 +1721,12 @@ class Enc_a30110 : OpcodeHexagon { bits <5> Vd32; let Inst{4-0} = Vd32{4-0}; } -class Enc_f3f408 : OpcodeHexagon { - bits <4> Ii; - let Inst{13-13} = Ii{3-3}; - let Inst{10-8} = Ii{2-0}; - bits <5> Rt32; - let Inst{20-16} = Rt32{4-0}; - bits <5> Vd32; - let Inst{4-0} = Vd32{4-0}; -} -class Enc_ce4c54 : OpcodeHexagon { - bits <16> Ii; - let Inst{21-21} = Ii{15-15}; - let Inst{13-8} = Ii{14-9}; - let Inst{2-0} = Ii{8-6}; - bits <5> Rt32; - let Inst{20-16} = Rt32{4-0}; - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; +class Enc_33f8ba : OpcodeHexagon { + bits <8> Ii; + let Inst{12-8} = Ii{7-3}; + let Inst{4-2} = Ii{2-0}; + bits <5> Rx32; + let Inst{20-16} = Rx32{4-0}; } class Enc_690862 : OpcodeHexagon { bits <13> Ii; @@ -1949,20 +1738,6 @@ class Enc_690862 : OpcodeHexagon { bits <3> Nt8; let Inst{10-8} = Nt8{2-0}; } -class Enc_e570b0 : OpcodeHexagon { - bits <5> Rtt32; - let Inst{20-16} = Rtt32{4-0}; - bits <5> Vdd32; - let Inst{7-3} = Vdd32{4-0}; -} -class Enc_3c46e8 : OpcodeHexagon { - bits <5> Vuu32; - let Inst{12-8} = Vuu32{4-0}; - bits <5> Rt32; - let Inst{20-16} = Rt32{4-0}; - bits <5> Vdd32; - let Inst{7-3} = Vdd32{4-0}; -} class Enc_2a3787 : OpcodeHexagon { bits <13> Ii; let Inst{26-25} = Ii{12-11}; @@ -2010,22 +1785,6 @@ class Enc_729ff7 : OpcodeHexagon { bits <5> Rdd32; let Inst{4-0} = Rdd32{4-0}; } -class Enc_5883d0 : OpcodeHexagon { - bits <16> Ii; - let Inst{21-21} = Ii{15-15}; - let Inst{13-8} = Ii{14-9}; - let Inst{2-0} = Ii{8-6}; - bits <5> Rt32; - let Inst{20-16} = Rt32{4-0}; - bits <5> Vdd32; - let Inst{7-3} = Vdd32{4-0}; -} -class Enc_ff0e49 : OpcodeHexagon { - bits <5> Rss32; - let Inst{20-16} = Rss32{4-0}; - bits <6> Sdd64; - let Inst{5-0} = Sdd64{5-0}; -} class Enc_217147 : OpcodeHexagon { bits <2> Qv4; let Inst{23-22} = Qv4{1-0}; @@ -2060,14 +1819,6 @@ class Enc_541f26 : OpcodeHexagon { bits <5> Rt32; let Inst{12-8} = Rt32{4-0}; } -class Enc_9aae4a : OpcodeHexagon { - bits <5> Rt32; - let Inst{20-16} = Rt32{4-0}; - bits <5> Vx32; - let Inst{7-3} = Vx32{4-0}; - bits <3> Qd8; - let Inst{2-0} = Qd8{2-0}; -} class Enc_724154 : OpcodeHexagon { bits <6> II; let Inst{5-0} = II{5-0}; @@ -2114,16 +1865,6 @@ class Enc_b84c4c : OpcodeHexagon { bits <5> Rdd32; let Inst{4-0} = Rdd32{4-0}; } -class Enc_9ac432 : OpcodeHexagon { - bits <2> Ps4; - let Inst{17-16} = Ps4{1-0}; - bits <2> Pt4; - let Inst{9-8} = Pt4{1-0}; - bits <2> Pu4; - let Inst{7-6} = Pu4{1-0}; - bits <2> Pd4; - let Inst{1-0} = Pd4{1-0}; -} class Enc_8203bb : OpcodeHexagon { bits <6> Ii; let Inst{12-7} = Ii{5-0}; @@ -2228,12 +1969,6 @@ class Enc_96ce4f : OpcodeHexagon { bits <5> Rx32; let Inst{20-16} = Rx32{4-0}; } -class Enc_2bbae6 : OpcodeHexagon { - bits <6> Ss64; - let Inst{21-16} = Ss64{5-0}; - bits <5> Rd32; - let Inst{4-0} = Rd32{4-0}; -} class Enc_143a3c : OpcodeHexagon { bits <6> Ii; let Inst{13-8} = Ii{5-0}; @@ -2281,13 +2016,14 @@ class Enc_de0214 : OpcodeHexagon { bits <5> Rd32; let Inst{4-0} = Rd32{4-0}; } -class Enc_a90628 : OpcodeHexagon { - bits <2> Qv4; - let Inst{23-22} = Qv4{1-0}; - bits <5> Vu32; - let Inst{12-8} = Vu32{4-0}; - bits <5> Vx32; - let Inst{4-0} = Vx32{4-0}; +class Enc_daea09 : OpcodeHexagon { + bits <17> Ii; + let Inst{23-22} = Ii{16-15}; + let Inst{20-16} = Ii{14-10}; + let Inst{13-13} = Ii{9-9}; + let Inst{7-1} = Ii{8-2}; + bits <2> Pu4; + let Inst{9-8} = Pu4{1-0}; } class Enc_fda92c : OpcodeHexagon { bits <17> Ii; @@ -2365,26 +2101,6 @@ class Enc_b43b67 : OpcodeHexagon { bits <2> Qx4; let Inst{6-5} = Qx4{1-0}; } -class Enc_1cd70f : OpcodeHexagon { - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <5> Vv32; - let Inst{12-8} = Vv32{4-0}; - bits <3> Rt8; - let Inst{2-0} = Rt8{2-0}; - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; -} -class Enc_3a527f : OpcodeHexagon { - bits <16> Ii; - let Inst{21-21} = Ii{15-15}; - let Inst{13-8} = Ii{14-9}; - let Inst{2-0} = Ii{8-6}; - bits <5> Vs32; - let Inst{7-3} = Vs32{4-0}; - bits <5> Rx32; - let Inst{20-16} = Rx32{4-0}; -} class Enc_4aca3a : OpcodeHexagon { bits <11> Ii; let Inst{21-20} = Ii{10-9}; @@ -2403,12 +2119,6 @@ class Enc_b38ffc : OpcodeHexagon { bits <4> Rt16; let Inst{3-0} = Rt16{3-0}; } -class Enc_5c3a80 : OpcodeHexagon { - bits <3> Qt8; - let Inst{10-8} = Qt8{2-0}; - bits <3> Qd8; - let Inst{5-3} = Qd8{2-0}; -} class Enc_cda00a : OpcodeHexagon { bits <12> Ii; let Inst{19-16} = Ii{11-8}; @@ -2426,24 +2136,6 @@ class Enc_2fbf3c : OpcodeHexagon { bits <4> Rd16; let Inst{3-0} = Rd16{3-0}; } -class Enc_a4ae28 : OpcodeHexagon { - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <5> Vv32; - let Inst{12-8} = Vv32{4-0}; - bits <3> Qd8; - let Inst{5-3} = Qd8{2-0}; -} -class Enc_dd5f9f : OpcodeHexagon { - bits <3> Qtt8; - let Inst{2-0} = Qtt8{2-0}; - bits <5> Vuu32; - let Inst{20-16} = Vuu32{4-0}; - bits <5> Vvv32; - let Inst{12-8} = Vvv32{4-0}; - bits <5> Vdd32; - let Inst{7-3} = Vdd32{4-0}; -} class Enc_70b24b : OpcodeHexagon { bits <6> Ii; let Inst{8-5} = Ii{5-2}; @@ -2490,16 +2182,6 @@ class Enc_08d755 : OpcodeHexagon { bits <2> Pd4; let Inst{1-0} = Pd4{1-0}; } -class Enc_a7ca29 : OpcodeHexagon { - bits <3> Qt8; - let Inst{2-0} = Qt8{2-0}; - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <5> Vv32; - let Inst{12-8} = Vv32{4-0}; - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; -} class Enc_1178da : OpcodeHexagon { bits <3> Ii; let Inst{7-5} = Ii{2-0}; @@ -2518,14 +2200,6 @@ class Enc_8dbe85 : OpcodeHexagon { bits <5> Rx32; let Inst{20-16} = Rx32{4-0}; } -class Enc_17a474 : OpcodeHexagon { - bits <1> Mu2; - let Inst{13-13} = Mu2{0-0}; - bits <5> Vs32; - let Inst{7-3} = Vs32{4-0}; - bits <5> Rx32; - let Inst{20-16} = Rx32{4-0}; -} class Enc_5a18b3 : OpcodeHexagon { bits <11> Ii; let Inst{21-20} = Ii{10-9}; @@ -2586,14 +2260,6 @@ class Enc_12b6e9 : OpcodeHexagon { bits <5> Rdd32; let Inst{4-0} = Rdd32{4-0}; } -class Enc_9a895f : OpcodeHexagon { - bits <1> Mu2; - let Inst{13-13} = Mu2{0-0}; - bits <5> Vdd32; - let Inst{7-3} = Vdd32{4-0}; - bits <5> Rx32; - let Inst{20-16} = Rx32{4-0}; -} class Enc_6f70ca : OpcodeHexagon { bits <8> Ii; let Inst{8-4} = Ii{7-3}; @@ -2605,12 +2271,7 @@ class Enc_7222b7 : OpcodeHexagon { let Inst{1-0} = Qd4{1-0}; } class Enc_e3b0c4 : OpcodeHexagon { -} -class Enc_d7e8ba : OpcodeHexagon { - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <5> Vdd32; - let Inst{7-3} = Vdd32{4-0}; + } class Enc_a255dc : OpcodeHexagon { bits <3> Ii; @@ -2628,16 +2289,6 @@ class Enc_cb785b : OpcodeHexagon { bits <5> Vdd32; let Inst{4-0} = Vdd32{4-0}; } -class Enc_5b76ab : OpcodeHexagon { - bits <10> Ii; - let Inst{21-21} = Ii{9-9}; - let Inst{13-8} = Ii{8-3}; - let Inst{2-0} = Ii{2-0}; - bits <5> Vs32; - let Inst{7-3} = Vs32{4-0}; - bits <5> Rx32; - let Inst{20-16} = Rx32{4-0}; -} class Enc_cb4b4e : OpcodeHexagon { bits <2> Pu4; let Inst{6-5} = Pu4{1-0}; @@ -2648,23 +2299,13 @@ class Enc_cb4b4e : OpcodeHexagon { bits <5> Rdd32; let Inst{4-0} = Rdd32{4-0}; } -class Enc_fbacc2 : OpcodeHexagon { - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <3> Rt8; - let Inst{2-0} = Rt8{2-0}; - bits <5> Vxx32; - let Inst{7-3} = Vxx32{4-0}; - bits <5> Vy32; - let Inst{12-8} = Vy32{4-0}; -} -class Enc_2ad23d : OpcodeHexagon { - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <5> Vv32; - let Inst{12-8} = Vv32{4-0}; - bits <5> Vx32; - let Inst{7-3} = Vx32{4-0}; +class Enc_1f5d8f : OpcodeHexagon { + bits <1> Mu2; + let Inst{13-13} = Mu2{0-0}; + bits <5> Ryy32; + let Inst{4-0} = Ryy32{4-0}; + bits <5> Rx32; + let Inst{20-16} = Rx32{4-0}; } class Enc_9cdba7 : OpcodeHexagon { bits <8> Ii; @@ -2683,10 +2324,6 @@ class Enc_5cd7e9 : OpcodeHexagon { bits <5> Ryy32; let Inst{4-0} = Ryy32{4-0}; } -class Enc_e7c9de : OpcodeHexagon { - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; -} class Enc_454a26 : OpcodeHexagon { bits <2> Pt4; let Inst{9-8} = Pt4{1-0}; @@ -2786,14 +2423,6 @@ class Enc_d2c7f1 : OpcodeHexagon { bits <2> Pe4; let Inst{6-5} = Pe4{1-0}; } -class Enc_dcfcbb : OpcodeHexagon { - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <5> Vvv32; - let Inst{12-8} = Vvv32{4-0}; - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; -} class Enc_3680c2 : OpcodeHexagon { bits <7> Ii; let Inst{11-5} = Ii{6-0}; @@ -2822,31 +2451,13 @@ class Enc_e957fb : OpcodeHexagon { bits <5> Rt32; let Inst{12-8} = Rt32{4-0}; } -class Enc_2146c1 : OpcodeHexagon { - bits <5> Vuu32; - let Inst{20-16} = Vuu32{4-0}; - bits <5> Vvv32; - let Inst{12-8} = Vvv32{4-0}; - bits <3> Qss8; - let Inst{2-0} = Qss8{2-0}; - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; -} -class Enc_a662ae : OpcodeHexagon { - bits <5> Vuu32; - let Inst{20-16} = Vuu32{4-0}; - bits <5> Vvv32; - let Inst{12-8} = Vvv32{4-0}; - bits <3> Rt8; - let Inst{2-0} = Rt8{2-0}; - bits <5> Vdd32; - let Inst{7-3} = Vdd32{4-0}; -} -class Enc_8f7cc3 : OpcodeHexagon { - bits <3> Qtt8; - let Inst{10-8} = Qtt8{2-0}; - bits <3> Qdd8; - let Inst{5-3} = Qdd8{2-0}; +class Enc_c0cdde : OpcodeHexagon { + bits <9> Ii; + let Inst{13-5} = Ii{8-0}; + bits <5> Rs32; + let Inst{20-16} = Rs32{4-0}; + bits <2> Pd4; + let Inst{1-0} = Pd4{1-0}; } class Enc_c9e3bc : OpcodeHexagon { bits <4> Ii; @@ -2886,33 +2497,18 @@ class Enc_6f83e7 : OpcodeHexagon { bits <5> Vd32; let Inst{4-0} = Vd32{4-0}; } -class Enc_46f33d : OpcodeHexagon { - bits <5> Rss32; - let Inst{20-16} = Rss32{4-0}; - bits <5> Rt32; - let Inst{12-8} = Rt32{4-0}; -} -class Enc_c1652e : OpcodeHexagon { - bits <5> Vu32; - let Inst{12-8} = Vu32{4-0}; +class Enc_6339d5 : OpcodeHexagon { + bits <2> Ii; + let Inst{13-13} = Ii{1-1}; + let Inst{7-7} = Ii{0-0}; + bits <2> Pv4; + let Inst{6-5} = Pv4{1-0}; + bits <5> Rs32; + let Inst{20-16} = Rs32{4-0}; + bits <5> Ru32; + let Inst{12-8} = Ru32{4-0}; bits <5> Rt32; - let Inst{20-16} = Rt32{4-0}; - bits <3> Qd8; - let Inst{5-3} = Qd8{2-0}; -} -class Enc_b5b643 : OpcodeHexagon { - bits <5> Rtt32; - let Inst{20-16} = Rtt32{4-0}; - bits <5> Vx32; - let Inst{7-3} = Vx32{4-0}; -} -class Enc_85daf5 : OpcodeHexagon { - bits <5> Vu32; - let Inst{12-8} = Vu32{4-0}; - bits <5> Rtt32; - let Inst{20-16} = Rtt32{4-0}; - bits <5> Vx32; - let Inst{7-3} = Vx32{4-0}; + let Inst{4-0} = Rt32{4-0}; } class Enc_d483b9 : OpcodeHexagon { bits <1> Ii; @@ -2952,13 +2548,14 @@ class Enc_6c9ee0 : OpcodeHexagon { bits <5> Rx32; let Inst{20-16} = Rx32{4-0}; } -class Enc_72a92d : OpcodeHexagon { - bits <5> Vuu32; - let Inst{12-8} = Vuu32{4-0}; - bits <5> Rt32; - let Inst{20-16} = Rt32{4-0}; - bits <5> Vxx32; - let Inst{7-3} = Vxx32{4-0}; +class Enc_fa3ba4 : OpcodeHexagon { + bits <14> Ii; + let Inst{26-25} = Ii{13-12}; + let Inst{13-5} = Ii{11-3}; + bits <5> Rs32; + let Inst{20-16} = Rs32{4-0}; + bits <5> Rdd32; + let Inst{4-0} = Rdd32{4-0}; } class Enc_44661f : OpcodeHexagon { bits <1> Mu2; @@ -3006,14 +2603,6 @@ class Enc_da664b : OpcodeHexagon { bits <5> Rd32; let Inst{4-0} = Rd32{4-0}; } -class Enc_7b7ba8 : OpcodeHexagon { - bits <2> Qu4; - let Inst{9-8} = Qu4{1-0}; - bits <5> Rt32; - let Inst{20-16} = Rt32{4-0}; - bits <5> Vd32; - let Inst{4-0} = Vd32{4-0}; -} class Enc_47ee5e : OpcodeHexagon { bits <2> Ii; let Inst{13-13} = Ii{1-1}; @@ -3116,14 +2705,6 @@ class Enc_8e583a : OpcodeHexagon { let Inst{25-23} = n1{3-1}; let Inst{13-13} = n1{0-0}; } -class Enc_334c2b : OpcodeHexagon { - bits <5> Vuu32; - let Inst{12-8} = Vuu32{4-0}; - bits <5> Rt32; - let Inst{20-16} = Rt32{4-0}; - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; -} class Enc_b886fd : OpcodeHexagon { bits <5> Ii; let Inst{6-3} = Ii{4-1}; @@ -3177,36 +2758,12 @@ class Enc_8dbdfe : OpcodeHexagon { bits <3> Nt8; let Inst{10-8} = Nt8{2-0}; } -class Enc_7dc746 : OpcodeHexagon { - bits <3> Quu8; - let Inst{10-8} = Quu8{2-0}; - bits <5> Rt32; - let Inst{20-16} = Rt32{4-0}; - bits <3> Qdd8; - let Inst{5-3} = Qdd8{2-0}; -} class Enc_90cd8b : OpcodeHexagon { bits <5> Rss32; let Inst{20-16} = Rss32{4-0}; bits <5> Rd32; let Inst{4-0} = Rd32{4-0}; } -class Enc_b8513b : OpcodeHexagon { - bits <5> Vuu32; - let Inst{20-16} = Vuu32{4-0}; - bits <5> Vvv32; - let Inst{12-8} = Vvv32{4-0}; - bits <5> Vdd32; - let Inst{7-3} = Vdd32{4-0}; -} -class Enc_b3bac4 : OpcodeHexagon { - bits <5> Vu32; - let Inst{12-8} = Vu32{4-0}; - bits <5> Rtt32; - let Inst{20-16} = Rtt32{4-0}; - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; -} class Enc_bd0b33 : OpcodeHexagon { bits <10> Ii; let Inst{21-21} = Ii{9-9}; @@ -3216,16 +2773,6 @@ class Enc_bd0b33 : OpcodeHexagon { bits <2> Pd4; let Inst{1-0} = Pd4{1-0}; } -class Enc_843e80 : OpcodeHexagon { - bits <5> Vu32; - let Inst{12-8} = Vu32{4-0}; - bits <5> Rt32; - let Inst{20-16} = Rt32{4-0}; - bits <5> Vd32; - let Inst{7-3} = Vd32{4-0}; - bits <3> Qxx8; - let Inst{2-0} = Qxx8{2-0}; -} class Enc_8b8927 : OpcodeHexagon { bits <5> Rt32; let Inst{20-16} = Rt32{4-0}; @@ -3359,6 +2906,16 @@ class Enc_e07374 : OpcodeHexagon { bits <5> Rd32; let Inst{4-0} = Rd32{4-0}; } +class Enc_e0820b : OpcodeHexagon { + bits <5> Vu32; + let Inst{12-8} = Vu32{4-0}; + bits <5> Vv32; + let Inst{20-16} = Vv32{4-0}; + bits <2> Qs4; + let Inst{6-5} = Qs4{1-0}; + bits <5> Vd32; + let Inst{4-0} = Vd32{4-0}; +} class Enc_323f2d : OpcodeHexagon { bits <6> II; let Inst{11-8} = II{5-2}; @@ -3381,16 +2938,6 @@ class Enc_1a9974 : OpcodeHexagon { bits <5> Rtt32; let Inst{4-0} = Rtt32{4-0}; } -class Enc_9ce456 : OpcodeHexagon { - bits <10> Ii; - let Inst{21-21} = Ii{9-9}; - let Inst{13-8} = Ii{8-3}; - let Inst{2-0} = Ii{2-0}; - bits <5> Vss32; - let Inst{7-3} = Vss32{4-0}; - bits <5> Rx32; - let Inst{20-16} = Rx32{4-0}; -} class Enc_5de85f : OpcodeHexagon { bits <11> Ii; let Inst{21-20} = Ii{10-9}; @@ -3416,14 +2963,6 @@ class Enc_0b51ce : OpcodeHexagon { bits <5> Rx32; let Inst{20-16} = Rx32{4-0}; } -class Enc_b5e54d : OpcodeHexagon { - bits <5> Vu32; - let Inst{12-8} = Vu32{4-0}; - bits <5> Rs32; - let Inst{20-16} = Rs32{4-0}; - bits <5> Rdd32; - let Inst{4-0} = Rdd32{4-0}; -} class Enc_b4e6cf : OpcodeHexagon { bits <10> Ii; let Inst{21-21} = Ii{9-9}; @@ -3479,16 +3018,6 @@ class Enc_645d54 : OpcodeHexagon { bits <5> Rdd32; let Inst{4-0} = Rdd32{4-0}; } -class Enc_b5d5a7 : OpcodeHexagon { - bits <16> Ii; - let Inst{21-21} = Ii{15-15}; - let Inst{13-8} = Ii{14-9}; - let Inst{2-0} = Ii{8-6}; - bits <5> Rt32; - let Inst{20-16} = Rt32{4-0}; - bits <5> Vs32; - let Inst{7-3} = Vs32{4-0}; -} class Enc_667b39 : OpcodeHexagon { bits <5> Css32; let Inst{20-16} = Css32{4-0}; @@ -3511,6 +3040,14 @@ class Enc_163a3c : OpcodeHexagon { bits <5> Rt32; let Inst{4-0} = Rt32{4-0}; } +class Enc_a75aa6 : OpcodeHexagon { + bits <5> Rs32; + let Inst{20-16} = Rs32{4-0}; + bits <5> Rt32; + let Inst{12-8} = Rt32{4-0}; + bits <1> Mu2; + let Inst{13-13} = Mu2{0-0}; +} class Enc_b087ac : OpcodeHexagon { bits <5> Vu32; let Inst{12-8} = Vu32{4-0}; @@ -3519,6 +3056,14 @@ class Enc_b087ac : OpcodeHexagon { bits <5> Vd32; let Inst{4-0} = Vd32{4-0}; } +class Enc_691712 : OpcodeHexagon { + bits <2> Pv4; + let Inst{12-11} = Pv4{1-0}; + bits <1> Mu2; + let Inst{13-13} = Mu2{0-0}; + bits <5> Rx32; + let Inst{20-16} = Rx32{4-0}; +} class Enc_b1e1fb : OpcodeHexagon { bits <11> Ii; let Inst{21-20} = Ii{10-9}; @@ -3546,16 +3091,6 @@ class Enc_b8c967 : OpcodeHexagon { bits <5> Rd32; let Inst{4-0} = Rd32{4-0}; } -class Enc_f106e0 : OpcodeHexagon { - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <5> Vv32; - let Inst{8-4} = Vv32{4-0}; - bits <5> Vt32; - let Inst{13-9} = Vt32{4-0}; - bits <4> Vdd16; - let Inst{3-0} = Vdd16{3-0}; -} class Enc_fb6577 : OpcodeHexagon { bits <2> Pu4; let Inst{9-8} = Pu4{1-0}; @@ -3564,20 +3099,6 @@ class Enc_fb6577 : OpcodeHexagon { bits <5> Rd32; let Inst{4-0} = Rd32{4-0}; } -class Enc_37c406 : OpcodeHexagon { - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <5> Vv32; - let Inst{12-8} = Vv32{4-0}; - bits <3> Rt8; - let Inst{2-0} = Rt8{2-0}; - bits <4> Vdd16; - let Inst{7-4} = Vdd16{3-0}; -} -class Enc_403871 : OpcodeHexagon { - bits <5> Rx32; - let Inst{20-16} = Rx32{4-0}; -} class Enc_2bae10 : OpcodeHexagon { bits <4> Ii; let Inst{10-8} = Ii{3-1}; @@ -3586,22 +3107,6 @@ class Enc_2bae10 : OpcodeHexagon { bits <4> Rd16; let Inst{3-0} = Rd16{3-0}; } -class Enc_f3adb6 : OpcodeHexagon { - bits <16> Ii; - let Inst{21-21} = Ii{15-15}; - let Inst{13-8} = Ii{14-9}; - let Inst{2-0} = Ii{8-6}; - bits <5> Vdd32; - let Inst{7-3} = Vdd32{4-0}; - bits <5> Rx32; - let Inst{20-16} = Rx32{4-0}; -} -class Enc_aac08c : OpcodeHexagon { - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <5> Vx32; - let Inst{7-3} = Vx32{4-0}; -} class Enc_c4dc92 : OpcodeHexagon { bits <2> Qv4; let Inst{23-22} = Qv4{1-0}; @@ -3743,12 +3248,14 @@ class Enc_134437 : OpcodeHexagon { bits <2> Qd4; let Inst{1-0} = Qd4{1-0}; } -class Enc_33f8ba : OpcodeHexagon { - bits <8> Ii; - let Inst{12-8} = Ii{7-3}; - let Inst{4-2} = Ii{2-0}; - bits <5> Rx32; - let Inst{20-16} = Rx32{4-0}; +class Enc_f3f408 : OpcodeHexagon { + bits <4> Ii; + let Inst{13-13} = Ii{3-3}; + let Inst{10-8} = Ii{2-0}; + bits <5> Rt32; + let Inst{20-16} = Rt32{4-0}; + bits <5> Vd32; + let Inst{4-0} = Vd32{4-0}; } class Enc_97d666 : OpcodeHexagon { bits <4> Rs16; @@ -3766,16 +3273,6 @@ class Enc_f82eaf : OpcodeHexagon { bits <5> Rd32; let Inst{4-0} = Rd32{4-0}; } -class Enc_57e245 : OpcodeHexagon { - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <3> Rt8; - let Inst{2-0} = Rt8{2-0}; - bits <5> Vdd32; - let Inst{7-3} = Vdd32{4-0}; - bits <5> Vy32; - let Inst{12-8} = Vy32{4-0}; -} class Enc_69d63b : OpcodeHexagon { bits <11> Ii; let Inst{21-20} = Ii{10-9}; @@ -3842,24 +3339,6 @@ class Enc_7eaeb6 : OpcodeHexagon { bits <5> Rx32; let Inst{20-16} = Rx32{4-0}; } -class Enc_274a4c : OpcodeHexagon { - bits <5> Vu32; - let Inst{20-16} = Vu32{4-0}; - bits <3> Rt8; - let Inst{2-0} = Rt8{2-0}; - bits <5> Vx32; - let Inst{7-3} = Vx32{4-0}; - bits <5> Vy32; - let Inst{12-8} = Vy32{4-0}; -} -class Enc_aceeef : OpcodeHexagon { - bits <5> Vu32; - let Inst{12-8} = Vu32{4-0}; - bits <5> Rt32; - let Inst{20-16} = Rt32{4-0}; - bits <5> Vdd32; - let Inst{7-3} = Vdd32{4-0}; -} class Enc_f55a0c : OpcodeHexagon { bits <6> Ii; let Inst{11-8} = Ii{5-2}; @@ -3898,16 +3377,6 @@ class Enc_7b523d : OpcodeHexagon { bits <5> Vxx32; let Inst{4-0} = Vxx32{4-0}; } -class Enc_c39a8b : OpcodeHexagon { - bits <16> Ii; - let Inst{21-21} = Ii{15-15}; - let Inst{13-8} = Ii{14-9}; - let Inst{2-0} = Ii{8-6}; - bits <5> Rt32; - let Inst{20-16} = Rt32{4-0}; - bits <5> Vss32; - let Inst{7-3} = Vss32{4-0}; -} class Enc_47ef61 : OpcodeHexagon { bits <3> Ii; let Inst{7-5} = Ii{2-0}; @@ -4006,6 +3475,14 @@ class Enc_a6ce9c : OpcodeHexagon { bits <4> Rs16; let Inst{7-4} = Rs16{3-0}; } +class Enc_3b7631 : OpcodeHexagon { + bits <5> Vu32; + let Inst{12-8} = Vu32{4-0}; + bits <5> Vdddd32; + let Inst{4-0} = Vdddd32{4-0}; + bits <3> Rx8; + let Inst{18-16} = Rx8{2-0}; +} class Enc_eca7c8 : OpcodeHexagon { bits <2> Ii; let Inst{13-13} = Ii{1-1}; @@ -4017,16 +3494,6 @@ class Enc_eca7c8 : OpcodeHexagon { bits <5> Rt32; let Inst{4-0} = Rt32{4-0}; } -class Enc_598f6c : OpcodeHexagon { - bits <5> Rtt32; - let Inst{12-8} = Rtt32{4-0}; -} -class Enc_41dcc3 : OpcodeHexagon { - bits <5> Rt32; - let Inst{20-16} = Rt32{4-0}; - bits <5> Vdd32; - let Inst{7-3} = Vdd32{4-0}; -} class Enc_4b39e4 : OpcodeHexagon { bits <3> Ii; let Inst{7-5} = Ii{2-0}; diff --git a/lib/Target/Hexagon/HexagonDepInstrInfo.td b/lib/Target/Hexagon/HexagonDepInstrInfo.td index 0b5efda933da3f27d9b589f9daa2dda60dd52543..3ef1c49eb7ee0f4be2c186e03d59b9f9cbae9a26 100644 --- a/lib/Target/Hexagon/HexagonDepInstrInfo.td +++ b/lib/Target/Hexagon/HexagonDepInstrInfo.td @@ -1,4 +1,4 @@ -//===- HexagonDepInstrInfo.td ---------------------------------------------===// +//===----------------------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -9,12 +9,11 @@ // Automatically generated file, please consult code owner before editing. //===----------------------------------------------------------------------===// - def A2_abs : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = abs($Rs32)", -tc_c2f7d806, TypeS_2op>, Enc_5e2823 { +tc_cf8126ae, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000100; let Inst{31-21} = 0b10001100100; let hasNewValue = 1; @@ -25,7 +24,7 @@ def A2_absp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = abs($Rss32)", -tc_c2f7d806, TypeS_2op>, Enc_b9c5fb { +tc_cf8126ae, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000110; let Inst{31-21} = 0b10000000100; let prefersSlot3 = 1; @@ -34,7 +33,7 @@ def A2_abssat : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = abs($Rs32):sat", -tc_c2f7d806, TypeS_2op>, Enc_5e2823 { +tc_cf8126ae, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000101; let Inst{31-21} = 0b10001100100; let hasNewValue = 1; @@ -46,7 +45,7 @@ def A2_add : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = add($Rs32,$Rt32)", -tc_b9488031, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel { +tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110011000; @@ -62,7 +61,7 @@ def A2_addh_h16_hh : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = add($Rt32.h,$Rs32.h):<<16", -tc_897d1a9d, TypeALU64>, Enc_bd6011 { +tc_679309b8, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101010; @@ -74,7 +73,7 @@ def A2_addh_h16_hl : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = add($Rt32.h,$Rs32.l):<<16", -tc_897d1a9d, TypeALU64>, Enc_bd6011 { +tc_679309b8, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101010; @@ -86,7 +85,7 @@ def A2_addh_h16_lh : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = add($Rt32.l,$Rs32.h):<<16", -tc_897d1a9d, TypeALU64>, Enc_bd6011 { +tc_679309b8, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101010; @@ -98,7 +97,7 @@ def A2_addh_h16_ll : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = add($Rt32.l,$Rs32.l):<<16", -tc_897d1a9d, TypeALU64>, Enc_bd6011 { +tc_679309b8, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101010; @@ -110,7 +109,7 @@ def A2_addh_h16_sat_hh : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = add($Rt32.h,$Rs32.h):sat:<<16", -tc_b44c6e2a, TypeALU64>, Enc_bd6011 { +tc_779080bf, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101010; @@ -123,7 +122,7 @@ def A2_addh_h16_sat_hl : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = add($Rt32.h,$Rs32.l):sat:<<16", -tc_b44c6e2a, TypeALU64>, Enc_bd6011 { +tc_779080bf, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101010; @@ -136,7 +135,7 @@ def A2_addh_h16_sat_lh : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = add($Rt32.l,$Rs32.h):sat:<<16", -tc_b44c6e2a, TypeALU64>, Enc_bd6011 { +tc_779080bf, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101010; @@ -149,7 +148,7 @@ def A2_addh_h16_sat_ll : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = add($Rt32.l,$Rs32.l):sat:<<16", -tc_b44c6e2a, TypeALU64>, Enc_bd6011 { +tc_779080bf, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101010; @@ -162,7 +161,7 @@ def A2_addh_l16_hl : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = add($Rt32.l,$Rs32.h)", -tc_1b9c9ee5, TypeALU64>, Enc_bd6011 { +tc_4414d8b1, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101000; @@ -174,7 +173,7 @@ def A2_addh_l16_ll : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = add($Rt32.l,$Rs32.l)", -tc_1b9c9ee5, TypeALU64>, Enc_bd6011 { +tc_4414d8b1, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101000; @@ -186,7 +185,7 @@ def A2_addh_l16_sat_hl : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = add($Rt32.l,$Rs32.h):sat", -tc_b44c6e2a, TypeALU64>, Enc_bd6011 { +tc_779080bf, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101000; @@ -199,7 +198,7 @@ def A2_addh_l16_sat_ll : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = add($Rt32.l,$Rs32.l):sat", -tc_b44c6e2a, TypeALU64>, Enc_bd6011 { +tc_779080bf, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101000; @@ -212,7 +211,7 @@ def A2_addi : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Rd32 = add($Rs32,#$Ii)", -tc_b9488031, TypeALU32_ADDI>, Enc_cb9321, PredNewRel, ImmRegRel { +tc_5a2711e5, TypeALU32_ADDI>, Enc_cb9321, PredNewRel, ImmRegRel { let Inst{31-28} = 0b1011; let hasNewValue = 1; let opNewValue = 0; @@ -231,7 +230,7 @@ def A2_addp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = add($Rss32,$Rtt32)", -tc_540fdfbc, TypeALU64>, Enc_a56825 { +tc_946df596, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011000; @@ -242,7 +241,7 @@ def A2_addpsat : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = add($Rss32,$Rtt32):sat", -tc_b44c6e2a, TypeALU64>, Enc_a56825 { +tc_779080bf, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011011; @@ -254,7 +253,7 @@ def A2_addsat : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = add($Rs32,$Rt32):sat", -tc_5ba5997d, TypeALU32_3op>, Enc_5ab2be { +tc_61830035, TypeALU32_3op>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110110010; @@ -269,14 +268,14 @@ def A2_addsp : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, DoubleRegs:$Rtt32), "$Rdd32 = add($Rs32,$Rtt32)", -tc_897d1a9d, TypeALU64> { +tc_679309b8, TypeALU64> { let isPseudo = 1; } def A2_addsph : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = add($Rss32,$Rtt32):raw:hi", -tc_897d1a9d, TypeALU64>, Enc_a56825 { +tc_679309b8, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011011; @@ -286,7 +285,7 @@ def A2_addspl : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = add($Rss32,$Rtt32):raw:lo", -tc_897d1a9d, TypeALU64>, Enc_a56825 { +tc_679309b8, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011011; @@ -296,7 +295,7 @@ def A2_and : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = and($Rs32,$Rt32)", -tc_b9488031, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel { +tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110001000; @@ -312,7 +311,7 @@ def A2_andir : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Rd32 = and($Rs32,#$Ii)", -tc_b9488031, TypeALU32_2op>, Enc_140c83, ImmRegRel { +tc_5a2711e5, TypeALU32_2op>, Enc_140c83, ImmRegRel { let Inst{31-22} = 0b0111011000; let hasNewValue = 1; let opNewValue = 0; @@ -328,7 +327,7 @@ def A2_andp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = and($Rss32,$Rtt32)", -tc_540fdfbc, TypeALU64>, Enc_a56825 { +tc_946df596, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011111; @@ -338,7 +337,7 @@ def A2_aslh : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = aslh($Rs32)", -tc_68cb12ce, TypeALU32_2op>, Enc_5e2823, PredNewRel { +tc_57890846, TypeALU32_2op>, Enc_5e2823, PredNewRel { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b01110000000; let hasNewValue = 1; @@ -350,7 +349,7 @@ def A2_asrh : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = asrh($Rs32)", -tc_68cb12ce, TypeALU32_2op>, Enc_5e2823, PredNewRel { +tc_57890846, TypeALU32_2op>, Enc_5e2823, PredNewRel { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b01110000001; let hasNewValue = 1; @@ -362,7 +361,7 @@ def A2_combine_hh : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = combine($Rt32.h,$Rs32.h)", -tc_b9488031, TypeALU32_3op>, Enc_bd6011 { +tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110011100; @@ -374,7 +373,7 @@ def A2_combine_hl : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = combine($Rt32.h,$Rs32.l)", -tc_b9488031, TypeALU32_3op>, Enc_bd6011 { +tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110011101; @@ -386,7 +385,7 @@ def A2_combine_lh : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = combine($Rt32.l,$Rs32.h)", -tc_b9488031, TypeALU32_3op>, Enc_bd6011 { +tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110011110; @@ -398,7 +397,7 @@ def A2_combine_ll : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = combine($Rt32.l,$Rs32.l)", -tc_b9488031, TypeALU32_3op>, Enc_bd6011 { +tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110011111; @@ -410,7 +409,7 @@ def A2_combineii : HInst< (outs DoubleRegs:$Rdd32), (ins s32_0Imm:$Ii, s8_0Imm:$II), "$Rdd32 = combine(#$Ii,#$II)", -tc_b9488031, TypeALU32_2op>, Enc_18c338 { +tc_5a2711e5, TypeALU32_2op>, Enc_18c338 { let Inst{31-23} = 0b011111000; let isReMaterializable = 1; let isAsCheapAsAMove = 1; @@ -425,7 +424,7 @@ def A2_combinew : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = combine($Rs32,$Rt32)", -tc_b9488031, TypeALU32_3op>, Enc_be32a5, PredNewRel { +tc_5a2711e5, TypeALU32_3op>, Enc_be32a5, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110101000; @@ -437,7 +436,7 @@ def A2_max : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = max($Rs32,$Rt32)", -tc_b44c6e2a, TypeALU64>, Enc_5ab2be { +tc_779080bf, TypeALU64>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101110; @@ -449,7 +448,7 @@ def A2_maxp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = max($Rss32,$Rtt32)", -tc_b44c6e2a, TypeALU64>, Enc_a56825 { +tc_779080bf, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011110; @@ -459,7 +458,7 @@ def A2_maxu : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = maxu($Rs32,$Rt32)", -tc_b44c6e2a, TypeALU64>, Enc_5ab2be { +tc_779080bf, TypeALU64>, Enc_5ab2be { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101110; @@ -471,7 +470,7 @@ def A2_maxup : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = maxu($Rss32,$Rtt32)", -tc_b44c6e2a, TypeALU64>, Enc_a56825 { +tc_779080bf, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011110; @@ -481,7 +480,7 @@ def A2_min : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = min($Rt32,$Rs32)", -tc_b44c6e2a, TypeALU64>, Enc_bd6011 { +tc_779080bf, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101101; @@ -493,7 +492,7 @@ def A2_minp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = min($Rtt32,$Rss32)", -tc_b44c6e2a, TypeALU64>, Enc_ea23e4 { +tc_779080bf, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011101; @@ -503,7 +502,7 @@ def A2_minu : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = minu($Rt32,$Rs32)", -tc_b44c6e2a, TypeALU64>, Enc_bd6011 { +tc_779080bf, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101101; @@ -515,7 +514,7 @@ def A2_minup : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = minu($Rtt32,$Rss32)", -tc_b44c6e2a, TypeALU64>, Enc_ea23e4 { +tc_779080bf, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011101; @@ -525,7 +524,7 @@ def A2_neg : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = neg($Rs32)", -tc_68cb12ce, TypeALU32_2op> { +tc_57890846, TypeALU32_2op> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -535,7 +534,7 @@ def A2_negp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = neg($Rss32)", -tc_cde8b071, TypeS_2op>, Enc_b9c5fb { +tc_0ae0825c, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000101; let Inst{31-21} = 0b10000000100; } @@ -543,7 +542,7 @@ def A2_negsat : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = neg($Rs32):sat", -tc_c2f7d806, TypeS_2op>, Enc_5e2823 { +tc_cf8126ae, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000110; let Inst{31-21} = 0b10001100100; let hasNewValue = 1; @@ -555,7 +554,7 @@ def A2_nop : HInst< (outs), (ins), "nop", -tc_6efc556e, TypeALU32_2op>, Enc_e3b0c4 { +tc_2eabeebe, TypeALU32_2op>, Enc_e3b0c4 { let Inst{13-0} = 0b00000000000000; let Inst{31-16} = 0b0111111100000000; } @@ -563,7 +562,7 @@ def A2_not : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = not($Rs32)", -tc_68cb12ce, TypeALU32_2op> { +tc_57890846, TypeALU32_2op> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -573,7 +572,7 @@ def A2_notp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = not($Rss32)", -tc_cde8b071, TypeS_2op>, Enc_b9c5fb { +tc_0ae0825c, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000100; let Inst{31-21} = 0b10000000100; } @@ -581,7 +580,7 @@ def A2_or : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = or($Rs32,$Rt32)", -tc_b9488031, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel { +tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110001001; @@ -597,7 +596,7 @@ def A2_orir : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Rd32 = or($Rs32,#$Ii)", -tc_b9488031, TypeALU32_2op>, Enc_140c83, ImmRegRel { +tc_5a2711e5, TypeALU32_2op>, Enc_140c83, ImmRegRel { let Inst{31-22} = 0b0111011010; let hasNewValue = 1; let opNewValue = 0; @@ -613,7 +612,7 @@ def A2_orp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = or($Rss32,$Rtt32)", -tc_540fdfbc, TypeALU64>, Enc_a56825 { +tc_946df596, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011111; @@ -623,7 +622,7 @@ def A2_paddf : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if (!$Pu4) $Rd32 = add($Rs32,$Rt32)", -tc_d6bf0472, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel { +tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel { let Inst{7-7} = 0b1; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11111011000; @@ -639,7 +638,7 @@ def A2_paddfnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if (!$Pu4.new) $Rd32 = add($Rs32,$Rt32)", -tc_2b2f4060, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel { +tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel { let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11111011000; @@ -656,7 +655,7 @@ def A2_paddif : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii), "if (!$Pu4) $Rd32 = add($Rs32,#$Ii)", -tc_d6bf0472, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel { +tc_4c5ba658, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel { let Inst{13-13} = 0b0; let Inst{31-23} = 0b011101001; let isPredicated = 1; @@ -676,7 +675,7 @@ def A2_paddifnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii), "if (!$Pu4.new) $Rd32 = add($Rs32,#$Ii)", -tc_2b2f4060, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel { +tc_05c070ec, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel { let Inst{13-13} = 0b1; let Inst{31-23} = 0b011101001; let isPredicated = 1; @@ -697,7 +696,7 @@ def A2_paddit : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii), "if ($Pu4) $Rd32 = add($Rs32,#$Ii)", -tc_d6bf0472, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel { +tc_4c5ba658, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel { let Inst{13-13} = 0b0; let Inst{31-23} = 0b011101000; let isPredicated = 1; @@ -716,7 +715,7 @@ def A2_padditnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii), "if ($Pu4.new) $Rd32 = add($Rs32,#$Ii)", -tc_2b2f4060, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel { +tc_05c070ec, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel { let Inst{13-13} = 0b1; let Inst{31-23} = 0b011101000; let isPredicated = 1; @@ -736,7 +735,7 @@ def A2_paddt : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if ($Pu4) $Rd32 = add($Rs32,$Rt32)", -tc_d6bf0472, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel { +tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel { let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11111011000; @@ -751,7 +750,7 @@ def A2_paddtnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if ($Pu4.new) $Rd32 = add($Rs32,$Rt32)", -tc_2b2f4060, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel { +tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel { let Inst{7-7} = 0b0; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11111011000; @@ -767,7 +766,7 @@ def A2_pandf : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if (!$Pu4) $Rd32 = and($Rs32,$Rt32)", -tc_d6bf0472, TypeALU32_3op>, Enc_ea4c54, PredNewRel { +tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel { let Inst{7-7} = 0b1; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11111001000; @@ -781,7 +780,7 @@ def A2_pandfnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if (!$Pu4.new) $Rd32 = and($Rs32,$Rt32)", -tc_2b2f4060, TypeALU32_3op>, Enc_ea4c54, PredNewRel { +tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel { let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11111001000; @@ -796,7 +795,7 @@ def A2_pandt : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if ($Pu4) $Rd32 = and($Rs32,$Rt32)", -tc_d6bf0472, TypeALU32_3op>, Enc_ea4c54, PredNewRel { +tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel { let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11111001000; @@ -809,7 +808,7 @@ def A2_pandtnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if ($Pu4.new) $Rd32 = and($Rs32,$Rt32)", -tc_2b2f4060, TypeALU32_3op>, Enc_ea4c54, PredNewRel { +tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel { let Inst{7-7} = 0b0; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11111001000; @@ -823,7 +822,7 @@ def A2_porf : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if (!$Pu4) $Rd32 = or($Rs32,$Rt32)", -tc_d6bf0472, TypeALU32_3op>, Enc_ea4c54, PredNewRel { +tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel { let Inst{7-7} = 0b1; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11111001001; @@ -837,7 +836,7 @@ def A2_porfnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if (!$Pu4.new) $Rd32 = or($Rs32,$Rt32)", -tc_2b2f4060, TypeALU32_3op>, Enc_ea4c54, PredNewRel { +tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel { let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11111001001; @@ -852,7 +851,7 @@ def A2_port : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if ($Pu4) $Rd32 = or($Rs32,$Rt32)", -tc_d6bf0472, TypeALU32_3op>, Enc_ea4c54, PredNewRel { +tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel { let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11111001001; @@ -865,7 +864,7 @@ def A2_portnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if ($Pu4.new) $Rd32 = or($Rs32,$Rt32)", -tc_2b2f4060, TypeALU32_3op>, Enc_ea4c54, PredNewRel { +tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel { let Inst{7-7} = 0b0; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11111001001; @@ -879,7 +878,7 @@ def A2_psubf : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32), "if (!$Pu4) $Rd32 = sub($Rt32,$Rs32)", -tc_d6bf0472, TypeALU32_3op>, Enc_9b0bc1, PredNewRel { +tc_4c5ba658, TypeALU32_3op>, Enc_9b0bc1, PredNewRel { let Inst{7-7} = 0b1; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11111011001; @@ -893,7 +892,7 @@ def A2_psubfnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32), "if (!$Pu4.new) $Rd32 = sub($Rt32,$Rs32)", -tc_2b2f4060, TypeALU32_3op>, Enc_9b0bc1, PredNewRel { +tc_05c070ec, TypeALU32_3op>, Enc_9b0bc1, PredNewRel { let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11111011001; @@ -908,7 +907,7 @@ def A2_psubt : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32), "if ($Pu4) $Rd32 = sub($Rt32,$Rs32)", -tc_d6bf0472, TypeALU32_3op>, Enc_9b0bc1, PredNewRel { +tc_4c5ba658, TypeALU32_3op>, Enc_9b0bc1, PredNewRel { let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11111011001; @@ -921,7 +920,7 @@ def A2_psubtnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32), "if ($Pu4.new) $Rd32 = sub($Rt32,$Rs32)", -tc_2b2f4060, TypeALU32_3op>, Enc_9b0bc1, PredNewRel { +tc_05c070ec, TypeALU32_3op>, Enc_9b0bc1, PredNewRel { let Inst{7-7} = 0b0; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11111011001; @@ -935,7 +934,7 @@ def A2_pxorf : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if (!$Pu4) $Rd32 = xor($Rs32,$Rt32)", -tc_d6bf0472, TypeALU32_3op>, Enc_ea4c54, PredNewRel { +tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel { let Inst{7-7} = 0b1; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11111001011; @@ -949,7 +948,7 @@ def A2_pxorfnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if (!$Pu4.new) $Rd32 = xor($Rs32,$Rt32)", -tc_2b2f4060, TypeALU32_3op>, Enc_ea4c54, PredNewRel { +tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel { let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11111001011; @@ -964,7 +963,7 @@ def A2_pxort : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if ($Pu4) $Rd32 = xor($Rs32,$Rt32)", -tc_d6bf0472, TypeALU32_3op>, Enc_ea4c54, PredNewRel { +tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel { let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11111001011; @@ -977,7 +976,7 @@ def A2_pxortnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if ($Pu4.new) $Rd32 = xor($Rs32,$Rt32)", -tc_2b2f4060, TypeALU32_3op>, Enc_ea4c54, PredNewRel { +tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel { let Inst{7-7} = 0b0; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11111001011; @@ -991,7 +990,7 @@ def A2_roundsat : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = round($Rss32):sat", -tc_c2f7d806, TypeS_2op>, Enc_90cd8b { +tc_cf8126ae, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000001; let Inst{31-21} = 0b10001000110; let hasNewValue = 1; @@ -1003,7 +1002,7 @@ def A2_sat : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = sat($Rss32)", -tc_cde8b071, TypeS_2op>, Enc_90cd8b { +tc_0ae0825c, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b10001000110; let hasNewValue = 1; @@ -1014,7 +1013,7 @@ def A2_satb : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = satb($Rs32)", -tc_cde8b071, TypeS_2op>, Enc_5e2823 { +tc_0ae0825c, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000111; let Inst{31-21} = 0b10001100110; let hasNewValue = 1; @@ -1025,7 +1024,7 @@ def A2_sath : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = sath($Rs32)", -tc_cde8b071, TypeS_2op>, Enc_5e2823 { +tc_0ae0825c, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000100; let Inst{31-21} = 0b10001100110; let hasNewValue = 1; @@ -1036,7 +1035,7 @@ def A2_satub : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = satub($Rs32)", -tc_cde8b071, TypeS_2op>, Enc_5e2823 { +tc_0ae0825c, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000110; let Inst{31-21} = 0b10001100110; let hasNewValue = 1; @@ -1047,7 +1046,7 @@ def A2_satuh : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = satuh($Rs32)", -tc_cde8b071, TypeS_2op>, Enc_5e2823 { +tc_0ae0825c, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000101; let Inst{31-21} = 0b10001100110; let hasNewValue = 1; @@ -1058,7 +1057,7 @@ def A2_sub : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = sub($Rt32,$Rs32)", -tc_b9488031, TypeALU32_3op>, Enc_bd6011, PredNewRel, ImmRegRel { +tc_5a2711e5, TypeALU32_3op>, Enc_bd6011, PredNewRel, ImmRegRel { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110011001; @@ -1073,7 +1072,7 @@ def A2_subh_h16_hh : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = sub($Rt32.h,$Rs32.h):<<16", -tc_897d1a9d, TypeALU64>, Enc_bd6011 { +tc_679309b8, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101011; @@ -1085,7 +1084,7 @@ def A2_subh_h16_hl : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = sub($Rt32.h,$Rs32.l):<<16", -tc_897d1a9d, TypeALU64>, Enc_bd6011 { +tc_679309b8, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101011; @@ -1097,7 +1096,7 @@ def A2_subh_h16_lh : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = sub($Rt32.l,$Rs32.h):<<16", -tc_897d1a9d, TypeALU64>, Enc_bd6011 { +tc_679309b8, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101011; @@ -1109,7 +1108,7 @@ def A2_subh_h16_ll : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = sub($Rt32.l,$Rs32.l):<<16", -tc_897d1a9d, TypeALU64>, Enc_bd6011 { +tc_679309b8, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101011; @@ -1121,7 +1120,7 @@ def A2_subh_h16_sat_hh : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = sub($Rt32.h,$Rs32.h):sat:<<16", -tc_b44c6e2a, TypeALU64>, Enc_bd6011 { +tc_779080bf, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101011; @@ -1134,7 +1133,7 @@ def A2_subh_h16_sat_hl : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = sub($Rt32.h,$Rs32.l):sat:<<16", -tc_b44c6e2a, TypeALU64>, Enc_bd6011 { +tc_779080bf, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101011; @@ -1147,7 +1146,7 @@ def A2_subh_h16_sat_lh : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = sub($Rt32.l,$Rs32.h):sat:<<16", -tc_b44c6e2a, TypeALU64>, Enc_bd6011 { +tc_779080bf, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101011; @@ -1160,7 +1159,7 @@ def A2_subh_h16_sat_ll : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = sub($Rt32.l,$Rs32.l):sat:<<16", -tc_b44c6e2a, TypeALU64>, Enc_bd6011 { +tc_779080bf, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101011; @@ -1173,7 +1172,7 @@ def A2_subh_l16_hl : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = sub($Rt32.l,$Rs32.h)", -tc_1b9c9ee5, TypeALU64>, Enc_bd6011 { +tc_4414d8b1, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101001; @@ -1185,7 +1184,7 @@ def A2_subh_l16_ll : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = sub($Rt32.l,$Rs32.l)", -tc_1b9c9ee5, TypeALU64>, Enc_bd6011 { +tc_4414d8b1, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101001; @@ -1197,7 +1196,7 @@ def A2_subh_l16_sat_hl : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = sub($Rt32.l,$Rs32.h):sat", -tc_b44c6e2a, TypeALU64>, Enc_bd6011 { +tc_779080bf, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101001; @@ -1210,7 +1209,7 @@ def A2_subh_l16_sat_ll : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = sub($Rt32.l,$Rs32.l):sat", -tc_b44c6e2a, TypeALU64>, Enc_bd6011 { +tc_779080bf, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101001; @@ -1223,7 +1222,7 @@ def A2_subp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = sub($Rtt32,$Rss32)", -tc_540fdfbc, TypeALU64>, Enc_ea23e4 { +tc_946df596, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011001; @@ -1232,7 +1231,7 @@ def A2_subri : HInst< (outs IntRegs:$Rd32), (ins s32_0Imm:$Ii, IntRegs:$Rs32), "$Rd32 = sub(#$Ii,$Rs32)", -tc_b9488031, TypeALU32_2op>, Enc_140c83, PredNewRel, ImmRegRel { +tc_5a2711e5, TypeALU32_2op>, Enc_140c83, PredNewRel, ImmRegRel { let Inst{31-22} = 0b0111011001; let hasNewValue = 1; let opNewValue = 0; @@ -1248,7 +1247,7 @@ def A2_subsat : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = sub($Rt32,$Rs32):sat", -tc_5ba5997d, TypeALU32_3op>, Enc_bd6011 { +tc_61830035, TypeALU32_3op>, Enc_bd6011 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110110110; @@ -1262,7 +1261,7 @@ def A2_svaddh : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = vaddh($Rs32,$Rt32)", -tc_b9488031, TypeALU32_3op>, Enc_5ab2be { +tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110110000; @@ -1275,7 +1274,7 @@ def A2_svaddhs : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = vaddh($Rs32,$Rt32):sat", -tc_5ba5997d, TypeALU32_3op>, Enc_5ab2be { +tc_61830035, TypeALU32_3op>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110110001; @@ -1290,7 +1289,7 @@ def A2_svadduhs : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = vadduh($Rs32,$Rt32):sat", -tc_5ba5997d, TypeALU32_3op>, Enc_5ab2be { +tc_61830035, TypeALU32_3op>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110110011; @@ -1305,12 +1304,13 @@ def A2_svavgh : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = vavgh($Rs32,$Rt32)", -tc_b9488031, TypeALU32_3op>, Enc_5ab2be { +tc_1c80410a, TypeALU32_3op>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110111000; let hasNewValue = 1; let opNewValue = 0; +let prefersSlot3 = 1; let InputType = "reg"; let isCommutable = 1; } @@ -1318,12 +1318,13 @@ def A2_svavghs : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = vavgh($Rs32,$Rt32):rnd", -tc_8fe6b782, TypeALU32_3op>, Enc_5ab2be { +tc_d08ee0f4, TypeALU32_3op>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110111001; let hasNewValue = 1; let opNewValue = 0; +let prefersSlot3 = 1; let InputType = "reg"; let isCommutable = 1; } @@ -1331,19 +1332,20 @@ def A2_svnavgh : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = vnavgh($Rt32,$Rs32)", -tc_b9488031, TypeALU32_3op>, Enc_bd6011 { +tc_1c80410a, TypeALU32_3op>, Enc_bd6011 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110111011; let hasNewValue = 1; let opNewValue = 0; +let prefersSlot3 = 1; let InputType = "reg"; } def A2_svsubh : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = vsubh($Rt32,$Rs32)", -tc_b9488031, TypeALU32_3op>, Enc_bd6011 { +tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110110100; @@ -1355,7 +1357,7 @@ def A2_svsubhs : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = vsubh($Rt32,$Rs32):sat", -tc_5ba5997d, TypeALU32_3op>, Enc_bd6011 { +tc_61830035, TypeALU32_3op>, Enc_bd6011 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110110101; @@ -1369,7 +1371,7 @@ def A2_svsubuhs : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = vsubuh($Rt32,$Rs32):sat", -tc_5ba5997d, TypeALU32_3op>, Enc_bd6011 { +tc_61830035, TypeALU32_3op>, Enc_bd6011 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110110111; @@ -1383,7 +1385,7 @@ def A2_swiz : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = swiz($Rs32)", -tc_cde8b071, TypeS_2op>, Enc_5e2823 { +tc_0ae0825c, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000111; let Inst{31-21} = 0b10001100100; let hasNewValue = 1; @@ -1393,7 +1395,7 @@ def A2_sxtb : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = sxtb($Rs32)", -tc_68cb12ce, TypeALU32_2op>, Enc_5e2823, PredNewRel { +tc_57890846, TypeALU32_2op>, Enc_5e2823, PredNewRel { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b01110000101; let hasNewValue = 1; @@ -1405,7 +1407,7 @@ def A2_sxth : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = sxth($Rs32)", -tc_68cb12ce, TypeALU32_2op>, Enc_5e2823, PredNewRel { +tc_57890846, TypeALU32_2op>, Enc_5e2823, PredNewRel { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b01110000111; let hasNewValue = 1; @@ -1417,7 +1419,7 @@ def A2_sxtw : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = sxtw($Rs32)", -tc_cde8b071, TypeS_2op>, Enc_3a3d62 { +tc_0ae0825c, TypeS_2op>, Enc_3a3d62 { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b10000100010; } @@ -1425,7 +1427,7 @@ def A2_tfr : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = $Rs32", -tc_68cb12ce, TypeALU32_2op>, Enc_5e2823, PredNewRel { +tc_57890846, TypeALU32_2op>, Enc_5e2823, PredNewRel { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b01110000011; let hasNewValue = 1; @@ -1438,7 +1440,7 @@ def A2_tfrcrr : HInst< (outs IntRegs:$Rd32), (ins CtrRegs:$Cs32), "$Rd32 = $Cs32", -tc_29175780, TypeCR>, Enc_0cb018 { +tc_b9272d6c, TypeCR>, Enc_0cb018 { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b01101010000; let hasNewValue = 1; @@ -1448,7 +1450,7 @@ def A2_tfrf : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4) $Rd32 = $Rs32", -tc_d6bf0472, TypeALU32_2op>, PredNewRel, ImmRegRel { +tc_4c5ba658, TypeALU32_2op>, PredNewRel, ImmRegRel { let isPredicated = 1; let isPredicatedFalse = 1; let hasNewValue = 1; @@ -1463,7 +1465,7 @@ def A2_tfrfnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4.new) $Rd32 = $Rs32", -tc_2b2f4060, TypeALU32_2op>, PredNewRel, ImmRegRel { +tc_05c070ec, TypeALU32_2op>, PredNewRel, ImmRegRel { let isPredicated = 1; let isPredicatedFalse = 1; let hasNewValue = 1; @@ -1479,7 +1481,7 @@ def A2_tfrih : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, u16_0Imm:$Ii), "$Rx32.h = #$Ii", -tc_b9488031, TypeALU32_2op>, Enc_51436c { +tc_5a2711e5, TypeALU32_2op>, Enc_51436c { let Inst{21-21} = 0b1; let Inst{31-24} = 0b01110010; let hasNewValue = 1; @@ -1490,7 +1492,7 @@ def A2_tfril : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, u16_0Imm:$Ii), "$Rx32.l = #$Ii", -tc_b9488031, TypeALU32_2op>, Enc_51436c { +tc_5a2711e5, TypeALU32_2op>, Enc_51436c { let Inst{21-21} = 0b1; let Inst{31-24} = 0b01110001; let hasNewValue = 1; @@ -1501,7 +1503,7 @@ def A2_tfrp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = $Rss32", -tc_b9488031, TypeALU32_2op>, PredNewRel { +tc_5a2711e5, TypeALU32_2op>, PredNewRel { let BaseOpcode = "A2_tfrp"; let isPredicable = 1; let isPseudo = 1; @@ -1510,7 +1512,7 @@ def A2_tfrpf : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pu4, DoubleRegs:$Rss32), "if (!$Pu4) $Rdd32 = $Rss32", -tc_b9488031, TypeALU32_2op>, PredNewRel { +tc_5a2711e5, TypeALU32_2op>, PredNewRel { let isPredicated = 1; let isPredicatedFalse = 1; let BaseOpcode = "A2_tfrp"; @@ -1520,7 +1522,7 @@ def A2_tfrpfnew : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pu4, DoubleRegs:$Rss32), "if (!$Pu4.new) $Rdd32 = $Rss32", -tc_5f6847a1, TypeALU32_2op>, PredNewRel { +tc_1ae57e39, TypeALU32_2op>, PredNewRel { let isPredicated = 1; let isPredicatedFalse = 1; let isPredicatedNew = 1; @@ -1531,7 +1533,7 @@ def A2_tfrpi : HInst< (outs DoubleRegs:$Rdd32), (ins s8_0Imm:$Ii), "$Rdd32 = #$Ii", -tc_b9488031, TypeALU64> { +tc_5a2711e5, TypeALU64> { let isReMaterializable = 1; let isAsCheapAsAMove = 1; let isMoveImm = 1; @@ -1541,7 +1543,7 @@ def A2_tfrpt : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pu4, DoubleRegs:$Rss32), "if ($Pu4) $Rdd32 = $Rss32", -tc_b9488031, TypeALU32_2op>, PredNewRel { +tc_5a2711e5, TypeALU32_2op>, PredNewRel { let isPredicated = 1; let BaseOpcode = "A2_tfrp"; let isPseudo = 1; @@ -1550,7 +1552,7 @@ def A2_tfrptnew : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pu4, DoubleRegs:$Rss32), "if ($Pu4.new) $Rdd32 = $Rss32", -tc_5f6847a1, TypeALU32_2op>, PredNewRel { +tc_1ae57e39, TypeALU32_2op>, PredNewRel { let isPredicated = 1; let isPredicatedNew = 1; let BaseOpcode = "A2_tfrp"; @@ -1560,7 +1562,7 @@ def A2_tfrrcr : HInst< (outs CtrRegs:$Cd32), (ins IntRegs:$Rs32), "$Cd32 = $Rs32", -tc_a21dc435, TypeCR>, Enc_bd811a { +tc_434c8e1e, TypeCR>, Enc_bd811a { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b01100010001; let hasNewValue = 1; @@ -1570,7 +1572,7 @@ def A2_tfrsi : HInst< (outs IntRegs:$Rd32), (ins s32_0Imm:$Ii), "$Rd32 = #$Ii", -tc_68cb12ce, TypeALU32_2op>, Enc_5e87ce, PredNewRel, ImmRegRel { +tc_57890846, TypeALU32_2op>, Enc_5e87ce, PredNewRel, ImmRegRel { let Inst{21-21} = 0b0; let Inst{31-24} = 0b01111000; let hasNewValue = 1; @@ -1592,7 +1594,7 @@ def A2_tfrt : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4) $Rd32 = $Rs32", -tc_d6bf0472, TypeALU32_2op>, PredNewRel, ImmRegRel { +tc_4c5ba658, TypeALU32_2op>, PredNewRel, ImmRegRel { let isPredicated = 1; let hasNewValue = 1; let opNewValue = 0; @@ -1606,7 +1608,7 @@ def A2_tfrtnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4.new) $Rd32 = $Rs32", -tc_2b2f4060, TypeALU32_2op>, PredNewRel, ImmRegRel { +tc_05c070ec, TypeALU32_2op>, PredNewRel, ImmRegRel { let isPredicated = 1; let hasNewValue = 1; let opNewValue = 0; @@ -1621,7 +1623,7 @@ def A2_vabsh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = vabsh($Rss32)", -tc_c2f7d806, TypeS_2op>, Enc_b9c5fb { +tc_cf8126ae, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000100; let Inst{31-21} = 0b10000000010; let prefersSlot3 = 1; @@ -1630,7 +1632,7 @@ def A2_vabshsat : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = vabsh($Rss32):sat", -tc_c2f7d806, TypeS_2op>, Enc_b9c5fb { +tc_cf8126ae, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000101; let Inst{31-21} = 0b10000000010; let prefersSlot3 = 1; @@ -1640,7 +1642,7 @@ def A2_vabsw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = vabsw($Rss32)", -tc_c2f7d806, TypeS_2op>, Enc_b9c5fb { +tc_cf8126ae, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000110; let Inst{31-21} = 0b10000000010; let prefersSlot3 = 1; @@ -1649,7 +1651,7 @@ def A2_vabswsat : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = vabsw($Rss32):sat", -tc_c2f7d806, TypeS_2op>, Enc_b9c5fb { +tc_cf8126ae, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000111; let Inst{31-21} = 0b10000000010; let prefersSlot3 = 1; @@ -1659,7 +1661,7 @@ def A2_vaddb_map : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vaddb($Rss32,$Rtt32)", -tc_540fdfbc, TypeMAPPING> { +tc_946df596, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -1667,7 +1669,7 @@ def A2_vaddh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vaddh($Rss32,$Rtt32)", -tc_540fdfbc, TypeALU64>, Enc_a56825 { +tc_946df596, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011000; @@ -1676,7 +1678,7 @@ def A2_vaddhs : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vaddh($Rss32,$Rtt32):sat", -tc_b44c6e2a, TypeALU64>, Enc_a56825 { +tc_779080bf, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011000; @@ -1687,7 +1689,7 @@ def A2_vaddub : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vaddub($Rss32,$Rtt32)", -tc_540fdfbc, TypeALU64>, Enc_a56825 { +tc_946df596, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011000; @@ -1696,7 +1698,7 @@ def A2_vaddubs : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vaddub($Rss32,$Rtt32):sat", -tc_b44c6e2a, TypeALU64>, Enc_a56825 { +tc_779080bf, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011000; @@ -1707,7 +1709,7 @@ def A2_vadduhs : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vadduh($Rss32,$Rtt32):sat", -tc_b44c6e2a, TypeALU64>, Enc_a56825 { +tc_779080bf, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011000; @@ -1718,7 +1720,7 @@ def A2_vaddw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vaddw($Rss32,$Rtt32)", -tc_540fdfbc, TypeALU64>, Enc_a56825 { +tc_946df596, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011000; @@ -1727,7 +1729,7 @@ def A2_vaddws : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vaddw($Rss32,$Rtt32):sat", -tc_b44c6e2a, TypeALU64>, Enc_a56825 { +tc_779080bf, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011000; @@ -1738,16 +1740,17 @@ def A2_vavgh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vavgh($Rss32,$Rtt32)", -tc_540fdfbc, TypeALU64>, Enc_a56825 { +tc_6132ba3d, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011010; +let prefersSlot3 = 1; } def A2_vavghcr : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vavgh($Rss32,$Rtt32):crnd", -tc_2b6f77c6, TypeALU64>, Enc_a56825 { +tc_002cb246, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011010; @@ -1757,79 +1760,87 @@ def A2_vavghr : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vavgh($Rss32,$Rtt32):rnd", -tc_dbdffe3d, TypeALU64>, Enc_a56825 { +tc_e4a7f9f0, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011010; +let prefersSlot3 = 1; } def A2_vavgub : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vavgub($Rss32,$Rtt32)", -tc_540fdfbc, TypeALU64>, Enc_a56825 { +tc_6132ba3d, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011010; +let prefersSlot3 = 1; } def A2_vavgubr : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vavgub($Rss32,$Rtt32):rnd", -tc_dbdffe3d, TypeALU64>, Enc_a56825 { +tc_e4a7f9f0, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011010; +let prefersSlot3 = 1; } def A2_vavguh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vavguh($Rss32,$Rtt32)", -tc_540fdfbc, TypeALU64>, Enc_a56825 { +tc_6132ba3d, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011010; +let prefersSlot3 = 1; } def A2_vavguhr : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vavguh($Rss32,$Rtt32):rnd", -tc_dbdffe3d, TypeALU64>, Enc_a56825 { +tc_e4a7f9f0, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011010; +let prefersSlot3 = 1; } def A2_vavguw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vavguw($Rss32,$Rtt32)", -tc_540fdfbc, TypeALU64>, Enc_a56825 { +tc_6132ba3d, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011011; +let prefersSlot3 = 1; } def A2_vavguwr : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vavguw($Rss32,$Rtt32):rnd", -tc_dbdffe3d, TypeALU64>, Enc_a56825 { +tc_e4a7f9f0, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011011; +let prefersSlot3 = 1; } def A2_vavgw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vavgw($Rss32,$Rtt32)", -tc_540fdfbc, TypeALU64>, Enc_a56825 { +tc_6132ba3d, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011011; +let prefersSlot3 = 1; } def A2_vavgwcr : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vavgw($Rss32,$Rtt32):crnd", -tc_2b6f77c6, TypeALU64>, Enc_a56825 { +tc_002cb246, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011011; @@ -1839,16 +1850,17 @@ def A2_vavgwr : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vavgw($Rss32,$Rtt32):rnd", -tc_dbdffe3d, TypeALU64>, Enc_a56825 { +tc_e4a7f9f0, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011011; +let prefersSlot3 = 1; } def A2_vcmpbeq : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = vcmpb.eq($Rss32,$Rtt32)", -tc_1e856f58, TypeALU64>, Enc_fcf7a7 { +tc_85d5d03f, TypeALU64>, Enc_fcf7a7 { let Inst{7-2} = 0b110000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010010000; @@ -1857,7 +1869,7 @@ def A2_vcmpbgtu : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = vcmpb.gtu($Rss32,$Rtt32)", -tc_1e856f58, TypeALU64>, Enc_fcf7a7 { +tc_85d5d03f, TypeALU64>, Enc_fcf7a7 { let Inst{7-2} = 0b111000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010010000; @@ -1866,7 +1878,7 @@ def A2_vcmpheq : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = vcmph.eq($Rss32,$Rtt32)", -tc_1e856f58, TypeALU64>, Enc_fcf7a7 { +tc_85d5d03f, TypeALU64>, Enc_fcf7a7 { let Inst{7-2} = 0b011000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010010000; @@ -1875,7 +1887,7 @@ def A2_vcmphgt : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = vcmph.gt($Rss32,$Rtt32)", -tc_1e856f58, TypeALU64>, Enc_fcf7a7 { +tc_85d5d03f, TypeALU64>, Enc_fcf7a7 { let Inst{7-2} = 0b100000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010010000; @@ -1884,7 +1896,7 @@ def A2_vcmphgtu : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = vcmph.gtu($Rss32,$Rtt32)", -tc_1e856f58, TypeALU64>, Enc_fcf7a7 { +tc_85d5d03f, TypeALU64>, Enc_fcf7a7 { let Inst{7-2} = 0b101000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010010000; @@ -1893,7 +1905,7 @@ def A2_vcmpweq : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = vcmpw.eq($Rss32,$Rtt32)", -tc_1e856f58, TypeALU64>, Enc_fcf7a7 { +tc_85d5d03f, TypeALU64>, Enc_fcf7a7 { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010010000; @@ -1902,7 +1914,7 @@ def A2_vcmpwgt : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = vcmpw.gt($Rss32,$Rtt32)", -tc_1e856f58, TypeALU64>, Enc_fcf7a7 { +tc_85d5d03f, TypeALU64>, Enc_fcf7a7 { let Inst{7-2} = 0b001000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010010000; @@ -1911,7 +1923,7 @@ def A2_vcmpwgtu : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = vcmpw.gtu($Rss32,$Rtt32)", -tc_1e856f58, TypeALU64>, Enc_fcf7a7 { +tc_85d5d03f, TypeALU64>, Enc_fcf7a7 { let Inst{7-2} = 0b010000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010010000; @@ -1920,7 +1932,7 @@ def A2_vconj : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = vconj($Rss32):sat", -tc_c2f7d806, TypeS_2op>, Enc_b9c5fb { +tc_cf8126ae, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000111; let Inst{31-21} = 0b10000000100; let prefersSlot3 = 1; @@ -1930,7 +1942,7 @@ def A2_vmaxb : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vmaxb($Rtt32,$Rss32)", -tc_b44c6e2a, TypeALU64>, Enc_ea23e4 { +tc_779080bf, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011110; @@ -1940,7 +1952,7 @@ def A2_vmaxh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vmaxh($Rtt32,$Rss32)", -tc_b44c6e2a, TypeALU64>, Enc_ea23e4 { +tc_779080bf, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011110; @@ -1950,7 +1962,7 @@ def A2_vmaxub : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vmaxub($Rtt32,$Rss32)", -tc_b44c6e2a, TypeALU64>, Enc_ea23e4 { +tc_779080bf, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011110; @@ -1960,7 +1972,7 @@ def A2_vmaxuh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vmaxuh($Rtt32,$Rss32)", -tc_b44c6e2a, TypeALU64>, Enc_ea23e4 { +tc_779080bf, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011110; @@ -1970,7 +1982,7 @@ def A2_vmaxuw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vmaxuw($Rtt32,$Rss32)", -tc_b44c6e2a, TypeALU64>, Enc_ea23e4 { +tc_779080bf, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011101; @@ -1980,7 +1992,7 @@ def A2_vmaxw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vmaxw($Rtt32,$Rss32)", -tc_b44c6e2a, TypeALU64>, Enc_ea23e4 { +tc_779080bf, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011110; @@ -1990,7 +2002,7 @@ def A2_vminb : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vminb($Rtt32,$Rss32)", -tc_b44c6e2a, TypeALU64>, Enc_ea23e4 { +tc_779080bf, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011110; @@ -2000,7 +2012,7 @@ def A2_vminh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vminh($Rtt32,$Rss32)", -tc_b44c6e2a, TypeALU64>, Enc_ea23e4 { +tc_779080bf, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011101; @@ -2010,7 +2022,7 @@ def A2_vminub : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vminub($Rtt32,$Rss32)", -tc_b44c6e2a, TypeALU64>, Enc_ea23e4 { +tc_779080bf, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011101; @@ -2020,7 +2032,7 @@ def A2_vminuh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vminuh($Rtt32,$Rss32)", -tc_b44c6e2a, TypeALU64>, Enc_ea23e4 { +tc_779080bf, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011101; @@ -2030,7 +2042,7 @@ def A2_vminuw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vminuw($Rtt32,$Rss32)", -tc_b44c6e2a, TypeALU64>, Enc_ea23e4 { +tc_779080bf, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011101; @@ -2040,7 +2052,7 @@ def A2_vminw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vminw($Rtt32,$Rss32)", -tc_b44c6e2a, TypeALU64>, Enc_ea23e4 { +tc_779080bf, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011101; @@ -2050,16 +2062,17 @@ def A2_vnavgh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vnavgh($Rtt32,$Rss32)", -tc_540fdfbc, TypeALU64>, Enc_ea23e4 { +tc_6132ba3d, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011100; +let prefersSlot3 = 1; } def A2_vnavghcr : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vnavgh($Rtt32,$Rss32):crnd:sat", -tc_2b6f77c6, TypeALU64>, Enc_ea23e4 { +tc_002cb246, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011100; @@ -2070,7 +2083,7 @@ def A2_vnavghr : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vnavgh($Rtt32,$Rss32):rnd:sat", -tc_2b6f77c6, TypeALU64>, Enc_ea23e4 { +tc_002cb246, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011100; @@ -2081,16 +2094,17 @@ def A2_vnavgw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vnavgw($Rtt32,$Rss32)", -tc_540fdfbc, TypeALU64>, Enc_ea23e4 { +tc_6132ba3d, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011100; +let prefersSlot3 = 1; } def A2_vnavgwcr : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vnavgw($Rtt32,$Rss32):crnd:sat", -tc_2b6f77c6, TypeALU64>, Enc_ea23e4 { +tc_002cb246, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011100; @@ -2101,7 +2115,7 @@ def A2_vnavgwr : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vnavgw($Rtt32,$Rss32):rnd:sat", -tc_2b6f77c6, TypeALU64>, Enc_ea23e4 { +tc_002cb246, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011100; @@ -2112,7 +2126,7 @@ def A2_vraddub : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vraddub($Rss32,$Rtt32)", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000010; @@ -2122,7 +2136,7 @@ def A2_vraddub_acc : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vraddub($Rss32,$Rtt32)", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010010; @@ -2133,7 +2147,7 @@ def A2_vrsadub : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vrsadub($Rss32,$Rtt32)", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000010; @@ -2143,7 +2157,7 @@ def A2_vrsadub_acc : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vrsadub($Rss32,$Rtt32)", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010010; @@ -2154,7 +2168,7 @@ def A2_vsubb_map : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vsubb($Rss32,$Rtt32)", -tc_540fdfbc, TypeMAPPING> { +tc_946df596, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -2162,7 +2176,7 @@ def A2_vsubh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vsubh($Rtt32,$Rss32)", -tc_540fdfbc, TypeALU64>, Enc_ea23e4 { +tc_946df596, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011001; @@ -2171,7 +2185,7 @@ def A2_vsubhs : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vsubh($Rtt32,$Rss32):sat", -tc_b44c6e2a, TypeALU64>, Enc_ea23e4 { +tc_779080bf, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011001; @@ -2182,7 +2196,7 @@ def A2_vsubub : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vsubub($Rtt32,$Rss32)", -tc_540fdfbc, TypeALU64>, Enc_ea23e4 { +tc_946df596, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011001; @@ -2191,7 +2205,7 @@ def A2_vsububs : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vsubub($Rtt32,$Rss32):sat", -tc_b44c6e2a, TypeALU64>, Enc_ea23e4 { +tc_779080bf, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011001; @@ -2202,7 +2216,7 @@ def A2_vsubuhs : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vsubuh($Rtt32,$Rss32):sat", -tc_b44c6e2a, TypeALU64>, Enc_ea23e4 { +tc_779080bf, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011001; @@ -2213,7 +2227,7 @@ def A2_vsubw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vsubw($Rtt32,$Rss32)", -tc_540fdfbc, TypeALU64>, Enc_ea23e4 { +tc_946df596, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011001; @@ -2222,7 +2236,7 @@ def A2_vsubws : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vsubw($Rtt32,$Rss32):sat", -tc_b44c6e2a, TypeALU64>, Enc_ea23e4 { +tc_779080bf, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011001; @@ -2233,7 +2247,7 @@ def A2_xor : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = xor($Rs32,$Rt32)", -tc_b9488031, TypeALU32_3op>, Enc_5ab2be, PredNewRel { +tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110001011; @@ -2248,7 +2262,7 @@ def A2_xorp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = xor($Rss32,$Rtt32)", -tc_540fdfbc, TypeALU64>, Enc_a56825 { +tc_946df596, TypeALU64>, Enc_a56825 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011111; @@ -2258,7 +2272,7 @@ def A2_zxtb : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = zxtb($Rs32)", -tc_b9488031, TypeALU32_2op>, PredNewRel { +tc_5a2711e5, TypeALU32_2op>, PredNewRel { let hasNewValue = 1; let opNewValue = 0; let BaseOpcode = "A2_zxtb"; @@ -2270,7 +2284,7 @@ def A2_zxth : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = zxth($Rs32)", -tc_68cb12ce, TypeALU32_2op>, Enc_5e2823, PredNewRel { +tc_57890846, TypeALU32_2op>, Enc_5e2823, PredNewRel { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b01110000110; let hasNewValue = 1; @@ -2282,7 +2296,7 @@ def A4_addp_c : HInst< (outs DoubleRegs:$Rdd32, PredRegs:$Px4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, PredRegs:$Px4in), "$Rdd32 = add($Rss32,$Rtt32,$Px4):carry", -tc_523fcf30, TypeS_3op>, Enc_2b3f60 { +tc_9c3ecd83, TypeS_3op>, Enc_2b3f60 { let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000010110; @@ -2293,7 +2307,7 @@ def A4_andn : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = and($Rt32,~$Rs32)", -tc_b9488031, TypeALU32_3op>, Enc_bd6011 { +tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110001100; @@ -2305,7 +2319,7 @@ def A4_andnp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = and($Rtt32,~$Rss32)", -tc_540fdfbc, TypeALU64>, Enc_ea23e4 { +tc_946df596, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011111; @@ -2314,7 +2328,7 @@ def A4_bitsplit : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = bitsplit($Rs32,$Rt32)", -tc_1b9c9ee5, TypeALU64>, Enc_be32a5 { +tc_4414d8b1, TypeALU64>, Enc_be32a5 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010100001; @@ -2324,7 +2338,7 @@ def A4_bitspliti : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, u5_0Imm:$Ii), "$Rdd32 = bitsplit($Rs32,#$Ii)", -tc_1b9c9ee5, TypeS_2op>, Enc_311abd { +tc_4414d8b1, TypeS_2op>, Enc_311abd { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001000110; @@ -2334,14 +2348,14 @@ def A4_boundscheck : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, DoubleRegs:$Rtt32), "$Pd4 = boundscheck($Rs32,$Rtt32)", -tc_1e856f58, TypeALU64> { +tc_85d5d03f, TypeALU64> { let isPseudo = 1; } def A4_boundscheck_hi : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = boundscheck($Rss32,$Rtt32):raw:hi", -tc_1e856f58, TypeALU64>, Enc_fcf7a7 { +tc_85d5d03f, TypeALU64>, Enc_fcf7a7 { let Inst{7-2} = 0b101000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11010010000; @@ -2350,7 +2364,7 @@ def A4_boundscheck_lo : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = boundscheck($Rss32,$Rtt32):raw:lo", -tc_1e856f58, TypeALU64>, Enc_fcf7a7 { +tc_85d5d03f, TypeALU64>, Enc_fcf7a7 { let Inst{7-2} = 0b100000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11010010000; @@ -2359,7 +2373,7 @@ def A4_cmpbeq : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = cmpb.eq($Rs32,$Rt32)", -tc_1e856f58, TypeS_3op>, Enc_c2b48e, ImmRegRel { +tc_85d5d03f, TypeS_3op>, Enc_c2b48e, ImmRegRel { let Inst{7-2} = 0b110000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000111110; @@ -2372,7 +2386,7 @@ def A4_cmpbeqi : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, u8_0Imm:$Ii), "$Pd4 = cmpb.eq($Rs32,#$Ii)", -tc_7a830544, TypeALU64>, Enc_08d755, ImmRegRel { +tc_643b4717, TypeALU64>, Enc_08d755, ImmRegRel { let Inst{4-2} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11011101000; @@ -2385,7 +2399,7 @@ def A4_cmpbgt : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = cmpb.gt($Rs32,$Rt32)", -tc_1e856f58, TypeS_3op>, Enc_c2b48e, ImmRegRel { +tc_85d5d03f, TypeS_3op>, Enc_c2b48e, ImmRegRel { let Inst{7-2} = 0b010000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000111110; @@ -2397,7 +2411,7 @@ def A4_cmpbgti : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, s8_0Imm:$Ii), "$Pd4 = cmpb.gt($Rs32,#$Ii)", -tc_7a830544, TypeALU64>, Enc_08d755, ImmRegRel { +tc_643b4717, TypeALU64>, Enc_08d755, ImmRegRel { let Inst{4-2} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11011101001; @@ -2409,7 +2423,7 @@ def A4_cmpbgtu : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = cmpb.gtu($Rs32,$Rt32)", -tc_1e856f58, TypeS_3op>, Enc_c2b48e, ImmRegRel { +tc_85d5d03f, TypeS_3op>, Enc_c2b48e, ImmRegRel { let Inst{7-2} = 0b111000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000111110; @@ -2421,7 +2435,7 @@ def A4_cmpbgtui : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, u32_0Imm:$Ii), "$Pd4 = cmpb.gtu($Rs32,#$Ii)", -tc_7a830544, TypeALU64>, Enc_02553a, ImmRegRel { +tc_643b4717, TypeALU64>, Enc_02553a, ImmRegRel { let Inst{4-2} = 0b000; let Inst{13-12} = 0b00; let Inst{31-21} = 0b11011101010; @@ -2438,7 +2452,7 @@ def A4_cmpheq : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = cmph.eq($Rs32,$Rt32)", -tc_1e856f58, TypeS_3op>, Enc_c2b48e, ImmRegRel { +tc_85d5d03f, TypeS_3op>, Enc_c2b48e, ImmRegRel { let Inst{7-2} = 0b011000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000111110; @@ -2451,7 +2465,7 @@ def A4_cmpheqi : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Pd4 = cmph.eq($Rs32,#$Ii)", -tc_7a830544, TypeALU64>, Enc_08d755, ImmRegRel { +tc_643b4717, TypeALU64>, Enc_08d755, ImmRegRel { let Inst{4-2} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11011101000; @@ -2469,7 +2483,7 @@ def A4_cmphgt : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = cmph.gt($Rs32,$Rt32)", -tc_1e856f58, TypeS_3op>, Enc_c2b48e, ImmRegRel { +tc_85d5d03f, TypeS_3op>, Enc_c2b48e, ImmRegRel { let Inst{7-2} = 0b100000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000111110; @@ -2481,7 +2495,7 @@ def A4_cmphgti : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Pd4 = cmph.gt($Rs32,#$Ii)", -tc_7a830544, TypeALU64>, Enc_08d755, ImmRegRel { +tc_643b4717, TypeALU64>, Enc_08d755, ImmRegRel { let Inst{4-2} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11011101001; @@ -2498,7 +2512,7 @@ def A4_cmphgtu : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = cmph.gtu($Rs32,$Rt32)", -tc_1e856f58, TypeS_3op>, Enc_c2b48e, ImmRegRel { +tc_85d5d03f, TypeS_3op>, Enc_c2b48e, ImmRegRel { let Inst{7-2} = 0b101000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000111110; @@ -2510,7 +2524,7 @@ def A4_cmphgtui : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, u32_0Imm:$Ii), "$Pd4 = cmph.gtu($Rs32,#$Ii)", -tc_7a830544, TypeALU64>, Enc_02553a, ImmRegRel { +tc_643b4717, TypeALU64>, Enc_02553a, ImmRegRel { let Inst{4-2} = 0b010; let Inst{13-12} = 0b00; let Inst{31-21} = 0b11011101010; @@ -2527,7 +2541,7 @@ def A4_combineii : HInst< (outs DoubleRegs:$Rdd32), (ins s8_0Imm:$Ii, u32_0Imm:$II), "$Rdd32 = combine(#$Ii,#$II)", -tc_b9488031, TypeALU32_2op>, Enc_f0cca7 { +tc_5a2711e5, TypeALU32_2op>, Enc_f0cca7 { let Inst{31-21} = 0b01111100100; let isExtendable = 1; let opExtendable = 2; @@ -2539,7 +2553,7 @@ def A4_combineir : HInst< (outs DoubleRegs:$Rdd32), (ins s32_0Imm:$Ii, IntRegs:$Rs32), "$Rdd32 = combine(#$Ii,$Rs32)", -tc_b9488031, TypeALU32_2op>, Enc_9cdba7 { +tc_5a2711e5, TypeALU32_2op>, Enc_9cdba7 { let Inst{13-13} = 0b1; let Inst{31-21} = 0b01110011001; let isExtendable = 1; @@ -2552,7 +2566,7 @@ def A4_combineri : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Rdd32 = combine($Rs32,#$Ii)", -tc_b9488031, TypeALU32_2op>, Enc_9cdba7 { +tc_5a2711e5, TypeALU32_2op>, Enc_9cdba7 { let Inst{13-13} = 0b1; let Inst{31-21} = 0b01110011000; let isExtendable = 1; @@ -2565,7 +2579,7 @@ def A4_cround_ri : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, u5_0Imm:$Ii), "$Rd32 = cround($Rs32,#$Ii)", -tc_2b6f77c6, TypeS_2op>, Enc_a05677 { +tc_002cb246, TypeS_2op>, Enc_a05677 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001100111; @@ -2577,7 +2591,7 @@ def A4_cround_rr : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = cround($Rs32,$Rt32)", -tc_2b6f77c6, TypeS_3op>, Enc_5ab2be { +tc_002cb246, TypeS_3op>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000110110; @@ -2589,14 +2603,14 @@ def A4_ext : HInst< (outs), (ins u26_6Imm:$Ii), "immext(#$Ii)", -tc_452f85af, TypeEXTENDER>, Enc_2b518f { +tc_862b3e70, TypeEXTENDER>, Enc_2b518f { let Inst{31-28} = 0b0000; } def A4_modwrapu : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = modwrap($Rs32,$Rt32)", -tc_b44c6e2a, TypeALU64>, Enc_5ab2be { +tc_779080bf, TypeALU64>, Enc_5ab2be { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011111; @@ -2608,7 +2622,7 @@ def A4_orn : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = or($Rt32,~$Rs32)", -tc_b9488031, TypeALU32_3op>, Enc_bd6011 { +tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110001101; @@ -2620,7 +2634,7 @@ def A4_ornp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = or($Rtt32,~$Rss32)", -tc_540fdfbc, TypeALU64>, Enc_ea23e4 { +tc_946df596, TypeALU64>, Enc_ea23e4 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010011111; @@ -2629,7 +2643,7 @@ def A4_paslhf : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4) $Rd32 = aslh($Rs32)", -tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1010; let Inst{31-21} = 0b01110000000; @@ -2643,7 +2657,7 @@ def A4_paslhfnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4.new) $Rd32 = aslh($Rs32)", -tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1011; let Inst{31-21} = 0b01110000000; @@ -2658,7 +2672,7 @@ def A4_paslht : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4) $Rd32 = aslh($Rs32)", -tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1000; let Inst{31-21} = 0b01110000000; @@ -2671,7 +2685,7 @@ def A4_paslhtnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4.new) $Rd32 = aslh($Rs32)", -tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1001; let Inst{31-21} = 0b01110000000; @@ -2685,7 +2699,7 @@ def A4_pasrhf : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4) $Rd32 = asrh($Rs32)", -tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1010; let Inst{31-21} = 0b01110000001; @@ -2699,7 +2713,7 @@ def A4_pasrhfnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4.new) $Rd32 = asrh($Rs32)", -tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1011; let Inst{31-21} = 0b01110000001; @@ -2714,7 +2728,7 @@ def A4_pasrht : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4) $Rd32 = asrh($Rs32)", -tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1000; let Inst{31-21} = 0b01110000001; @@ -2727,7 +2741,7 @@ def A4_pasrhtnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4.new) $Rd32 = asrh($Rs32)", -tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1001; let Inst{31-21} = 0b01110000001; @@ -2741,7 +2755,7 @@ def A4_psxtbf : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4) $Rd32 = sxtb($Rs32)", -tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1010; let Inst{31-21} = 0b01110000101; @@ -2755,7 +2769,7 @@ def A4_psxtbfnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4.new) $Rd32 = sxtb($Rs32)", -tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1011; let Inst{31-21} = 0b01110000101; @@ -2770,7 +2784,7 @@ def A4_psxtbt : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4) $Rd32 = sxtb($Rs32)", -tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1000; let Inst{31-21} = 0b01110000101; @@ -2783,7 +2797,7 @@ def A4_psxtbtnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4.new) $Rd32 = sxtb($Rs32)", -tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1001; let Inst{31-21} = 0b01110000101; @@ -2797,7 +2811,7 @@ def A4_psxthf : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4) $Rd32 = sxth($Rs32)", -tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1010; let Inst{31-21} = 0b01110000111; @@ -2811,7 +2825,7 @@ def A4_psxthfnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4.new) $Rd32 = sxth($Rs32)", -tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1011; let Inst{31-21} = 0b01110000111; @@ -2826,7 +2840,7 @@ def A4_psxtht : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4) $Rd32 = sxth($Rs32)", -tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1000; let Inst{31-21} = 0b01110000111; @@ -2839,7 +2853,7 @@ def A4_psxthtnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4.new) $Rd32 = sxth($Rs32)", -tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1001; let Inst{31-21} = 0b01110000111; @@ -2853,7 +2867,7 @@ def A4_pzxtbf : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4) $Rd32 = zxtb($Rs32)", -tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1010; let Inst{31-21} = 0b01110000100; @@ -2867,7 +2881,7 @@ def A4_pzxtbfnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4.new) $Rd32 = zxtb($Rs32)", -tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1011; let Inst{31-21} = 0b01110000100; @@ -2882,7 +2896,7 @@ def A4_pzxtbt : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4) $Rd32 = zxtb($Rs32)", -tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1000; let Inst{31-21} = 0b01110000100; @@ -2895,7 +2909,7 @@ def A4_pzxtbtnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4.new) $Rd32 = zxtb($Rs32)", -tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1001; let Inst{31-21} = 0b01110000100; @@ -2909,7 +2923,7 @@ def A4_pzxthf : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4) $Rd32 = zxth($Rs32)", -tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1010; let Inst{31-21} = 0b01110000110; @@ -2923,7 +2937,7 @@ def A4_pzxthfnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4.new) $Rd32 = zxth($Rs32)", -tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1011; let Inst{31-21} = 0b01110000110; @@ -2938,7 +2952,7 @@ def A4_pzxtht : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4) $Rd32 = zxth($Rs32)", -tc_b9488031, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1000; let Inst{31-21} = 0b01110000110; @@ -2951,7 +2965,7 @@ def A4_pzxthtnew : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4.new) $Rd32 = zxth($Rs32)", -tc_5f6847a1, TypeALU32_2op>, Enc_fb6577, PredNewRel { +tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1001; let Inst{31-21} = 0b01110000110; @@ -2965,7 +2979,7 @@ def A4_rcmpeq : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = cmp.eq($Rs32,$Rt32)", -tc_b9488031, TypeALU32_3op>, Enc_5ab2be, ImmRegRel { +tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be, ImmRegRel { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110011010; @@ -2979,7 +2993,7 @@ def A4_rcmpeqi : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Rd32 = cmp.eq($Rs32,#$Ii)", -tc_b9488031, TypeALU32_2op>, Enc_b8c967, ImmRegRel { +tc_5a2711e5, TypeALU32_2op>, Enc_b8c967, ImmRegRel { let Inst{13-13} = 0b1; let Inst{31-21} = 0b01110011010; let hasNewValue = 1; @@ -2996,7 +3010,7 @@ def A4_rcmpneq : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = !cmp.eq($Rs32,$Rt32)", -tc_b9488031, TypeALU32_3op>, Enc_5ab2be, ImmRegRel { +tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be, ImmRegRel { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110011011; @@ -3010,7 +3024,7 @@ def A4_rcmpneqi : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Rd32 = !cmp.eq($Rs32,#$Ii)", -tc_b9488031, TypeALU32_2op>, Enc_b8c967, ImmRegRel { +tc_5a2711e5, TypeALU32_2op>, Enc_b8c967, ImmRegRel { let Inst{13-13} = 0b1; let Inst{31-21} = 0b01110011011; let hasNewValue = 1; @@ -3027,7 +3041,7 @@ def A4_round_ri : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, u5_0Imm:$Ii), "$Rd32 = round($Rs32,#$Ii)", -tc_2b6f77c6, TypeS_2op>, Enc_a05677 { +tc_002cb246, TypeS_2op>, Enc_a05677 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001100111; @@ -3039,7 +3053,7 @@ def A4_round_ri_sat : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, u5_0Imm:$Ii), "$Rd32 = round($Rs32,#$Ii):sat", -tc_2b6f77c6, TypeS_2op>, Enc_a05677 { +tc_002cb246, TypeS_2op>, Enc_a05677 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001100111; @@ -3052,7 +3066,7 @@ def A4_round_rr : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = round($Rs32,$Rt32)", -tc_2b6f77c6, TypeS_3op>, Enc_5ab2be { +tc_002cb246, TypeS_3op>, Enc_5ab2be { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000110110; @@ -3064,7 +3078,7 @@ def A4_round_rr_sat : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = round($Rs32,$Rt32):sat", -tc_2b6f77c6, TypeS_3op>, Enc_5ab2be { +tc_002cb246, TypeS_3op>, Enc_5ab2be { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000110110; @@ -3077,7 +3091,7 @@ def A4_subp_c : HInst< (outs DoubleRegs:$Rdd32, PredRegs:$Px4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, PredRegs:$Px4in), "$Rdd32 = sub($Rss32,$Rtt32,$Px4):carry", -tc_523fcf30, TypeS_3op>, Enc_2b3f60 { +tc_9c3ecd83, TypeS_3op>, Enc_2b3f60 { let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000010111; @@ -3088,7 +3102,7 @@ def A4_tfrcpp : HInst< (outs DoubleRegs:$Rdd32), (ins CtrRegs64:$Css32), "$Rdd32 = $Css32", -tc_29175780, TypeCR>, Enc_667b39 { +tc_b9272d6c, TypeCR>, Enc_667b39 { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b01101000000; } @@ -3096,7 +3110,7 @@ def A4_tfrpcp : HInst< (outs CtrRegs64:$Cdd32), (ins DoubleRegs:$Rss32), "$Cdd32 = $Rss32", -tc_a21dc435, TypeCR>, Enc_0ed752 { +tc_434c8e1e, TypeCR>, Enc_0ed752 { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b01100011001; } @@ -3104,7 +3118,7 @@ def A4_tlbmatch : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Pd4 = tlbmatch($Rss32,$Rt32)", -tc_04c9decc, TypeALU64>, Enc_03833b { +tc_4837eefb, TypeALU64>, Enc_03833b { let Inst{7-2} = 0b011000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11010010000; @@ -3114,7 +3128,7 @@ def A4_vcmpbeq_any : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = any8(vcmpb.eq($Rss32,$Rtt32))", -tc_1e856f58, TypeALU64>, Enc_fcf7a7 { +tc_85d5d03f, TypeALU64>, Enc_fcf7a7 { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11010010000; @@ -3123,7 +3137,7 @@ def A4_vcmpbeqi : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, u8_0Imm:$Ii), "$Pd4 = vcmpb.eq($Rss32,#$Ii)", -tc_7a830544, TypeALU64>, Enc_0d8adb { +tc_643b4717, TypeALU64>, Enc_0d8adb { let Inst{4-2} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11011100000; @@ -3132,7 +3146,7 @@ def A4_vcmpbgt : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = vcmpb.gt($Rss32,$Rtt32)", -tc_1e856f58, TypeALU64>, Enc_fcf7a7 { +tc_85d5d03f, TypeALU64>, Enc_fcf7a7 { let Inst{7-2} = 0b010000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11010010000; @@ -3141,7 +3155,7 @@ def A4_vcmpbgti : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, s8_0Imm:$Ii), "$Pd4 = vcmpb.gt($Rss32,#$Ii)", -tc_7a830544, TypeALU64>, Enc_0d8adb { +tc_643b4717, TypeALU64>, Enc_0d8adb { let Inst{4-2} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11011100001; @@ -3150,7 +3164,7 @@ def A4_vcmpbgtui : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, u7_0Imm:$Ii), "$Pd4 = vcmpb.gtu($Rss32,#$Ii)", -tc_7a830544, TypeALU64>, Enc_3680c2 { +tc_643b4717, TypeALU64>, Enc_3680c2 { let Inst{4-2} = 0b000; let Inst{13-12} = 0b00; let Inst{31-21} = 0b11011100010; @@ -3159,7 +3173,7 @@ def A4_vcmpheqi : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, s8_0Imm:$Ii), "$Pd4 = vcmph.eq($Rss32,#$Ii)", -tc_7a830544, TypeALU64>, Enc_0d8adb { +tc_643b4717, TypeALU64>, Enc_0d8adb { let Inst{4-2} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11011100000; @@ -3168,7 +3182,7 @@ def A4_vcmphgti : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, s8_0Imm:$Ii), "$Pd4 = vcmph.gt($Rss32,#$Ii)", -tc_7a830544, TypeALU64>, Enc_0d8adb { +tc_643b4717, TypeALU64>, Enc_0d8adb { let Inst{4-2} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11011100001; @@ -3177,7 +3191,7 @@ def A4_vcmphgtui : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, u7_0Imm:$Ii), "$Pd4 = vcmph.gtu($Rss32,#$Ii)", -tc_7a830544, TypeALU64>, Enc_3680c2 { +tc_643b4717, TypeALU64>, Enc_3680c2 { let Inst{4-2} = 0b010; let Inst{13-12} = 0b00; let Inst{31-21} = 0b11011100010; @@ -3186,7 +3200,7 @@ def A4_vcmpweqi : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, s8_0Imm:$Ii), "$Pd4 = vcmpw.eq($Rss32,#$Ii)", -tc_7a830544, TypeALU64>, Enc_0d8adb { +tc_643b4717, TypeALU64>, Enc_0d8adb { let Inst{4-2} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11011100000; @@ -3195,7 +3209,7 @@ def A4_vcmpwgti : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, s8_0Imm:$Ii), "$Pd4 = vcmpw.gt($Rss32,#$Ii)", -tc_7a830544, TypeALU64>, Enc_0d8adb { +tc_643b4717, TypeALU64>, Enc_0d8adb { let Inst{4-2} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11011100001; @@ -3204,7 +3218,7 @@ def A4_vcmpwgtui : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, u7_0Imm:$Ii), "$Pd4 = vcmpw.gtu($Rss32,#$Ii)", -tc_7a830544, TypeALU64>, Enc_3680c2 { +tc_643b4717, TypeALU64>, Enc_3680c2 { let Inst{4-2} = 0b100; let Inst{13-12} = 0b00; let Inst{31-21} = 0b11011100010; @@ -3213,7 +3227,7 @@ def A4_vrmaxh : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32), "$Rxx32 = vrmaxh($Rss32,$Ru32)", -tc_c6ce9b3f, TypeS_3op>, Enc_412ff0 { +tc_5b54b33f, TypeS_3op>, Enc_412ff0 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011001; @@ -3224,7 +3238,7 @@ def A4_vrmaxuh : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32), "$Rxx32 = vrmaxuh($Rss32,$Ru32)", -tc_c6ce9b3f, TypeS_3op>, Enc_412ff0 { +tc_5b54b33f, TypeS_3op>, Enc_412ff0 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11001011001; @@ -3235,7 +3249,7 @@ def A4_vrmaxuw : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32), "$Rxx32 = vrmaxuw($Rss32,$Ru32)", -tc_c6ce9b3f, TypeS_3op>, Enc_412ff0 { +tc_5b54b33f, TypeS_3op>, Enc_412ff0 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11001011001; @@ -3246,7 +3260,7 @@ def A4_vrmaxw : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32), "$Rxx32 = vrmaxw($Rss32,$Ru32)", -tc_c6ce9b3f, TypeS_3op>, Enc_412ff0 { +tc_5b54b33f, TypeS_3op>, Enc_412ff0 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011001; @@ -3257,7 +3271,7 @@ def A4_vrminh : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32), "$Rxx32 = vrminh($Rss32,$Ru32)", -tc_c6ce9b3f, TypeS_3op>, Enc_412ff0 { +tc_5b54b33f, TypeS_3op>, Enc_412ff0 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011001; @@ -3268,7 +3282,7 @@ def A4_vrminuh : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32), "$Rxx32 = vrminuh($Rss32,$Ru32)", -tc_c6ce9b3f, TypeS_3op>, Enc_412ff0 { +tc_5b54b33f, TypeS_3op>, Enc_412ff0 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11001011001; @@ -3279,7 +3293,7 @@ def A4_vrminuw : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32), "$Rxx32 = vrminuw($Rss32,$Ru32)", -tc_c6ce9b3f, TypeS_3op>, Enc_412ff0 { +tc_5b54b33f, TypeS_3op>, Enc_412ff0 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11001011001; @@ -3290,7 +3304,7 @@ def A4_vrminw : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32), "$Rxx32 = vrminw($Rss32,$Ru32)", -tc_c6ce9b3f, TypeS_3op>, Enc_412ff0 { +tc_5b54b33f, TypeS_3op>, Enc_412ff0 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011001; @@ -3301,7 +3315,7 @@ def A5_ACS : HInst< (outs DoubleRegs:$Rxx32, PredRegs:$Pe4), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32,$Pe4 = vacsh($Rss32,$Rtt32)", -tc_caaebcba, TypeM>, Enc_831a7d, Requires<[HasV55]> { +tc_d1aa9eaa, TypeM>, Enc_831a7d, Requires<[HasV55]> { let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010101; @@ -3314,7 +3328,7 @@ def A5_vaddhubs : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rd32 = vaddhub($Rss32,$Rtt32):sat", -tc_2b6f77c6, TypeS_3op>, Enc_d2216a { +tc_002cb246, TypeS_3op>, Enc_d2216a { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001010; @@ -3327,7 +3341,7 @@ def A6_vcmpbeq_notany : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = !any8(vcmpb.eq($Rss32,$Rtt32))", -tc_55050d58, TypeALU64>, Enc_fcf7a7, Requires<[HasV65]> { +tc_1fc97744, TypeALU64>, Enc_fcf7a7, Requires<[HasV65]> { let Inst{7-2} = 0b001000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11010010000; @@ -3336,7 +3350,7 @@ def A6_vminub_RdP : HInst< (outs DoubleRegs:$Rdd32, PredRegs:$Pe4), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32,$Pe4 = vminub($Rtt32,$Rss32)", -tc_ef84f62f, TypeM>, Enc_d2c7f1, Requires<[HasV62]> { +tc_f9058dd7, TypeM>, Enc_d2c7f1, Requires<[HasV62]> { let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010111; @@ -3347,7 +3361,7 @@ def C2_all8 : HInst< (outs PredRegs:$Pd4), (ins PredRegs:$Ps4), "$Pd4 = all8($Ps4)", -tc_f2704b9a, TypeCR>, Enc_65d691 { +tc_de554571, TypeCR>, Enc_65d691 { let Inst{13-2} = 0b000000000000; let Inst{31-18} = 0b01101011101000; } @@ -3355,7 +3369,7 @@ def C2_and : HInst< (outs PredRegs:$Pd4), (ins PredRegs:$Pt4, PredRegs:$Ps4), "$Pd4 = and($Pt4,$Ps4)", -tc_53bc8a6a, TypeCR>, Enc_454a26 { +tc_640086b5, TypeCR>, Enc_454a26 { let Inst{7-2} = 0b000000; let Inst{13-10} = 0b0000; let Inst{31-18} = 0b01101011000000; @@ -3364,7 +3378,7 @@ def C2_andn : HInst< (outs PredRegs:$Pd4), (ins PredRegs:$Pt4, PredRegs:$Ps4), "$Pd4 = and($Pt4,!$Ps4)", -tc_53bc8a6a, TypeCR>, Enc_454a26 { +tc_640086b5, TypeCR>, Enc_454a26 { let Inst{7-2} = 0b000000; let Inst{13-10} = 0b0000; let Inst{31-18} = 0b01101011011000; @@ -3373,7 +3387,7 @@ def C2_any8 : HInst< (outs PredRegs:$Pd4), (ins PredRegs:$Ps4), "$Pd4 = any8($Ps4)", -tc_f2704b9a, TypeCR>, Enc_65d691 { +tc_de554571, TypeCR>, Enc_65d691 { let Inst{13-2} = 0b000000000000; let Inst{31-18} = 0b01101011100000; } @@ -3381,7 +3395,7 @@ def C2_bitsclr : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = bitsclr($Rs32,$Rt32)", -tc_1e856f58, TypeS_3op>, Enc_c2b48e { +tc_85d5d03f, TypeS_3op>, Enc_c2b48e { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000111100; @@ -3390,7 +3404,7 @@ def C2_bitsclri : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, u6_0Imm:$Ii), "$Pd4 = bitsclr($Rs32,#$Ii)", -tc_7a830544, TypeS_2op>, Enc_5d6c34 { +tc_643b4717, TypeS_2op>, Enc_5d6c34 { let Inst{7-2} = 0b000000; let Inst{31-21} = 0b10000101100; } @@ -3398,7 +3412,7 @@ def C2_bitsset : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = bitsset($Rs32,$Rt32)", -tc_1e856f58, TypeS_3op>, Enc_c2b48e { +tc_85d5d03f, TypeS_3op>, Enc_c2b48e { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000111010; @@ -3407,7 +3421,7 @@ def C2_ccombinewf : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if (!$Pu4) $Rdd32 = combine($Rs32,$Rt32)", -tc_d6bf0472, TypeALU32_3op>, Enc_cb4b4e, PredNewRel { +tc_4c5ba658, TypeALU32_3op>, Enc_cb4b4e, PredNewRel { let Inst{7-7} = 0b1; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11111101000; @@ -3419,7 +3433,7 @@ def C2_ccombinewnewf : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if (!$Pu4.new) $Rdd32 = combine($Rs32,$Rt32)", -tc_2b2f4060, TypeALU32_3op>, Enc_cb4b4e, PredNewRel { +tc_05c070ec, TypeALU32_3op>, Enc_cb4b4e, PredNewRel { let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11111101000; @@ -3432,7 +3446,7 @@ def C2_ccombinewnewt : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if ($Pu4.new) $Rdd32 = combine($Rs32,$Rt32)", -tc_2b2f4060, TypeALU32_3op>, Enc_cb4b4e, PredNewRel { +tc_05c070ec, TypeALU32_3op>, Enc_cb4b4e, PredNewRel { let Inst{7-7} = 0b0; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11111101000; @@ -3444,7 +3458,7 @@ def C2_ccombinewt : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "if ($Pu4) $Rdd32 = combine($Rs32,$Rt32)", -tc_d6bf0472, TypeALU32_3op>, Enc_cb4b4e, PredNewRel { +tc_4c5ba658, TypeALU32_3op>, Enc_cb4b4e, PredNewRel { let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11111101000; @@ -3455,7 +3469,7 @@ def C2_cmoveif : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, s32_0Imm:$Ii), "if (!$Pu4) $Rd32 = #$Ii", -tc_b9488031, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel { +tc_5a2711e5, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel { let Inst{13-13} = 0b0; let Inst{20-20} = 0b0; let Inst{31-23} = 0b011111101; @@ -3477,7 +3491,7 @@ def C2_cmoveit : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, s32_0Imm:$Ii), "if ($Pu4) $Rd32 = #$Ii", -tc_b9488031, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel { +tc_5a2711e5, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel { let Inst{13-13} = 0b0; let Inst{20-20} = 0b0; let Inst{31-23} = 0b011111100; @@ -3498,7 +3512,7 @@ def C2_cmovenewif : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, s32_0Imm:$Ii), "if (!$Pu4.new) $Rd32 = #$Ii", -tc_5f6847a1, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel { +tc_1ae57e39, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel { let Inst{13-13} = 0b1; let Inst{20-20} = 0b0; let Inst{31-23} = 0b011111101; @@ -3521,7 +3535,7 @@ def C2_cmovenewit : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, s32_0Imm:$Ii), "if ($Pu4.new) $Rd32 = #$Ii", -tc_5f6847a1, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel { +tc_1ae57e39, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel { let Inst{13-13} = 0b1; let Inst{20-20} = 0b0; let Inst{31-23} = 0b011111100; @@ -3543,7 +3557,7 @@ def C2_cmpeq : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = cmp.eq($Rs32,$Rt32)", -tc_c6aa82f7, TypeALU32_3op>, Enc_c2b48e, ImmRegRel { +tc_de4df740, TypeALU32_3op>, Enc_c2b48e, ImmRegRel { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110010000; @@ -3556,7 +3570,7 @@ def C2_cmpeqi : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Pd4 = cmp.eq($Rs32,#$Ii)", -tc_6ebb4a12, TypeALU32_2op>, Enc_bd0b33, ImmRegRel { +tc_56f114f4, TypeALU32_2op>, Enc_bd0b33, ImmRegRel { let Inst{4-2} = 0b000; let Inst{31-22} = 0b0111010100; let CextOpcode = "C2_cmpeq"; @@ -3572,7 +3586,7 @@ def C2_cmpeqp : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = cmp.eq($Rss32,$Rtt32)", -tc_1e856f58, TypeALU64>, Enc_fcf7a7 { +tc_85d5d03f, TypeALU64>, Enc_fcf7a7 { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010010100; @@ -3583,7 +3597,7 @@ def C2_cmpgei : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, s8_0Imm:$Ii), "$Pd4 = cmp.ge($Rs32,#$Ii)", -tc_6ebb4a12, TypeALU32_2op> { +tc_56f114f4, TypeALU32_2op> { let isCompare = 1; let isPseudo = 1; } @@ -3591,7 +3605,7 @@ def C2_cmpgeui : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, u8_0Imm:$Ii), "$Pd4 = cmp.geu($Rs32,#$Ii)", -tc_6ebb4a12, TypeALU32_2op> { +tc_56f114f4, TypeALU32_2op> { let isCompare = 1; let isPseudo = 1; } @@ -3599,7 +3613,7 @@ def C2_cmpgt : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = cmp.gt($Rs32,$Rt32)", -tc_c6aa82f7, TypeALU32_3op>, Enc_c2b48e, ImmRegRel { +tc_de4df740, TypeALU32_3op>, Enc_c2b48e, ImmRegRel { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110010010; @@ -3611,7 +3625,7 @@ def C2_cmpgti : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Pd4 = cmp.gt($Rs32,#$Ii)", -tc_6ebb4a12, TypeALU32_2op>, Enc_bd0b33, ImmRegRel { +tc_56f114f4, TypeALU32_2op>, Enc_bd0b33, ImmRegRel { let Inst{4-2} = 0b000; let Inst{31-22} = 0b0111010101; let CextOpcode = "C2_cmpgt"; @@ -3627,7 +3641,7 @@ def C2_cmpgtp : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = cmp.gt($Rss32,$Rtt32)", -tc_1e856f58, TypeALU64>, Enc_fcf7a7 { +tc_85d5d03f, TypeALU64>, Enc_fcf7a7 { let Inst{7-2} = 0b010000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010010100; @@ -3637,7 +3651,7 @@ def C2_cmpgtu : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = cmp.gtu($Rs32,$Rt32)", -tc_c6aa82f7, TypeALU32_3op>, Enc_c2b48e, ImmRegRel { +tc_de4df740, TypeALU32_3op>, Enc_c2b48e, ImmRegRel { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110010011; @@ -3649,7 +3663,7 @@ def C2_cmpgtui : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, u32_0Imm:$Ii), "$Pd4 = cmp.gtu($Rs32,#$Ii)", -tc_6ebb4a12, TypeALU32_2op>, Enc_c0cdde, ImmRegRel { +tc_56f114f4, TypeALU32_2op>, Enc_c0cdde, ImmRegRel { let Inst{4-2} = 0b000; let Inst{31-21} = 0b01110101100; let CextOpcode = "C2_cmpgtu"; @@ -3665,7 +3679,7 @@ def C2_cmpgtup : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = cmp.gtu($Rss32,$Rtt32)", -tc_1e856f58, TypeALU64>, Enc_fcf7a7 { +tc_85d5d03f, TypeALU64>, Enc_fcf7a7 { let Inst{7-2} = 0b100000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010010100; @@ -3675,7 +3689,7 @@ def C2_cmplt : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = cmp.lt($Rs32,$Rt32)", -tc_6ebb4a12, TypeALU32_3op> { +tc_56f114f4, TypeALU32_3op> { let isCompare = 1; let isPseudo = 1; let isCodeGenOnly = 1; @@ -3684,7 +3698,7 @@ def C2_cmpltu : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = cmp.ltu($Rs32,$Rt32)", -tc_6ebb4a12, TypeALU32_3op> { +tc_56f114f4, TypeALU32_3op> { let isCompare = 1; let isPseudo = 1; let isCodeGenOnly = 1; @@ -3693,7 +3707,7 @@ def C2_mask : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pt4), "$Rdd32 = mask($Pt4)", -tc_cde8b071, TypeS_2op>, Enc_78e566 { +tc_0ae0825c, TypeS_2op>, Enc_78e566 { let Inst{7-5} = 0b000; let Inst{13-10} = 0b0000; let Inst{31-16} = 0b1000011000000000; @@ -3702,7 +3716,7 @@ def C2_mux : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mux($Pu4,$Rs32,$Rt32)", -tc_d6bf0472, TypeALU32_3op>, Enc_ea4c54 { +tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54 { let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110100000; @@ -3714,7 +3728,7 @@ def C2_muxii : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, s32_0Imm:$Ii, s8_0Imm:$II), "$Rd32 = mux($Pu4,#$Ii,#$II)", -tc_d6bf0472, TypeALU32_2op>, Enc_830e5d { +tc_4c5ba658, TypeALU32_2op>, Enc_830e5d { let Inst{31-25} = 0b0111101; let hasNewValue = 1; let opNewValue = 0; @@ -3728,7 +3742,7 @@ def C2_muxir : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii), "$Rd32 = mux($Pu4,$Rs32,#$Ii)", -tc_d6bf0472, TypeALU32_2op>, Enc_e38e1f { +tc_4c5ba658, TypeALU32_2op>, Enc_e38e1f { let Inst{13-13} = 0b0; let Inst{31-23} = 0b011100110; let hasNewValue = 1; @@ -3744,7 +3758,7 @@ def C2_muxri : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, s32_0Imm:$Ii, IntRegs:$Rs32), "$Rd32 = mux($Pu4,#$Ii,$Rs32)", -tc_d6bf0472, TypeALU32_2op>, Enc_e38e1f { +tc_4c5ba658, TypeALU32_2op>, Enc_e38e1f { let Inst{13-13} = 0b0; let Inst{31-23} = 0b011100111; let hasNewValue = 1; @@ -3760,7 +3774,7 @@ def C2_not : HInst< (outs PredRegs:$Pd4), (ins PredRegs:$Ps4), "$Pd4 = not($Ps4)", -tc_f2704b9a, TypeCR>, Enc_65d691 { +tc_de554571, TypeCR>, Enc_65d691 { let Inst{13-2} = 0b000000000000; let Inst{31-18} = 0b01101011110000; } @@ -3768,7 +3782,7 @@ def C2_or : HInst< (outs PredRegs:$Pd4), (ins PredRegs:$Pt4, PredRegs:$Ps4), "$Pd4 = or($Pt4,$Ps4)", -tc_53bc8a6a, TypeCR>, Enc_454a26 { +tc_640086b5, TypeCR>, Enc_454a26 { let Inst{7-2} = 0b000000; let Inst{13-10} = 0b0000; let Inst{31-18} = 0b01101011001000; @@ -3777,7 +3791,7 @@ def C2_orn : HInst< (outs PredRegs:$Pd4), (ins PredRegs:$Pt4, PredRegs:$Ps4), "$Pd4 = or($Pt4,!$Ps4)", -tc_53bc8a6a, TypeCR>, Enc_454a26 { +tc_640086b5, TypeCR>, Enc_454a26 { let Inst{7-2} = 0b000000; let Inst{13-10} = 0b0000; let Inst{31-18} = 0b01101011111000; @@ -3786,7 +3800,7 @@ def C2_pxfer_map : HInst< (outs PredRegs:$Pd4), (ins PredRegs:$Ps4), "$Pd4 = $Ps4", -tc_53bc8a6a, TypeMAPPING> { +tc_640086b5, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -3794,7 +3808,7 @@ def C2_tfrpr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Ps4), "$Rd32 = $Ps4", -tc_cde8b071, TypeS_2op>, Enc_f5e933 { +tc_0ae0825c, TypeS_2op>, Enc_f5e933 { let Inst{13-5} = 0b000000000; let Inst{31-18} = 0b10001001010000; let hasNewValue = 1; @@ -3804,7 +3818,7 @@ def C2_tfrrp : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32), "$Pd4 = $Rs32", -tc_351fed2d, TypeS_2op>, Enc_48b75f { +tc_cfd8378a, TypeS_2op>, Enc_48b75f { let Inst{13-2} = 0b000000000000; let Inst{31-21} = 0b10000101010; } @@ -3812,7 +3826,7 @@ def C2_vitpack : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Ps4, PredRegs:$Pt4), "$Rd32 = vitpack($Ps4,$Pt4)", -tc_1b9c9ee5, TypeS_2op>, Enc_527412 { +tc_4414d8b1, TypeS_2op>, Enc_527412 { let Inst{7-5} = 0b000; let Inst{13-10} = 0b0000; let Inst{31-18} = 0b10001001000000; @@ -3824,7 +3838,7 @@ def C2_vmux : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pu4, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vmux($Pu4,$Rss32,$Rtt32)", -tc_f8eeed7a, TypeALU64>, Enc_329361 { +tc_b4b5c03a, TypeALU64>, Enc_329361 { let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010001000; @@ -3833,7 +3847,7 @@ def C2_xor : HInst< (outs PredRegs:$Pd4), (ins PredRegs:$Ps4, PredRegs:$Pt4), "$Pd4 = xor($Ps4,$Pt4)", -tc_53bc8a6a, TypeCR>, Enc_284ebb { +tc_640086b5, TypeCR>, Enc_284ebb { let Inst{7-2} = 0b000000; let Inst{13-10} = 0b0000; let Inst{31-18} = 0b01101011010000; @@ -3842,7 +3856,7 @@ def C4_addipc : HInst< (outs IntRegs:$Rd32), (ins u32_0Imm:$Ii), "$Rd32 = add(pc,#$Ii)", -tc_b9c4623f, TypeCR>, Enc_607661 { +tc_a813cf9a, TypeCR>, Enc_607661 { let Inst{6-5} = 0b00; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0110101001001001; @@ -3858,7 +3872,7 @@ def C4_and_and : HInst< (outs PredRegs:$Pd4), (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4), "$Pd4 = and($Ps4,and($Pt4,$Pu4))", -tc_481e5e5c, TypeCR>, Enc_9ac432 { +tc_b31c2e97, TypeCR>, Enc_9ac432 { let Inst{5-2} = 0b0000; let Inst{13-10} = 0b0000; let Inst{31-18} = 0b01101011000100; @@ -3867,7 +3881,7 @@ def C4_and_andn : HInst< (outs PredRegs:$Pd4), (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4), "$Pd4 = and($Ps4,and($Pt4,!$Pu4))", -tc_481e5e5c, TypeCR>, Enc_9ac432 { +tc_b31c2e97, TypeCR>, Enc_9ac432 { let Inst{5-2} = 0b0000; let Inst{13-10} = 0b0000; let Inst{31-18} = 0b01101011100100; @@ -3876,7 +3890,7 @@ def C4_and_or : HInst< (outs PredRegs:$Pd4), (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4), "$Pd4 = and($Ps4,or($Pt4,$Pu4))", -tc_481e5e5c, TypeCR>, Enc_9ac432 { +tc_b31c2e97, TypeCR>, Enc_9ac432 { let Inst{5-2} = 0b0000; let Inst{13-10} = 0b0000; let Inst{31-18} = 0b01101011001100; @@ -3885,7 +3899,7 @@ def C4_and_orn : HInst< (outs PredRegs:$Pd4), (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4), "$Pd4 = and($Ps4,or($Pt4,!$Pu4))", -tc_481e5e5c, TypeCR>, Enc_9ac432 { +tc_b31c2e97, TypeCR>, Enc_9ac432 { let Inst{5-2} = 0b0000; let Inst{13-10} = 0b0000; let Inst{31-18} = 0b01101011101100; @@ -3894,7 +3908,7 @@ def C4_cmplte : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = !cmp.gt($Rs32,$Rt32)", -tc_c6aa82f7, TypeALU32_3op>, Enc_c2b48e, ImmRegRel { +tc_de4df740, TypeALU32_3op>, Enc_c2b48e, ImmRegRel { let Inst{7-2} = 0b000100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110010010; @@ -3906,7 +3920,7 @@ def C4_cmpltei : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Pd4 = !cmp.gt($Rs32,#$Ii)", -tc_6ebb4a12, TypeALU32_2op>, Enc_bd0b33, ImmRegRel { +tc_56f114f4, TypeALU32_2op>, Enc_bd0b33, ImmRegRel { let Inst{4-2} = 0b100; let Inst{31-22} = 0b0111010101; let CextOpcode = "C4_cmplte"; @@ -3922,7 +3936,7 @@ def C4_cmplteu : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = !cmp.gtu($Rs32,$Rt32)", -tc_c6aa82f7, TypeALU32_3op>, Enc_c2b48e, ImmRegRel { +tc_de4df740, TypeALU32_3op>, Enc_c2b48e, ImmRegRel { let Inst{7-2} = 0b000100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110010011; @@ -3934,7 +3948,7 @@ def C4_cmplteui : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, u32_0Imm:$Ii), "$Pd4 = !cmp.gtu($Rs32,#$Ii)", -tc_6ebb4a12, TypeALU32_2op>, Enc_c0cdde, ImmRegRel { +tc_56f114f4, TypeALU32_2op>, Enc_c0cdde, ImmRegRel { let Inst{4-2} = 0b100; let Inst{31-21} = 0b01110101100; let CextOpcode = "C4_cmplteu"; @@ -3950,7 +3964,7 @@ def C4_cmpneq : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = !cmp.eq($Rs32,$Rt32)", -tc_c6aa82f7, TypeALU32_3op>, Enc_c2b48e, ImmRegRel { +tc_de4df740, TypeALU32_3op>, Enc_c2b48e, ImmRegRel { let Inst{7-2} = 0b000100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110010000; @@ -3963,7 +3977,7 @@ def C4_cmpneqi : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Pd4 = !cmp.eq($Rs32,#$Ii)", -tc_6ebb4a12, TypeALU32_2op>, Enc_bd0b33, ImmRegRel { +tc_56f114f4, TypeALU32_2op>, Enc_bd0b33, ImmRegRel { let Inst{4-2} = 0b100; let Inst{31-22} = 0b0111010100; let CextOpcode = "C4_cmpneq"; @@ -3979,7 +3993,7 @@ def C4_fastcorner9 : HInst< (outs PredRegs:$Pd4), (ins PredRegs:$Ps4, PredRegs:$Pt4), "$Pd4 = fastcorner9($Ps4,$Pt4)", -tc_53bc8a6a, TypeCR>, Enc_284ebb { +tc_640086b5, TypeCR>, Enc_284ebb { let Inst{7-2} = 0b100100; let Inst{13-10} = 0b1000; let Inst{31-18} = 0b01101011000000; @@ -3988,7 +4002,7 @@ def C4_fastcorner9_not : HInst< (outs PredRegs:$Pd4), (ins PredRegs:$Ps4, PredRegs:$Pt4), "$Pd4 = !fastcorner9($Ps4,$Pt4)", -tc_53bc8a6a, TypeCR>, Enc_284ebb { +tc_640086b5, TypeCR>, Enc_284ebb { let Inst{7-2} = 0b100100; let Inst{13-10} = 0b1000; let Inst{31-18} = 0b01101011000100; @@ -3997,7 +4011,7 @@ def C4_nbitsclr : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = !bitsclr($Rs32,$Rt32)", -tc_1e856f58, TypeS_3op>, Enc_c2b48e { +tc_85d5d03f, TypeS_3op>, Enc_c2b48e { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000111101; @@ -4006,7 +4020,7 @@ def C4_nbitsclri : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, u6_0Imm:$Ii), "$Pd4 = !bitsclr($Rs32,#$Ii)", -tc_7a830544, TypeS_2op>, Enc_5d6c34 { +tc_643b4717, TypeS_2op>, Enc_5d6c34 { let Inst{7-2} = 0b000000; let Inst{31-21} = 0b10000101101; } @@ -4014,7 +4028,7 @@ def C4_nbitsset : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = !bitsset($Rs32,$Rt32)", -tc_1e856f58, TypeS_3op>, Enc_c2b48e { +tc_85d5d03f, TypeS_3op>, Enc_c2b48e { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000111011; @@ -4023,7 +4037,7 @@ def C4_or_and : HInst< (outs PredRegs:$Pd4), (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4), "$Pd4 = or($Ps4,and($Pt4,$Pu4))", -tc_481e5e5c, TypeCR>, Enc_9ac432 { +tc_b31c2e97, TypeCR>, Enc_9ac432 { let Inst{5-2} = 0b0000; let Inst{13-10} = 0b0000; let Inst{31-18} = 0b01101011010100; @@ -4032,7 +4046,7 @@ def C4_or_andn : HInst< (outs PredRegs:$Pd4), (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4), "$Pd4 = or($Ps4,and($Pt4,!$Pu4))", -tc_481e5e5c, TypeCR>, Enc_9ac432 { +tc_b31c2e97, TypeCR>, Enc_9ac432 { let Inst{5-2} = 0b0000; let Inst{13-10} = 0b0000; let Inst{31-18} = 0b01101011110100; @@ -4041,7 +4055,7 @@ def C4_or_or : HInst< (outs PredRegs:$Pd4), (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4), "$Pd4 = or($Ps4,or($Pt4,$Pu4))", -tc_481e5e5c, TypeCR>, Enc_9ac432 { +tc_b31c2e97, TypeCR>, Enc_9ac432 { let Inst{5-2} = 0b0000; let Inst{13-10} = 0b0000; let Inst{31-18} = 0b01101011011100; @@ -4050,7 +4064,7 @@ def C4_or_orn : HInst< (outs PredRegs:$Pd4), (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4), "$Pd4 = or($Ps4,or($Pt4,!$Pu4))", -tc_481e5e5c, TypeCR>, Enc_9ac432 { +tc_b31c2e97, TypeCR>, Enc_9ac432 { let Inst{5-2} = 0b0000; let Inst{13-10} = 0b0000; let Inst{31-18} = 0b01101011111100; @@ -4059,7 +4073,7 @@ def F2_conv_d2df : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = convert_d2df($Rss32)", -tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb { +tc_3a867367, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000011; let Inst{31-21} = 0b10000000111; let isFP = 1; @@ -4069,7 +4083,7 @@ def F2_conv_d2sf : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = convert_d2sf($Rss32)", -tc_f3eaa14b, TypeS_2op>, Enc_90cd8b { +tc_3a867367, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000001; let Inst{31-21} = 0b10001000010; let hasNewValue = 1; @@ -4081,7 +4095,7 @@ def F2_conv_df2d : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = convert_df2d($Rss32)", -tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb { +tc_3a867367, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b10000000111; let isFP = 1; @@ -4091,7 +4105,7 @@ def F2_conv_df2d_chop : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = convert_df2d($Rss32):chop", -tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb { +tc_3a867367, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000110; let Inst{31-21} = 0b10000000111; let isFP = 1; @@ -4101,7 +4115,7 @@ def F2_conv_df2sf : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = convert_df2sf($Rss32)", -tc_f3eaa14b, TypeS_2op>, Enc_90cd8b { +tc_3a867367, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000001; let Inst{31-21} = 0b10001000000; let hasNewValue = 1; @@ -4113,7 +4127,7 @@ def F2_conv_df2ud : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = convert_df2ud($Rss32)", -tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb { +tc_3a867367, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000001; let Inst{31-21} = 0b10000000111; let isFP = 1; @@ -4123,7 +4137,7 @@ def F2_conv_df2ud_chop : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = convert_df2ud($Rss32):chop", -tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb { +tc_3a867367, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000111; let Inst{31-21} = 0b10000000111; let isFP = 1; @@ -4133,7 +4147,7 @@ def F2_conv_df2uw : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = convert_df2uw($Rss32)", -tc_f3eaa14b, TypeS_2op>, Enc_90cd8b { +tc_3a867367, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000001; let Inst{31-21} = 0b10001000011; let hasNewValue = 1; @@ -4145,7 +4159,7 @@ def F2_conv_df2uw_chop : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = convert_df2uw($Rss32):chop", -tc_f3eaa14b, TypeS_2op>, Enc_90cd8b { +tc_3a867367, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000001; let Inst{31-21} = 0b10001000101; let hasNewValue = 1; @@ -4157,7 +4171,7 @@ def F2_conv_df2w : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = convert_df2w($Rss32)", -tc_f3eaa14b, TypeS_2op>, Enc_90cd8b { +tc_3a867367, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000001; let Inst{31-21} = 0b10001000100; let hasNewValue = 1; @@ -4169,7 +4183,7 @@ def F2_conv_df2w_chop : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = convert_df2w($Rss32):chop", -tc_f3eaa14b, TypeS_2op>, Enc_90cd8b { +tc_3a867367, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000001; let Inst{31-21} = 0b10001000111; let hasNewValue = 1; @@ -4181,7 +4195,7 @@ def F2_conv_sf2d : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = convert_sf2d($Rs32)", -tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 { +tc_3a867367, TypeS_2op>, Enc_3a3d62 { let Inst{13-5} = 0b000000100; let Inst{31-21} = 0b10000100100; let isFP = 1; @@ -4191,7 +4205,7 @@ def F2_conv_sf2d_chop : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = convert_sf2d($Rs32):chop", -tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 { +tc_3a867367, TypeS_2op>, Enc_3a3d62 { let Inst{13-5} = 0b000000110; let Inst{31-21} = 0b10000100100; let isFP = 1; @@ -4201,7 +4215,7 @@ def F2_conv_sf2df : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = convert_sf2df($Rs32)", -tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 { +tc_3a867367, TypeS_2op>, Enc_3a3d62 { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b10000100100; let isFP = 1; @@ -4211,7 +4225,7 @@ def F2_conv_sf2ud : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = convert_sf2ud($Rs32)", -tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 { +tc_3a867367, TypeS_2op>, Enc_3a3d62 { let Inst{13-5} = 0b000000011; let Inst{31-21} = 0b10000100100; let isFP = 1; @@ -4221,7 +4235,7 @@ def F2_conv_sf2ud_chop : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = convert_sf2ud($Rs32):chop", -tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 { +tc_3a867367, TypeS_2op>, Enc_3a3d62 { let Inst{13-5} = 0b000000101; let Inst{31-21} = 0b10000100100; let isFP = 1; @@ -4231,7 +4245,7 @@ def F2_conv_sf2uw : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = convert_sf2uw($Rs32)", -tc_f3eaa14b, TypeS_2op>, Enc_5e2823 { +tc_3a867367, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b10001011011; let hasNewValue = 1; @@ -4243,7 +4257,7 @@ def F2_conv_sf2uw_chop : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = convert_sf2uw($Rs32):chop", -tc_f3eaa14b, TypeS_2op>, Enc_5e2823 { +tc_3a867367, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000001; let Inst{31-21} = 0b10001011011; let hasNewValue = 1; @@ -4255,7 +4269,7 @@ def F2_conv_sf2w : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = convert_sf2w($Rs32)", -tc_f3eaa14b, TypeS_2op>, Enc_5e2823 { +tc_3a867367, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b10001011100; let hasNewValue = 1; @@ -4267,7 +4281,7 @@ def F2_conv_sf2w_chop : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = convert_sf2w($Rs32):chop", -tc_f3eaa14b, TypeS_2op>, Enc_5e2823 { +tc_3a867367, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000001; let Inst{31-21} = 0b10001011100; let hasNewValue = 1; @@ -4279,7 +4293,7 @@ def F2_conv_ud2df : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = convert_ud2df($Rss32)", -tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb { +tc_3a867367, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000010; let Inst{31-21} = 0b10000000111; let isFP = 1; @@ -4289,7 +4303,7 @@ def F2_conv_ud2sf : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = convert_ud2sf($Rss32)", -tc_f3eaa14b, TypeS_2op>, Enc_90cd8b { +tc_3a867367, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000001; let Inst{31-21} = 0b10001000001; let hasNewValue = 1; @@ -4301,7 +4315,7 @@ def F2_conv_uw2df : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = convert_uw2df($Rs32)", -tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 { +tc_3a867367, TypeS_2op>, Enc_3a3d62 { let Inst{13-5} = 0b000000001; let Inst{31-21} = 0b10000100100; let isFP = 1; @@ -4311,7 +4325,7 @@ def F2_conv_uw2sf : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = convert_uw2sf($Rs32)", -tc_f3eaa14b, TypeS_2op>, Enc_5e2823 { +tc_3a867367, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b10001011001; let hasNewValue = 1; @@ -4323,7 +4337,7 @@ def F2_conv_w2df : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = convert_w2df($Rs32)", -tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 { +tc_3a867367, TypeS_2op>, Enc_3a3d62 { let Inst{13-5} = 0b000000010; let Inst{31-21} = 0b10000100100; let isFP = 1; @@ -4333,7 +4347,7 @@ def F2_conv_w2sf : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = convert_w2sf($Rs32)", -tc_f3eaa14b, TypeS_2op>, Enc_5e2823 { +tc_3a867367, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b10001011010; let hasNewValue = 1; @@ -4341,11 +4355,22 @@ let opNewValue = 0; let isFP = 1; let Uses = [USR]; } +def F2_dfadd : HInst< +(outs DoubleRegs:$Rdd32), +(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), +"$Rdd32 = dfadd($Rss32,$Rtt32)", +tc_2f7c551d, TypeM>, Enc_a56825, Requires<[HasV66]> { +let Inst{7-5} = 0b011; +let Inst{13-13} = 0b0; +let Inst{31-21} = 0b11101000000; +let isFP = 1; +let Uses = [USR]; +} def F2_dfclass : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, u5_0Imm:$Ii), "$Pd4 = dfclass($Rss32,#$Ii)", -tc_7a830544, TypeALU64>, Enc_1f19b5 { +tc_643b4717, TypeALU64>, Enc_1f19b5 { let Inst{4-2} = 0b100; let Inst{13-10} = 0b0000; let Inst{31-21} = 0b11011100100; @@ -4356,7 +4381,7 @@ def F2_dfcmpeq : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = dfcmp.eq($Rss32,$Rtt32)", -tc_1e856f58, TypeALU64>, Enc_fcf7a7 { +tc_85d5d03f, TypeALU64>, Enc_fcf7a7 { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010010111; @@ -4368,7 +4393,7 @@ def F2_dfcmpge : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = dfcmp.ge($Rss32,$Rtt32)", -tc_1e856f58, TypeALU64>, Enc_fcf7a7 { +tc_85d5d03f, TypeALU64>, Enc_fcf7a7 { let Inst{7-2} = 0b010000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010010111; @@ -4380,7 +4405,7 @@ def F2_dfcmpgt : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = dfcmp.gt($Rss32,$Rtt32)", -tc_1e856f58, TypeALU64>, Enc_fcf7a7 { +tc_85d5d03f, TypeALU64>, Enc_fcf7a7 { let Inst{7-2} = 0b001000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010010111; @@ -4392,7 +4417,7 @@ def F2_dfcmpuo : HInst< (outs PredRegs:$Pd4), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Pd4 = dfcmp.uo($Rss32,$Rtt32)", -tc_1e856f58, TypeALU64>, Enc_fcf7a7 { +tc_85d5d03f, TypeALU64>, Enc_fcf7a7 { let Inst{7-2} = 0b011000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010010111; @@ -4404,7 +4429,7 @@ def F2_dfimm_n : HInst< (outs DoubleRegs:$Rdd32), (ins u10_0Imm:$Ii), "$Rdd32 = dfmake(#$Ii):neg", -tc_234a11a5, TypeALU64>, Enc_e6c957 { +tc_9e313203, TypeALU64>, Enc_e6c957 { let Inst{20-16} = 0b00000; let Inst{31-22} = 0b1101100101; let prefersSlot3 = 1; @@ -4413,16 +4438,27 @@ def F2_dfimm_p : HInst< (outs DoubleRegs:$Rdd32), (ins u10_0Imm:$Ii), "$Rdd32 = dfmake(#$Ii):pos", -tc_234a11a5, TypeALU64>, Enc_e6c957 { +tc_9e313203, TypeALU64>, Enc_e6c957 { let Inst{20-16} = 0b00000; let Inst{31-22} = 0b1101100100; let prefersSlot3 = 1; } +def F2_dfsub : HInst< +(outs DoubleRegs:$Rdd32), +(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), +"$Rdd32 = dfsub($Rss32,$Rtt32)", +tc_2f7c551d, TypeM>, Enc_a56825, Requires<[HasV66]> { +let Inst{7-5} = 0b011; +let Inst{13-13} = 0b0; +let Inst{31-21} = 0b11101000100; +let isFP = 1; +let Uses = [USR]; +} def F2_sfadd : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = sfadd($Rs32,$Rt32)", -tc_6792d5ff, TypeM>, Enc_5ab2be { +tc_3b470976, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101011000; @@ -4436,7 +4472,7 @@ def F2_sfclass : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, u5_0Imm:$Ii), "$Pd4 = sfclass($Rs32,#$Ii)", -tc_7a830544, TypeS_2op>, Enc_83ee64 { +tc_643b4717, TypeS_2op>, Enc_83ee64 { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10000101111; @@ -4447,7 +4483,7 @@ def F2_sfcmpeq : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = sfcmp.eq($Rs32,$Rt32)", -tc_1e856f58, TypeS_3op>, Enc_c2b48e { +tc_85d5d03f, TypeS_3op>, Enc_c2b48e { let Inst{7-2} = 0b011000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000111111; @@ -4459,7 +4495,7 @@ def F2_sfcmpge : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = sfcmp.ge($Rs32,$Rt32)", -tc_1e856f58, TypeS_3op>, Enc_c2b48e { +tc_85d5d03f, TypeS_3op>, Enc_c2b48e { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000111111; @@ -4471,7 +4507,7 @@ def F2_sfcmpgt : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = sfcmp.gt($Rs32,$Rt32)", -tc_1e856f58, TypeS_3op>, Enc_c2b48e { +tc_85d5d03f, TypeS_3op>, Enc_c2b48e { let Inst{7-2} = 0b100000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000111111; @@ -4483,7 +4519,7 @@ def F2_sfcmpuo : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = sfcmp.uo($Rs32,$Rt32)", -tc_1e856f58, TypeS_3op>, Enc_c2b48e { +tc_85d5d03f, TypeS_3op>, Enc_c2b48e { let Inst{7-2} = 0b001000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000111111; @@ -4495,7 +4531,7 @@ def F2_sffixupd : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = sffixupd($Rs32,$Rt32)", -tc_6792d5ff, TypeM>, Enc_5ab2be { +tc_3b470976, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101011110; @@ -4507,7 +4543,7 @@ def F2_sffixupn : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = sffixupn($Rs32,$Rt32)", -tc_6792d5ff, TypeM>, Enc_5ab2be { +tc_3b470976, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101011110; @@ -4519,7 +4555,7 @@ def F2_sffixupr : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = sffixupr($Rs32)", -tc_f3eaa14b, TypeS_2op>, Enc_5e2823 { +tc_3a867367, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b10001011101; let hasNewValue = 1; @@ -4530,7 +4566,7 @@ def F2_sffma : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += sfmpy($Rs32,$Rt32)", -tc_d580173f, TypeM>, Enc_2ae154 { +tc_a58fd5cc, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111000; @@ -4544,7 +4580,7 @@ def F2_sffma_lib : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += sfmpy($Rs32,$Rt32):lib", -tc_d580173f, TypeM>, Enc_2ae154 { +tc_a58fd5cc, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111000; @@ -4558,7 +4594,7 @@ def F2_sffma_sc : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32, PredRegs:$Pu4), "$Rx32 += sfmpy($Rs32,$Rt32,$Pu4):scale", -tc_038a1342, TypeM>, Enc_437f33 { +tc_4560740b, TypeM>, Enc_437f33 { let Inst{7-7} = 0b1; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111011; @@ -4572,7 +4608,7 @@ def F2_sffms : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= sfmpy($Rs32,$Rt32)", -tc_d580173f, TypeM>, Enc_2ae154 { +tc_a58fd5cc, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111000; @@ -4586,7 +4622,7 @@ def F2_sffms_lib : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= sfmpy($Rs32,$Rt32):lib", -tc_d580173f, TypeM>, Enc_2ae154 { +tc_a58fd5cc, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111000; @@ -4600,7 +4636,7 @@ def F2_sfimm_n : HInst< (outs IntRegs:$Rd32), (ins u10_0Imm:$Ii), "$Rd32 = sfmake(#$Ii):neg", -tc_234a11a5, TypeALU64>, Enc_6c9440 { +tc_9e313203, TypeALU64>, Enc_6c9440 { let Inst{20-16} = 0b00000; let Inst{31-22} = 0b1101011001; let hasNewValue = 1; @@ -4611,7 +4647,7 @@ def F2_sfimm_p : HInst< (outs IntRegs:$Rd32), (ins u10_0Imm:$Ii), "$Rd32 = sfmake(#$Ii):pos", -tc_234a11a5, TypeALU64>, Enc_6c9440 { +tc_9e313203, TypeALU64>, Enc_6c9440 { let Inst{20-16} = 0b00000; let Inst{31-22} = 0b1101011000; let hasNewValue = 1; @@ -4622,7 +4658,7 @@ def F2_sfinvsqrta : HInst< (outs IntRegs:$Rd32, PredRegs:$Pe4), (ins IntRegs:$Rs32), "$Rd32,$Pe4 = sfinvsqrta($Rs32)", -tc_4d99bca9, TypeS_2op>, Enc_890909 { +tc_b8bffe55, TypeS_2op>, Enc_890909 { let Inst{13-7} = 0b0000000; let Inst{31-21} = 0b10001011111; let hasNewValue = 1; @@ -4634,7 +4670,7 @@ def F2_sfmax : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = sfmax($Rs32,$Rt32)", -tc_976ddc4f, TypeM>, Enc_5ab2be { +tc_88b4f13d, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101011100; @@ -4648,7 +4684,7 @@ def F2_sfmin : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = sfmin($Rs32,$Rt32)", -tc_976ddc4f, TypeM>, Enc_5ab2be { +tc_88b4f13d, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101011100; @@ -4662,7 +4698,7 @@ def F2_sfmpy : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = sfmpy($Rs32,$Rt32)", -tc_6792d5ff, TypeM>, Enc_5ab2be { +tc_3b470976, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101011010; @@ -4676,7 +4712,7 @@ def F2_sfrecipa : HInst< (outs IntRegs:$Rd32, PredRegs:$Pe4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32,$Pe4 = sfrecipa($Rs32,$Rt32)", -tc_9c00ce8d, TypeM>, Enc_a94f3b { +tc_2ff964b4, TypeM>, Enc_a94f3b { let Inst{7-7} = 0b1; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101011111; @@ -4689,7 +4725,7 @@ def F2_sfsub : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = sfsub($Rs32,$Rt32)", -tc_6792d5ff, TypeM>, Enc_5ab2be { +tc_3b470976, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101011000; @@ -4702,7 +4738,7 @@ def G4_tfrgcpp : HInst< (outs DoubleRegs:$Rdd32), (ins GuestRegs64:$Gss32), "$Rdd32 = $Gss32", -tc_6fa4db47, TypeCR>, Enc_0aa344 { +tc_0d8f5752, TypeCR>, Enc_0aa344 { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b01101000001; } @@ -4710,7 +4746,7 @@ def G4_tfrgcrr : HInst< (outs IntRegs:$Rd32), (ins GuestRegs:$Gs32), "$Rd32 = $Gs32", -tc_6fa4db47, TypeCR>, Enc_44271f { +tc_0d8f5752, TypeCR>, Enc_44271f { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b01101010001; let hasNewValue = 1; @@ -4720,7 +4756,7 @@ def G4_tfrgpcp : HInst< (outs GuestRegs64:$Gdd32), (ins DoubleRegs:$Rss32), "$Gdd32 = $Rss32", -tc_994333cd, TypeCR>, Enc_ed5027 { +tc_bcf98408, TypeCR>, Enc_ed5027 { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b01100011000; let hasNewValue = 1; @@ -4730,7 +4766,7 @@ def G4_tfrgrcr : HInst< (outs GuestRegs:$Gd32), (ins IntRegs:$Rs32), "$Gd32 = $Rs32", -tc_994333cd, TypeCR>, Enc_621fba { +tc_bcf98408, TypeCR>, Enc_621fba { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b01100010000; let hasNewValue = 1; @@ -4740,7 +4776,7 @@ def J2_call : HInst< (outs), (ins a30_2Imm:$Ii), "call $Ii", -tc_a27582fa, TypeJ>, Enc_81ac1d, PredRel { +tc_4ae7b58b, TypeJ>, Enc_81ac1d, PredRel { let Inst{0-0} = 0b0; let Inst{31-25} = 0b0101101; let isCall = 1; @@ -4762,7 +4798,7 @@ def J2_callf : HInst< (outs), (ins PredRegs:$Pu4, a30_2Imm:$Ii), "if (!$Pu4) call $Ii", -tc_2f185f5c, TypeJ>, Enc_daea09, PredRel { +tc_1d81e60e, TypeJ>, Enc_daea09, PredRel { let Inst{0-0} = 0b0; let Inst{12-10} = 0b000; let Inst{21-21} = 0b1; @@ -4789,7 +4825,7 @@ def J2_callr : HInst< (outs), (ins IntRegs:$Rs32), "callr $Rs32", -tc_15411484, TypeJ>, Enc_ecbcc8 { +tc_3bd75825, TypeJ>, Enc_ecbcc8 { let Inst{13-0} = 0b00000000000000; let Inst{31-21} = 0b01010000101; let isCall = 1; @@ -4803,7 +4839,7 @@ def J2_callrf : HInst< (outs), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4) callr $Rs32", -tc_10b97e27, TypeJ>, Enc_88d4d9 { +tc_1ad90acd, TypeJ>, Enc_88d4d9 { let Inst{7-0} = 0b00000000; let Inst{13-10} = 0b0000; let Inst{31-21} = 0b01010001001; @@ -4821,7 +4857,7 @@ def J2_callrt : HInst< (outs), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4) callr $Rs32", -tc_10b97e27, TypeJ>, Enc_88d4d9 { +tc_1ad90acd, TypeJ>, Enc_88d4d9 { let Inst{7-0} = 0b00000000; let Inst{13-10} = 0b0000; let Inst{31-21} = 0b01010001000; @@ -4838,7 +4874,7 @@ def J2_callt : HInst< (outs), (ins PredRegs:$Pu4, a30_2Imm:$Ii), "if ($Pu4) call $Ii", -tc_2f185f5c, TypeJ>, Enc_daea09, PredRel { +tc_1d81e60e, TypeJ>, Enc_daea09, PredRel { let Inst{0-0} = 0b0; let Inst{12-10} = 0b000; let Inst{21-21} = 0b0; @@ -4864,7 +4900,7 @@ def J2_endloop0 : HInst< (outs), (ins), "endloop0", -tc_52d7bbea, TypeJ> { +tc_1b6f7cec, TypeJ> { let Uses = [LC0, SA0]; let Defs = [LC0, P3, PC, USR]; let isBranch = 1; @@ -4875,7 +4911,7 @@ def J2_endloop01 : HInst< (outs), (ins), "endloop01", -tc_52d7bbea, TypeJ> { +tc_1b6f7cec, TypeJ> { let Uses = [LC0, LC1, SA0, SA1]; let Defs = [LC0, LC1, P3, PC, USR]; let isPseudo = 1; @@ -4884,7 +4920,7 @@ def J2_endloop1 : HInst< (outs), (ins), "endloop1", -tc_52d7bbea, TypeJ> { +tc_1b6f7cec, TypeJ> { let Uses = [LC1, SA1]; let Defs = [LC1, PC]; let isBranch = 1; @@ -4895,7 +4931,7 @@ def J2_jump : HInst< (outs), (ins b30_2Imm:$Ii), "jump $Ii", -tc_3669266a, TypeJ>, Enc_81ac1d, PredNewRel { +tc_ae53734a, TypeJ>, Enc_81ac1d, PredNewRel { let Inst{0-0} = 0b0; let Inst{31-25} = 0b0101100; let isTerminator = 1; @@ -4917,7 +4953,7 @@ def J2_jumpf : HInst< (outs), (ins PredRegs:$Pu4, b30_2Imm:$Ii), "if (!$Pu4) jump:nt $Ii", -tc_e9fae2d6, TypeJ>, Enc_daea09, PredNewRel { +tc_db2bce9c, TypeJ>, Enc_daea09, PredNewRel { let Inst{0-0} = 0b0; let Inst{12-10} = 0b000; let Inst{21-21} = 0b1; @@ -4943,7 +4979,7 @@ def J2_jumpf_nopred_map : HInst< (outs), (ins PredRegs:$Pu4, b15_2Imm:$Ii), "if (!$Pu4) jump $Ii", -tc_e9fae2d6, TypeMAPPING>, Requires<[HasV60]> { +tc_db2bce9c, TypeMAPPING>, Requires<[HasV60]> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -4951,7 +4987,7 @@ def J2_jumpfnew : HInst< (outs), (ins PredRegs:$Pu4, b30_2Imm:$Ii), "if (!$Pu4.new) jump:nt $Ii", -tc_a46f0df5, TypeJ>, Enc_daea09, PredNewRel { +tc_20cdee80, TypeJ>, Enc_daea09, PredNewRel { let Inst{0-0} = 0b0; let Inst{12-10} = 0b010; let Inst{21-21} = 0b1; @@ -4978,7 +5014,7 @@ def J2_jumpfnewpt : HInst< (outs), (ins PredRegs:$Pu4, b30_2Imm:$Ii), "if (!$Pu4.new) jump:t $Ii", -tc_a46f0df5, TypeJ>, Enc_daea09, PredNewRel { +tc_20cdee80, TypeJ>, Enc_daea09, PredNewRel { let Inst{0-0} = 0b0; let Inst{12-10} = 0b110; let Inst{21-21} = 0b1; @@ -5005,7 +5041,7 @@ def J2_jumpfpt : HInst< (outs), (ins PredRegs:$Pu4, b30_2Imm:$Ii), "if (!$Pu4) jump:t $Ii", -tc_e1e99bfa, TypeJ>, Enc_daea09, Requires<[HasV60]>, PredNewRel { +tc_cd374165, TypeJ>, Enc_daea09, Requires<[HasV60]>, PredNewRel { let Inst{0-0} = 0b0; let Inst{12-10} = 0b100; let Inst{21-21} = 0b1; @@ -5031,7 +5067,7 @@ def J2_jumpr : HInst< (outs), (ins IntRegs:$Rs32), "jumpr $Rs32", -tc_9faf76ae, TypeJ>, Enc_ecbcc8, PredNewRel { +tc_d5b7b0c1, TypeJ>, Enc_ecbcc8, PredNewRel { let Inst{13-0} = 0b00000000000000; let Inst{31-21} = 0b01010010100; let isTerminator = 1; @@ -5048,7 +5084,7 @@ def J2_jumprf : HInst< (outs), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4) jumpr:nt $Rs32", -tc_e0739b8c, TypeJ>, Enc_88d4d9, PredNewRel { +tc_85c9c08f, TypeJ>, Enc_88d4d9, PredNewRel { let Inst{7-0} = 0b00000000; let Inst{13-10} = 0b0000; let Inst{31-21} = 0b01010011011; @@ -5067,7 +5103,7 @@ def J2_jumprf_nopred_map : HInst< (outs), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4) jumpr $Rs32", -tc_e0739b8c, TypeMAPPING>, Requires<[HasV60]> { +tc_85c9c08f, TypeMAPPING>, Requires<[HasV60]> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -5075,7 +5111,7 @@ def J2_jumprfnew : HInst< (outs), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4.new) jumpr:nt $Rs32", -tc_181af5d0, TypeJ>, Enc_88d4d9, PredNewRel { +tc_b51dc29a, TypeJ>, Enc_88d4d9, PredNewRel { let Inst{7-0} = 0b00000000; let Inst{13-10} = 0b0010; let Inst{31-21} = 0b01010011011; @@ -5095,7 +5131,7 @@ def J2_jumprfnewpt : HInst< (outs), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4.new) jumpr:t $Rs32", -tc_181af5d0, TypeJ>, Enc_88d4d9, PredNewRel { +tc_b51dc29a, TypeJ>, Enc_88d4d9, PredNewRel { let Inst{7-0} = 0b00000000; let Inst{13-10} = 0b0110; let Inst{31-21} = 0b01010011011; @@ -5115,7 +5151,7 @@ def J2_jumprfpt : HInst< (outs), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if (!$Pu4) jumpr:t $Rs32", -tc_97743097, TypeJ>, Enc_88d4d9, Requires<[HasV60]>, PredNewRel { +tc_e78647bd, TypeJ>, Enc_88d4d9, Requires<[HasV60]>, PredNewRel { let Inst{7-0} = 0b00000000; let Inst{13-10} = 0b0100; let Inst{31-21} = 0b01010011011; @@ -5134,7 +5170,7 @@ def J2_jumprgtez : HInst< (outs), (ins IntRegs:$Rs32, b13_2Imm:$Ii), "if ($Rs32>=#0) jump:nt $Ii", -tc_73043bf4, TypeCR>, Enc_0fa531 { +tc_d9d43ecb, TypeCR>, Enc_0fa531 { let Inst{0-0} = 0b0; let Inst{12-12} = 0b0; let Inst{31-22} = 0b0110000101; @@ -5152,7 +5188,7 @@ def J2_jumprgtezpt : HInst< (outs), (ins IntRegs:$Rs32, b13_2Imm:$Ii), "if ($Rs32>=#0) jump:t $Ii", -tc_73043bf4, TypeCR>, Enc_0fa531 { +tc_d9d43ecb, TypeCR>, Enc_0fa531 { let Inst{0-0} = 0b0; let Inst{12-12} = 0b1; let Inst{31-22} = 0b0110000101; @@ -5170,7 +5206,7 @@ def J2_jumprltez : HInst< (outs), (ins IntRegs:$Rs32, b13_2Imm:$Ii), "if ($Rs32<=#0) jump:nt $Ii", -tc_73043bf4, TypeCR>, Enc_0fa531 { +tc_d9d43ecb, TypeCR>, Enc_0fa531 { let Inst{0-0} = 0b0; let Inst{12-12} = 0b0; let Inst{31-22} = 0b0110000111; @@ -5188,7 +5224,7 @@ def J2_jumprltezpt : HInst< (outs), (ins IntRegs:$Rs32, b13_2Imm:$Ii), "if ($Rs32<=#0) jump:t $Ii", -tc_73043bf4, TypeCR>, Enc_0fa531 { +tc_d9d43ecb, TypeCR>, Enc_0fa531 { let Inst{0-0} = 0b0; let Inst{12-12} = 0b1; let Inst{31-22} = 0b0110000111; @@ -5206,7 +5242,7 @@ def J2_jumprnz : HInst< (outs), (ins IntRegs:$Rs32, b13_2Imm:$Ii), "if ($Rs32==#0) jump:nt $Ii", -tc_73043bf4, TypeCR>, Enc_0fa531 { +tc_d9d43ecb, TypeCR>, Enc_0fa531 { let Inst{0-0} = 0b0; let Inst{12-12} = 0b0; let Inst{31-22} = 0b0110000110; @@ -5224,7 +5260,7 @@ def J2_jumprnzpt : HInst< (outs), (ins IntRegs:$Rs32, b13_2Imm:$Ii), "if ($Rs32==#0) jump:t $Ii", -tc_73043bf4, TypeCR>, Enc_0fa531 { +tc_d9d43ecb, TypeCR>, Enc_0fa531 { let Inst{0-0} = 0b0; let Inst{12-12} = 0b1; let Inst{31-22} = 0b0110000110; @@ -5242,7 +5278,7 @@ def J2_jumprt : HInst< (outs), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4) jumpr:nt $Rs32", -tc_e0739b8c, TypeJ>, Enc_88d4d9, PredNewRel { +tc_85c9c08f, TypeJ>, Enc_88d4d9, PredNewRel { let Inst{7-0} = 0b00000000; let Inst{13-10} = 0b0000; let Inst{31-21} = 0b01010011010; @@ -5260,7 +5296,7 @@ def J2_jumprt_nopred_map : HInst< (outs), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4) jumpr $Rs32", -tc_e0739b8c, TypeMAPPING>, Requires<[HasV60]> { +tc_85c9c08f, TypeMAPPING>, Requires<[HasV60]> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -5268,7 +5304,7 @@ def J2_jumprtnew : HInst< (outs), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4.new) jumpr:nt $Rs32", -tc_181af5d0, TypeJ>, Enc_88d4d9, PredNewRel { +tc_b51dc29a, TypeJ>, Enc_88d4d9, PredNewRel { let Inst{7-0} = 0b00000000; let Inst{13-10} = 0b0010; let Inst{31-21} = 0b01010011010; @@ -5287,7 +5323,7 @@ def J2_jumprtnewpt : HInst< (outs), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4.new) jumpr:t $Rs32", -tc_181af5d0, TypeJ>, Enc_88d4d9, PredNewRel { +tc_b51dc29a, TypeJ>, Enc_88d4d9, PredNewRel { let Inst{7-0} = 0b00000000; let Inst{13-10} = 0b0110; let Inst{31-21} = 0b01010011010; @@ -5306,7 +5342,7 @@ def J2_jumprtpt : HInst< (outs), (ins PredRegs:$Pu4, IntRegs:$Rs32), "if ($Pu4) jumpr:t $Rs32", -tc_97743097, TypeJ>, Enc_88d4d9, Requires<[HasV60]>, PredNewRel { +tc_e78647bd, TypeJ>, Enc_88d4d9, Requires<[HasV60]>, PredNewRel { let Inst{7-0} = 0b00000000; let Inst{13-10} = 0b0100; let Inst{31-21} = 0b01010011010; @@ -5324,7 +5360,7 @@ def J2_jumprz : HInst< (outs), (ins IntRegs:$Rs32, b13_2Imm:$Ii), "if ($Rs32!=#0) jump:nt $Ii", -tc_73043bf4, TypeCR>, Enc_0fa531 { +tc_d9d43ecb, TypeCR>, Enc_0fa531 { let Inst{0-0} = 0b0; let Inst{12-12} = 0b0; let Inst{31-22} = 0b0110000100; @@ -5342,7 +5378,7 @@ def J2_jumprzpt : HInst< (outs), (ins IntRegs:$Rs32, b13_2Imm:$Ii), "if ($Rs32!=#0) jump:t $Ii", -tc_73043bf4, TypeCR>, Enc_0fa531 { +tc_d9d43ecb, TypeCR>, Enc_0fa531 { let Inst{0-0} = 0b0; let Inst{12-12} = 0b1; let Inst{31-22} = 0b0110000100; @@ -5360,7 +5396,7 @@ def J2_jumpt : HInst< (outs), (ins PredRegs:$Pu4, b30_2Imm:$Ii), "if ($Pu4) jump:nt $Ii", -tc_e9fae2d6, TypeJ>, Enc_daea09, PredNewRel { +tc_db2bce9c, TypeJ>, Enc_daea09, PredNewRel { let Inst{0-0} = 0b0; let Inst{12-10} = 0b000; let Inst{21-21} = 0b0; @@ -5385,7 +5421,7 @@ def J2_jumpt_nopred_map : HInst< (outs), (ins PredRegs:$Pu4, b15_2Imm:$Ii), "if ($Pu4) jump $Ii", -tc_e9fae2d6, TypeMAPPING>, Requires<[HasV60]> { +tc_db2bce9c, TypeMAPPING>, Requires<[HasV60]> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -5393,7 +5429,7 @@ def J2_jumptnew : HInst< (outs), (ins PredRegs:$Pu4, b30_2Imm:$Ii), "if ($Pu4.new) jump:nt $Ii", -tc_a46f0df5, TypeJ>, Enc_daea09, PredNewRel { +tc_20cdee80, TypeJ>, Enc_daea09, PredNewRel { let Inst{0-0} = 0b0; let Inst{12-10} = 0b010; let Inst{21-21} = 0b0; @@ -5419,7 +5455,7 @@ def J2_jumptnewpt : HInst< (outs), (ins PredRegs:$Pu4, b30_2Imm:$Ii), "if ($Pu4.new) jump:t $Ii", -tc_a46f0df5, TypeJ>, Enc_daea09, PredNewRel { +tc_20cdee80, TypeJ>, Enc_daea09, PredNewRel { let Inst{0-0} = 0b0; let Inst{12-10} = 0b110; let Inst{21-21} = 0b0; @@ -5445,7 +5481,7 @@ def J2_jumptpt : HInst< (outs), (ins PredRegs:$Pu4, b30_2Imm:$Ii), "if ($Pu4) jump:t $Ii", -tc_e1e99bfa, TypeJ>, Enc_daea09, Requires<[HasV60]>, PredNewRel { +tc_cd374165, TypeJ>, Enc_daea09, Requires<[HasV60]>, PredNewRel { let Inst{0-0} = 0b0; let Inst{12-10} = 0b100; let Inst{21-21} = 0b0; @@ -5470,7 +5506,7 @@ def J2_loop0i : HInst< (outs), (ins b30_2Imm:$Ii, u10_0Imm:$II), "loop0($Ii,#$II)", -tc_cf59f215, TypeCR>, Enc_4dc228 { +tc_a9d88b22, TypeCR>, Enc_4dc228 { let Inst{2-2} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b01101001000; @@ -5487,7 +5523,7 @@ def J2_loop0r : HInst< (outs), (ins b30_2Imm:$Ii, IntRegs:$Rs32), "loop0($Ii,$Rs32)", -tc_7934b9df, TypeCR>, Enc_864a5a { +tc_df3319ed, TypeCR>, Enc_864a5a { let Inst{2-0} = 0b000; let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; @@ -5505,7 +5541,7 @@ def J2_loop1i : HInst< (outs), (ins b30_2Imm:$Ii, u10_0Imm:$II), "loop1($Ii,#$II)", -tc_cf59f215, TypeCR>, Enc_4dc228 { +tc_a9d88b22, TypeCR>, Enc_4dc228 { let Inst{2-2} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b01101001001; @@ -5522,7 +5558,7 @@ def J2_loop1r : HInst< (outs), (ins b30_2Imm:$Ii, IntRegs:$Rs32), "loop1($Ii,$Rs32)", -tc_7934b9df, TypeCR>, Enc_864a5a { +tc_df3319ed, TypeCR>, Enc_864a5a { let Inst{2-0} = 0b000; let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; @@ -5540,7 +5576,7 @@ def J2_pause : HInst< (outs), (ins u8_0Imm:$Ii), "pause(#$Ii)", -tc_681a2300, TypeJ>, Enc_a51a9a { +tc_8d9d0154, TypeJ>, Enc_a51a9a { let Inst{1-0} = 0b00; let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; @@ -5551,7 +5587,7 @@ def J2_ploop1si : HInst< (outs), (ins b30_2Imm:$Ii, u10_0Imm:$II), "p3 = sp1loop0($Ii,#$II)", -tc_c5e2426d, TypeCR>, Enc_4dc228 { +tc_1c4528a2, TypeCR>, Enc_4dc228 { let Inst{2-2} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b01101001101; @@ -5569,7 +5605,7 @@ def J2_ploop1sr : HInst< (outs), (ins b30_2Imm:$Ii, IntRegs:$Rs32), "p3 = sp1loop0($Ii,$Rs32)", -tc_4f7cd700, TypeCR>, Enc_864a5a { +tc_32779c6f, TypeCR>, Enc_864a5a { let Inst{2-0} = 0b000; let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; @@ -5588,7 +5624,7 @@ def J2_ploop2si : HInst< (outs), (ins b30_2Imm:$Ii, u10_0Imm:$II), "p3 = sp2loop0($Ii,#$II)", -tc_c5e2426d, TypeCR>, Enc_4dc228 { +tc_1c4528a2, TypeCR>, Enc_4dc228 { let Inst{2-2} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b01101001110; @@ -5606,7 +5642,7 @@ def J2_ploop2sr : HInst< (outs), (ins b30_2Imm:$Ii, IntRegs:$Rs32), "p3 = sp2loop0($Ii,$Rs32)", -tc_4f7cd700, TypeCR>, Enc_864a5a { +tc_32779c6f, TypeCR>, Enc_864a5a { let Inst{2-0} = 0b000; let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; @@ -5625,7 +5661,7 @@ def J2_ploop3si : HInst< (outs), (ins b30_2Imm:$Ii, u10_0Imm:$II), "p3 = sp3loop0($Ii,#$II)", -tc_c5e2426d, TypeCR>, Enc_4dc228 { +tc_1c4528a2, TypeCR>, Enc_4dc228 { let Inst{2-2} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b01101001111; @@ -5643,7 +5679,7 @@ def J2_ploop3sr : HInst< (outs), (ins b30_2Imm:$Ii, IntRegs:$Rs32), "p3 = sp3loop0($Ii,$Rs32)", -tc_4f7cd700, TypeCR>, Enc_864a5a { +tc_32779c6f, TypeCR>, Enc_864a5a { let Inst{2-0} = 0b000; let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; @@ -5662,45 +5698,45 @@ def J2_trap0 : HInst< (outs), (ins u8_0Imm:$Ii), "trap0(#$Ii)", -tc_14cd4cfa, TypeJ>, Enc_a51a9a { +tc_fc3999b4, TypeJ>, Enc_a51a9a { let Inst{1-0} = 0b00; let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0101010000000000; -let hasSideEffects = 1; let isSolo = 1; +let hasSideEffects = 1; } def J2_trap1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, u8_0Imm:$Ii), "trap1($Rx32,#$Ii)", -tc_59a01ead, TypeJ>, Enc_33f8ba { +tc_b9e09e03, TypeJ>, Enc_33f8ba { let Inst{1-0} = 0b00; let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b01010100100; let hasNewValue = 1; -let hasSideEffects = 1; let opNewValue = 0; let isSolo = 1; let Uses = [GOSP]; let Defs = [GOSP, PC]; +let hasSideEffects = 1; let Constraints = "$Rx32 = $Rx32in"; } def J2_trap1_noregmap : HInst< (outs), (ins u8_0Imm:$Ii), "trap1(#$Ii)", -tc_59a01ead, TypeMAPPING> { +tc_b9e09e03, TypeMAPPING> { +let hasSideEffects = 1; let isPseudo = 1; let isCodeGenOnly = 1; -let hasSideEffects = 1; } def J4_cmpeq_f_jumpnv_nt : HInst< (outs), (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii), "if (!cmp.eq($Ns8.new,$Rt32)) jump:nt $Ii", -tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel { +tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{19-19} = 0b0; @@ -5726,7 +5762,7 @@ def J4_cmpeq_f_jumpnv_t : HInst< (outs), (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii), "if (!cmp.eq($Ns8.new,$Rt32)) jump:t $Ii", -tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel { +tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{19-19} = 0b0; @@ -5752,7 +5788,7 @@ def J4_cmpeq_fp0_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p0 = cmp.eq($Rs16,$Rt16); if (!p0.new) jump:nt $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b00; let Inst{31-22} = 0b0001010001; @@ -5778,7 +5814,7 @@ def J4_cmpeq_fp0_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p0 = cmp.eq($Rs16,$Rt16); if (!p0.new) jump:t $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b10; let Inst{31-22} = 0b0001010001; @@ -5804,7 +5840,7 @@ def J4_cmpeq_fp1_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p1 = cmp.eq($Rs16,$Rt16); if (!p1.new) jump:nt $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b01; let Inst{31-22} = 0b0001010001; @@ -5830,7 +5866,7 @@ def J4_cmpeq_fp1_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p1 = cmp.eq($Rs16,$Rt16); if (!p1.new) jump:t $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b11; let Inst{31-22} = 0b0001010001; @@ -5856,7 +5892,7 @@ def J4_cmpeq_t_jumpnv_nt : HInst< (outs), (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii), "if (cmp.eq($Ns8.new,$Rt32)) jump:nt $Ii", -tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel { +tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{19-19} = 0b0; @@ -5881,7 +5917,7 @@ def J4_cmpeq_t_jumpnv_t : HInst< (outs), (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii), "if (cmp.eq($Ns8.new,$Rt32)) jump:t $Ii", -tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel { +tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{19-19} = 0b0; @@ -5906,7 +5942,7 @@ def J4_cmpeq_tp0_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p0 = cmp.eq($Rs16,$Rt16); if (p0.new) jump:nt $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b00; let Inst{31-22} = 0b0001010000; @@ -5931,7 +5967,7 @@ def J4_cmpeq_tp0_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p0 = cmp.eq($Rs16,$Rt16); if (p0.new) jump:t $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b10; let Inst{31-22} = 0b0001010000; @@ -5956,7 +5992,7 @@ def J4_cmpeq_tp1_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p1 = cmp.eq($Rs16,$Rt16); if (p1.new) jump:nt $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b01; let Inst{31-22} = 0b0001010000; @@ -5981,7 +6017,7 @@ def J4_cmpeq_tp1_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p1 = cmp.eq($Rs16,$Rt16); if (p1.new) jump:t $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b11; let Inst{31-22} = 0b0001010000; @@ -6006,7 +6042,7 @@ def J4_cmpeqi_f_jumpnv_nt : HInst< (outs), (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii), "if (!cmp.eq($Ns8.new,#$II)) jump:nt $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{19-19} = 0b0; @@ -6032,7 +6068,7 @@ def J4_cmpeqi_f_jumpnv_t : HInst< (outs), (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii), "if (!cmp.eq($Ns8.new,#$II)) jump:t $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{19-19} = 0b0; @@ -6058,7 +6094,7 @@ def J4_cmpeqi_fp0_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p0 = cmp.eq($Rs16,#$II); if (!p0.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{31-22} = 0b0001000001; @@ -6084,7 +6120,7 @@ def J4_cmpeqi_fp0_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p0 = cmp.eq($Rs16,#$II); if (!p0.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{31-22} = 0b0001000001; @@ -6110,7 +6146,7 @@ def J4_cmpeqi_fp1_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p1 = cmp.eq($Rs16,#$II); if (!p1.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{31-22} = 0b0001001001; @@ -6136,7 +6172,7 @@ def J4_cmpeqi_fp1_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p1 = cmp.eq($Rs16,#$II); if (!p1.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{31-22} = 0b0001001001; @@ -6162,7 +6198,7 @@ def J4_cmpeqi_t_jumpnv_nt : HInst< (outs), (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii), "if (cmp.eq($Ns8.new,#$II)) jump:nt $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{19-19} = 0b0; @@ -6187,7 +6223,7 @@ def J4_cmpeqi_t_jumpnv_t : HInst< (outs), (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii), "if (cmp.eq($Ns8.new,#$II)) jump:t $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{19-19} = 0b0; @@ -6212,7 +6248,7 @@ def J4_cmpeqi_tp0_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p0 = cmp.eq($Rs16,#$II); if (p0.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{31-22} = 0b0001000000; @@ -6237,7 +6273,7 @@ def J4_cmpeqi_tp0_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p0 = cmp.eq($Rs16,#$II); if (p0.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{31-22} = 0b0001000000; @@ -6262,7 +6298,7 @@ def J4_cmpeqi_tp1_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p1 = cmp.eq($Rs16,#$II); if (p1.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{31-22} = 0b0001001000; @@ -6287,7 +6323,7 @@ def J4_cmpeqi_tp1_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p1 = cmp.eq($Rs16,#$II); if (p1.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{31-22} = 0b0001001000; @@ -6312,7 +6348,7 @@ def J4_cmpeqn1_f_jumpnv_nt : HInst< (outs), (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii), "if (!cmp.eq($Ns8.new,#$n1)) jump:nt $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_e90a15, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_e90a15, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b000000; let Inst{19-19} = 0b0; @@ -6338,7 +6374,7 @@ def J4_cmpeqn1_f_jumpnv_t : HInst< (outs), (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii), "if (!cmp.eq($Ns8.new,#$n1)) jump:t $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_5a18b3, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_5a18b3, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b100000; let Inst{19-19} = 0b0; @@ -6364,7 +6400,7 @@ def J4_cmpeqn1_fp0_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii), "p0 = cmp.eq($Rs16,#$n1); if (!p0.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_1de724, PredRel { +tc_3d495a39, TypeCJ>, Enc_1de724, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b000000; let Inst{31-22} = 0b0001000111; @@ -6390,7 +6426,7 @@ def J4_cmpeqn1_fp0_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii), "p0 = cmp.eq($Rs16,#$n1); if (!p0.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_14640c, PredRel { +tc_3d495a39, TypeCJ>, Enc_14640c, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b100000; let Inst{31-22} = 0b0001000111; @@ -6416,7 +6452,7 @@ def J4_cmpeqn1_fp1_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii), "p1 = cmp.eq($Rs16,#$n1); if (!p1.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_668704, PredRel { +tc_3d495a39, TypeCJ>, Enc_668704, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b000000; let Inst{31-22} = 0b0001001111; @@ -6442,7 +6478,7 @@ def J4_cmpeqn1_fp1_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii), "p1 = cmp.eq($Rs16,#$n1); if (!p1.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_800e04, PredRel { +tc_3d495a39, TypeCJ>, Enc_800e04, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b100000; let Inst{31-22} = 0b0001001111; @@ -6468,7 +6504,7 @@ def J4_cmpeqn1_t_jumpnv_nt : HInst< (outs), (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii), "if (cmp.eq($Ns8.new,#$n1)) jump:nt $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_4aca3a, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_4aca3a, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b000000; let Inst{19-19} = 0b0; @@ -6493,7 +6529,7 @@ def J4_cmpeqn1_t_jumpnv_t : HInst< (outs), (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii), "if (cmp.eq($Ns8.new,#$n1)) jump:t $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_f7ea77, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_f7ea77, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b100000; let Inst{19-19} = 0b0; @@ -6518,7 +6554,7 @@ def J4_cmpeqn1_tp0_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii), "p0 = cmp.eq($Rs16,#$n1); if (p0.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_405228, PredRel { +tc_3d495a39, TypeCJ>, Enc_405228, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b000000; let Inst{31-22} = 0b0001000110; @@ -6543,7 +6579,7 @@ def J4_cmpeqn1_tp0_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii), "p0 = cmp.eq($Rs16,#$n1); if (p0.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_3a2484, PredRel { +tc_3d495a39, TypeCJ>, Enc_3a2484, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b100000; let Inst{31-22} = 0b0001000110; @@ -6568,7 +6604,7 @@ def J4_cmpeqn1_tp1_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii), "p1 = cmp.eq($Rs16,#$n1); if (p1.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_736575, PredRel { +tc_3d495a39, TypeCJ>, Enc_736575, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b000000; let Inst{31-22} = 0b0001001110; @@ -6593,7 +6629,7 @@ def J4_cmpeqn1_tp1_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii), "p1 = cmp.eq($Rs16,#$n1); if (p1.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_8e583a, PredRel { +tc_3d495a39, TypeCJ>, Enc_8e583a, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b100000; let Inst{31-22} = 0b0001001110; @@ -6618,7 +6654,7 @@ def J4_cmpgt_f_jumpnv_nt : HInst< (outs), (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii), "if (!cmp.gt($Ns8.new,$Rt32)) jump:nt $Ii", -tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel { +tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{19-19} = 0b0; @@ -6644,7 +6680,7 @@ def J4_cmpgt_f_jumpnv_t : HInst< (outs), (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii), "if (!cmp.gt($Ns8.new,$Rt32)) jump:t $Ii", -tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel { +tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{19-19} = 0b0; @@ -6670,7 +6706,7 @@ def J4_cmpgt_fp0_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p0 = cmp.gt($Rs16,$Rt16); if (!p0.new) jump:nt $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b00; let Inst{31-22} = 0b0001010011; @@ -6696,7 +6732,7 @@ def J4_cmpgt_fp0_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p0 = cmp.gt($Rs16,$Rt16); if (!p0.new) jump:t $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b10; let Inst{31-22} = 0b0001010011; @@ -6722,7 +6758,7 @@ def J4_cmpgt_fp1_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p1 = cmp.gt($Rs16,$Rt16); if (!p1.new) jump:nt $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b01; let Inst{31-22} = 0b0001010011; @@ -6748,7 +6784,7 @@ def J4_cmpgt_fp1_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p1 = cmp.gt($Rs16,$Rt16); if (!p1.new) jump:t $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b11; let Inst{31-22} = 0b0001010011; @@ -6774,7 +6810,7 @@ def J4_cmpgt_t_jumpnv_nt : HInst< (outs), (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii), "if (cmp.gt($Ns8.new,$Rt32)) jump:nt $Ii", -tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel { +tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{19-19} = 0b0; @@ -6799,7 +6835,7 @@ def J4_cmpgt_t_jumpnv_t : HInst< (outs), (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii), "if (cmp.gt($Ns8.new,$Rt32)) jump:t $Ii", -tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel { +tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{19-19} = 0b0; @@ -6824,7 +6860,7 @@ def J4_cmpgt_tp0_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p0 = cmp.gt($Rs16,$Rt16); if (p0.new) jump:nt $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b00; let Inst{31-22} = 0b0001010010; @@ -6849,7 +6885,7 @@ def J4_cmpgt_tp0_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p0 = cmp.gt($Rs16,$Rt16); if (p0.new) jump:t $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b10; let Inst{31-22} = 0b0001010010; @@ -6874,7 +6910,7 @@ def J4_cmpgt_tp1_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p1 = cmp.gt($Rs16,$Rt16); if (p1.new) jump:nt $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b01; let Inst{31-22} = 0b0001010010; @@ -6899,7 +6935,7 @@ def J4_cmpgt_tp1_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p1 = cmp.gt($Rs16,$Rt16); if (p1.new) jump:t $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b11; let Inst{31-22} = 0b0001010010; @@ -6924,7 +6960,7 @@ def J4_cmpgti_f_jumpnv_nt : HInst< (outs), (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii), "if (!cmp.gt($Ns8.new,#$II)) jump:nt $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{19-19} = 0b0; @@ -6950,7 +6986,7 @@ def J4_cmpgti_f_jumpnv_t : HInst< (outs), (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii), "if (!cmp.gt($Ns8.new,#$II)) jump:t $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{19-19} = 0b0; @@ -6976,7 +7012,7 @@ def J4_cmpgti_fp0_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p0 = cmp.gt($Rs16,#$II); if (!p0.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{31-22} = 0b0001000011; @@ -7002,7 +7038,7 @@ def J4_cmpgti_fp0_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p0 = cmp.gt($Rs16,#$II); if (!p0.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{31-22} = 0b0001000011; @@ -7028,7 +7064,7 @@ def J4_cmpgti_fp1_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p1 = cmp.gt($Rs16,#$II); if (!p1.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{31-22} = 0b0001001011; @@ -7054,7 +7090,7 @@ def J4_cmpgti_fp1_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p1 = cmp.gt($Rs16,#$II); if (!p1.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{31-22} = 0b0001001011; @@ -7080,7 +7116,7 @@ def J4_cmpgti_t_jumpnv_nt : HInst< (outs), (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii), "if (cmp.gt($Ns8.new,#$II)) jump:nt $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{19-19} = 0b0; @@ -7105,7 +7141,7 @@ def J4_cmpgti_t_jumpnv_t : HInst< (outs), (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii), "if (cmp.gt($Ns8.new,#$II)) jump:t $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{19-19} = 0b0; @@ -7130,7 +7166,7 @@ def J4_cmpgti_tp0_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p0 = cmp.gt($Rs16,#$II); if (p0.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{31-22} = 0b0001000010; @@ -7155,7 +7191,7 @@ def J4_cmpgti_tp0_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p0 = cmp.gt($Rs16,#$II); if (p0.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{31-22} = 0b0001000010; @@ -7180,7 +7216,7 @@ def J4_cmpgti_tp1_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p1 = cmp.gt($Rs16,#$II); if (p1.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{31-22} = 0b0001001010; @@ -7205,7 +7241,7 @@ def J4_cmpgti_tp1_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p1 = cmp.gt($Rs16,#$II); if (p1.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{31-22} = 0b0001001010; @@ -7230,7 +7266,7 @@ def J4_cmpgtn1_f_jumpnv_nt : HInst< (outs), (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii), "if (!cmp.gt($Ns8.new,#$n1)) jump:nt $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_3694bd, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_3694bd, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b000000; let Inst{19-19} = 0b0; @@ -7256,7 +7292,7 @@ def J4_cmpgtn1_f_jumpnv_t : HInst< (outs), (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii), "if (!cmp.gt($Ns8.new,#$n1)) jump:t $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_a6853f, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_a6853f, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b100000; let Inst{19-19} = 0b0; @@ -7282,7 +7318,7 @@ def J4_cmpgtn1_fp0_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii), "p0 = cmp.gt($Rs16,#$n1); if (!p0.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_a42857, PredRel { +tc_3d495a39, TypeCJ>, Enc_a42857, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b000001; let Inst{31-22} = 0b0001000111; @@ -7308,7 +7344,7 @@ def J4_cmpgtn1_fp0_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii), "p0 = cmp.gt($Rs16,#$n1); if (!p0.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_f6fe0b, PredRel { +tc_3d495a39, TypeCJ>, Enc_f6fe0b, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b100001; let Inst{31-22} = 0b0001000111; @@ -7334,7 +7370,7 @@ def J4_cmpgtn1_fp1_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii), "p1 = cmp.gt($Rs16,#$n1); if (!p1.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_3e3989, PredRel { +tc_3d495a39, TypeCJ>, Enc_3e3989, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b000001; let Inst{31-22} = 0b0001001111; @@ -7360,7 +7396,7 @@ def J4_cmpgtn1_fp1_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii), "p1 = cmp.gt($Rs16,#$n1); if (!p1.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_b909d2, PredRel { +tc_3d495a39, TypeCJ>, Enc_b909d2, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b100001; let Inst{31-22} = 0b0001001111; @@ -7386,7 +7422,7 @@ def J4_cmpgtn1_t_jumpnv_nt : HInst< (outs), (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii), "if (cmp.gt($Ns8.new,#$n1)) jump:nt $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_f82302, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_f82302, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b000000; let Inst{19-19} = 0b0; @@ -7411,7 +7447,7 @@ def J4_cmpgtn1_t_jumpnv_t : HInst< (outs), (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii), "if (cmp.gt($Ns8.new,#$n1)) jump:t $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_6413b6, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_6413b6, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b100000; let Inst{19-19} = 0b0; @@ -7436,7 +7472,7 @@ def J4_cmpgtn1_tp0_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii), "p0 = cmp.gt($Rs16,#$n1); if (p0.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_b78edd, PredRel { +tc_3d495a39, TypeCJ>, Enc_b78edd, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b000001; let Inst{31-22} = 0b0001000110; @@ -7461,7 +7497,7 @@ def J4_cmpgtn1_tp0_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii), "p0 = cmp.gt($Rs16,#$n1); if (p0.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_041d7b, PredRel { +tc_3d495a39, TypeCJ>, Enc_041d7b, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b100001; let Inst{31-22} = 0b0001000110; @@ -7486,7 +7522,7 @@ def J4_cmpgtn1_tp1_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii), "p1 = cmp.gt($Rs16,#$n1); if (p1.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_b1e1fb, PredRel { +tc_3d495a39, TypeCJ>, Enc_b1e1fb, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b000001; let Inst{31-22} = 0b0001001110; @@ -7511,7 +7547,7 @@ def J4_cmpgtn1_tp1_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii), "p1 = cmp.gt($Rs16,#$n1); if (p1.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_178717, PredRel { +tc_3d495a39, TypeCJ>, Enc_178717, PredRel { let Inst{0-0} = 0b0; let Inst{13-8} = 0b100001; let Inst{31-22} = 0b0001001110; @@ -7536,7 +7572,7 @@ def J4_cmpgtu_f_jumpnv_nt : HInst< (outs), (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii), "if (!cmp.gtu($Ns8.new,$Rt32)) jump:nt $Ii", -tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel { +tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{19-19} = 0b0; @@ -7562,7 +7598,7 @@ def J4_cmpgtu_f_jumpnv_t : HInst< (outs), (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii), "if (!cmp.gtu($Ns8.new,$Rt32)) jump:t $Ii", -tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel { +tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{19-19} = 0b0; @@ -7588,7 +7624,7 @@ def J4_cmpgtu_fp0_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p0 = cmp.gtu($Rs16,$Rt16); if (!p0.new) jump:nt $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b00; let Inst{31-22} = 0b0001010101; @@ -7614,7 +7650,7 @@ def J4_cmpgtu_fp0_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p0 = cmp.gtu($Rs16,$Rt16); if (!p0.new) jump:t $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b10; let Inst{31-22} = 0b0001010101; @@ -7640,7 +7676,7 @@ def J4_cmpgtu_fp1_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p1 = cmp.gtu($Rs16,$Rt16); if (!p1.new) jump:nt $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b01; let Inst{31-22} = 0b0001010101; @@ -7666,7 +7702,7 @@ def J4_cmpgtu_fp1_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p1 = cmp.gtu($Rs16,$Rt16); if (!p1.new) jump:t $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b11; let Inst{31-22} = 0b0001010101; @@ -7692,7 +7728,7 @@ def J4_cmpgtu_t_jumpnv_nt : HInst< (outs), (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii), "if (cmp.gtu($Ns8.new,$Rt32)) jump:nt $Ii", -tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel { +tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{19-19} = 0b0; @@ -7717,7 +7753,7 @@ def J4_cmpgtu_t_jumpnv_t : HInst< (outs), (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii), "if (cmp.gtu($Ns8.new,$Rt32)) jump:t $Ii", -tc_51b866be, TypeNCJ>, Enc_c9a18e, PredRel { +tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{19-19} = 0b0; @@ -7742,7 +7778,7 @@ def J4_cmpgtu_tp0_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p0 = cmp.gtu($Rs16,$Rt16); if (p0.new) jump:nt $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b00; let Inst{31-22} = 0b0001010100; @@ -7767,7 +7803,7 @@ def J4_cmpgtu_tp0_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p0 = cmp.gtu($Rs16,$Rt16); if (p0.new) jump:t $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b10; let Inst{31-22} = 0b0001010100; @@ -7792,7 +7828,7 @@ def J4_cmpgtu_tp1_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p1 = cmp.gtu($Rs16,$Rt16); if (p1.new) jump:nt $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b01; let Inst{31-22} = 0b0001010100; @@ -7817,7 +7853,7 @@ def J4_cmpgtu_tp1_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii), "p1 = cmp.gtu($Rs16,$Rt16); if (p1.new) jump:t $Ii", -tc_855b0b61, TypeCJ>, Enc_6a5972, PredRel { +tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel { let Inst{0-0} = 0b0; let Inst{13-12} = 0b11; let Inst{31-22} = 0b0001010100; @@ -7842,7 +7878,7 @@ def J4_cmpgtui_f_jumpnv_nt : HInst< (outs), (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii), "if (!cmp.gtu($Ns8.new,#$II)) jump:nt $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{19-19} = 0b0; @@ -7868,7 +7904,7 @@ def J4_cmpgtui_f_jumpnv_t : HInst< (outs), (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii), "if (!cmp.gtu($Ns8.new,#$II)) jump:t $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{19-19} = 0b0; @@ -7894,7 +7930,7 @@ def J4_cmpgtui_fp0_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p0 = cmp.gtu($Rs16,#$II); if (!p0.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{31-22} = 0b0001000101; @@ -7920,7 +7956,7 @@ def J4_cmpgtui_fp0_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p0 = cmp.gtu($Rs16,#$II); if (!p0.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{31-22} = 0b0001000101; @@ -7946,7 +7982,7 @@ def J4_cmpgtui_fp1_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p1 = cmp.gtu($Rs16,#$II); if (!p1.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{31-22} = 0b0001001101; @@ -7972,7 +8008,7 @@ def J4_cmpgtui_fp1_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p1 = cmp.gtu($Rs16,#$II); if (!p1.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{31-22} = 0b0001001101; @@ -7998,7 +8034,7 @@ def J4_cmpgtui_t_jumpnv_nt : HInst< (outs), (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii), "if (cmp.gtu($Ns8.new,#$II)) jump:nt $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{19-19} = 0b0; @@ -8023,7 +8059,7 @@ def J4_cmpgtui_t_jumpnv_t : HInst< (outs), (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii), "if (cmp.gtu($Ns8.new,#$II)) jump:t $Ii", -tc_bde7aaf4, TypeNCJ>, Enc_eafd18, PredRel { +tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{19-19} = 0b0; @@ -8048,7 +8084,7 @@ def J4_cmpgtui_tp0_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p0 = cmp.gtu($Rs16,#$II); if (p0.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{31-22} = 0b0001000100; @@ -8073,7 +8109,7 @@ def J4_cmpgtui_tp0_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p0 = cmp.gtu($Rs16,#$II); if (p0.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{31-22} = 0b0001000100; @@ -8098,7 +8134,7 @@ def J4_cmpgtui_tp1_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p1 = cmp.gtu($Rs16,#$II); if (p1.new) jump:nt $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{31-22} = 0b0001001100; @@ -8123,7 +8159,7 @@ def J4_cmpgtui_tp1_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii), "p1 = cmp.gtu($Rs16,#$II); if (p1.new) jump:t $Ii", -tc_99be14ca, TypeCJ>, Enc_14d27a, PredRel { +tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{31-22} = 0b0001001100; @@ -8148,7 +8184,7 @@ def J4_cmplt_f_jumpnv_nt : HInst< (outs), (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii), "if (!cmp.gt($Rt32,$Ns8.new)) jump:nt $Ii", -tc_5eb851fc, TypeNCJ>, Enc_5de85f, PredRel { +tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{19-19} = 0b0; @@ -8174,7 +8210,7 @@ def J4_cmplt_f_jumpnv_t : HInst< (outs), (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii), "if (!cmp.gt($Rt32,$Ns8.new)) jump:t $Ii", -tc_5eb851fc, TypeNCJ>, Enc_5de85f, PredRel { +tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{19-19} = 0b0; @@ -8200,7 +8236,7 @@ def J4_cmplt_t_jumpnv_nt : HInst< (outs), (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii), "if (cmp.gt($Rt32,$Ns8.new)) jump:nt $Ii", -tc_5eb851fc, TypeNCJ>, Enc_5de85f, PredRel { +tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{19-19} = 0b0; @@ -8225,7 +8261,7 @@ def J4_cmplt_t_jumpnv_t : HInst< (outs), (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii), "if (cmp.gt($Rt32,$Ns8.new)) jump:t $Ii", -tc_5eb851fc, TypeNCJ>, Enc_5de85f, PredRel { +tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{19-19} = 0b0; @@ -8250,7 +8286,7 @@ def J4_cmpltu_f_jumpnv_nt : HInst< (outs), (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii), "if (!cmp.gtu($Rt32,$Ns8.new)) jump:nt $Ii", -tc_5eb851fc, TypeNCJ>, Enc_5de85f, PredRel { +tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{19-19} = 0b0; @@ -8276,7 +8312,7 @@ def J4_cmpltu_f_jumpnv_t : HInst< (outs), (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii), "if (!cmp.gtu($Rt32,$Ns8.new)) jump:t $Ii", -tc_5eb851fc, TypeNCJ>, Enc_5de85f, PredRel { +tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{19-19} = 0b0; @@ -8302,7 +8338,7 @@ def J4_cmpltu_t_jumpnv_nt : HInst< (outs), (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii), "if (cmp.gtu($Rt32,$Ns8.new)) jump:nt $Ii", -tc_5eb851fc, TypeNCJ>, Enc_5de85f, PredRel { +tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b0; let Inst{19-19} = 0b0; @@ -8327,7 +8363,7 @@ def J4_cmpltu_t_jumpnv_t : HInst< (outs), (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii), "if (cmp.gtu($Rt32,$Ns8.new)) jump:t $Ii", -tc_5eb851fc, TypeNCJ>, Enc_5de85f, PredRel { +tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel { let Inst{0-0} = 0b0; let Inst{13-13} = 0b1; let Inst{19-19} = 0b0; @@ -8352,7 +8388,7 @@ def J4_hintjumpr : HInst< (outs), (ins IntRegs:$Rs32), "hintjr($Rs32)", -tc_9faf76ae, TypeJ>, Enc_ecbcc8 { +tc_d5b7b0c1, TypeJ>, Enc_ecbcc8 { let Inst{13-0} = 0b00000000000000; let Inst{31-21} = 0b01010010101; let isTerminator = 1; @@ -8364,7 +8400,7 @@ def J4_jumpseti : HInst< (outs GeneralSubRegs:$Rd16), (ins u6_0Imm:$II, b30_2Imm:$Ii), "$Rd16 = #$II ; jump $Ii", -tc_49eb22c8, TypeCJ>, Enc_9e4c3f { +tc_0663f615, TypeCJ>, Enc_9e4c3f { let Inst{0-0} = 0b0; let Inst{31-22} = 0b0001011000; let hasNewValue = 1; @@ -8384,7 +8420,7 @@ def J4_jumpsetr : HInst< (outs GeneralSubRegs:$Rd16), (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii), "$Rd16 = $Rs16 ; jump $Ii", -tc_49eb22c8, TypeCJ>, Enc_66bce1 { +tc_0663f615, TypeCJ>, Enc_66bce1 { let Inst{0-0} = 0b0; let Inst{13-12} = 0b00; let Inst{31-22} = 0b0001011100; @@ -8405,7 +8441,7 @@ def J4_tstbit0_f_jumpnv_nt : HInst< (outs), (ins IntRegs:$Ns8, b30_2Imm:$Ii), "if (!tstbit($Ns8.new,#0)) jump:nt $Ii", -tc_746baa8e, TypeNCJ>, Enc_69d63b { +tc_8c945be0, TypeNCJ>, Enc_69d63b { let Inst{0-0} = 0b0; let Inst{13-8} = 0b000000; let Inst{19-19} = 0b0; @@ -8430,7 +8466,7 @@ def J4_tstbit0_f_jumpnv_t : HInst< (outs), (ins IntRegs:$Ns8, b30_2Imm:$Ii), "if (!tstbit($Ns8.new,#0)) jump:t $Ii", -tc_746baa8e, TypeNCJ>, Enc_69d63b { +tc_8c945be0, TypeNCJ>, Enc_69d63b { let Inst{0-0} = 0b0; let Inst{13-8} = 0b100000; let Inst{19-19} = 0b0; @@ -8455,7 +8491,7 @@ def J4_tstbit0_fp0_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii), "p0 = tstbit($Rs16,#0); if (!p0.new) jump:nt $Ii", -tc_3cb8ea06, TypeCJ>, Enc_ad1c74 { +tc_2332b92e, TypeCJ>, Enc_ad1c74 { let Inst{0-0} = 0b0; let Inst{13-8} = 0b000011; let Inst{31-22} = 0b0001000111; @@ -8480,7 +8516,7 @@ def J4_tstbit0_fp0_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii), "p0 = tstbit($Rs16,#0); if (!p0.new) jump:t $Ii", -tc_3cb8ea06, TypeCJ>, Enc_ad1c74 { +tc_2332b92e, TypeCJ>, Enc_ad1c74 { let Inst{0-0} = 0b0; let Inst{13-8} = 0b100011; let Inst{31-22} = 0b0001000111; @@ -8505,7 +8541,7 @@ def J4_tstbit0_fp1_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii), "p1 = tstbit($Rs16,#0); if (!p1.new) jump:nt $Ii", -tc_3cb8ea06, TypeCJ>, Enc_ad1c74 { +tc_2332b92e, TypeCJ>, Enc_ad1c74 { let Inst{0-0} = 0b0; let Inst{13-8} = 0b000011; let Inst{31-22} = 0b0001001111; @@ -8530,7 +8566,7 @@ def J4_tstbit0_fp1_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii), "p1 = tstbit($Rs16,#0); if (!p1.new) jump:t $Ii", -tc_3cb8ea06, TypeCJ>, Enc_ad1c74 { +tc_2332b92e, TypeCJ>, Enc_ad1c74 { let Inst{0-0} = 0b0; let Inst{13-8} = 0b100011; let Inst{31-22} = 0b0001001111; @@ -8555,7 +8591,7 @@ def J4_tstbit0_t_jumpnv_nt : HInst< (outs), (ins IntRegs:$Ns8, b30_2Imm:$Ii), "if (tstbit($Ns8.new,#0)) jump:nt $Ii", -tc_746baa8e, TypeNCJ>, Enc_69d63b { +tc_8c945be0, TypeNCJ>, Enc_69d63b { let Inst{0-0} = 0b0; let Inst{13-8} = 0b000000; let Inst{19-19} = 0b0; @@ -8579,7 +8615,7 @@ def J4_tstbit0_t_jumpnv_t : HInst< (outs), (ins IntRegs:$Ns8, b30_2Imm:$Ii), "if (tstbit($Ns8.new,#0)) jump:t $Ii", -tc_746baa8e, TypeNCJ>, Enc_69d63b { +tc_8c945be0, TypeNCJ>, Enc_69d63b { let Inst{0-0} = 0b0; let Inst{13-8} = 0b100000; let Inst{19-19} = 0b0; @@ -8603,7 +8639,7 @@ def J4_tstbit0_tp0_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii), "p0 = tstbit($Rs16,#0); if (p0.new) jump:nt $Ii", -tc_3cb8ea06, TypeCJ>, Enc_ad1c74 { +tc_2332b92e, TypeCJ>, Enc_ad1c74 { let Inst{0-0} = 0b0; let Inst{13-8} = 0b000011; let Inst{31-22} = 0b0001000110; @@ -8627,7 +8663,7 @@ def J4_tstbit0_tp0_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii), "p0 = tstbit($Rs16,#0); if (p0.new) jump:t $Ii", -tc_3cb8ea06, TypeCJ>, Enc_ad1c74 { +tc_2332b92e, TypeCJ>, Enc_ad1c74 { let Inst{0-0} = 0b0; let Inst{13-8} = 0b100011; let Inst{31-22} = 0b0001000110; @@ -8651,7 +8687,7 @@ def J4_tstbit0_tp1_jump_nt : HInst< (outs), (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii), "p1 = tstbit($Rs16,#0); if (p1.new) jump:nt $Ii", -tc_3cb8ea06, TypeCJ>, Enc_ad1c74 { +tc_2332b92e, TypeCJ>, Enc_ad1c74 { let Inst{0-0} = 0b0; let Inst{13-8} = 0b000011; let Inst{31-22} = 0b0001001110; @@ -8675,7 +8711,7 @@ def J4_tstbit0_tp1_jump_t : HInst< (outs), (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii), "p1 = tstbit($Rs16,#0); if (p1.new) jump:t $Ii", -tc_3cb8ea06, TypeCJ>, Enc_ad1c74 { +tc_2332b92e, TypeCJ>, Enc_ad1c74 { let Inst{0-0} = 0b0; let Inst{13-8} = 0b100011; let Inst{31-22} = 0b0001001110; @@ -8699,7 +8735,7 @@ def L2_deallocframe : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = deallocframe($Rs32):raw", -tc_d1090e34, TypeLD>, Enc_3a3d62 { +tc_15aa71c5, TypeLD>, Enc_3a3d62 { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b10010000000; let accessSize = DoubleWordAccess; @@ -8711,7 +8747,7 @@ def L2_loadalignb_io : HInst< (outs DoubleRegs:$Ryy32), (ins DoubleRegs:$Ryy32in, IntRegs:$Rs32, s32_0Imm:$Ii), "$Ryy32 = memb_fifo($Rs32+#$Ii)", -tc_ef52ed71, TypeLD>, Enc_a27588 { +tc_5ef37dc4, TypeLD>, Enc_a27588 { let Inst{24-21} = 0b0100; let Inst{31-27} = 0b10010; let addrMode = BaseImmOffset; @@ -8728,9 +8764,10 @@ def L2_loadalignb_pbr : HInst< (outs DoubleRegs:$Ryy32, IntRegs:$Rx32), (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2), "$Ryy32 = memb_fifo($Rx32++$Mu2:brev)", -tc_bad2bcaf, TypeLD>, Enc_1f5d8f { +tc_3c76b0ff, TypeLD>, Enc_1f5d8f { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011110100; +let addrMode = PostInc; let accessSize = ByteAccess; let mayLoad = 1; let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in"; @@ -8739,7 +8776,7 @@ def L2_loadalignb_pci : HInst< (outs DoubleRegs:$Ryy32, IntRegs:$Rx32), (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2), "$Ryy32 = memb_fifo($Rx32++#$Ii:circ($Mu2))", -tc_03220ffa, TypeLD>, Enc_74aef2 { +tc_785f65a7, TypeLD>, Enc_74aef2 { let Inst{12-9} = 0b0000; let Inst{31-21} = 0b10011000100; let addrMode = PostInc; @@ -8752,7 +8789,7 @@ def L2_loadalignb_pcr : HInst< (outs DoubleRegs:$Ryy32, IntRegs:$Rx32), (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2), "$Ryy32 = memb_fifo($Rx32++I:circ($Mu2))", -tc_bad2bcaf, TypeLD>, Enc_1f5d8f { +tc_3c76b0ff, TypeLD>, Enc_1f5d8f { let Inst{12-5} = 0b00010000; let Inst{31-21} = 0b10011000100; let addrMode = PostInc; @@ -8765,7 +8802,7 @@ def L2_loadalignb_pi : HInst< (outs DoubleRegs:$Ryy32, IntRegs:$Rx32), (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_0Imm:$Ii), "$Ryy32 = memb_fifo($Rx32++#$Ii)", -tc_bad2bcaf, TypeLD>, Enc_6b197f { +tc_3c76b0ff, TypeLD>, Enc_6b197f { let Inst{13-9} = 0b00000; let Inst{31-21} = 0b10011010100; let addrMode = PostInc; @@ -8777,7 +8814,7 @@ def L2_loadalignb_pr : HInst< (outs DoubleRegs:$Ryy32, IntRegs:$Rx32), (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2), "$Ryy32 = memb_fifo($Rx32++$Mu2)", -tc_bad2bcaf, TypeLD>, Enc_1f5d8f { +tc_3c76b0ff, TypeLD>, Enc_1f5d8f { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011100100; let addrMode = PostInc; @@ -8789,7 +8826,7 @@ def L2_loadalignb_zomap : HInst< (outs DoubleRegs:$Ryy32), (ins DoubleRegs:$Ryy32in, IntRegs:$Rs32), "$Ryy32 = memb_fifo($Rs32)", -tc_ef52ed71, TypeMAPPING> { +tc_5ef37dc4, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; let Constraints = "$Ryy32 = $Ryy32in"; @@ -8798,7 +8835,7 @@ def L2_loadalignh_io : HInst< (outs DoubleRegs:$Ryy32), (ins DoubleRegs:$Ryy32in, IntRegs:$Rs32, s31_1Imm:$Ii), "$Ryy32 = memh_fifo($Rs32+#$Ii)", -tc_ef52ed71, TypeLD>, Enc_5cd7e9 { +tc_5ef37dc4, TypeLD>, Enc_5cd7e9 { let Inst{24-21} = 0b0010; let Inst{31-27} = 0b10010; let addrMode = BaseImmOffset; @@ -8815,9 +8852,10 @@ def L2_loadalignh_pbr : HInst< (outs DoubleRegs:$Ryy32, IntRegs:$Rx32), (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2), "$Ryy32 = memh_fifo($Rx32++$Mu2:brev)", -tc_bad2bcaf, TypeLD>, Enc_1f5d8f { +tc_3c76b0ff, TypeLD>, Enc_1f5d8f { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011110010; +let addrMode = PostInc; let accessSize = HalfWordAccess; let mayLoad = 1; let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in"; @@ -8826,7 +8864,7 @@ def L2_loadalignh_pci : HInst< (outs DoubleRegs:$Ryy32, IntRegs:$Rx32), (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2), "$Ryy32 = memh_fifo($Rx32++#$Ii:circ($Mu2))", -tc_03220ffa, TypeLD>, Enc_9e2e1c { +tc_785f65a7, TypeLD>, Enc_9e2e1c { let Inst{12-9} = 0b0000; let Inst{31-21} = 0b10011000010; let addrMode = PostInc; @@ -8839,7 +8877,7 @@ def L2_loadalignh_pcr : HInst< (outs DoubleRegs:$Ryy32, IntRegs:$Rx32), (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2), "$Ryy32 = memh_fifo($Rx32++I:circ($Mu2))", -tc_bad2bcaf, TypeLD>, Enc_1f5d8f { +tc_3c76b0ff, TypeLD>, Enc_1f5d8f { let Inst{12-5} = 0b00010000; let Inst{31-21} = 0b10011000010; let addrMode = PostInc; @@ -8852,7 +8890,7 @@ def L2_loadalignh_pi : HInst< (outs DoubleRegs:$Ryy32, IntRegs:$Rx32), (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_1Imm:$Ii), "$Ryy32 = memh_fifo($Rx32++#$Ii)", -tc_bad2bcaf, TypeLD>, Enc_bd1cbc { +tc_3c76b0ff, TypeLD>, Enc_bd1cbc { let Inst{13-9} = 0b00000; let Inst{31-21} = 0b10011010010; let addrMode = PostInc; @@ -8864,7 +8902,7 @@ def L2_loadalignh_pr : HInst< (outs DoubleRegs:$Ryy32, IntRegs:$Rx32), (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2), "$Ryy32 = memh_fifo($Rx32++$Mu2)", -tc_bad2bcaf, TypeLD>, Enc_1f5d8f { +tc_3c76b0ff, TypeLD>, Enc_1f5d8f { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011100010; let addrMode = PostInc; @@ -8876,7 +8914,7 @@ def L2_loadalignh_zomap : HInst< (outs DoubleRegs:$Ryy32), (ins DoubleRegs:$Ryy32in, IntRegs:$Rs32), "$Ryy32 = memh_fifo($Rs32)", -tc_ef52ed71, TypeMAPPING> { +tc_5ef37dc4, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; let Constraints = "$Ryy32 = $Ryy32in"; @@ -8885,7 +8923,7 @@ def L2_loadbsw2_io : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s31_1Imm:$Ii), "$Rd32 = membh($Rs32+#$Ii)", -tc_7f881c76, TypeLD>, Enc_de0214 { +tc_17e0d2cd, TypeLD>, Enc_de0214 { let Inst{24-21} = 0b0001; let Inst{31-27} = 0b10010; let hasNewValue = 1; @@ -8903,11 +8941,12 @@ def L2_loadbsw2_pbr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = membh($Rx32++$Mu2:brev)", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011110001; let hasNewValue = 1; let opNewValue = 0; +let addrMode = PostInc; let accessSize = HalfWordAccess; let mayLoad = 1; let Constraints = "$Rx32 = $Rx32in"; @@ -8916,7 +8955,7 @@ def L2_loadbsw2_pci : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2), "$Rd32 = membh($Rx32++#$Ii:circ($Mu2))", -tc_4403ca65, TypeLD>, Enc_e83554 { +tc_e93a3d71, TypeLD>, Enc_e83554 { let Inst{12-9} = 0b0000; let Inst{31-21} = 0b10011000001; let hasNewValue = 1; @@ -8931,7 +8970,7 @@ def L2_loadbsw2_pcr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = membh($Rx32++I:circ($Mu2))", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00010000; let Inst{31-21} = 0b10011000001; let hasNewValue = 1; @@ -8946,7 +8985,7 @@ def L2_loadbsw2_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_1Imm:$Ii), "$Rd32 = membh($Rx32++#$Ii)", -tc_2fc0c436, TypeLD>, Enc_152467 { +tc_44d3da28, TypeLD>, Enc_152467 { let Inst{13-9} = 0b00000; let Inst{31-21} = 0b10011010001; let hasNewValue = 1; @@ -8960,7 +8999,7 @@ def L2_loadbsw2_pr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = membh($Rx32++$Mu2)", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011100001; let hasNewValue = 1; @@ -8974,7 +9013,7 @@ def L2_loadbsw2_zomap : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = membh($Rs32)", -tc_7f881c76, TypeMAPPING> { +tc_17e0d2cd, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -8984,7 +9023,7 @@ def L2_loadbsw4_io : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, s30_2Imm:$Ii), "$Rdd32 = membh($Rs32+#$Ii)", -tc_7f881c76, TypeLD>, Enc_2d7491 { +tc_17e0d2cd, TypeLD>, Enc_2d7491 { let Inst{24-21} = 0b0111; let Inst{31-27} = 0b10010; let addrMode = BaseImmOffset; @@ -9000,9 +9039,10 @@ def L2_loadbsw4_pbr : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rdd32 = membh($Rx32++$Mu2:brev)", -tc_2fc0c436, TypeLD>, Enc_7eee72 { +tc_44d3da28, TypeLD>, Enc_7eee72 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011110111; +let addrMode = PostInc; let accessSize = WordAccess; let mayLoad = 1; let Constraints = "$Rx32 = $Rx32in"; @@ -9011,7 +9051,7 @@ def L2_loadbsw4_pci : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2), "$Rdd32 = membh($Rx32++#$Ii:circ($Mu2))", -tc_4403ca65, TypeLD>, Enc_70b24b { +tc_e93a3d71, TypeLD>, Enc_70b24b { let Inst{12-9} = 0b0000; let Inst{31-21} = 0b10011000111; let addrMode = PostInc; @@ -9024,7 +9064,7 @@ def L2_loadbsw4_pcr : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rdd32 = membh($Rx32++I:circ($Mu2))", -tc_2fc0c436, TypeLD>, Enc_7eee72 { +tc_44d3da28, TypeLD>, Enc_7eee72 { let Inst{12-5} = 0b00010000; let Inst{31-21} = 0b10011000111; let addrMode = PostInc; @@ -9037,7 +9077,7 @@ def L2_loadbsw4_pi : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_2Imm:$Ii), "$Rdd32 = membh($Rx32++#$Ii)", -tc_2fc0c436, TypeLD>, Enc_71f1b4 { +tc_44d3da28, TypeLD>, Enc_71f1b4 { let Inst{13-9} = 0b00000; let Inst{31-21} = 0b10011010111; let addrMode = PostInc; @@ -9049,7 +9089,7 @@ def L2_loadbsw4_pr : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rdd32 = membh($Rx32++$Mu2)", -tc_2fc0c436, TypeLD>, Enc_7eee72 { +tc_44d3da28, TypeLD>, Enc_7eee72 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011100111; let addrMode = PostInc; @@ -9061,7 +9101,7 @@ def L2_loadbsw4_zomap : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = membh($Rs32)", -tc_7f881c76, TypeMAPPING> { +tc_17e0d2cd, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -9069,7 +9109,7 @@ def L2_loadbzw2_io : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s31_1Imm:$Ii), "$Rd32 = memubh($Rs32+#$Ii)", -tc_7f881c76, TypeLD>, Enc_de0214 { +tc_17e0d2cd, TypeLD>, Enc_de0214 { let Inst{24-21} = 0b0011; let Inst{31-27} = 0b10010; let hasNewValue = 1; @@ -9087,11 +9127,12 @@ def L2_loadbzw2_pbr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = memubh($Rx32++$Mu2:brev)", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011110011; let hasNewValue = 1; let opNewValue = 0; +let addrMode = PostInc; let accessSize = HalfWordAccess; let mayLoad = 1; let Constraints = "$Rx32 = $Rx32in"; @@ -9100,7 +9141,7 @@ def L2_loadbzw2_pci : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2), "$Rd32 = memubh($Rx32++#$Ii:circ($Mu2))", -tc_4403ca65, TypeLD>, Enc_e83554 { +tc_e93a3d71, TypeLD>, Enc_e83554 { let Inst{12-9} = 0b0000; let Inst{31-21} = 0b10011000011; let hasNewValue = 1; @@ -9115,7 +9156,7 @@ def L2_loadbzw2_pcr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = memubh($Rx32++I:circ($Mu2))", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00010000; let Inst{31-21} = 0b10011000011; let hasNewValue = 1; @@ -9130,7 +9171,7 @@ def L2_loadbzw2_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_1Imm:$Ii), "$Rd32 = memubh($Rx32++#$Ii)", -tc_2fc0c436, TypeLD>, Enc_152467 { +tc_44d3da28, TypeLD>, Enc_152467 { let Inst{13-9} = 0b00000; let Inst{31-21} = 0b10011010011; let hasNewValue = 1; @@ -9144,7 +9185,7 @@ def L2_loadbzw2_pr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = memubh($Rx32++$Mu2)", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011100011; let hasNewValue = 1; @@ -9158,7 +9199,7 @@ def L2_loadbzw2_zomap : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = memubh($Rs32)", -tc_7f881c76, TypeMAPPING> { +tc_17e0d2cd, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -9168,7 +9209,7 @@ def L2_loadbzw4_io : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, s30_2Imm:$Ii), "$Rdd32 = memubh($Rs32+#$Ii)", -tc_7f881c76, TypeLD>, Enc_2d7491 { +tc_17e0d2cd, TypeLD>, Enc_2d7491 { let Inst{24-21} = 0b0101; let Inst{31-27} = 0b10010; let addrMode = BaseImmOffset; @@ -9184,9 +9225,10 @@ def L2_loadbzw4_pbr : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rdd32 = memubh($Rx32++$Mu2:brev)", -tc_2fc0c436, TypeLD>, Enc_7eee72 { +tc_44d3da28, TypeLD>, Enc_7eee72 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011110101; +let addrMode = PostInc; let accessSize = WordAccess; let mayLoad = 1; let Constraints = "$Rx32 = $Rx32in"; @@ -9195,7 +9237,7 @@ def L2_loadbzw4_pci : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2), "$Rdd32 = memubh($Rx32++#$Ii:circ($Mu2))", -tc_4403ca65, TypeLD>, Enc_70b24b { +tc_e93a3d71, TypeLD>, Enc_70b24b { let Inst{12-9} = 0b0000; let Inst{31-21} = 0b10011000101; let addrMode = PostInc; @@ -9208,7 +9250,7 @@ def L2_loadbzw4_pcr : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rdd32 = memubh($Rx32++I:circ($Mu2))", -tc_2fc0c436, TypeLD>, Enc_7eee72 { +tc_44d3da28, TypeLD>, Enc_7eee72 { let Inst{12-5} = 0b00010000; let Inst{31-21} = 0b10011000101; let addrMode = PostInc; @@ -9221,7 +9263,7 @@ def L2_loadbzw4_pi : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_2Imm:$Ii), "$Rdd32 = memubh($Rx32++#$Ii)", -tc_2fc0c436, TypeLD>, Enc_71f1b4 { +tc_44d3da28, TypeLD>, Enc_71f1b4 { let Inst{13-9} = 0b00000; let Inst{31-21} = 0b10011010101; let addrMode = PostInc; @@ -9233,7 +9275,7 @@ def L2_loadbzw4_pr : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rdd32 = memubh($Rx32++$Mu2)", -tc_2fc0c436, TypeLD>, Enc_7eee72 { +tc_44d3da28, TypeLD>, Enc_7eee72 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011100101; let addrMode = PostInc; @@ -9245,7 +9287,7 @@ def L2_loadbzw4_zomap : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = memubh($Rs32)", -tc_7f881c76, TypeMAPPING> { +tc_17e0d2cd, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -9253,7 +9295,7 @@ def L2_loadrb_io : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Rd32 = memb($Rs32+#$Ii)", -tc_7f881c76, TypeLD>, Enc_211aaa, AddrModeRel, PostInc_BaseImm { +tc_17e0d2cd, TypeLD>, Enc_211aaa, AddrModeRel, PostInc_BaseImm { let Inst{24-21} = 0b1000; let Inst{31-27} = 0b10010; let hasNewValue = 1; @@ -9274,11 +9316,12 @@ def L2_loadrb_pbr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = memb($Rx32++$Mu2:brev)", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011111000; let hasNewValue = 1; let opNewValue = 0; +let addrMode = PostInc; let accessSize = ByteAccess; let mayLoad = 1; let Constraints = "$Rx32 = $Rx32in"; @@ -9287,7 +9330,7 @@ def L2_loadrb_pci : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2), "$Rd32 = memb($Rx32++#$Ii:circ($Mu2))", -tc_4403ca65, TypeLD>, Enc_e0a47a { +tc_e93a3d71, TypeLD>, Enc_e0a47a { let Inst{12-9} = 0b0000; let Inst{31-21} = 0b10011001000; let hasNewValue = 1; @@ -9302,7 +9345,7 @@ def L2_loadrb_pcr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = memb($Rx32++I:circ($Mu2))", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00010000; let Inst{31-21} = 0b10011001000; let hasNewValue = 1; @@ -9317,7 +9360,7 @@ def L2_loadrb_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_0Imm:$Ii), "$Rd32 = memb($Rx32++#$Ii)", -tc_2fc0c436, TypeLD>, Enc_222336, PredNewRel, PostInc_BaseImm { +tc_44d3da28, TypeLD>, Enc_222336, PredNewRel, PostInc_BaseImm { let Inst{13-9} = 0b00000; let Inst{31-21} = 0b10011011000; let hasNewValue = 1; @@ -9334,7 +9377,7 @@ def L2_loadrb_pr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = memb($Rx32++$Mu2)", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011101000; let hasNewValue = 1; @@ -9348,7 +9391,7 @@ def L2_loadrb_zomap : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = memb($Rs32)", -tc_7f881c76, TypeMAPPING> { +tc_17e0d2cd, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -9358,7 +9401,7 @@ def L2_loadrbgp : HInst< (outs IntRegs:$Rd32), (ins u32_0Imm:$Ii), "$Rd32 = memb(gp+#$Ii)", -tc_9c98e8af, TypeV2LDST>, Enc_25bef0, AddrModeRel { +tc_c4db48cb, TypeV2LDST>, Enc_25bef0, AddrModeRel { let Inst{24-21} = 0b1000; let Inst{31-27} = 0b01001; let hasNewValue = 1; @@ -9377,7 +9420,7 @@ def L2_loadrd_io : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, s29_3Imm:$Ii), "$Rdd32 = memd($Rs32+#$Ii)", -tc_7f881c76, TypeLD>, Enc_fa3ba4, AddrModeRel, PostInc_BaseImm { +tc_17e0d2cd, TypeLD>, Enc_fa3ba4, AddrModeRel, PostInc_BaseImm { let Inst{24-21} = 0b1110; let Inst{31-27} = 0b10010; let addrMode = BaseImmOffset; @@ -9396,9 +9439,10 @@ def L2_loadrd_pbr : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rdd32 = memd($Rx32++$Mu2:brev)", -tc_2fc0c436, TypeLD>, Enc_7eee72 { +tc_44d3da28, TypeLD>, Enc_7eee72 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011111110; +let addrMode = PostInc; let accessSize = DoubleWordAccess; let mayLoad = 1; let Constraints = "$Rx32 = $Rx32in"; @@ -9407,7 +9451,7 @@ def L2_loadrd_pci : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_3Imm:$Ii, ModRegs:$Mu2), "$Rdd32 = memd($Rx32++#$Ii:circ($Mu2))", -tc_4403ca65, TypeLD>, Enc_b05839 { +tc_e93a3d71, TypeLD>, Enc_b05839 { let Inst{12-9} = 0b0000; let Inst{31-21} = 0b10011001110; let addrMode = PostInc; @@ -9420,7 +9464,7 @@ def L2_loadrd_pcr : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rdd32 = memd($Rx32++I:circ($Mu2))", -tc_2fc0c436, TypeLD>, Enc_7eee72 { +tc_44d3da28, TypeLD>, Enc_7eee72 { let Inst{12-5} = 0b00010000; let Inst{31-21} = 0b10011001110; let addrMode = PostInc; @@ -9433,7 +9477,7 @@ def L2_loadrd_pi : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_3Imm:$Ii), "$Rdd32 = memd($Rx32++#$Ii)", -tc_2fc0c436, TypeLD>, Enc_5bdd42, PredNewRel, PostInc_BaseImm { +tc_44d3da28, TypeLD>, Enc_5bdd42, PredNewRel, PostInc_BaseImm { let Inst{13-9} = 0b00000; let Inst{31-21} = 0b10011011110; let addrMode = PostInc; @@ -9448,7 +9492,7 @@ def L2_loadrd_pr : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rdd32 = memd($Rx32++$Mu2)", -tc_2fc0c436, TypeLD>, Enc_7eee72 { +tc_44d3da28, TypeLD>, Enc_7eee72 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011101110; let addrMode = PostInc; @@ -9460,7 +9504,7 @@ def L2_loadrd_zomap : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = memd($Rs32)", -tc_7f881c76, TypeMAPPING> { +tc_17e0d2cd, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -9468,7 +9512,7 @@ def L2_loadrdgp : HInst< (outs DoubleRegs:$Rdd32), (ins u29_3Imm:$Ii), "$Rdd32 = memd(gp+#$Ii)", -tc_9c98e8af, TypeV2LDST>, Enc_509701, AddrModeRel { +tc_c4db48cb, TypeV2LDST>, Enc_509701, AddrModeRel { let Inst{24-21} = 0b1110; let Inst{31-27} = 0b01001; let accessSize = DoubleWordAccess; @@ -9485,7 +9529,7 @@ def L2_loadrh_io : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s31_1Imm:$Ii), "$Rd32 = memh($Rs32+#$Ii)", -tc_7f881c76, TypeLD>, Enc_de0214, AddrModeRel, PostInc_BaseImm { +tc_17e0d2cd, TypeLD>, Enc_de0214, AddrModeRel, PostInc_BaseImm { let Inst{24-21} = 0b1010; let Inst{31-27} = 0b10010; let hasNewValue = 1; @@ -9506,11 +9550,12 @@ def L2_loadrh_pbr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = memh($Rx32++$Mu2:brev)", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011111010; let hasNewValue = 1; let opNewValue = 0; +let addrMode = PostInc; let accessSize = HalfWordAccess; let mayLoad = 1; let Constraints = "$Rx32 = $Rx32in"; @@ -9519,7 +9564,7 @@ def L2_loadrh_pci : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2), "$Rd32 = memh($Rx32++#$Ii:circ($Mu2))", -tc_4403ca65, TypeLD>, Enc_e83554 { +tc_e93a3d71, TypeLD>, Enc_e83554 { let Inst{12-9} = 0b0000; let Inst{31-21} = 0b10011001010; let hasNewValue = 1; @@ -9534,7 +9579,7 @@ def L2_loadrh_pcr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = memh($Rx32++I:circ($Mu2))", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00010000; let Inst{31-21} = 0b10011001010; let hasNewValue = 1; @@ -9549,7 +9594,7 @@ def L2_loadrh_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_1Imm:$Ii), "$Rd32 = memh($Rx32++#$Ii)", -tc_2fc0c436, TypeLD>, Enc_152467, PredNewRel, PostInc_BaseImm { +tc_44d3da28, TypeLD>, Enc_152467, PredNewRel, PostInc_BaseImm { let Inst{13-9} = 0b00000; let Inst{31-21} = 0b10011011010; let hasNewValue = 1; @@ -9566,7 +9611,7 @@ def L2_loadrh_pr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = memh($Rx32++$Mu2)", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011101010; let hasNewValue = 1; @@ -9580,7 +9625,7 @@ def L2_loadrh_zomap : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = memh($Rs32)", -tc_7f881c76, TypeMAPPING> { +tc_17e0d2cd, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -9590,7 +9635,7 @@ def L2_loadrhgp : HInst< (outs IntRegs:$Rd32), (ins u31_1Imm:$Ii), "$Rd32 = memh(gp+#$Ii)", -tc_9c98e8af, TypeV2LDST>, Enc_8df4be, AddrModeRel { +tc_c4db48cb, TypeV2LDST>, Enc_8df4be, AddrModeRel { let Inst{24-21} = 0b1010; let Inst{31-27} = 0b01001; let hasNewValue = 1; @@ -9609,7 +9654,7 @@ def L2_loadri_io : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s30_2Imm:$Ii), "$Rd32 = memw($Rs32+#$Ii)", -tc_7f881c76, TypeLD>, Enc_2a3787, AddrModeRel, PostInc_BaseImm { +tc_17e0d2cd, TypeLD>, Enc_2a3787, AddrModeRel, PostInc_BaseImm { let Inst{24-21} = 0b1100; let Inst{31-27} = 0b10010; let hasNewValue = 1; @@ -9630,11 +9675,12 @@ def L2_loadri_pbr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = memw($Rx32++$Mu2:brev)", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011111100; let hasNewValue = 1; let opNewValue = 0; +let addrMode = PostInc; let accessSize = WordAccess; let mayLoad = 1; let Constraints = "$Rx32 = $Rx32in"; @@ -9643,7 +9689,7 @@ def L2_loadri_pci : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2), "$Rd32 = memw($Rx32++#$Ii:circ($Mu2))", -tc_4403ca65, TypeLD>, Enc_27fd0e { +tc_e93a3d71, TypeLD>, Enc_27fd0e { let Inst{12-9} = 0b0000; let Inst{31-21} = 0b10011001100; let hasNewValue = 1; @@ -9658,7 +9704,7 @@ def L2_loadri_pcr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = memw($Rx32++I:circ($Mu2))", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00010000; let Inst{31-21} = 0b10011001100; let hasNewValue = 1; @@ -9673,7 +9719,7 @@ def L2_loadri_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_2Imm:$Ii), "$Rd32 = memw($Rx32++#$Ii)", -tc_2fc0c436, TypeLD>, Enc_3d920a, PredNewRel, PostInc_BaseImm { +tc_44d3da28, TypeLD>, Enc_3d920a, PredNewRel, PostInc_BaseImm { let Inst{13-9} = 0b00000; let Inst{31-21} = 0b10011011100; let hasNewValue = 1; @@ -9690,7 +9736,7 @@ def L2_loadri_pr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = memw($Rx32++$Mu2)", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011101100; let hasNewValue = 1; @@ -9704,7 +9750,7 @@ def L2_loadri_zomap : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = memw($Rs32)", -tc_7f881c76, TypeMAPPING> { +tc_17e0d2cd, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -9714,7 +9760,7 @@ def L2_loadrigp : HInst< (outs IntRegs:$Rd32), (ins u30_2Imm:$Ii), "$Rd32 = memw(gp+#$Ii)", -tc_9c98e8af, TypeV2LDST>, Enc_4f4ed7, AddrModeRel { +tc_c4db48cb, TypeV2LDST>, Enc_4f4ed7, AddrModeRel { let Inst{24-21} = 0b1100; let Inst{31-27} = 0b01001; let hasNewValue = 1; @@ -9733,7 +9779,7 @@ def L2_loadrub_io : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Rd32 = memub($Rs32+#$Ii)", -tc_7f881c76, TypeLD>, Enc_211aaa, AddrModeRel, PostInc_BaseImm { +tc_17e0d2cd, TypeLD>, Enc_211aaa, AddrModeRel, PostInc_BaseImm { let Inst{24-21} = 0b1001; let Inst{31-27} = 0b10010; let hasNewValue = 1; @@ -9754,11 +9800,12 @@ def L2_loadrub_pbr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = memub($Rx32++$Mu2:brev)", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011111001; let hasNewValue = 1; let opNewValue = 0; +let addrMode = PostInc; let accessSize = ByteAccess; let mayLoad = 1; let Constraints = "$Rx32 = $Rx32in"; @@ -9767,7 +9814,7 @@ def L2_loadrub_pci : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2), "$Rd32 = memub($Rx32++#$Ii:circ($Mu2))", -tc_4403ca65, TypeLD>, Enc_e0a47a { +tc_e93a3d71, TypeLD>, Enc_e0a47a { let Inst{12-9} = 0b0000; let Inst{31-21} = 0b10011001001; let hasNewValue = 1; @@ -9782,7 +9829,7 @@ def L2_loadrub_pcr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = memub($Rx32++I:circ($Mu2))", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00010000; let Inst{31-21} = 0b10011001001; let hasNewValue = 1; @@ -9797,7 +9844,7 @@ def L2_loadrub_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_0Imm:$Ii), "$Rd32 = memub($Rx32++#$Ii)", -tc_2fc0c436, TypeLD>, Enc_222336, PredNewRel, PostInc_BaseImm { +tc_44d3da28, TypeLD>, Enc_222336, PredNewRel, PostInc_BaseImm { let Inst{13-9} = 0b00000; let Inst{31-21} = 0b10011011001; let hasNewValue = 1; @@ -9814,7 +9861,7 @@ def L2_loadrub_pr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = memub($Rx32++$Mu2)", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011101001; let hasNewValue = 1; @@ -9828,7 +9875,7 @@ def L2_loadrub_zomap : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = memub($Rs32)", -tc_7f881c76, TypeMAPPING> { +tc_17e0d2cd, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -9838,7 +9885,7 @@ def L2_loadrubgp : HInst< (outs IntRegs:$Rd32), (ins u32_0Imm:$Ii), "$Rd32 = memub(gp+#$Ii)", -tc_9c98e8af, TypeV2LDST>, Enc_25bef0, AddrModeRel { +tc_c4db48cb, TypeV2LDST>, Enc_25bef0, AddrModeRel { let Inst{24-21} = 0b1001; let Inst{31-27} = 0b01001; let hasNewValue = 1; @@ -9857,7 +9904,7 @@ def L2_loadruh_io : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s31_1Imm:$Ii), "$Rd32 = memuh($Rs32+#$Ii)", -tc_7f881c76, TypeLD>, Enc_de0214, AddrModeRel, PostInc_BaseImm { +tc_17e0d2cd, TypeLD>, Enc_de0214, AddrModeRel, PostInc_BaseImm { let Inst{24-21} = 0b1011; let Inst{31-27} = 0b10010; let hasNewValue = 1; @@ -9878,11 +9925,12 @@ def L2_loadruh_pbr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = memuh($Rx32++$Mu2:brev)", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011111011; let hasNewValue = 1; let opNewValue = 0; +let addrMode = PostInc; let accessSize = HalfWordAccess; let mayLoad = 1; let Constraints = "$Rx32 = $Rx32in"; @@ -9891,7 +9939,7 @@ def L2_loadruh_pci : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2), "$Rd32 = memuh($Rx32++#$Ii:circ($Mu2))", -tc_4403ca65, TypeLD>, Enc_e83554 { +tc_e93a3d71, TypeLD>, Enc_e83554 { let Inst{12-9} = 0b0000; let Inst{31-21} = 0b10011001011; let hasNewValue = 1; @@ -9906,7 +9954,7 @@ def L2_loadruh_pcr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = memuh($Rx32++I:circ($Mu2))", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00010000; let Inst{31-21} = 0b10011001011; let hasNewValue = 1; @@ -9921,7 +9969,7 @@ def L2_loadruh_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_1Imm:$Ii), "$Rd32 = memuh($Rx32++#$Ii)", -tc_2fc0c436, TypeLD>, Enc_152467, PredNewRel, PostInc_BaseImm { +tc_44d3da28, TypeLD>, Enc_152467, PredNewRel, PostInc_BaseImm { let Inst{13-9} = 0b00000; let Inst{31-21} = 0b10011011011; let hasNewValue = 1; @@ -9938,7 +9986,7 @@ def L2_loadruh_pr : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Rd32 = memuh($Rx32++$Mu2)", -tc_2fc0c436, TypeLD>, Enc_74d4e5 { +tc_44d3da28, TypeLD>, Enc_74d4e5 { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b10011101011; let hasNewValue = 1; @@ -9952,7 +10000,7 @@ def L2_loadruh_zomap : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = memuh($Rs32)", -tc_7f881c76, TypeMAPPING> { +tc_17e0d2cd, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -9962,7 +10010,7 @@ def L2_loadruhgp : HInst< (outs IntRegs:$Rd32), (ins u31_1Imm:$Ii), "$Rd32 = memuh(gp+#$Ii)", -tc_9c98e8af, TypeV2LDST>, Enc_8df4be, AddrModeRel { +tc_c4db48cb, TypeV2LDST>, Enc_8df4be, AddrModeRel { let Inst{24-21} = 0b1011; let Inst{31-27} = 0b01001; let hasNewValue = 1; @@ -9981,7 +10029,7 @@ def L2_loadw_locked : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = memw_locked($Rs32)", -tc_6aa5711a, TypeLD>, Enc_5e2823 { +tc_b43e7930, TypeLD>, Enc_5e2823 { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b10010010000; let hasNewValue = 1; @@ -9994,7 +10042,7 @@ def L2_ploadrbf_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii), "if (!$Pt4) $Rd32 = memb($Rs32+#$Ii)", -tc_ef52ed71, TypeV2LDST>, Enc_a21d47, AddrModeRel { +tc_5ef37dc4, TypeV2LDST>, Enc_a21d47, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000101000; let isPredicated = 1; @@ -10016,7 +10064,7 @@ def L2_ploadrbf_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii), "if (!$Pt4) $Rd32 = memb($Rx32++#$Ii)", -tc_bad2bcaf, TypeLD>, Enc_f4413a, PredNewRel { +tc_3c76b0ff, TypeLD>, Enc_f4413a, PredNewRel { let Inst{13-11} = 0b101; let Inst{31-21} = 0b10011011000; let isPredicated = 1; @@ -10033,7 +10081,7 @@ def L2_ploadrbf_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if (!$Pt4) $Rd32 = memb($Rs32)", -tc_ef52ed71, TypeMAPPING> { +tc_5ef37dc4, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -10043,7 +10091,7 @@ def L2_ploadrbfnew_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii), "if (!$Pt4.new) $Rd32 = memb($Rs32+#$Ii)", -tc_2fc0c436, TypeV2LDST>, Enc_a21d47, AddrModeRel { +tc_44d3da28, TypeV2LDST>, Enc_a21d47, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000111000; let isPredicated = 1; @@ -10066,7 +10114,7 @@ def L2_ploadrbfnew_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii), "if (!$Pt4.new) $Rd32 = memb($Rx32++#$Ii)", -tc_63fe3df7, TypeLD>, Enc_f4413a, PredNewRel { +tc_e9f3243f, TypeLD>, Enc_f4413a, PredNewRel { let Inst{13-11} = 0b111; let Inst{31-21} = 0b10011011000; let isPredicated = 1; @@ -10084,7 +10132,7 @@ def L2_ploadrbfnew_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if (!$Pt4.new) $Rd32 = memb($Rs32)", -tc_2fc0c436, TypeMAPPING> { +tc_44d3da28, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -10094,7 +10142,7 @@ def L2_ploadrbt_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii), "if ($Pt4) $Rd32 = memb($Rs32+#$Ii)", -tc_ef52ed71, TypeV2LDST>, Enc_a21d47, AddrModeRel { +tc_5ef37dc4, TypeV2LDST>, Enc_a21d47, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000001000; let isPredicated = 1; @@ -10115,7 +10163,7 @@ def L2_ploadrbt_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii), "if ($Pt4) $Rd32 = memb($Rx32++#$Ii)", -tc_bad2bcaf, TypeLD>, Enc_f4413a, PredNewRel { +tc_3c76b0ff, TypeLD>, Enc_f4413a, PredNewRel { let Inst{13-11} = 0b100; let Inst{31-21} = 0b10011011000; let isPredicated = 1; @@ -10131,7 +10179,7 @@ def L2_ploadrbt_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if ($Pt4) $Rd32 = memb($Rs32)", -tc_ef52ed71, TypeMAPPING> { +tc_5ef37dc4, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -10141,7 +10189,7 @@ def L2_ploadrbtnew_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii), "if ($Pt4.new) $Rd32 = memb($Rs32+#$Ii)", -tc_2fc0c436, TypeV2LDST>, Enc_a21d47, AddrModeRel { +tc_44d3da28, TypeV2LDST>, Enc_a21d47, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000011000; let isPredicated = 1; @@ -10163,7 +10211,7 @@ def L2_ploadrbtnew_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii), "if ($Pt4.new) $Rd32 = memb($Rx32++#$Ii)", -tc_63fe3df7, TypeLD>, Enc_f4413a, PredNewRel { +tc_e9f3243f, TypeLD>, Enc_f4413a, PredNewRel { let Inst{13-11} = 0b110; let Inst{31-21} = 0b10011011000; let isPredicated = 1; @@ -10180,7 +10228,7 @@ def L2_ploadrbtnew_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if ($Pt4.new) $Rd32 = memb($Rs32)", -tc_2fc0c436, TypeMAPPING> { +tc_44d3da28, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -10190,7 +10238,7 @@ def L2_ploadrdf_io : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii), "if (!$Pt4) $Rdd32 = memd($Rs32+#$Ii)", -tc_ef52ed71, TypeV2LDST>, Enc_acd6ed, AddrModeRel { +tc_5ef37dc4, TypeV2LDST>, Enc_acd6ed, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000101110; let isPredicated = 1; @@ -10210,7 +10258,7 @@ def L2_ploadrdf_pi : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii), "if (!$Pt4) $Rdd32 = memd($Rx32++#$Ii)", -tc_bad2bcaf, TypeLD>, Enc_9d1247, PredNewRel { +tc_3c76b0ff, TypeLD>, Enc_9d1247, PredNewRel { let Inst{13-11} = 0b101; let Inst{31-21} = 0b10011011110; let isPredicated = 1; @@ -10225,7 +10273,7 @@ def L2_ploadrdf_zomap : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if (!$Pt4) $Rdd32 = memd($Rs32)", -tc_ef52ed71, TypeMAPPING> { +tc_5ef37dc4, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -10233,7 +10281,7 @@ def L2_ploadrdfnew_io : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii), "if (!$Pt4.new) $Rdd32 = memd($Rs32+#$Ii)", -tc_2fc0c436, TypeV2LDST>, Enc_acd6ed, AddrModeRel { +tc_44d3da28, TypeV2LDST>, Enc_acd6ed, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000111110; let isPredicated = 1; @@ -10254,7 +10302,7 @@ def L2_ploadrdfnew_pi : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii), "if (!$Pt4.new) $Rdd32 = memd($Rx32++#$Ii)", -tc_63fe3df7, TypeLD>, Enc_9d1247, PredNewRel { +tc_e9f3243f, TypeLD>, Enc_9d1247, PredNewRel { let Inst{13-11} = 0b111; let Inst{31-21} = 0b10011011110; let isPredicated = 1; @@ -10270,7 +10318,7 @@ def L2_ploadrdfnew_zomap : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if (!$Pt4.new) $Rdd32 = memd($Rs32)", -tc_2fc0c436, TypeMAPPING> { +tc_44d3da28, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -10278,7 +10326,7 @@ def L2_ploadrdt_io : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii), "if ($Pt4) $Rdd32 = memd($Rs32+#$Ii)", -tc_ef52ed71, TypeV2LDST>, Enc_acd6ed, AddrModeRel { +tc_5ef37dc4, TypeV2LDST>, Enc_acd6ed, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000001110; let isPredicated = 1; @@ -10297,7 +10345,7 @@ def L2_ploadrdt_pi : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii), "if ($Pt4) $Rdd32 = memd($Rx32++#$Ii)", -tc_bad2bcaf, TypeLD>, Enc_9d1247, PredNewRel { +tc_3c76b0ff, TypeLD>, Enc_9d1247, PredNewRel { let Inst{13-11} = 0b100; let Inst{31-21} = 0b10011011110; let isPredicated = 1; @@ -10311,7 +10359,7 @@ def L2_ploadrdt_zomap : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if ($Pt4) $Rdd32 = memd($Rs32)", -tc_ef52ed71, TypeMAPPING> { +tc_5ef37dc4, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -10319,7 +10367,7 @@ def L2_ploadrdtnew_io : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii), "if ($Pt4.new) $Rdd32 = memd($Rs32+#$Ii)", -tc_2fc0c436, TypeV2LDST>, Enc_acd6ed, AddrModeRel { +tc_44d3da28, TypeV2LDST>, Enc_acd6ed, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000011110; let isPredicated = 1; @@ -10339,7 +10387,7 @@ def L2_ploadrdtnew_pi : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii), "if ($Pt4.new) $Rdd32 = memd($Rx32++#$Ii)", -tc_63fe3df7, TypeLD>, Enc_9d1247, PredNewRel { +tc_e9f3243f, TypeLD>, Enc_9d1247, PredNewRel { let Inst{13-11} = 0b110; let Inst{31-21} = 0b10011011110; let isPredicated = 1; @@ -10354,7 +10402,7 @@ def L2_ploadrdtnew_zomap : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if ($Pt4.new) $Rdd32 = memd($Rs32)", -tc_2fc0c436, TypeMAPPING> { +tc_44d3da28, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -10362,7 +10410,7 @@ def L2_ploadrhf_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii), "if (!$Pt4) $Rd32 = memh($Rs32+#$Ii)", -tc_ef52ed71, TypeV2LDST>, Enc_a198f6, AddrModeRel { +tc_5ef37dc4, TypeV2LDST>, Enc_a198f6, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000101010; let isPredicated = 1; @@ -10384,7 +10432,7 @@ def L2_ploadrhf_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii), "if (!$Pt4) $Rd32 = memh($Rx32++#$Ii)", -tc_bad2bcaf, TypeLD>, Enc_733b27, PredNewRel { +tc_3c76b0ff, TypeLD>, Enc_733b27, PredNewRel { let Inst{13-11} = 0b101; let Inst{31-21} = 0b10011011010; let isPredicated = 1; @@ -10401,7 +10449,7 @@ def L2_ploadrhf_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if (!$Pt4) $Rd32 = memh($Rs32)", -tc_ef52ed71, TypeMAPPING> { +tc_5ef37dc4, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -10411,7 +10459,7 @@ def L2_ploadrhfnew_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii), "if (!$Pt4.new) $Rd32 = memh($Rs32+#$Ii)", -tc_2fc0c436, TypeV2LDST>, Enc_a198f6, AddrModeRel { +tc_44d3da28, TypeV2LDST>, Enc_a198f6, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000111010; let isPredicated = 1; @@ -10434,7 +10482,7 @@ def L2_ploadrhfnew_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii), "if (!$Pt4.new) $Rd32 = memh($Rx32++#$Ii)", -tc_63fe3df7, TypeLD>, Enc_733b27, PredNewRel { +tc_e9f3243f, TypeLD>, Enc_733b27, PredNewRel { let Inst{13-11} = 0b111; let Inst{31-21} = 0b10011011010; let isPredicated = 1; @@ -10452,7 +10500,7 @@ def L2_ploadrhfnew_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if (!$Pt4.new) $Rd32 = memh($Rs32)", -tc_2fc0c436, TypeMAPPING> { +tc_44d3da28, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -10462,7 +10510,7 @@ def L2_ploadrht_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii), "if ($Pt4) $Rd32 = memh($Rs32+#$Ii)", -tc_ef52ed71, TypeV2LDST>, Enc_a198f6, AddrModeRel { +tc_5ef37dc4, TypeV2LDST>, Enc_a198f6, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000001010; let isPredicated = 1; @@ -10483,7 +10531,7 @@ def L2_ploadrht_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii), "if ($Pt4) $Rd32 = memh($Rx32++#$Ii)", -tc_bad2bcaf, TypeLD>, Enc_733b27, PredNewRel { +tc_3c76b0ff, TypeLD>, Enc_733b27, PredNewRel { let Inst{13-11} = 0b100; let Inst{31-21} = 0b10011011010; let isPredicated = 1; @@ -10499,7 +10547,7 @@ def L2_ploadrht_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if ($Pt4) $Rd32 = memh($Rs32)", -tc_ef52ed71, TypeMAPPING> { +tc_5ef37dc4, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -10509,7 +10557,7 @@ def L2_ploadrhtnew_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii), "if ($Pt4.new) $Rd32 = memh($Rs32+#$Ii)", -tc_2fc0c436, TypeV2LDST>, Enc_a198f6, AddrModeRel { +tc_44d3da28, TypeV2LDST>, Enc_a198f6, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000011010; let isPredicated = 1; @@ -10531,7 +10579,7 @@ def L2_ploadrhtnew_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii), "if ($Pt4.new) $Rd32 = memh($Rx32++#$Ii)", -tc_63fe3df7, TypeLD>, Enc_733b27, PredNewRel { +tc_e9f3243f, TypeLD>, Enc_733b27, PredNewRel { let Inst{13-11} = 0b110; let Inst{31-21} = 0b10011011010; let isPredicated = 1; @@ -10548,7 +10596,7 @@ def L2_ploadrhtnew_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if ($Pt4.new) $Rd32 = memh($Rs32)", -tc_2fc0c436, TypeMAPPING> { +tc_44d3da28, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -10558,7 +10606,7 @@ def L2_ploadrif_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii), "if (!$Pt4) $Rd32 = memw($Rs32+#$Ii)", -tc_ef52ed71, TypeV2LDST>, Enc_f82eaf, AddrModeRel { +tc_5ef37dc4, TypeV2LDST>, Enc_f82eaf, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000101100; let isPredicated = 1; @@ -10580,7 +10628,7 @@ def L2_ploadrif_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii), "if (!$Pt4) $Rd32 = memw($Rx32++#$Ii)", -tc_bad2bcaf, TypeLD>, Enc_b97f71, PredNewRel { +tc_3c76b0ff, TypeLD>, Enc_b97f71, PredNewRel { let Inst{13-11} = 0b101; let Inst{31-21} = 0b10011011100; let isPredicated = 1; @@ -10597,7 +10645,7 @@ def L2_ploadrif_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if (!$Pt4) $Rd32 = memw($Rs32)", -tc_ef52ed71, TypeMAPPING> { +tc_5ef37dc4, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -10607,7 +10655,7 @@ def L2_ploadrifnew_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii), "if (!$Pt4.new) $Rd32 = memw($Rs32+#$Ii)", -tc_2fc0c436, TypeV2LDST>, Enc_f82eaf, AddrModeRel { +tc_44d3da28, TypeV2LDST>, Enc_f82eaf, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000111100; let isPredicated = 1; @@ -10630,7 +10678,7 @@ def L2_ploadrifnew_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii), "if (!$Pt4.new) $Rd32 = memw($Rx32++#$Ii)", -tc_63fe3df7, TypeLD>, Enc_b97f71, PredNewRel { +tc_e9f3243f, TypeLD>, Enc_b97f71, PredNewRel { let Inst{13-11} = 0b111; let Inst{31-21} = 0b10011011100; let isPredicated = 1; @@ -10648,7 +10696,7 @@ def L2_ploadrifnew_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if (!$Pt4.new) $Rd32 = memw($Rs32)", -tc_2fc0c436, TypeMAPPING> { +tc_44d3da28, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -10658,7 +10706,7 @@ def L2_ploadrit_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii), "if ($Pt4) $Rd32 = memw($Rs32+#$Ii)", -tc_ef52ed71, TypeV2LDST>, Enc_f82eaf, AddrModeRel { +tc_5ef37dc4, TypeV2LDST>, Enc_f82eaf, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000001100; let isPredicated = 1; @@ -10679,7 +10727,7 @@ def L2_ploadrit_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii), "if ($Pt4) $Rd32 = memw($Rx32++#$Ii)", -tc_bad2bcaf, TypeLD>, Enc_b97f71, PredNewRel { +tc_3c76b0ff, TypeLD>, Enc_b97f71, PredNewRel { let Inst{13-11} = 0b100; let Inst{31-21} = 0b10011011100; let isPredicated = 1; @@ -10695,7 +10743,7 @@ def L2_ploadrit_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if ($Pt4) $Rd32 = memw($Rs32)", -tc_ef52ed71, TypeMAPPING> { +tc_5ef37dc4, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -10705,7 +10753,7 @@ def L2_ploadritnew_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii), "if ($Pt4.new) $Rd32 = memw($Rs32+#$Ii)", -tc_2fc0c436, TypeV2LDST>, Enc_f82eaf, AddrModeRel { +tc_44d3da28, TypeV2LDST>, Enc_f82eaf, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000011100; let isPredicated = 1; @@ -10727,7 +10775,7 @@ def L2_ploadritnew_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii), "if ($Pt4.new) $Rd32 = memw($Rx32++#$Ii)", -tc_63fe3df7, TypeLD>, Enc_b97f71, PredNewRel { +tc_e9f3243f, TypeLD>, Enc_b97f71, PredNewRel { let Inst{13-11} = 0b110; let Inst{31-21} = 0b10011011100; let isPredicated = 1; @@ -10744,7 +10792,7 @@ def L2_ploadritnew_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if ($Pt4.new) $Rd32 = memw($Rs32)", -tc_2fc0c436, TypeMAPPING> { +tc_44d3da28, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -10754,7 +10802,7 @@ def L2_ploadrubf_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii), "if (!$Pt4) $Rd32 = memub($Rs32+#$Ii)", -tc_ef52ed71, TypeV2LDST>, Enc_a21d47, AddrModeRel { +tc_5ef37dc4, TypeV2LDST>, Enc_a21d47, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000101001; let isPredicated = 1; @@ -10776,7 +10824,7 @@ def L2_ploadrubf_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii), "if (!$Pt4) $Rd32 = memub($Rx32++#$Ii)", -tc_bad2bcaf, TypeLD>, Enc_f4413a, PredNewRel { +tc_3c76b0ff, TypeLD>, Enc_f4413a, PredNewRel { let Inst{13-11} = 0b101; let Inst{31-21} = 0b10011011001; let isPredicated = 1; @@ -10793,7 +10841,7 @@ def L2_ploadrubf_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if (!$Pt4) $Rd32 = memub($Rs32)", -tc_ef52ed71, TypeMAPPING> { +tc_5ef37dc4, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -10803,7 +10851,7 @@ def L2_ploadrubfnew_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii), "if (!$Pt4.new) $Rd32 = memub($Rs32+#$Ii)", -tc_2fc0c436, TypeV2LDST>, Enc_a21d47, AddrModeRel { +tc_44d3da28, TypeV2LDST>, Enc_a21d47, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000111001; let isPredicated = 1; @@ -10826,7 +10874,7 @@ def L2_ploadrubfnew_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii), "if (!$Pt4.new) $Rd32 = memub($Rx32++#$Ii)", -tc_63fe3df7, TypeLD>, Enc_f4413a, PredNewRel { +tc_e9f3243f, TypeLD>, Enc_f4413a, PredNewRel { let Inst{13-11} = 0b111; let Inst{31-21} = 0b10011011001; let isPredicated = 1; @@ -10844,7 +10892,7 @@ def L2_ploadrubfnew_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if (!$Pt4.new) $Rd32 = memub($Rs32)", -tc_2fc0c436, TypeMAPPING> { +tc_44d3da28, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -10854,7 +10902,7 @@ def L2_ploadrubt_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii), "if ($Pt4) $Rd32 = memub($Rs32+#$Ii)", -tc_ef52ed71, TypeV2LDST>, Enc_a21d47, AddrModeRel { +tc_5ef37dc4, TypeV2LDST>, Enc_a21d47, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000001001; let isPredicated = 1; @@ -10875,7 +10923,7 @@ def L2_ploadrubt_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii), "if ($Pt4) $Rd32 = memub($Rx32++#$Ii)", -tc_bad2bcaf, TypeLD>, Enc_f4413a, PredNewRel { +tc_3c76b0ff, TypeLD>, Enc_f4413a, PredNewRel { let Inst{13-11} = 0b100; let Inst{31-21} = 0b10011011001; let isPredicated = 1; @@ -10891,7 +10939,7 @@ def L2_ploadrubt_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if ($Pt4) $Rd32 = memub($Rs32)", -tc_ef52ed71, TypeMAPPING> { +tc_5ef37dc4, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -10901,7 +10949,7 @@ def L2_ploadrubtnew_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii), "if ($Pt4.new) $Rd32 = memub($Rs32+#$Ii)", -tc_2fc0c436, TypeV2LDST>, Enc_a21d47, AddrModeRel { +tc_44d3da28, TypeV2LDST>, Enc_a21d47, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000011001; let isPredicated = 1; @@ -10923,7 +10971,7 @@ def L2_ploadrubtnew_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii), "if ($Pt4.new) $Rd32 = memub($Rx32++#$Ii)", -tc_63fe3df7, TypeLD>, Enc_f4413a, PredNewRel { +tc_e9f3243f, TypeLD>, Enc_f4413a, PredNewRel { let Inst{13-11} = 0b110; let Inst{31-21} = 0b10011011001; let isPredicated = 1; @@ -10940,7 +10988,7 @@ def L2_ploadrubtnew_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if ($Pt4.new) $Rd32 = memub($Rs32)", -tc_2fc0c436, TypeMAPPING> { +tc_44d3da28, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -10950,7 +10998,7 @@ def L2_ploadruhf_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii), "if (!$Pt4) $Rd32 = memuh($Rs32+#$Ii)", -tc_ef52ed71, TypeV2LDST>, Enc_a198f6, AddrModeRel { +tc_5ef37dc4, TypeV2LDST>, Enc_a198f6, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000101011; let isPredicated = 1; @@ -10972,7 +11020,7 @@ def L2_ploadruhf_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii), "if (!$Pt4) $Rd32 = memuh($Rx32++#$Ii)", -tc_bad2bcaf, TypeLD>, Enc_733b27, PredNewRel { +tc_3c76b0ff, TypeLD>, Enc_733b27, PredNewRel { let Inst{13-11} = 0b101; let Inst{31-21} = 0b10011011011; let isPredicated = 1; @@ -10989,7 +11037,7 @@ def L2_ploadruhf_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if (!$Pt4) $Rd32 = memuh($Rs32)", -tc_ef52ed71, TypeMAPPING> { +tc_5ef37dc4, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -10999,7 +11047,7 @@ def L2_ploadruhfnew_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii), "if (!$Pt4.new) $Rd32 = memuh($Rs32+#$Ii)", -tc_2fc0c436, TypeV2LDST>, Enc_a198f6, AddrModeRel { +tc_44d3da28, TypeV2LDST>, Enc_a198f6, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000111011; let isPredicated = 1; @@ -11022,7 +11070,7 @@ def L2_ploadruhfnew_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii), "if (!$Pt4.new) $Rd32 = memuh($Rx32++#$Ii)", -tc_63fe3df7, TypeLD>, Enc_733b27, PredNewRel { +tc_e9f3243f, TypeLD>, Enc_733b27, PredNewRel { let Inst{13-11} = 0b111; let Inst{31-21} = 0b10011011011; let isPredicated = 1; @@ -11040,7 +11088,7 @@ def L2_ploadruhfnew_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if (!$Pt4.new) $Rd32 = memuh($Rs32)", -tc_2fc0c436, TypeMAPPING> { +tc_44d3da28, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -11050,7 +11098,7 @@ def L2_ploadruht_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii), "if ($Pt4) $Rd32 = memuh($Rs32+#$Ii)", -tc_ef52ed71, TypeV2LDST>, Enc_a198f6, AddrModeRel { +tc_5ef37dc4, TypeV2LDST>, Enc_a198f6, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000001011; let isPredicated = 1; @@ -11071,7 +11119,7 @@ def L2_ploadruht_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii), "if ($Pt4) $Rd32 = memuh($Rx32++#$Ii)", -tc_bad2bcaf, TypeLD>, Enc_733b27, PredNewRel { +tc_3c76b0ff, TypeLD>, Enc_733b27, PredNewRel { let Inst{13-11} = 0b100; let Inst{31-21} = 0b10011011011; let isPredicated = 1; @@ -11087,7 +11135,7 @@ def L2_ploadruht_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if ($Pt4) $Rd32 = memuh($Rs32)", -tc_ef52ed71, TypeMAPPING> { +tc_5ef37dc4, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -11097,7 +11145,7 @@ def L2_ploadruhtnew_io : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii), "if ($Pt4.new) $Rd32 = memuh($Rs32+#$Ii)", -tc_2fc0c436, TypeV2LDST>, Enc_a198f6, AddrModeRel { +tc_44d3da28, TypeV2LDST>, Enc_a198f6, AddrModeRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b01000011011; let isPredicated = 1; @@ -11119,7 +11167,7 @@ def L2_ploadruhtnew_pi : HInst< (outs IntRegs:$Rd32, IntRegs:$Rx32), (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii), "if ($Pt4.new) $Rd32 = memuh($Rx32++#$Ii)", -tc_63fe3df7, TypeLD>, Enc_733b27, PredNewRel { +tc_e9f3243f, TypeLD>, Enc_733b27, PredNewRel { let Inst{13-11} = 0b110; let Inst{31-21} = 0b10011011011; let isPredicated = 1; @@ -11136,7 +11184,7 @@ def L2_ploadruhtnew_zomap : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, IntRegs:$Rs32), "if ($Pt4.new) $Rd32 = memuh($Rs32)", -tc_2fc0c436, TypeMAPPING> { +tc_44d3da28, TypeMAPPING> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -11146,7 +11194,7 @@ def L4_add_memopb_io : HInst< (outs), (ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32), "memb($Rs32+#$Ii) += $Rt32", -tc_44126683, TypeV4LDST>, Enc_d44e31 { +tc_7186d325, TypeV4LDST>, Enc_d44e31 { let Inst{6-5} = 0b00; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111110000; @@ -11165,7 +11213,7 @@ def L4_add_memopb_zomap : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Rt32), "memb($Rs32) += $Rt32", -tc_44126683, TypeMAPPING> { +tc_7186d325, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -11173,7 +11221,7 @@ def L4_add_memoph_io : HInst< (outs), (ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32), "memh($Rs32+#$Ii) += $Rt32", -tc_44126683, TypeV4LDST>, Enc_163a3c { +tc_7186d325, TypeV4LDST>, Enc_163a3c { let Inst{6-5} = 0b00; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111110001; @@ -11192,7 +11240,7 @@ def L4_add_memoph_zomap : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Rt32), "memh($Rs32) += $Rt32", -tc_44126683, TypeMAPPING> { +tc_7186d325, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -11200,7 +11248,7 @@ def L4_add_memopw_io : HInst< (outs), (ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32), "memw($Rs32+#$Ii) += $Rt32", -tc_44126683, TypeV4LDST>, Enc_226535 { +tc_7186d325, TypeV4LDST>, Enc_226535 { let Inst{6-5} = 0b00; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111110010; @@ -11219,7 +11267,7 @@ def L4_add_memopw_zomap : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Rt32), "memw($Rs32) += $Rt32", -tc_44126683, TypeMAPPING> { +tc_7186d325, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -11227,7 +11275,7 @@ def L4_and_memopb_io : HInst< (outs), (ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32), "memb($Rs32+#$Ii) &= $Rt32", -tc_44126683, TypeV4LDST>, Enc_d44e31 { +tc_7186d325, TypeV4LDST>, Enc_d44e31 { let Inst{6-5} = 0b10; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111110000; @@ -11246,7 +11294,7 @@ def L4_and_memopb_zomap : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Rt32), "memb($Rs32) &= $Rt32", -tc_44126683, TypeMAPPING> { +tc_7186d325, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -11254,7 +11302,7 @@ def L4_and_memoph_io : HInst< (outs), (ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32), "memh($Rs32+#$Ii) &= $Rt32", -tc_44126683, TypeV4LDST>, Enc_163a3c { +tc_7186d325, TypeV4LDST>, Enc_163a3c { let Inst{6-5} = 0b10; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111110001; @@ -11273,7 +11321,7 @@ def L4_and_memoph_zomap : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Rt32), "memh($Rs32) &= $Rt32", -tc_44126683, TypeMAPPING> { +tc_7186d325, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -11281,7 +11329,7 @@ def L4_and_memopw_io : HInst< (outs), (ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32), "memw($Rs32+#$Ii) &= $Rt32", -tc_44126683, TypeV4LDST>, Enc_226535 { +tc_7186d325, TypeV4LDST>, Enc_226535 { let Inst{6-5} = 0b10; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111110010; @@ -11300,7 +11348,7 @@ def L4_and_memopw_zomap : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Rt32), "memw($Rs32) &= $Rt32", -tc_44126683, TypeMAPPING> { +tc_7186d325, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -11308,7 +11356,7 @@ def L4_iadd_memopb_io : HInst< (outs), (ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II), "memb($Rs32+#$Ii) += #$II", -tc_44126683, TypeV4LDST>, Enc_46c951 { +tc_096199d3, TypeV4LDST>, Enc_46c951 { let Inst{6-5} = 0b00; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111111000; @@ -11327,7 +11375,7 @@ def L4_iadd_memopb_zomap : HInst< (outs), (ins IntRegs:$Rs32, u5_0Imm:$II), "memb($Rs32) += #$II", -tc_44126683, TypeMAPPING> { +tc_096199d3, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -11335,7 +11383,7 @@ def L4_iadd_memoph_io : HInst< (outs), (ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II), "memh($Rs32+#$Ii) += #$II", -tc_44126683, TypeV4LDST>, Enc_e66a97 { +tc_096199d3, TypeV4LDST>, Enc_e66a97 { let Inst{6-5} = 0b00; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111111001; @@ -11354,7 +11402,7 @@ def L4_iadd_memoph_zomap : HInst< (outs), (ins IntRegs:$Rs32, u5_0Imm:$II), "memh($Rs32) += #$II", -tc_44126683, TypeMAPPING> { +tc_096199d3, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -11362,7 +11410,7 @@ def L4_iadd_memopw_io : HInst< (outs), (ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II), "memw($Rs32+#$Ii) += #$II", -tc_44126683, TypeV4LDST>, Enc_84b2cd { +tc_096199d3, TypeV4LDST>, Enc_84b2cd { let Inst{6-5} = 0b00; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111111010; @@ -11381,7 +11429,7 @@ def L4_iadd_memopw_zomap : HInst< (outs), (ins IntRegs:$Rs32, u5_0Imm:$II), "memw($Rs32) += #$II", -tc_44126683, TypeMAPPING> { +tc_096199d3, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -11389,7 +11437,7 @@ def L4_iand_memopb_io : HInst< (outs), (ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II), "memb($Rs32+#$Ii) = clrbit(#$II)", -tc_44126683, TypeV4LDST>, Enc_46c951 { +tc_096199d3, TypeV4LDST>, Enc_46c951 { let Inst{6-5} = 0b10; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111111000; @@ -11408,7 +11456,7 @@ def L4_iand_memopb_zomap : HInst< (outs), (ins IntRegs:$Rs32, u5_0Imm:$II), "memb($Rs32) = clrbit(#$II)", -tc_44126683, TypeMAPPING> { +tc_096199d3, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -11416,7 +11464,7 @@ def L4_iand_memoph_io : HInst< (outs), (ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II), "memh($Rs32+#$Ii) = clrbit(#$II)", -tc_44126683, TypeV4LDST>, Enc_e66a97 { +tc_096199d3, TypeV4LDST>, Enc_e66a97 { let Inst{6-5} = 0b10; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111111001; @@ -11435,7 +11483,7 @@ def L4_iand_memoph_zomap : HInst< (outs), (ins IntRegs:$Rs32, u5_0Imm:$II), "memh($Rs32) = clrbit(#$II)", -tc_44126683, TypeMAPPING> { +tc_096199d3, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -11443,7 +11491,7 @@ def L4_iand_memopw_io : HInst< (outs), (ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II), "memw($Rs32+#$Ii) = clrbit(#$II)", -tc_44126683, TypeV4LDST>, Enc_84b2cd { +tc_096199d3, TypeV4LDST>, Enc_84b2cd { let Inst{6-5} = 0b10; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111111010; @@ -11462,7 +11510,7 @@ def L4_iand_memopw_zomap : HInst< (outs), (ins IntRegs:$Rs32, u5_0Imm:$II), "memw($Rs32) = clrbit(#$II)", -tc_44126683, TypeMAPPING> { +tc_096199d3, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -11470,7 +11518,7 @@ def L4_ior_memopb_io : HInst< (outs), (ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II), "memb($Rs32+#$Ii) = setbit(#$II)", -tc_44126683, TypeV4LDST>, Enc_46c951 { +tc_096199d3, TypeV4LDST>, Enc_46c951 { let Inst{6-5} = 0b11; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111111000; @@ -11489,7 +11537,7 @@ def L4_ior_memopb_zomap : HInst< (outs), (ins IntRegs:$Rs32, u5_0Imm:$II), "memb($Rs32) = setbit(#$II)", -tc_44126683, TypeMAPPING> { +tc_096199d3, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -11497,7 +11545,7 @@ def L4_ior_memoph_io : HInst< (outs), (ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II), "memh($Rs32+#$Ii) = setbit(#$II)", -tc_44126683, TypeV4LDST>, Enc_e66a97 { +tc_096199d3, TypeV4LDST>, Enc_e66a97 { let Inst{6-5} = 0b11; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111111001; @@ -11516,7 +11564,7 @@ def L4_ior_memoph_zomap : HInst< (outs), (ins IntRegs:$Rs32, u5_0Imm:$II), "memh($Rs32) = setbit(#$II)", -tc_44126683, TypeMAPPING> { +tc_096199d3, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -11524,7 +11572,7 @@ def L4_ior_memopw_io : HInst< (outs), (ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II), "memw($Rs32+#$Ii) = setbit(#$II)", -tc_44126683, TypeV4LDST>, Enc_84b2cd { +tc_096199d3, TypeV4LDST>, Enc_84b2cd { let Inst{6-5} = 0b11; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111111010; @@ -11543,7 +11591,7 @@ def L4_ior_memopw_zomap : HInst< (outs), (ins IntRegs:$Rs32, u5_0Imm:$II), "memw($Rs32) = setbit(#$II)", -tc_44126683, TypeMAPPING> { +tc_096199d3, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -11551,7 +11599,7 @@ def L4_isub_memopb_io : HInst< (outs), (ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II), "memb($Rs32+#$Ii) -= #$II", -tc_44126683, TypeV4LDST>, Enc_46c951 { +tc_096199d3, TypeV4LDST>, Enc_46c951 { let Inst{6-5} = 0b01; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111111000; @@ -11570,7 +11618,7 @@ def L4_isub_memopb_zomap : HInst< (outs), (ins IntRegs:$Rs32, u5_0Imm:$II), "memb($Rs32) -= #$II", -tc_44126683, TypeMAPPING> { +tc_096199d3, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -11578,7 +11626,7 @@ def L4_isub_memoph_io : HInst< (outs), (ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II), "memh($Rs32+#$Ii) -= #$II", -tc_44126683, TypeV4LDST>, Enc_e66a97 { +tc_096199d3, TypeV4LDST>, Enc_e66a97 { let Inst{6-5} = 0b01; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111111001; @@ -11597,7 +11645,7 @@ def L4_isub_memoph_zomap : HInst< (outs), (ins IntRegs:$Rs32, u5_0Imm:$II), "memh($Rs32) -= #$II", -tc_44126683, TypeMAPPING> { +tc_096199d3, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -11605,7 +11653,7 @@ def L4_isub_memopw_io : HInst< (outs), (ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II), "memw($Rs32+#$Ii) -= #$II", -tc_44126683, TypeV4LDST>, Enc_84b2cd { +tc_096199d3, TypeV4LDST>, Enc_84b2cd { let Inst{6-5} = 0b01; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111111010; @@ -11624,7 +11672,7 @@ def L4_isub_memopw_zomap : HInst< (outs), (ins IntRegs:$Rs32, u5_0Imm:$II), "memw($Rs32) -= #$II", -tc_44126683, TypeMAPPING> { +tc_096199d3, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -11632,7 +11680,7 @@ def L4_loadalignb_ap : HInst< (outs DoubleRegs:$Ryy32, IntRegs:$Re32), (ins DoubleRegs:$Ryy32in, u32_0Imm:$II), "$Ryy32 = memb_fifo($Re32=#$II)", -tc_5acef64a, TypeLD>, Enc_f394d3 { +tc_7a91e76a, TypeLD>, Enc_f394d3 { let Inst{7-7} = 0b0; let Inst{13-12} = 0b01; let Inst{31-21} = 0b10011010100; @@ -11652,7 +11700,7 @@ def L4_loadalignb_ur : HInst< (outs DoubleRegs:$Ryy32), (ins DoubleRegs:$Ryy32in, IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II), "$Ryy32 = memb_fifo($Rt32<<#$Ii+#$II)", -tc_0cd51c76, TypeLD>, Enc_04c959 { +tc_a5d4aeec, TypeLD>, Enc_04c959 { let Inst{12-12} = 0b1; let Inst{31-21} = 0b10011100100; let addrMode = BaseLongOffset; @@ -11672,7 +11720,7 @@ def L4_loadalignh_ap : HInst< (outs DoubleRegs:$Ryy32, IntRegs:$Re32), (ins DoubleRegs:$Ryy32in, u32_0Imm:$II), "$Ryy32 = memh_fifo($Re32=#$II)", -tc_5acef64a, TypeLD>, Enc_f394d3 { +tc_7a91e76a, TypeLD>, Enc_f394d3 { let Inst{7-7} = 0b0; let Inst{13-12} = 0b01; let Inst{31-21} = 0b10011010010; @@ -11692,7 +11740,7 @@ def L4_loadalignh_ur : HInst< (outs DoubleRegs:$Ryy32), (ins DoubleRegs:$Ryy32in, IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II), "$Ryy32 = memh_fifo($Rt32<<#$Ii+#$II)", -tc_0cd51c76, TypeLD>, Enc_04c959 { +tc_a5d4aeec, TypeLD>, Enc_04c959 { let Inst{12-12} = 0b1; let Inst{31-21} = 0b10011100010; let addrMode = BaseLongOffset; @@ -11712,7 +11760,7 @@ def L4_loadbsw2_ap : HInst< (outs IntRegs:$Rd32, IntRegs:$Re32), (ins u32_0Imm:$II), "$Rd32 = membh($Re32=#$II)", -tc_b77c481f, TypeLD>, Enc_323f2d { +tc_3b5b7ef9, TypeLD>, Enc_323f2d { let Inst{7-7} = 0b0; let Inst{13-12} = 0b01; let Inst{31-21} = 0b10011010001; @@ -11733,7 +11781,7 @@ def L4_loadbsw2_ur : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II), "$Rd32 = membh($Rt32<<#$Ii+#$II)", -tc_cf47a43f, TypeLD>, Enc_4f677b { +tc_bab0eed9, TypeLD>, Enc_4f677b { let Inst{12-12} = 0b1; let Inst{31-21} = 0b10011100001; let hasNewValue = 1; @@ -11754,7 +11802,7 @@ def L4_loadbsw4_ap : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Re32), (ins u32_0Imm:$II), "$Rdd32 = membh($Re32=#$II)", -tc_b77c481f, TypeLD>, Enc_7fa7f6 { +tc_3b5b7ef9, TypeLD>, Enc_7fa7f6 { let Inst{7-7} = 0b0; let Inst{13-12} = 0b01; let Inst{31-21} = 0b10011010111; @@ -11773,7 +11821,7 @@ def L4_loadbsw4_ur : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II), "$Rdd32 = membh($Rt32<<#$Ii+#$II)", -tc_cf47a43f, TypeLD>, Enc_6185fe { +tc_bab0eed9, TypeLD>, Enc_6185fe { let Inst{12-12} = 0b1; let Inst{31-21} = 0b10011100111; let addrMode = BaseLongOffset; @@ -11792,7 +11840,7 @@ def L4_loadbzw2_ap : HInst< (outs IntRegs:$Rd32, IntRegs:$Re32), (ins u32_0Imm:$II), "$Rd32 = memubh($Re32=#$II)", -tc_b77c481f, TypeLD>, Enc_323f2d { +tc_3b5b7ef9, TypeLD>, Enc_323f2d { let Inst{7-7} = 0b0; let Inst{13-12} = 0b01; let Inst{31-21} = 0b10011010011; @@ -11813,7 +11861,7 @@ def L4_loadbzw2_ur : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II), "$Rd32 = memubh($Rt32<<#$Ii+#$II)", -tc_cf47a43f, TypeLD>, Enc_4f677b { +tc_bab0eed9, TypeLD>, Enc_4f677b { let Inst{12-12} = 0b1; let Inst{31-21} = 0b10011100011; let hasNewValue = 1; @@ -11834,7 +11882,7 @@ def L4_loadbzw4_ap : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Re32), (ins u32_0Imm:$II), "$Rdd32 = memubh($Re32=#$II)", -tc_b77c481f, TypeLD>, Enc_7fa7f6 { +tc_3b5b7ef9, TypeLD>, Enc_7fa7f6 { let Inst{7-7} = 0b0; let Inst{13-12} = 0b01; let Inst{31-21} = 0b10011010101; @@ -11853,7 +11901,7 @@ def L4_loadbzw4_ur : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II), "$Rdd32 = memubh($Rt32<<#$Ii+#$II)", -tc_cf47a43f, TypeLD>, Enc_6185fe { +tc_bab0eed9, TypeLD>, Enc_6185fe { let Inst{12-12} = 0b1; let Inst{31-21} = 0b10011100101; let addrMode = BaseLongOffset; @@ -11872,7 +11920,7 @@ def L4_loadd_locked : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = memd_locked($Rs32)", -tc_6aa5711a, TypeLD>, Enc_3a3d62 { +tc_b43e7930, TypeLD>, Enc_3a3d62 { let Inst{13-5} = 0b010000000; let Inst{31-21} = 0b10010010000; let accessSize = DoubleWordAccess; @@ -11883,7 +11931,7 @@ def L4_loadrb_ap : HInst< (outs IntRegs:$Rd32, IntRegs:$Re32), (ins u32_0Imm:$II), "$Rd32 = memb($Re32=#$II)", -tc_b77c481f, TypeLD>, Enc_323f2d { +tc_3b5b7ef9, TypeLD>, Enc_323f2d { let Inst{7-7} = 0b0; let Inst{13-12} = 0b01; let Inst{31-21} = 0b10011011000; @@ -11904,7 +11952,7 @@ def L4_loadrb_rr : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "$Rd32 = memb($Rs32+$Rt32<<#$Ii)", -tc_f47d212f, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl { +tc_bf061958, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl { let Inst{6-5} = 0b00; let Inst{31-21} = 0b00111010000; let hasNewValue = 1; @@ -11921,7 +11969,7 @@ def L4_loadrb_ur : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II), "$Rd32 = memb($Rt32<<#$Ii+#$II)", -tc_cf47a43f, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl { +tc_bab0eed9, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl { let Inst{12-12} = 0b1; let Inst{31-21} = 0b10011101000; let hasNewValue = 1; @@ -11943,7 +11991,7 @@ def L4_loadrd_ap : HInst< (outs DoubleRegs:$Rdd32, IntRegs:$Re32), (ins u32_0Imm:$II), "$Rdd32 = memd($Re32=#$II)", -tc_b77c481f, TypeLD>, Enc_7fa7f6 { +tc_3b5b7ef9, TypeLD>, Enc_7fa7f6 { let Inst{7-7} = 0b0; let Inst{13-12} = 0b01; let Inst{31-21} = 0b10011011110; @@ -11962,7 +12010,7 @@ def L4_loadrd_rr : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "$Rdd32 = memd($Rs32+$Rt32<<#$Ii)", -tc_f47d212f, TypeLD>, Enc_84bff1, AddrModeRel, ImmRegShl { +tc_bf061958, TypeLD>, Enc_84bff1, AddrModeRel, ImmRegShl { let Inst{6-5} = 0b00; let Inst{31-21} = 0b00111010110; let addrMode = BaseRegOffset; @@ -11977,7 +12025,7 @@ def L4_loadrd_ur : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II), "$Rdd32 = memd($Rt32<<#$Ii+#$II)", -tc_cf47a43f, TypeLD>, Enc_6185fe, AddrModeRel, ImmRegShl { +tc_bab0eed9, TypeLD>, Enc_6185fe, AddrModeRel, ImmRegShl { let Inst{12-12} = 0b1; let Inst{31-21} = 0b10011101110; let addrMode = BaseLongOffset; @@ -11997,7 +12045,7 @@ def L4_loadrh_ap : HInst< (outs IntRegs:$Rd32, IntRegs:$Re32), (ins u32_0Imm:$II), "$Rd32 = memh($Re32=#$II)", -tc_b77c481f, TypeLD>, Enc_323f2d { +tc_3b5b7ef9, TypeLD>, Enc_323f2d { let Inst{7-7} = 0b0; let Inst{13-12} = 0b01; let Inst{31-21} = 0b10011011010; @@ -12018,7 +12066,7 @@ def L4_loadrh_rr : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "$Rd32 = memh($Rs32+$Rt32<<#$Ii)", -tc_f47d212f, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl { +tc_bf061958, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl { let Inst{6-5} = 0b00; let Inst{31-21} = 0b00111010010; let hasNewValue = 1; @@ -12035,7 +12083,7 @@ def L4_loadrh_ur : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II), "$Rd32 = memh($Rt32<<#$Ii+#$II)", -tc_cf47a43f, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl { +tc_bab0eed9, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl { let Inst{12-12} = 0b1; let Inst{31-21} = 0b10011101010; let hasNewValue = 1; @@ -12057,7 +12105,7 @@ def L4_loadri_ap : HInst< (outs IntRegs:$Rd32, IntRegs:$Re32), (ins u32_0Imm:$II), "$Rd32 = memw($Re32=#$II)", -tc_b77c481f, TypeLD>, Enc_323f2d { +tc_3b5b7ef9, TypeLD>, Enc_323f2d { let Inst{7-7} = 0b0; let Inst{13-12} = 0b01; let Inst{31-21} = 0b10011011100; @@ -12078,7 +12126,7 @@ def L4_loadri_rr : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "$Rd32 = memw($Rs32+$Rt32<<#$Ii)", -tc_f47d212f, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl { +tc_bf061958, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl { let Inst{6-5} = 0b00; let Inst{31-21} = 0b00111010100; let hasNewValue = 1; @@ -12095,7 +12143,7 @@ def L4_loadri_ur : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II), "$Rd32 = memw($Rt32<<#$Ii+#$II)", -tc_cf47a43f, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl { +tc_bab0eed9, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl { let Inst{12-12} = 0b1; let Inst{31-21} = 0b10011101100; let hasNewValue = 1; @@ -12117,7 +12165,7 @@ def L4_loadrub_ap : HInst< (outs IntRegs:$Rd32, IntRegs:$Re32), (ins u32_0Imm:$II), "$Rd32 = memub($Re32=#$II)", -tc_b77c481f, TypeLD>, Enc_323f2d { +tc_3b5b7ef9, TypeLD>, Enc_323f2d { let Inst{7-7} = 0b0; let Inst{13-12} = 0b01; let Inst{31-21} = 0b10011011001; @@ -12138,7 +12186,7 @@ def L4_loadrub_rr : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "$Rd32 = memub($Rs32+$Rt32<<#$Ii)", -tc_f47d212f, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl { +tc_bf061958, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl { let Inst{6-5} = 0b00; let Inst{31-21} = 0b00111010001; let hasNewValue = 1; @@ -12155,7 +12203,7 @@ def L4_loadrub_ur : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II), "$Rd32 = memub($Rt32<<#$Ii+#$II)", -tc_cf47a43f, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl { +tc_bab0eed9, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl { let Inst{12-12} = 0b1; let Inst{31-21} = 0b10011101001; let hasNewValue = 1; @@ -12177,7 +12225,7 @@ def L4_loadruh_ap : HInst< (outs IntRegs:$Rd32, IntRegs:$Re32), (ins u32_0Imm:$II), "$Rd32 = memuh($Re32=#$II)", -tc_b77c481f, TypeLD>, Enc_323f2d { +tc_3b5b7ef9, TypeLD>, Enc_323f2d { let Inst{7-7} = 0b0; let Inst{13-12} = 0b01; let Inst{31-21} = 0b10011011011; @@ -12198,7 +12246,7 @@ def L4_loadruh_rr : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "$Rd32 = memuh($Rs32+$Rt32<<#$Ii)", -tc_f47d212f, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl { +tc_bf061958, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl { let Inst{6-5} = 0b00; let Inst{31-21} = 0b00111010011; let hasNewValue = 1; @@ -12215,7 +12263,7 @@ def L4_loadruh_ur : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II), "$Rd32 = memuh($Rt32<<#$Ii+#$II)", -tc_cf47a43f, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl { +tc_bab0eed9, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl { let Inst{12-12} = 0b1; let Inst{31-21} = 0b10011101011; let hasNewValue = 1; @@ -12237,7 +12285,7 @@ def L4_or_memopb_io : HInst< (outs), (ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32), "memb($Rs32+#$Ii) |= $Rt32", -tc_44126683, TypeV4LDST>, Enc_d44e31 { +tc_7186d325, TypeV4LDST>, Enc_d44e31 { let Inst{6-5} = 0b11; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111110000; @@ -12256,7 +12304,7 @@ def L4_or_memopb_zomap : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Rt32), "memb($Rs32) |= $Rt32", -tc_44126683, TypeMAPPING> { +tc_7186d325, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -12264,7 +12312,7 @@ def L4_or_memoph_io : HInst< (outs), (ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32), "memh($Rs32+#$Ii) |= $Rt32", -tc_44126683, TypeV4LDST>, Enc_163a3c { +tc_7186d325, TypeV4LDST>, Enc_163a3c { let Inst{6-5} = 0b11; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111110001; @@ -12283,7 +12331,7 @@ def L4_or_memoph_zomap : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Rt32), "memh($Rs32) |= $Rt32", -tc_44126683, TypeMAPPING> { +tc_7186d325, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -12291,7 +12339,7 @@ def L4_or_memopw_io : HInst< (outs), (ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32), "memw($Rs32+#$Ii) |= $Rt32", -tc_44126683, TypeV4LDST>, Enc_226535 { +tc_7186d325, TypeV4LDST>, Enc_226535 { let Inst{6-5} = 0b11; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111110010; @@ -12310,7 +12358,7 @@ def L4_or_memopw_zomap : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Rt32), "memw($Rs32) |= $Rt32", -tc_44126683, TypeMAPPING> { +tc_7186d325, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -12318,7 +12366,7 @@ def L4_ploadrbf_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if (!$Pt4) $Rd32 = memb(#$Ii)", -tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel { +tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b101; let Inst{31-21} = 0b10011111000; @@ -12343,7 +12391,7 @@ def L4_ploadrbf_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if (!$Pv4) $Rd32 = memb($Rs32+$Rt32<<#$Ii)", -tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel { +tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110001000; let isPredicated = 1; let isPredicatedFalse = 1; @@ -12360,7 +12408,7 @@ def L4_ploadrbfnew_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if (!$Pt4.new) $Rd32 = memb(#$Ii)", -tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel { +tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b111; let Inst{31-21} = 0b10011111000; @@ -12386,7 +12434,7 @@ def L4_ploadrbfnew_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if (!$Pv4.new) $Rd32 = memb($Rs32+$Rt32<<#$Ii)", -tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel { +tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110011000; let isPredicated = 1; let isPredicatedFalse = 1; @@ -12404,7 +12452,7 @@ def L4_ploadrbt_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if ($Pt4) $Rd32 = memb(#$Ii)", -tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel { +tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b100; let Inst{31-21} = 0b10011111000; @@ -12428,7 +12476,7 @@ def L4_ploadrbt_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if ($Pv4) $Rd32 = memb($Rs32+$Rt32<<#$Ii)", -tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel { +tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110000000; let isPredicated = 1; let hasNewValue = 1; @@ -12444,7 +12492,7 @@ def L4_ploadrbtnew_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if ($Pt4.new) $Rd32 = memb(#$Ii)", -tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel { +tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b110; let Inst{31-21} = 0b10011111000; @@ -12469,7 +12517,7 @@ def L4_ploadrbtnew_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if ($Pv4.new) $Rd32 = memb($Rs32+$Rt32<<#$Ii)", -tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel { +tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110010000; let isPredicated = 1; let hasNewValue = 1; @@ -12486,7 +12534,7 @@ def L4_ploadrdf_abs : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if (!$Pt4) $Rdd32 = memd(#$Ii)", -tc_1d5a38a8, TypeLD>, Enc_2a7b91, AddrModeRel { +tc_7646c131, TypeLD>, Enc_2a7b91, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b101; let Inst{31-21} = 0b10011111110; @@ -12509,7 +12557,7 @@ def L4_ploadrdf_rr : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if (!$Pv4) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)", -tc_9ef61e5c, TypeLD>, Enc_98c0b8, AddrModeRel { +tc_e4b3cb20, TypeLD>, Enc_98c0b8, AddrModeRel { let Inst{31-21} = 0b00110001110; let isPredicated = 1; let isPredicatedFalse = 1; @@ -12524,7 +12572,7 @@ def L4_ploadrdfnew_abs : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if (!$Pt4.new) $Rdd32 = memd(#$Ii)", -tc_b77c481f, TypeLD>, Enc_2a7b91, AddrModeRel { +tc_3b5b7ef9, TypeLD>, Enc_2a7b91, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b111; let Inst{31-21} = 0b10011111110; @@ -12548,7 +12596,7 @@ def L4_ploadrdfnew_rr : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if (!$Pv4.new) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)", -tc_b7dd427e, TypeLD>, Enc_98c0b8, AddrModeRel { +tc_25a78932, TypeLD>, Enc_98c0b8, AddrModeRel { let Inst{31-21} = 0b00110011110; let isPredicated = 1; let isPredicatedFalse = 1; @@ -12564,7 +12612,7 @@ def L4_ploadrdt_abs : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if ($Pt4) $Rdd32 = memd(#$Ii)", -tc_1d5a38a8, TypeLD>, Enc_2a7b91, AddrModeRel { +tc_7646c131, TypeLD>, Enc_2a7b91, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b100; let Inst{31-21} = 0b10011111110; @@ -12586,7 +12634,7 @@ def L4_ploadrdt_rr : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if ($Pv4) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)", -tc_9ef61e5c, TypeLD>, Enc_98c0b8, AddrModeRel { +tc_e4b3cb20, TypeLD>, Enc_98c0b8, AddrModeRel { let Inst{31-21} = 0b00110000110; let isPredicated = 1; let addrMode = BaseRegOffset; @@ -12600,7 +12648,7 @@ def L4_ploadrdtnew_abs : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if ($Pt4.new) $Rdd32 = memd(#$Ii)", -tc_b77c481f, TypeLD>, Enc_2a7b91, AddrModeRel { +tc_3b5b7ef9, TypeLD>, Enc_2a7b91, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b110; let Inst{31-21} = 0b10011111110; @@ -12623,7 +12671,7 @@ def L4_ploadrdtnew_rr : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if ($Pv4.new) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)", -tc_b7dd427e, TypeLD>, Enc_98c0b8, AddrModeRel { +tc_25a78932, TypeLD>, Enc_98c0b8, AddrModeRel { let Inst{31-21} = 0b00110010110; let isPredicated = 1; let addrMode = BaseRegOffset; @@ -12638,7 +12686,7 @@ def L4_ploadrhf_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if (!$Pt4) $Rd32 = memh(#$Ii)", -tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel { +tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b101; let Inst{31-21} = 0b10011111010; @@ -12663,7 +12711,7 @@ def L4_ploadrhf_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if (!$Pv4) $Rd32 = memh($Rs32+$Rt32<<#$Ii)", -tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel { +tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110001010; let isPredicated = 1; let isPredicatedFalse = 1; @@ -12680,7 +12728,7 @@ def L4_ploadrhfnew_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if (!$Pt4.new) $Rd32 = memh(#$Ii)", -tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel { +tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b111; let Inst{31-21} = 0b10011111010; @@ -12706,7 +12754,7 @@ def L4_ploadrhfnew_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if (!$Pv4.new) $Rd32 = memh($Rs32+$Rt32<<#$Ii)", -tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel { +tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110011010; let isPredicated = 1; let isPredicatedFalse = 1; @@ -12724,7 +12772,7 @@ def L4_ploadrht_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if ($Pt4) $Rd32 = memh(#$Ii)", -tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel { +tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b100; let Inst{31-21} = 0b10011111010; @@ -12748,7 +12796,7 @@ def L4_ploadrht_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if ($Pv4) $Rd32 = memh($Rs32+$Rt32<<#$Ii)", -tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel { +tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110000010; let isPredicated = 1; let hasNewValue = 1; @@ -12764,7 +12812,7 @@ def L4_ploadrhtnew_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if ($Pt4.new) $Rd32 = memh(#$Ii)", -tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel { +tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b110; let Inst{31-21} = 0b10011111010; @@ -12789,7 +12837,7 @@ def L4_ploadrhtnew_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if ($Pv4.new) $Rd32 = memh($Rs32+$Rt32<<#$Ii)", -tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel { +tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110010010; let isPredicated = 1; let hasNewValue = 1; @@ -12806,7 +12854,7 @@ def L4_ploadrif_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if (!$Pt4) $Rd32 = memw(#$Ii)", -tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel { +tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b101; let Inst{31-21} = 0b10011111100; @@ -12831,7 +12879,7 @@ def L4_ploadrif_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if (!$Pv4) $Rd32 = memw($Rs32+$Rt32<<#$Ii)", -tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel { +tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110001100; let isPredicated = 1; let isPredicatedFalse = 1; @@ -12848,7 +12896,7 @@ def L4_ploadrifnew_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if (!$Pt4.new) $Rd32 = memw(#$Ii)", -tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel { +tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b111; let Inst{31-21} = 0b10011111100; @@ -12874,7 +12922,7 @@ def L4_ploadrifnew_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if (!$Pv4.new) $Rd32 = memw($Rs32+$Rt32<<#$Ii)", -tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel { +tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110011100; let isPredicated = 1; let isPredicatedFalse = 1; @@ -12892,7 +12940,7 @@ def L4_ploadrit_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if ($Pt4) $Rd32 = memw(#$Ii)", -tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel { +tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b100; let Inst{31-21} = 0b10011111100; @@ -12916,7 +12964,7 @@ def L4_ploadrit_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if ($Pv4) $Rd32 = memw($Rs32+$Rt32<<#$Ii)", -tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel { +tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110000100; let isPredicated = 1; let hasNewValue = 1; @@ -12932,7 +12980,7 @@ def L4_ploadritnew_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if ($Pt4.new) $Rd32 = memw(#$Ii)", -tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel { +tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b110; let Inst{31-21} = 0b10011111100; @@ -12957,7 +13005,7 @@ def L4_ploadritnew_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if ($Pv4.new) $Rd32 = memw($Rs32+$Rt32<<#$Ii)", -tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel { +tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110010100; let isPredicated = 1; let hasNewValue = 1; @@ -12974,7 +13022,7 @@ def L4_ploadrubf_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if (!$Pt4) $Rd32 = memub(#$Ii)", -tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel { +tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b101; let Inst{31-21} = 0b10011111001; @@ -12999,7 +13047,7 @@ def L4_ploadrubf_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if (!$Pv4) $Rd32 = memub($Rs32+$Rt32<<#$Ii)", -tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel { +tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110001001; let isPredicated = 1; let isPredicatedFalse = 1; @@ -13016,7 +13064,7 @@ def L4_ploadrubfnew_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if (!$Pt4.new) $Rd32 = memub(#$Ii)", -tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel { +tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b111; let Inst{31-21} = 0b10011111001; @@ -13042,7 +13090,7 @@ def L4_ploadrubfnew_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if (!$Pv4.new) $Rd32 = memub($Rs32+$Rt32<<#$Ii)", -tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel { +tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110011001; let isPredicated = 1; let isPredicatedFalse = 1; @@ -13060,7 +13108,7 @@ def L4_ploadrubt_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if ($Pt4) $Rd32 = memub(#$Ii)", -tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel { +tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b100; let Inst{31-21} = 0b10011111001; @@ -13084,7 +13132,7 @@ def L4_ploadrubt_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if ($Pv4) $Rd32 = memub($Rs32+$Rt32<<#$Ii)", -tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel { +tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110000001; let isPredicated = 1; let hasNewValue = 1; @@ -13100,7 +13148,7 @@ def L4_ploadrubtnew_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if ($Pt4.new) $Rd32 = memub(#$Ii)", -tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel { +tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b110; let Inst{31-21} = 0b10011111001; @@ -13125,7 +13173,7 @@ def L4_ploadrubtnew_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if ($Pv4.new) $Rd32 = memub($Rs32+$Rt32<<#$Ii)", -tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel { +tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110010001; let isPredicated = 1; let hasNewValue = 1; @@ -13142,7 +13190,7 @@ def L4_ploadruhf_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if (!$Pt4) $Rd32 = memuh(#$Ii)", -tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel { +tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b101; let Inst{31-21} = 0b10011111011; @@ -13167,7 +13215,7 @@ def L4_ploadruhf_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if (!$Pv4) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)", -tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel { +tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110001011; let isPredicated = 1; let isPredicatedFalse = 1; @@ -13184,7 +13232,7 @@ def L4_ploadruhfnew_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if (!$Pt4.new) $Rd32 = memuh(#$Ii)", -tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel { +tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b111; let Inst{31-21} = 0b10011111011; @@ -13210,7 +13258,7 @@ def L4_ploadruhfnew_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if (!$Pv4.new) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)", -tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel { +tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110011011; let isPredicated = 1; let isPredicatedFalse = 1; @@ -13228,7 +13276,7 @@ def L4_ploadruht_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if ($Pt4) $Rd32 = memuh(#$Ii)", -tc_1d5a38a8, TypeLD>, Enc_2301d6, AddrModeRel { +tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b100; let Inst{31-21} = 0b10011111011; @@ -13252,7 +13300,7 @@ def L4_ploadruht_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if ($Pv4) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)", -tc_9ef61e5c, TypeLD>, Enc_2e1979, AddrModeRel { +tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110000011; let isPredicated = 1; let hasNewValue = 1; @@ -13268,7 +13316,7 @@ def L4_ploadruhtnew_abs : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pt4, u32_0Imm:$Ii), "if ($Pt4.new) $Rd32 = memuh(#$Ii)", -tc_b77c481f, TypeLD>, Enc_2301d6, AddrModeRel { +tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel { let Inst{7-5} = 0b100; let Inst{13-11} = 0b110; let Inst{31-21} = 0b10011111011; @@ -13293,7 +13341,7 @@ def L4_ploadruhtnew_rr : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii), "if ($Pv4.new) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)", -tc_b7dd427e, TypeLD>, Enc_2e1979, AddrModeRel { +tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel { let Inst{31-21} = 0b00110010011; let isPredicated = 1; let hasNewValue = 1; @@ -13310,7 +13358,7 @@ def L4_return : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = dealloc_return($Rs32):raw", -tc_3d04548d, TypeLD>, Enc_3a3d62, PredNewRel { +tc_675e4897, TypeLD>, Enc_3a3d62, PredNewRel { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b10010110000; let isTerminator = 1; @@ -13331,7 +13379,7 @@ def L4_return_f : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pv4, IntRegs:$Rs32), "if (!$Pv4) $Rdd32 = dealloc_return($Rs32):raw", -tc_513bef45, TypeLD>, Enc_b7fad3, PredNewRel { +tc_2b8da4c2, TypeLD>, Enc_b7fad3, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1100; let Inst{31-21} = 0b10010110000; @@ -13353,7 +13401,7 @@ def L4_return_fnew_pnt : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pv4, IntRegs:$Rs32), "if (!$Pv4.new) $Rdd32 = dealloc_return($Rs32):nt:raw", -tc_395dc00f, TypeLD>, Enc_b7fad3, PredNewRel { +tc_9da59d12, TypeLD>, Enc_b7fad3, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1010; let Inst{31-21} = 0b10010110000; @@ -13376,7 +13424,7 @@ def L4_return_fnew_pt : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pv4, IntRegs:$Rs32), "if (!$Pv4.new) $Rdd32 = dealloc_return($Rs32):t:raw", -tc_395dc00f, TypeLD>, Enc_b7fad3, PredNewRel { +tc_9da59d12, TypeLD>, Enc_b7fad3, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b1110; let Inst{31-21} = 0b10010110000; @@ -13399,7 +13447,7 @@ def L4_return_map_to_raw_f : HInst< (outs), (ins PredRegs:$Pv4), "if (!$Pv4) dealloc_return", -tc_513bef45, TypeMAPPING>, Requires<[HasV65]> { +tc_2b8da4c2, TypeMAPPING>, Requires<[HasV65]> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -13407,7 +13455,7 @@ def L4_return_map_to_raw_fnew_pnt : HInst< (outs), (ins PredRegs:$Pv4), "if (!$Pv4.new) dealloc_return:nt", -tc_395dc00f, TypeMAPPING>, Requires<[HasV65]> { +tc_9da59d12, TypeMAPPING>, Requires<[HasV65]> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -13415,7 +13463,7 @@ def L4_return_map_to_raw_fnew_pt : HInst< (outs), (ins PredRegs:$Pv4), "if (!$Pv4.new) dealloc_return:t", -tc_395dc00f, TypeMAPPING>, Requires<[HasV65]> { +tc_9da59d12, TypeMAPPING>, Requires<[HasV65]> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -13423,7 +13471,7 @@ def L4_return_map_to_raw_t : HInst< (outs), (ins PredRegs:$Pv4), "if ($Pv4) dealloc_return", -tc_3bc2c5d3, TypeMAPPING>, Requires<[HasV65]> { +tc_4d5fa3a1, TypeMAPPING>, Requires<[HasV65]> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -13431,7 +13479,7 @@ def L4_return_map_to_raw_tnew_pnt : HInst< (outs), (ins PredRegs:$Pv4), "if ($Pv4.new) dealloc_return:nt", -tc_e7624c08, TypeMAPPING>, Requires<[HasV65]> { +tc_e06f432a, TypeMAPPING>, Requires<[HasV65]> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -13439,7 +13487,7 @@ def L4_return_map_to_raw_tnew_pt : HInst< (outs), (ins PredRegs:$Pv4), "if ($Pv4.new) dealloc_return:t", -tc_e7624c08, TypeMAPPING>, Requires<[HasV65]> { +tc_e06f432a, TypeMAPPING>, Requires<[HasV65]> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -13447,7 +13495,7 @@ def L4_return_t : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pv4, IntRegs:$Rs32), "if ($Pv4) $Rdd32 = dealloc_return($Rs32):raw", -tc_513bef45, TypeLD>, Enc_b7fad3, PredNewRel { +tc_2b8da4c2, TypeLD>, Enc_b7fad3, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b0100; let Inst{31-21} = 0b10010110000; @@ -13468,7 +13516,7 @@ def L4_return_tnew_pnt : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pv4, IntRegs:$Rs32), "if ($Pv4.new) $Rdd32 = dealloc_return($Rs32):nt:raw", -tc_395dc00f, TypeLD>, Enc_b7fad3, PredNewRel { +tc_9da59d12, TypeLD>, Enc_b7fad3, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b0010; let Inst{31-21} = 0b10010110000; @@ -13490,7 +13538,7 @@ def L4_return_tnew_pt : HInst< (outs DoubleRegs:$Rdd32), (ins PredRegs:$Pv4, IntRegs:$Rs32), "if ($Pv4.new) $Rdd32 = dealloc_return($Rs32):t:raw", -tc_395dc00f, TypeLD>, Enc_b7fad3, PredNewRel { +tc_9da59d12, TypeLD>, Enc_b7fad3, PredNewRel { let Inst{7-5} = 0b000; let Inst{13-10} = 0b0110; let Inst{31-21} = 0b10010110000; @@ -13512,7 +13560,7 @@ def L4_sub_memopb_io : HInst< (outs), (ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32), "memb($Rs32+#$Ii) -= $Rt32", -tc_44126683, TypeV4LDST>, Enc_d44e31 { +tc_7186d325, TypeV4LDST>, Enc_d44e31 { let Inst{6-5} = 0b01; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111110000; @@ -13531,7 +13579,7 @@ def L4_sub_memopb_zomap : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Rt32), "memb($Rs32) -= $Rt32", -tc_44126683, TypeMAPPING> { +tc_7186d325, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -13539,7 +13587,7 @@ def L4_sub_memoph_io : HInst< (outs), (ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32), "memh($Rs32+#$Ii) -= $Rt32", -tc_44126683, TypeV4LDST>, Enc_163a3c { +tc_7186d325, TypeV4LDST>, Enc_163a3c { let Inst{6-5} = 0b01; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111110001; @@ -13558,7 +13606,7 @@ def L4_sub_memoph_zomap : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Rt32), "memh($Rs32) -= $Rt32", -tc_44126683, TypeMAPPING> { +tc_7186d325, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -13566,7 +13614,7 @@ def L4_sub_memopw_io : HInst< (outs), (ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32), "memw($Rs32+#$Ii) -= $Rt32", -tc_44126683, TypeV4LDST>, Enc_226535 { +tc_7186d325, TypeV4LDST>, Enc_226535 { let Inst{6-5} = 0b01; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00111110010; @@ -13585,7 +13633,7 @@ def L4_sub_memopw_zomap : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Rt32), "memw($Rs32) -= $Rt32", -tc_44126683, TypeMAPPING> { +tc_7186d325, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -13593,15 +13641,26 @@ def L6_deallocframe_map_to_raw : HInst< (outs), (ins), "deallocframe", -tc_d1090e34, TypeMAPPING>, Requires<[HasV65]> { +tc_15aa71c5, TypeMAPPING>, Requires<[HasV65]> { let isPseudo = 1; let isCodeGenOnly = 1; } +def L6_memcpy : HInst< +(outs), +(ins IntRegs:$Rs32, IntRegs:$Rt32, ModRegs:$Mu2), +"memcpy($Rs32,$Rt32,$Mu2)", +tc_a6b1eca9, TypeLD>, Enc_a75aa6, Requires<[HasV66]> { +let Inst{7-0} = 0b01000000; +let Inst{31-21} = 0b10010010000; +let mayLoad = 1; +let isSolo = 1; +let mayStore = 1; +} def L6_return_map_to_raw : HInst< (outs), (ins), "dealloc_return", -tc_3d04548d, TypeMAPPING>, Requires<[HasV65]> { +tc_675e4897, TypeMAPPING>, Requires<[HasV65]> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -13609,7 +13668,7 @@ def M2_acci : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += add($Rs32,$Rt32)", -tc_c74f796f, TypeM>, Enc_2ae154, ImmRegRel { +tc_f675fee8, TypeM>, Enc_2ae154, ImmRegRel { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111000; @@ -13624,7 +13683,7 @@ def M2_accii : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii), "$Rx32 += add($Rs32,#$Ii)", -tc_c74f796f, TypeM>, Enc_c90aca, ImmRegRel { +tc_f675fee8, TypeM>, Enc_c90aca, ImmRegRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100010000; let hasNewValue = 1; @@ -13643,7 +13702,7 @@ def M2_cmaci_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += cmpyi($Rs32,$Rt32)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111000; @@ -13654,7 +13713,7 @@ def M2_cmacr_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += cmpyr($Rs32,$Rt32)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111000; @@ -13665,7 +13724,7 @@ def M2_cmacs_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += cmpy($Rs32,$Rt32):sat", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111000; @@ -13677,7 +13736,7 @@ def M2_cmacs_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += cmpy($Rs32,$Rt32):<<1:sat", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111100; @@ -13689,7 +13748,7 @@ def M2_cmacsc_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += cmpy($Rs32,$Rt32*):sat", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111010; @@ -13701,7 +13760,7 @@ def M2_cmacsc_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += cmpy($Rs32,$Rt32*):<<1:sat", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111110; @@ -13713,7 +13772,7 @@ def M2_cmpyi_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = cmpyi($Rs32,$Rt32)", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100101000; @@ -13723,7 +13782,7 @@ def M2_cmpyr_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = cmpyr($Rs32,$Rt32)", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100101000; @@ -13733,7 +13792,7 @@ def M2_cmpyrs_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = cmpy($Rs32,$Rt32):rnd:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101101001; @@ -13746,7 +13805,7 @@ def M2_cmpyrs_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = cmpy($Rs32,$Rt32):<<1:rnd:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101101101; @@ -13759,7 +13818,7 @@ def M2_cmpyrsc_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = cmpy($Rs32,$Rt32*):rnd:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101101011; @@ -13772,7 +13831,7 @@ def M2_cmpyrsc_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = cmpy($Rs32,$Rt32*):<<1:rnd:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101101111; @@ -13785,7 +13844,7 @@ def M2_cmpys_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = cmpy($Rs32,$Rt32):sat", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100101000; @@ -13796,7 +13855,7 @@ def M2_cmpys_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = cmpy($Rs32,$Rt32):<<1:sat", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100101100; @@ -13807,7 +13866,7 @@ def M2_cmpysc_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = cmpy($Rs32,$Rt32*):sat", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100101010; @@ -13818,7 +13877,7 @@ def M2_cmpysc_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = cmpy($Rs32,$Rt32*):<<1:sat", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100101110; @@ -13829,7 +13888,7 @@ def M2_cnacs_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= cmpy($Rs32,$Rt32):sat", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111000; @@ -13841,7 +13900,7 @@ def M2_cnacs_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= cmpy($Rs32,$Rt32):<<1:sat", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111100; @@ -13853,7 +13912,7 @@ def M2_cnacsc_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= cmpy($Rs32,$Rt32*):sat", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111010; @@ -13865,7 +13924,7 @@ def M2_cnacsc_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= cmpy($Rs32,$Rt32*):<<1:sat", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111110; @@ -13877,7 +13936,7 @@ def M2_dpmpyss_acc_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += mpy($Rs32,$Rt32)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111000; @@ -13888,7 +13947,7 @@ def M2_dpmpyss_nac_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= mpy($Rs32,$Rt32)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111001; @@ -13899,7 +13958,7 @@ def M2_dpmpyss_rnd_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32,$Rt32):rnd", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101101001; @@ -13911,7 +13970,7 @@ def M2_dpmpyss_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpy($Rs32,$Rt32)", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100101000; @@ -13921,7 +13980,7 @@ def M2_dpmpyuu_acc_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += mpyu($Rs32,$Rt32)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111010; @@ -13932,7 +13991,7 @@ def M2_dpmpyuu_nac_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= mpyu($Rs32,$Rt32)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111011; @@ -13943,7 +14002,7 @@ def M2_dpmpyuu_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpyu($Rs32,$Rt32)", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100101010; @@ -13953,7 +14012,7 @@ def M2_hmmpyh_rs1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32,$Rt32.h):<<1:rnd:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101101101; @@ -13966,7 +14025,7 @@ def M2_hmmpyh_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32,$Rt32.h):<<1:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101101101; @@ -13979,7 +14038,7 @@ def M2_hmmpyl_rs1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32,$Rt32.l):<<1:rnd:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101101111; @@ -13992,7 +14051,7 @@ def M2_hmmpyl_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32,$Rt32.l):<<1:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101101101; @@ -14005,7 +14064,7 @@ def M2_maci : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpyi($Rs32,$Rt32)", -tc_e913dc32, TypeM>, Enc_2ae154, ImmRegRel { +tc_d773585a, TypeM>, Enc_2ae154, ImmRegRel { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111000; @@ -14020,7 +14079,7 @@ def M2_macsin : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u32_0Imm:$Ii), "$Rx32 -= mpyi($Rs32,#$Ii)", -tc_16d0d8d5, TypeM>, Enc_c90aca { +tc_05d3a09b, TypeM>, Enc_c90aca { let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100001100; let hasNewValue = 1; @@ -14038,7 +14097,7 @@ def M2_macsip : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u32_0Imm:$Ii), "$Rx32 += mpyi($Rs32,#$Ii)", -tc_16d0d8d5, TypeM>, Enc_c90aca, ImmRegRel { +tc_05d3a09b, TypeM>, Enc_c90aca, ImmRegRel { let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100001000; let hasNewValue = 1; @@ -14057,7 +14116,7 @@ def M2_mmachs_rs0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vmpywoh($Rss32,$Rtt32):rnd:sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010001; @@ -14069,7 +14128,7 @@ def M2_mmachs_rs1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vmpywoh($Rss32,$Rtt32):<<1:rnd:sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010101; @@ -14081,7 +14140,7 @@ def M2_mmachs_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vmpywoh($Rss32,$Rtt32):sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010000; @@ -14093,7 +14152,7 @@ def M2_mmachs_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vmpywoh($Rss32,$Rtt32):<<1:sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010100; @@ -14105,7 +14164,7 @@ def M2_mmacls_rs0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vmpyweh($Rss32,$Rtt32):rnd:sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010001; @@ -14117,7 +14176,7 @@ def M2_mmacls_rs1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vmpyweh($Rss32,$Rtt32):<<1:rnd:sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010101; @@ -14129,7 +14188,7 @@ def M2_mmacls_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vmpyweh($Rss32,$Rtt32):sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010000; @@ -14141,7 +14200,7 @@ def M2_mmacls_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vmpyweh($Rss32,$Rtt32):<<1:sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010100; @@ -14153,7 +14212,7 @@ def M2_mmacuhs_rs0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vmpywouh($Rss32,$Rtt32):rnd:sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010011; @@ -14165,7 +14224,7 @@ def M2_mmacuhs_rs1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vmpywouh($Rss32,$Rtt32):<<1:rnd:sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010111; @@ -14177,7 +14236,7 @@ def M2_mmacuhs_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vmpywouh($Rss32,$Rtt32):sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010010; @@ -14189,7 +14248,7 @@ def M2_mmacuhs_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vmpywouh($Rss32,$Rtt32):<<1:sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010110; @@ -14201,7 +14260,7 @@ def M2_mmaculs_rs0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vmpyweuh($Rss32,$Rtt32):rnd:sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010011; @@ -14213,7 +14272,7 @@ def M2_mmaculs_rs1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vmpyweuh($Rss32,$Rtt32):<<1:rnd:sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010111; @@ -14225,7 +14284,7 @@ def M2_mmaculs_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vmpyweuh($Rss32,$Rtt32):sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010010; @@ -14237,7 +14296,7 @@ def M2_mmaculs_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vmpyweuh($Rss32,$Rtt32):<<1:sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010110; @@ -14249,7 +14308,7 @@ def M2_mmpyh_rs0 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vmpywoh($Rss32,$Rtt32):rnd:sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000001; @@ -14260,7 +14319,7 @@ def M2_mmpyh_rs1 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vmpywoh($Rss32,$Rtt32):<<1:rnd:sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000101; @@ -14271,7 +14330,7 @@ def M2_mmpyh_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vmpywoh($Rss32,$Rtt32):sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000000; @@ -14282,7 +14341,7 @@ def M2_mmpyh_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vmpywoh($Rss32,$Rtt32):<<1:sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000100; @@ -14293,7 +14352,7 @@ def M2_mmpyl_rs0 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vmpyweh($Rss32,$Rtt32):rnd:sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000001; @@ -14304,7 +14363,7 @@ def M2_mmpyl_rs1 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vmpyweh($Rss32,$Rtt32):<<1:rnd:sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000101; @@ -14315,7 +14374,7 @@ def M2_mmpyl_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vmpyweh($Rss32,$Rtt32):sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000000; @@ -14326,7 +14385,7 @@ def M2_mmpyl_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vmpyweh($Rss32,$Rtt32):<<1:sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000100; @@ -14337,7 +14396,7 @@ def M2_mmpyuh_rs0 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vmpywouh($Rss32,$Rtt32):rnd:sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000011; @@ -14348,7 +14407,7 @@ def M2_mmpyuh_rs1 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vmpywouh($Rss32,$Rtt32):<<1:rnd:sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000111; @@ -14359,7 +14418,7 @@ def M2_mmpyuh_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vmpywouh($Rss32,$Rtt32):sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000010; @@ -14370,7 +14429,7 @@ def M2_mmpyuh_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vmpywouh($Rss32,$Rtt32):<<1:sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000110; @@ -14381,7 +14440,7 @@ def M2_mmpyul_rs0 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vmpyweuh($Rss32,$Rtt32):rnd:sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000011; @@ -14392,7 +14451,7 @@ def M2_mmpyul_rs1 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vmpyweuh($Rss32,$Rtt32):<<1:rnd:sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000111; @@ -14403,7 +14462,7 @@ def M2_mmpyul_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vmpyweuh($Rss32,$Rtt32):sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000010; @@ -14414,18 +14473,31 @@ def M2_mmpyul_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vmpyweuh($Rss32,$Rtt32):<<1:sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000110; let prefersSlot3 = 1; let Defs = [USR_OVF]; } +def M2_mnaci : HInst< +(outs IntRegs:$Rx32), +(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), +"$Rx32 -= mpyi($Rs32,$Rt32)", +tc_bdceeac1, TypeM>, Enc_2ae154, Requires<[HasV66]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b0; +let Inst{31-21} = 0b11101111100; +let hasNewValue = 1; +let opNewValue = 0; +let prefersSlot3 = 1; +let Constraints = "$Rx32 = $Rx32in"; +} def M2_mpy_acc_hh_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpy($Rs32.h,$Rt32.h)", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110000; @@ -14438,7 +14510,7 @@ def M2_mpy_acc_hh_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpy($Rs32.h,$Rt32.h):<<1", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110100; @@ -14451,7 +14523,7 @@ def M2_mpy_acc_hl_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpy($Rs32.h,$Rt32.l)", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110000; @@ -14464,7 +14536,7 @@ def M2_mpy_acc_hl_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpy($Rs32.h,$Rt32.l):<<1", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110100; @@ -14477,7 +14549,7 @@ def M2_mpy_acc_lh_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpy($Rs32.l,$Rt32.h)", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110000; @@ -14490,7 +14562,7 @@ def M2_mpy_acc_lh_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpy($Rs32.l,$Rt32.h):<<1", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110100; @@ -14503,7 +14575,7 @@ def M2_mpy_acc_ll_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpy($Rs32.l,$Rt32.l)", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110000; @@ -14516,7 +14588,7 @@ def M2_mpy_acc_ll_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpy($Rs32.l,$Rt32.l):<<1", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110100; @@ -14529,7 +14601,7 @@ def M2_mpy_acc_sat_hh_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpy($Rs32.h,$Rt32.h):sat", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110000; @@ -14543,7 +14615,7 @@ def M2_mpy_acc_sat_hh_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpy($Rs32.h,$Rt32.h):<<1:sat", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110100; @@ -14557,7 +14629,7 @@ def M2_mpy_acc_sat_hl_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpy($Rs32.h,$Rt32.l):sat", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110000; @@ -14571,7 +14643,7 @@ def M2_mpy_acc_sat_hl_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpy($Rs32.h,$Rt32.l):<<1:sat", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110100; @@ -14585,7 +14657,7 @@ def M2_mpy_acc_sat_lh_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpy($Rs32.l,$Rt32.h):sat", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110000; @@ -14599,7 +14671,7 @@ def M2_mpy_acc_sat_lh_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpy($Rs32.l,$Rt32.h):<<1:sat", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110100; @@ -14613,7 +14685,7 @@ def M2_mpy_acc_sat_ll_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpy($Rs32.l,$Rt32.l):sat", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110000; @@ -14627,7 +14699,7 @@ def M2_mpy_acc_sat_ll_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpy($Rs32.l,$Rt32.l):<<1:sat", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110100; @@ -14641,7 +14713,7 @@ def M2_mpy_hh_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.h,$Rt32.h)", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100000; @@ -14653,7 +14725,7 @@ def M2_mpy_hh_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.h,$Rt32.h):<<1", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100100; @@ -14665,7 +14737,7 @@ def M2_mpy_hl_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.h,$Rt32.l)", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100000; @@ -14677,7 +14749,7 @@ def M2_mpy_hl_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.h,$Rt32.l):<<1", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100100; @@ -14689,7 +14761,7 @@ def M2_mpy_lh_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.l,$Rt32.h)", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100000; @@ -14701,7 +14773,7 @@ def M2_mpy_lh_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.l,$Rt32.h):<<1", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100100; @@ -14713,7 +14785,7 @@ def M2_mpy_ll_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.l,$Rt32.l)", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100000; @@ -14725,7 +14797,7 @@ def M2_mpy_ll_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.l,$Rt32.l):<<1", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100100; @@ -14737,7 +14809,7 @@ def M2_mpy_nac_hh_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpy($Rs32.h,$Rt32.h)", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110001; @@ -14750,7 +14822,7 @@ def M2_mpy_nac_hh_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpy($Rs32.h,$Rt32.h):<<1", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110101; @@ -14763,7 +14835,7 @@ def M2_mpy_nac_hl_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpy($Rs32.h,$Rt32.l)", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110001; @@ -14776,7 +14848,7 @@ def M2_mpy_nac_hl_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpy($Rs32.h,$Rt32.l):<<1", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110101; @@ -14789,7 +14861,7 @@ def M2_mpy_nac_lh_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpy($Rs32.l,$Rt32.h)", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110001; @@ -14802,7 +14874,7 @@ def M2_mpy_nac_lh_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpy($Rs32.l,$Rt32.h):<<1", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110101; @@ -14815,7 +14887,7 @@ def M2_mpy_nac_ll_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpy($Rs32.l,$Rt32.l)", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110001; @@ -14828,7 +14900,7 @@ def M2_mpy_nac_ll_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpy($Rs32.l,$Rt32.l):<<1", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110101; @@ -14841,7 +14913,7 @@ def M2_mpy_nac_sat_hh_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpy($Rs32.h,$Rt32.h):sat", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110001; @@ -14855,7 +14927,7 @@ def M2_mpy_nac_sat_hh_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpy($Rs32.h,$Rt32.h):<<1:sat", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110101; @@ -14869,7 +14941,7 @@ def M2_mpy_nac_sat_hl_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpy($Rs32.h,$Rt32.l):sat", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110001; @@ -14883,7 +14955,7 @@ def M2_mpy_nac_sat_hl_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpy($Rs32.h,$Rt32.l):<<1:sat", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110101; @@ -14897,7 +14969,7 @@ def M2_mpy_nac_sat_lh_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpy($Rs32.l,$Rt32.h):sat", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110001; @@ -14911,7 +14983,7 @@ def M2_mpy_nac_sat_lh_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpy($Rs32.l,$Rt32.h):<<1:sat", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110101; @@ -14925,7 +14997,7 @@ def M2_mpy_nac_sat_ll_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpy($Rs32.l,$Rt32.l):sat", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110001; @@ -14939,7 +15011,7 @@ def M2_mpy_nac_sat_ll_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpy($Rs32.l,$Rt32.l):<<1:sat", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110101; @@ -14953,7 +15025,7 @@ def M2_mpy_rnd_hh_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.h,$Rt32.h):rnd", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100001; @@ -14965,7 +15037,7 @@ def M2_mpy_rnd_hh_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.h,$Rt32.h):<<1:rnd", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100101; @@ -14977,7 +15049,7 @@ def M2_mpy_rnd_hl_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.h,$Rt32.l):rnd", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100001; @@ -14989,7 +15061,7 @@ def M2_mpy_rnd_hl_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.h,$Rt32.l):<<1:rnd", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100101; @@ -15001,7 +15073,7 @@ def M2_mpy_rnd_lh_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.l,$Rt32.h):rnd", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100001; @@ -15013,7 +15085,7 @@ def M2_mpy_rnd_lh_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.l,$Rt32.h):<<1:rnd", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100101; @@ -15025,7 +15097,7 @@ def M2_mpy_rnd_ll_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.l,$Rt32.l):rnd", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100001; @@ -15037,7 +15109,7 @@ def M2_mpy_rnd_ll_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.l,$Rt32.l):<<1:rnd", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100101; @@ -15049,7 +15121,7 @@ def M2_mpy_sat_hh_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.h,$Rt32.h):sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100000; @@ -15062,7 +15134,7 @@ def M2_mpy_sat_hh_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.h,$Rt32.h):<<1:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100100; @@ -15075,7 +15147,7 @@ def M2_mpy_sat_hl_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.h,$Rt32.l):sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100000; @@ -15088,7 +15160,7 @@ def M2_mpy_sat_hl_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.h,$Rt32.l):<<1:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100100; @@ -15101,7 +15173,7 @@ def M2_mpy_sat_lh_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.l,$Rt32.h):sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100000; @@ -15114,7 +15186,7 @@ def M2_mpy_sat_lh_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.l,$Rt32.h):<<1:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100100; @@ -15127,7 +15199,7 @@ def M2_mpy_sat_ll_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.l,$Rt32.l):sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100000; @@ -15140,7 +15212,7 @@ def M2_mpy_sat_ll_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.l,$Rt32.l):<<1:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100100; @@ -15153,7 +15225,7 @@ def M2_mpy_sat_rnd_hh_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.h,$Rt32.h):rnd:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100001; @@ -15166,7 +15238,7 @@ def M2_mpy_sat_rnd_hh_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.h,$Rt32.h):<<1:rnd:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100101; @@ -15179,7 +15251,7 @@ def M2_mpy_sat_rnd_hl_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.h,$Rt32.l):rnd:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100001; @@ -15192,7 +15264,7 @@ def M2_mpy_sat_rnd_hl_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.h,$Rt32.l):<<1:rnd:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100101; @@ -15205,7 +15277,7 @@ def M2_mpy_sat_rnd_lh_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.l,$Rt32.h):rnd:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100001; @@ -15218,7 +15290,7 @@ def M2_mpy_sat_rnd_lh_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.l,$Rt32.h):<<1:rnd:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100101; @@ -15231,7 +15303,7 @@ def M2_mpy_sat_rnd_ll_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.l,$Rt32.l):rnd:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100001; @@ -15244,7 +15316,7 @@ def M2_mpy_sat_rnd_ll_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32.l,$Rt32.l):<<1:rnd:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100101; @@ -15257,7 +15329,7 @@ def M2_mpy_up : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32,$Rt32)", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101101000; @@ -15269,7 +15341,7 @@ def M2_mpy_up_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32,$Rt32):<<1", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101101101; @@ -15281,7 +15353,7 @@ def M2_mpy_up_s1_sat : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpy($Rs32,$Rt32):<<1:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101101111; @@ -15294,7 +15366,7 @@ def M2_mpyd_acc_hh_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += mpy($Rs32.h,$Rt32.h)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110000; @@ -15305,7 +15377,7 @@ def M2_mpyd_acc_hh_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += mpy($Rs32.h,$Rt32.h):<<1", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110100; @@ -15316,7 +15388,7 @@ def M2_mpyd_acc_hl_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += mpy($Rs32.h,$Rt32.l)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110000; @@ -15327,7 +15399,7 @@ def M2_mpyd_acc_hl_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += mpy($Rs32.h,$Rt32.l):<<1", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110100; @@ -15338,7 +15410,7 @@ def M2_mpyd_acc_lh_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += mpy($Rs32.l,$Rt32.h)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110000; @@ -15349,7 +15421,7 @@ def M2_mpyd_acc_lh_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += mpy($Rs32.l,$Rt32.h):<<1", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110100; @@ -15360,7 +15432,7 @@ def M2_mpyd_acc_ll_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += mpy($Rs32.l,$Rt32.l)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110000; @@ -15371,7 +15443,7 @@ def M2_mpyd_acc_ll_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += mpy($Rs32.l,$Rt32.l):<<1", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110100; @@ -15382,7 +15454,7 @@ def M2_mpyd_hh_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpy($Rs32.h,$Rt32.h)", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100000; @@ -15392,7 +15464,7 @@ def M2_mpyd_hh_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpy($Rs32.h,$Rt32.h):<<1", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100100; @@ -15402,7 +15474,7 @@ def M2_mpyd_hl_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpy($Rs32.h,$Rt32.l)", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100000; @@ -15412,7 +15484,7 @@ def M2_mpyd_hl_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpy($Rs32.h,$Rt32.l):<<1", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100100; @@ -15422,7 +15494,7 @@ def M2_mpyd_lh_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpy($Rs32.l,$Rt32.h)", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100000; @@ -15432,7 +15504,7 @@ def M2_mpyd_lh_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpy($Rs32.l,$Rt32.h):<<1", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100100; @@ -15442,7 +15514,7 @@ def M2_mpyd_ll_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpy($Rs32.l,$Rt32.l)", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100000; @@ -15452,7 +15524,7 @@ def M2_mpyd_ll_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpy($Rs32.l,$Rt32.l):<<1", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100100; @@ -15462,7 +15534,7 @@ def M2_mpyd_nac_hh_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= mpy($Rs32.h,$Rt32.h)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110001; @@ -15473,7 +15545,7 @@ def M2_mpyd_nac_hh_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= mpy($Rs32.h,$Rt32.h):<<1", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110101; @@ -15484,7 +15556,7 @@ def M2_mpyd_nac_hl_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= mpy($Rs32.h,$Rt32.l)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110001; @@ -15495,7 +15567,7 @@ def M2_mpyd_nac_hl_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= mpy($Rs32.h,$Rt32.l):<<1", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110101; @@ -15506,7 +15578,7 @@ def M2_mpyd_nac_lh_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= mpy($Rs32.l,$Rt32.h)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110001; @@ -15517,7 +15589,7 @@ def M2_mpyd_nac_lh_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= mpy($Rs32.l,$Rt32.h):<<1", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110101; @@ -15528,7 +15600,7 @@ def M2_mpyd_nac_ll_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= mpy($Rs32.l,$Rt32.l)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110001; @@ -15539,7 +15611,7 @@ def M2_mpyd_nac_ll_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= mpy($Rs32.l,$Rt32.l):<<1", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110101; @@ -15550,7 +15622,7 @@ def M2_mpyd_rnd_hh_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpy($Rs32.h,$Rt32.h):rnd", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100001; @@ -15560,7 +15632,7 @@ def M2_mpyd_rnd_hh_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpy($Rs32.h,$Rt32.h):<<1:rnd", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100101; @@ -15570,7 +15642,7 @@ def M2_mpyd_rnd_hl_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpy($Rs32.h,$Rt32.l):rnd", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100001; @@ -15580,7 +15652,7 @@ def M2_mpyd_rnd_hl_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpy($Rs32.h,$Rt32.l):<<1:rnd", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100101; @@ -15590,7 +15662,7 @@ def M2_mpyd_rnd_lh_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpy($Rs32.l,$Rt32.h):rnd", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100001; @@ -15600,7 +15672,7 @@ def M2_mpyd_rnd_lh_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpy($Rs32.l,$Rt32.h):<<1:rnd", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100101; @@ -15610,7 +15682,7 @@ def M2_mpyd_rnd_ll_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpy($Rs32.l,$Rt32.l):rnd", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100001; @@ -15620,7 +15692,7 @@ def M2_mpyd_rnd_ll_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpy($Rs32.l,$Rt32.l):<<1:rnd", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100101; @@ -15630,7 +15702,7 @@ def M2_mpyi : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpyi($Rs32,$Rt32)", -tc_8fd5f294, TypeM>, Enc_5ab2be, ImmRegRel { +tc_bafaade3, TypeM>, Enc_5ab2be, ImmRegRel { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101101000; @@ -15644,7 +15716,7 @@ def M2_mpysin : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, u8_0Imm:$Ii), "$Rd32 = -mpyi($Rs32,#$Ii)", -tc_1853ea6d, TypeM>, Enc_b8c967 { +tc_c8ce0b5c, TypeM>, Enc_b8c967 { let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100000100; let hasNewValue = 1; @@ -15655,7 +15727,7 @@ def M2_mpysip : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, u32_0Imm:$Ii), "$Rd32 = +mpyi($Rs32,#$Ii)", -tc_1853ea6d, TypeM>, Enc_b8c967 { +tc_c8ce0b5c, TypeM>, Enc_b8c967 { let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100000000; let hasNewValue = 1; @@ -15671,7 +15743,7 @@ def M2_mpysmi : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, m32_0Imm:$Ii), "$Rd32 = mpyi($Rs32,#$Ii)", -tc_1853ea6d, TypeM>, ImmRegRel { +tc_c8ce0b5c, TypeM>, ImmRegRel { let hasNewValue = 1; let opNewValue = 0; let CextOpcode = "M2_mpyi"; @@ -15687,7 +15759,7 @@ def M2_mpysu_up : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpysu($Rs32,$Rt32)", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101101011; @@ -15699,7 +15771,7 @@ def M2_mpyu_acc_hh_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpyu($Rs32.h,$Rt32.h)", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110010; @@ -15712,7 +15784,7 @@ def M2_mpyu_acc_hh_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpyu($Rs32.h,$Rt32.h):<<1", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110110; @@ -15725,7 +15797,7 @@ def M2_mpyu_acc_hl_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpyu($Rs32.h,$Rt32.l)", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110010; @@ -15738,7 +15810,7 @@ def M2_mpyu_acc_hl_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpyu($Rs32.h,$Rt32.l):<<1", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110110; @@ -15751,7 +15823,7 @@ def M2_mpyu_acc_lh_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpyu($Rs32.l,$Rt32.h)", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110010; @@ -15764,7 +15836,7 @@ def M2_mpyu_acc_lh_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpyu($Rs32.l,$Rt32.h):<<1", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110110; @@ -15777,7 +15849,7 @@ def M2_mpyu_acc_ll_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpyu($Rs32.l,$Rt32.l)", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110010; @@ -15790,7 +15862,7 @@ def M2_mpyu_acc_ll_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpyu($Rs32.l,$Rt32.l):<<1", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110110; @@ -15803,7 +15875,7 @@ def M2_mpyu_hh_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpyu($Rs32.h,$Rt32.h)", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100010; @@ -15815,7 +15887,7 @@ def M2_mpyu_hh_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpyu($Rs32.h,$Rt32.h):<<1", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100110; @@ -15827,7 +15899,7 @@ def M2_mpyu_hl_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpyu($Rs32.h,$Rt32.l)", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100010; @@ -15839,7 +15911,7 @@ def M2_mpyu_hl_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpyu($Rs32.h,$Rt32.l):<<1", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100110; @@ -15851,7 +15923,7 @@ def M2_mpyu_lh_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpyu($Rs32.l,$Rt32.h)", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100010; @@ -15863,7 +15935,7 @@ def M2_mpyu_lh_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpyu($Rs32.l,$Rt32.h):<<1", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100110; @@ -15875,7 +15947,7 @@ def M2_mpyu_ll_s0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpyu($Rs32.l,$Rt32.l)", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100010; @@ -15887,7 +15959,7 @@ def M2_mpyu_ll_s1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpyu($Rs32.l,$Rt32.l):<<1", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101100110; @@ -15899,7 +15971,7 @@ def M2_mpyu_nac_hh_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpyu($Rs32.h,$Rt32.h)", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110011; @@ -15912,7 +15984,7 @@ def M2_mpyu_nac_hh_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpyu($Rs32.h,$Rt32.h):<<1", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110111; @@ -15925,7 +15997,7 @@ def M2_mpyu_nac_hl_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpyu($Rs32.h,$Rt32.l)", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110011; @@ -15938,7 +16010,7 @@ def M2_mpyu_nac_hl_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpyu($Rs32.h,$Rt32.l):<<1", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110111; @@ -15951,7 +16023,7 @@ def M2_mpyu_nac_lh_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpyu($Rs32.l,$Rt32.h)", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110011; @@ -15964,7 +16036,7 @@ def M2_mpyu_nac_lh_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpyu($Rs32.l,$Rt32.h):<<1", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110111; @@ -15977,7 +16049,7 @@ def M2_mpyu_nac_ll_s0 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpyu($Rs32.l,$Rt32.l)", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110011; @@ -15990,7 +16062,7 @@ def M2_mpyu_nac_ll_s1 : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpyu($Rs32.l,$Rt32.l):<<1", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101110111; @@ -16003,7 +16075,7 @@ def M2_mpyu_up : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpyu($Rs32,$Rt32)", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101101010; @@ -16015,7 +16087,7 @@ def M2_mpyud_acc_hh_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += mpyu($Rs32.h,$Rt32.h)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110010; @@ -16026,7 +16098,7 @@ def M2_mpyud_acc_hh_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += mpyu($Rs32.h,$Rt32.h):<<1", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110110; @@ -16037,7 +16109,7 @@ def M2_mpyud_acc_hl_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += mpyu($Rs32.h,$Rt32.l)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110010; @@ -16048,7 +16120,7 @@ def M2_mpyud_acc_hl_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += mpyu($Rs32.h,$Rt32.l):<<1", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110110; @@ -16059,7 +16131,7 @@ def M2_mpyud_acc_lh_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += mpyu($Rs32.l,$Rt32.h)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110010; @@ -16070,7 +16142,7 @@ def M2_mpyud_acc_lh_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += mpyu($Rs32.l,$Rt32.h):<<1", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110110; @@ -16081,7 +16153,7 @@ def M2_mpyud_acc_ll_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += mpyu($Rs32.l,$Rt32.l)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110010; @@ -16092,7 +16164,7 @@ def M2_mpyud_acc_ll_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += mpyu($Rs32.l,$Rt32.l):<<1", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110110; @@ -16103,7 +16175,7 @@ def M2_mpyud_hh_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpyu($Rs32.h,$Rt32.h)", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100010; @@ -16113,7 +16185,7 @@ def M2_mpyud_hh_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpyu($Rs32.h,$Rt32.h):<<1", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100110; @@ -16123,7 +16195,7 @@ def M2_mpyud_hl_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpyu($Rs32.h,$Rt32.l)", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100010; @@ -16133,7 +16205,7 @@ def M2_mpyud_hl_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpyu($Rs32.h,$Rt32.l):<<1", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100110; @@ -16143,7 +16215,7 @@ def M2_mpyud_lh_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpyu($Rs32.l,$Rt32.h)", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100010; @@ -16153,7 +16225,7 @@ def M2_mpyud_lh_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpyu($Rs32.l,$Rt32.h):<<1", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100110; @@ -16163,7 +16235,7 @@ def M2_mpyud_ll_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpyu($Rs32.l,$Rt32.l)", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100010; @@ -16173,7 +16245,7 @@ def M2_mpyud_ll_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = mpyu($Rs32.l,$Rt32.l):<<1", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100100110; @@ -16183,7 +16255,7 @@ def M2_mpyud_nac_hh_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= mpyu($Rs32.h,$Rt32.h)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110011; @@ -16194,7 +16266,7 @@ def M2_mpyud_nac_hh_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= mpyu($Rs32.h,$Rt32.h):<<1", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110111; @@ -16205,7 +16277,7 @@ def M2_mpyud_nac_hl_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= mpyu($Rs32.h,$Rt32.l)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110011; @@ -16216,7 +16288,7 @@ def M2_mpyud_nac_hl_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= mpyu($Rs32.h,$Rt32.l):<<1", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110111; @@ -16227,7 +16299,7 @@ def M2_mpyud_nac_lh_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= mpyu($Rs32.l,$Rt32.h)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110011; @@ -16238,7 +16310,7 @@ def M2_mpyud_nac_lh_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= mpyu($Rs32.l,$Rt32.h):<<1", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110111; @@ -16249,7 +16321,7 @@ def M2_mpyud_nac_ll_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= mpyu($Rs32.l,$Rt32.l)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110011; @@ -16260,7 +16332,7 @@ def M2_mpyud_nac_ll_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 -= mpyu($Rs32.l,$Rt32.l):<<1", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100110111; @@ -16271,7 +16343,7 @@ def M2_mpyui : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = mpyui($Rs32,$Rt32)", -tc_8fd5f294, TypeM> { +tc_bafaade3, TypeM> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -16281,7 +16353,7 @@ def M2_nacci : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= add($Rs32,$Rt32)", -tc_c74f796f, TypeM>, Enc_2ae154 { +tc_f675fee8, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111100; @@ -16295,7 +16367,7 @@ def M2_naccii : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii), "$Rx32 -= add($Rs32,#$Ii)", -tc_c74f796f, TypeM>, Enc_c90aca { +tc_f675fee8, TypeM>, Enc_c90aca { let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100010100; let hasNewValue = 1; @@ -16313,7 +16385,7 @@ def M2_subacc : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rt32, IntRegs:$Rs32), "$Rx32 += sub($Rt32,$Rs32)", -tc_c74f796f, TypeM>, Enc_a568d4 { +tc_f675fee8, TypeM>, Enc_a568d4 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111000; @@ -16327,7 +16399,7 @@ def M2_vabsdiffh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vabsdiffh($Rtt32,$Rss32)", -tc_2b6f77c6, TypeM>, Enc_ea23e4 { +tc_002cb246, TypeM>, Enc_ea23e4 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000011; @@ -16337,7 +16409,7 @@ def M2_vabsdiffw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vabsdiffw($Rtt32,$Rss32)", -tc_2b6f77c6, TypeM>, Enc_ea23e4 { +tc_002cb246, TypeM>, Enc_ea23e4 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000001; @@ -16347,7 +16419,7 @@ def M2_vcmac_s0_sat_i : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vcmpyi($Rss32,$Rtt32):sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010010; @@ -16359,7 +16431,7 @@ def M2_vcmac_s0_sat_r : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vcmpyr($Rss32,$Rtt32):sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010001; @@ -16371,7 +16443,7 @@ def M2_vcmpy_s0_sat_i : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vcmpyi($Rss32,$Rtt32):sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000010; @@ -16382,7 +16454,7 @@ def M2_vcmpy_s0_sat_r : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vcmpyr($Rss32,$Rtt32):sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000001; @@ -16393,7 +16465,7 @@ def M2_vcmpy_s1_sat_i : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vcmpyi($Rss32,$Rtt32):<<1:sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000110; @@ -16404,7 +16476,7 @@ def M2_vcmpy_s1_sat_r : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vcmpyr($Rss32,$Rtt32):<<1:sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000101; @@ -16415,7 +16487,7 @@ def M2_vdmacs_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vdmpy($Rss32,$Rtt32):sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010000; @@ -16427,7 +16499,7 @@ def M2_vdmacs_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vdmpy($Rss32,$Rtt32):<<1:sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010100; @@ -16439,7 +16511,7 @@ def M2_vdmpyrs_s0 : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rd32 = vdmpy($Rss32,$Rtt32):rnd:sat", -tc_8fd5f294, TypeM>, Enc_d2216a { +tc_bafaade3, TypeM>, Enc_d2216a { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101001000; @@ -16452,7 +16524,7 @@ def M2_vdmpyrs_s1 : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rd32 = vdmpy($Rss32,$Rtt32):<<1:rnd:sat", -tc_8fd5f294, TypeM>, Enc_d2216a { +tc_bafaade3, TypeM>, Enc_d2216a { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101001100; @@ -16465,7 +16537,7 @@ def M2_vdmpys_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vdmpy($Rss32,$Rtt32):sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000000; @@ -16476,7 +16548,7 @@ def M2_vdmpys_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vdmpy($Rss32,$Rtt32):<<1:sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000100; @@ -16487,7 +16559,7 @@ def M2_vmac2 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += vmpyh($Rs32,$Rt32)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111001; @@ -16498,7 +16570,7 @@ def M2_vmac2es : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vmpyeh($Rss32,$Rtt32)", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010001; @@ -16509,7 +16581,7 @@ def M2_vmac2es_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vmpyeh($Rss32,$Rtt32):sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010000; @@ -16521,7 +16593,7 @@ def M2_vmac2es_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vmpyeh($Rss32,$Rtt32):<<1:sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010100; @@ -16533,7 +16605,7 @@ def M2_vmac2s_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += vmpyh($Rs32,$Rt32):sat", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111000; @@ -16545,7 +16617,7 @@ def M2_vmac2s_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += vmpyh($Rs32,$Rt32):<<1:sat", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111100; @@ -16557,7 +16629,7 @@ def M2_vmac2su_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += vmpyhsu($Rs32,$Rt32):sat", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111011; @@ -16569,7 +16641,7 @@ def M2_vmac2su_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += vmpyhsu($Rs32,$Rt32):<<1:sat", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111111; @@ -16581,7 +16653,7 @@ def M2_vmpy2es_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vmpyeh($Rss32,$Rtt32):sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000000; @@ -16592,7 +16664,7 @@ def M2_vmpy2es_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vmpyeh($Rss32,$Rtt32):<<1:sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000100; @@ -16603,7 +16675,7 @@ def M2_vmpy2s_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = vmpyh($Rs32,$Rt32):sat", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100101000; @@ -16614,7 +16686,7 @@ def M2_vmpy2s_s0pack : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = vmpyh($Rs32,$Rt32):rnd:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101101001; @@ -16627,7 +16699,7 @@ def M2_vmpy2s_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = vmpyh($Rs32,$Rt32):<<1:sat", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100101100; @@ -16638,7 +16710,7 @@ def M2_vmpy2s_s1pack : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = vmpyh($Rs32,$Rt32):<<1:rnd:sat", -tc_8fd5f294, TypeM>, Enc_5ab2be { +tc_bafaade3, TypeM>, Enc_5ab2be { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101101101; @@ -16651,7 +16723,7 @@ def M2_vmpy2su_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = vmpyhsu($Rs32,$Rt32):sat", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100101000; @@ -16662,7 +16734,7 @@ def M2_vmpy2su_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = vmpyhsu($Rs32,$Rt32):<<1:sat", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100101100; @@ -16673,7 +16745,7 @@ def M2_vraddh : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rd32 = vraddh($Rss32,$Rtt32)", -tc_8fd5f294, TypeM>, Enc_d2216a { +tc_bafaade3, TypeM>, Enc_d2216a { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101001001; @@ -16685,7 +16757,7 @@ def M2_vradduh : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rd32 = vradduh($Rss32,$Rtt32)", -tc_8fd5f294, TypeM>, Enc_d2216a { +tc_bafaade3, TypeM>, Enc_d2216a { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101001000; @@ -16697,7 +16769,7 @@ def M2_vrcmaci_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vrcmpyi($Rss32,$Rtt32)", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010000; @@ -16708,7 +16780,7 @@ def M2_vrcmaci_s0c : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vrcmpyi($Rss32,$Rtt32*)", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010010; @@ -16719,7 +16791,7 @@ def M2_vrcmacr_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vrcmpyr($Rss32,$Rtt32)", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010000; @@ -16730,7 +16802,7 @@ def M2_vrcmacr_s0c : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vrcmpyr($Rss32,$Rtt32*)", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010011; @@ -16741,7 +16813,7 @@ def M2_vrcmpyi_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vrcmpyi($Rss32,$Rtt32)", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000000; @@ -16751,7 +16823,7 @@ def M2_vrcmpyi_s0c : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vrcmpyi($Rss32,$Rtt32*)", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000010; @@ -16761,7 +16833,7 @@ def M2_vrcmpyr_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vrcmpyr($Rss32,$Rtt32)", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000000; @@ -16771,7 +16843,7 @@ def M2_vrcmpyr_s0c : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vrcmpyr($Rss32,$Rtt32*)", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000011; @@ -16781,7 +16853,7 @@ def M2_vrcmpys_acc_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 += vrcmpys($Rss32,$Rt32):<<1:sat", -tc_e913dc32, TypeM> { +tc_d773585a, TypeM> { let isPseudo = 1; let Constraints = "$Rxx32 = $Rxx32in"; } @@ -16789,7 +16861,7 @@ def M2_vrcmpys_acc_s1_h : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vrcmpys($Rss32,$Rtt32):<<1:sat:raw:hi", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010101; @@ -16801,7 +16873,7 @@ def M2_vrcmpys_acc_s1_l : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vrcmpys($Rss32,$Rtt32):<<1:sat:raw:lo", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010111; @@ -16813,14 +16885,14 @@ def M2_vrcmpys_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rdd32 = vrcmpys($Rss32,$Rt32):<<1:sat", -tc_8fd5f294, TypeM> { +tc_bafaade3, TypeM> { let isPseudo = 1; } def M2_vrcmpys_s1_h : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vrcmpys($Rss32,$Rtt32):<<1:sat:raw:hi", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000101; @@ -16831,7 +16903,7 @@ def M2_vrcmpys_s1_l : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vrcmpys($Rss32,$Rtt32):<<1:sat:raw:lo", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000111; @@ -16842,7 +16914,7 @@ def M2_vrcmpys_s1rp : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rd32 = vrcmpys($Rss32,$Rt32):<<1:rnd:sat", -tc_8fd5f294, TypeM> { +tc_bafaade3, TypeM> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -16851,7 +16923,7 @@ def M2_vrcmpys_s1rp_h : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rd32 = vrcmpys($Rss32,$Rtt32):<<1:rnd:sat:raw:hi", -tc_8fd5f294, TypeM>, Enc_d2216a { +tc_bafaade3, TypeM>, Enc_d2216a { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101001101; @@ -16864,7 +16936,7 @@ def M2_vrcmpys_s1rp_l : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rd32 = vrcmpys($Rss32,$Rtt32):<<1:rnd:sat:raw:lo", -tc_8fd5f294, TypeM>, Enc_d2216a { +tc_bafaade3, TypeM>, Enc_d2216a { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101001101; @@ -16877,7 +16949,7 @@ def M2_vrmac_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vrmpyh($Rss32,$Rtt32)", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010000; @@ -16888,7 +16960,7 @@ def M2_vrmpy_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vrmpyh($Rss32,$Rtt32)", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000000; @@ -16898,7 +16970,7 @@ def M2_xor_xacc : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 ^= xor($Rs32,$Rt32)", -tc_84df2cd3, TypeM>, Enc_2ae154 { +tc_f429765c, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111100; @@ -16912,7 +16984,7 @@ def M4_and_and : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 &= and($Rs32,$Rt32)", -tc_84df2cd3, TypeM>, Enc_2ae154 { +tc_f429765c, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111010; @@ -16926,7 +16998,7 @@ def M4_and_andn : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 &= and($Rs32,~$Rt32)", -tc_84df2cd3, TypeM>, Enc_2ae154 { +tc_f429765c, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111001; @@ -16940,7 +17012,7 @@ def M4_and_or : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 &= or($Rs32,$Rt32)", -tc_84df2cd3, TypeM>, Enc_2ae154 { +tc_f429765c, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111010; @@ -16954,7 +17026,7 @@ def M4_and_xor : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 &= xor($Rs32,$Rt32)", -tc_84df2cd3, TypeM>, Enc_2ae154 { +tc_f429765c, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111010; @@ -16968,7 +17040,7 @@ def M4_cmpyi_wh : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rd32 = cmpyiwh($Rss32,$Rt32):<<1:rnd:sat", -tc_8fd5f294, TypeS_3op>, Enc_3d5b28 { +tc_bafaade3, TypeS_3op>, Enc_3d5b28 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000101000; @@ -16981,7 +17053,7 @@ def M4_cmpyi_whc : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rd32 = cmpyiwh($Rss32,$Rt32*):<<1:rnd:sat", -tc_8fd5f294, TypeS_3op>, Enc_3d5b28 { +tc_bafaade3, TypeS_3op>, Enc_3d5b28 { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000101000; @@ -16994,7 +17066,7 @@ def M4_cmpyr_wh : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rd32 = cmpyrwh($Rss32,$Rt32):<<1:rnd:sat", -tc_8fd5f294, TypeS_3op>, Enc_3d5b28 { +tc_bafaade3, TypeS_3op>, Enc_3d5b28 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000101000; @@ -17007,7 +17079,7 @@ def M4_cmpyr_whc : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rd32 = cmpyrwh($Rss32,$Rt32*):<<1:rnd:sat", -tc_8fd5f294, TypeS_3op>, Enc_3d5b28 { +tc_bafaade3, TypeS_3op>, Enc_3d5b28 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000101000; @@ -17020,7 +17092,7 @@ def M4_mac_up_s1_sat : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += mpy($Rs32,$Rt32):<<1:sat", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111011; @@ -17035,7 +17107,7 @@ def M4_mpyri_addi : HInst< (outs IntRegs:$Rd32), (ins u32_0Imm:$Ii, IntRegs:$Rs32, u6_0Imm:$II), "$Rd32 = add(#$Ii,mpyi($Rs32,#$II))", -tc_16d0d8d5, TypeALU64>, Enc_322e1b, ImmRegRel { +tc_05d3a09b, TypeALU64>, Enc_322e1b, ImmRegRel { let Inst{31-24} = 0b11011000; let hasNewValue = 1; let opNewValue = 0; @@ -17051,7 +17123,7 @@ def M4_mpyri_addr : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Ru32, IntRegs:$Rs32, u32_0Imm:$Ii), "$Rd32 = add($Ru32,mpyi($Rs32,#$Ii))", -tc_16d0d8d5, TypeALU64>, Enc_420cf3, ImmRegRel { +tc_05d3a09b, TypeALU64>, Enc_420cf3, ImmRegRel { let Inst{31-23} = 0b110111111; let hasNewValue = 1; let opNewValue = 0; @@ -17068,7 +17140,7 @@ def M4_mpyri_addr_u2 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Ru32, u6_2Imm:$Ii, IntRegs:$Rs32), "$Rd32 = add($Ru32,mpyi(#$Ii,$Rs32))", -tc_bcc96cee, TypeALU64>, Enc_277737 { +tc_1a2fd869, TypeALU64>, Enc_277737 { let Inst{31-23} = 0b110111110; let hasNewValue = 1; let opNewValue = 0; @@ -17078,7 +17150,7 @@ def M4_mpyrr_addi : HInst< (outs IntRegs:$Rd32), (ins u32_0Imm:$Ii, IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = add(#$Ii,mpyi($Rs32,$Rt32))", -tc_e913dc32, TypeALU64>, Enc_a7b8e8, ImmRegRel { +tc_d773585a, TypeALU64>, Enc_a7b8e8, ImmRegRel { let Inst{31-23} = 0b110101110; let hasNewValue = 1; let opNewValue = 0; @@ -17095,7 +17167,7 @@ def M4_mpyrr_addr : HInst< (outs IntRegs:$Ry32), (ins IntRegs:$Ru32, IntRegs:$Ry32in, IntRegs:$Rs32), "$Ry32 = add($Ru32,mpyi($Ry32in,$Rs32))", -tc_e913dc32, TypeM>, Enc_7f1a05, ImmRegRel { +tc_d773585a, TypeM>, Enc_7f1a05, ImmRegRel { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100011000; @@ -17110,7 +17182,7 @@ def M4_nac_up_s1_sat : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= mpy($Rs32,$Rt32):<<1:sat", -tc_e913dc32, TypeM>, Enc_2ae154 { +tc_d773585a, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111011; @@ -17125,7 +17197,7 @@ def M4_or_and : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 |= and($Rs32,$Rt32)", -tc_84df2cd3, TypeM>, Enc_2ae154 { +tc_f429765c, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111010; @@ -17139,7 +17211,7 @@ def M4_or_andn : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 |= and($Rs32,~$Rt32)", -tc_84df2cd3, TypeM>, Enc_2ae154 { +tc_f429765c, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111001; @@ -17153,7 +17225,7 @@ def M4_or_or : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 |= or($Rs32,$Rt32)", -tc_84df2cd3, TypeM>, Enc_2ae154 { +tc_f429765c, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111110; @@ -17167,7 +17239,7 @@ def M4_or_xor : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 |= xor($Rs32,$Rt32)", -tc_84df2cd3, TypeM>, Enc_2ae154 { +tc_f429765c, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111110; @@ -17181,7 +17253,7 @@ def M4_pmpyw : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = pmpyw($Rs32,$Rt32)", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100101010; @@ -17191,7 +17263,7 @@ def M4_pmpyw_acc : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 ^= pmpyw($Rs32,$Rt32)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111001; @@ -17202,7 +17274,7 @@ def M4_vpmpyh : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = vpmpyh($Rs32,$Rt32)", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100101110; @@ -17212,7 +17284,7 @@ def M4_vpmpyh_acc : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 ^= vpmpyh($Rs32,$Rt32)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111101; @@ -17223,7 +17295,7 @@ def M4_vrmpyeh_acc_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vrmpyweh($Rss32,$Rtt32)", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010001; @@ -17234,7 +17306,7 @@ def M4_vrmpyeh_acc_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vrmpyweh($Rss32,$Rtt32):<<1", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010101; @@ -17245,7 +17317,7 @@ def M4_vrmpyeh_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vrmpyweh($Rss32,$Rtt32)", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000010; @@ -17255,7 +17327,7 @@ def M4_vrmpyeh_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vrmpyweh($Rss32,$Rtt32):<<1", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000110; @@ -17265,7 +17337,7 @@ def M4_vrmpyoh_acc_s0 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vrmpywoh($Rss32,$Rtt32)", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010011; @@ -17276,7 +17348,7 @@ def M4_vrmpyoh_acc_s1 : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vrmpywoh($Rss32,$Rtt32):<<1", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010111; @@ -17287,7 +17359,7 @@ def M4_vrmpyoh_s0 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vrmpywoh($Rss32,$Rtt32)", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000001; @@ -17297,7 +17369,7 @@ def M4_vrmpyoh_s1 : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vrmpywoh($Rss32,$Rtt32):<<1", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000101; @@ -17307,7 +17379,7 @@ def M4_xor_and : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 ^= and($Rs32,$Rt32)", -tc_84df2cd3, TypeM>, Enc_2ae154 { +tc_f429765c, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111110; @@ -17321,7 +17393,7 @@ def M4_xor_andn : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 ^= and($Rs32,~$Rt32)", -tc_84df2cd3, TypeM>, Enc_2ae154 { +tc_f429765c, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111001; @@ -17335,7 +17407,7 @@ def M4_xor_or : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 ^= or($Rs32,$Rt32)", -tc_84df2cd3, TypeM>, Enc_2ae154 { +tc_f429765c, TypeM>, Enc_2ae154 { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101111110; @@ -17349,7 +17421,7 @@ def M4_xor_xacc : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 ^= xor($Rss32,$Rtt32)", -tc_84df2cd3, TypeS_3op>, Enc_88c16c { +tc_f429765c, TypeS_3op>, Enc_88c16c { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001010100; @@ -17360,7 +17432,7 @@ def M5_vdmacbsu : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vdmpybsu($Rss32,$Rtt32):sat", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010001; @@ -17372,7 +17444,7 @@ def M5_vdmpybsu : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vdmpybsu($Rss32,$Rtt32):sat", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000101; @@ -17383,7 +17455,7 @@ def M5_vmacbsu : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += vmpybsu($Rs32,$Rt32)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111110; @@ -17394,7 +17466,7 @@ def M5_vmacbuu : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rxx32 += vmpybu($Rs32,$Rt32)", -tc_e913dc32, TypeM>, Enc_61f0b0 { +tc_d773585a, TypeM>, Enc_61f0b0 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100111100; @@ -17405,7 +17477,7 @@ def M5_vmpybsu : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = vmpybsu($Rs32,$Rt32)", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100101010; @@ -17415,7 +17487,7 @@ def M5_vmpybuu : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = vmpybu($Rs32,$Rt32)", -tc_8fd5f294, TypeM>, Enc_be32a5 { +tc_bafaade3, TypeM>, Enc_be32a5 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11100101100; @@ -17425,7 +17497,7 @@ def M5_vrmacbsu : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vrmpybsu($Rss32,$Rtt32)", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010110; @@ -17436,7 +17508,7 @@ def M5_vrmacbuu : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 += vrmpybu($Rss32,$Rtt32)", -tc_e913dc32, TypeM>, Enc_88c16c { +tc_d773585a, TypeM>, Enc_88c16c { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101010100; @@ -17447,7 +17519,7 @@ def M5_vrmpybsu : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vrmpybsu($Rss32,$Rtt32)", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000110; @@ -17457,7 +17529,7 @@ def M5_vrmpybuu : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vrmpybu($Rss32,$Rtt32)", -tc_8fd5f294, TypeM>, Enc_a56825 { +tc_bafaade3, TypeM>, Enc_a56825 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000100; @@ -17467,7 +17539,7 @@ def M6_vabsdiffb : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vabsdiffb($Rtt32,$Rss32)", -tc_f49e76f4, TypeM>, Enc_ea23e4, Requires<[HasV62]> { +tc_9461ff31, TypeM>, Enc_ea23e4, Requires<[HasV62]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000111; @@ -17477,7 +17549,7 @@ def M6_vabsdiffub : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = vabsdiffub($Rtt32,$Rss32)", -tc_f49e76f4, TypeM>, Enc_ea23e4, Requires<[HasV62]> { +tc_9461ff31, TypeM>, Enc_ea23e4, Requires<[HasV62]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11101000101; @@ -17487,7 +17559,7 @@ def PS_loadrbabs : HInst< (outs IntRegs:$Rd32), (ins u32_0Imm:$Ii), "$Rd32 = memb(#$Ii)", -tc_9c98e8af, TypeV2LDST>, Enc_25bef0, AddrModeRel { +tc_c4db48cb, TypeV2LDST>, Enc_25bef0, AddrModeRel { let Inst{24-21} = 0b1000; let Inst{31-27} = 0b01001; let hasNewValue = 1; @@ -17510,7 +17582,7 @@ def PS_loadrdabs : HInst< (outs DoubleRegs:$Rdd32), (ins u29_3Imm:$Ii), "$Rdd32 = memd(#$Ii)", -tc_9c98e8af, TypeV2LDST>, Enc_509701, AddrModeRel { +tc_c4db48cb, TypeV2LDST>, Enc_509701, AddrModeRel { let Inst{24-21} = 0b1110; let Inst{31-27} = 0b01001; let addrMode = Absolute; @@ -17531,7 +17603,7 @@ def PS_loadrhabs : HInst< (outs IntRegs:$Rd32), (ins u31_1Imm:$Ii), "$Rd32 = memh(#$Ii)", -tc_9c98e8af, TypeV2LDST>, Enc_8df4be, AddrModeRel { +tc_c4db48cb, TypeV2LDST>, Enc_8df4be, AddrModeRel { let Inst{24-21} = 0b1010; let Inst{31-27} = 0b01001; let hasNewValue = 1; @@ -17554,7 +17626,7 @@ def PS_loadriabs : HInst< (outs IntRegs:$Rd32), (ins u30_2Imm:$Ii), "$Rd32 = memw(#$Ii)", -tc_9c98e8af, TypeV2LDST>, Enc_4f4ed7, AddrModeRel { +tc_c4db48cb, TypeV2LDST>, Enc_4f4ed7, AddrModeRel { let Inst{24-21} = 0b1100; let Inst{31-27} = 0b01001; let hasNewValue = 1; @@ -17577,7 +17649,7 @@ def PS_loadrubabs : HInst< (outs IntRegs:$Rd32), (ins u32_0Imm:$Ii), "$Rd32 = memub(#$Ii)", -tc_9c98e8af, TypeV2LDST>, Enc_25bef0, AddrModeRel { +tc_c4db48cb, TypeV2LDST>, Enc_25bef0, AddrModeRel { let Inst{24-21} = 0b1001; let Inst{31-27} = 0b01001; let hasNewValue = 1; @@ -17600,7 +17672,7 @@ def PS_loadruhabs : HInst< (outs IntRegs:$Rd32), (ins u31_1Imm:$Ii), "$Rd32 = memuh(#$Ii)", -tc_9c98e8af, TypeV2LDST>, Enc_8df4be, AddrModeRel { +tc_c4db48cb, TypeV2LDST>, Enc_8df4be, AddrModeRel { let Inst{24-21} = 0b1011; let Inst{31-27} = 0b01001; let hasNewValue = 1; @@ -17623,7 +17695,7 @@ def PS_storerbabs : HInst< (outs), (ins u32_0Imm:$Ii, IntRegs:$Rt32), "memb(#$Ii) = $Rt32", -tc_a788683e, TypeV2LDST>, Enc_1b64fb, AddrModeRel { +tc_0371abea, TypeV2LDST>, Enc_1b64fb, AddrModeRel { let Inst{24-21} = 0b0000; let Inst{31-27} = 0b01001; let addrMode = Absolute; @@ -17645,7 +17717,7 @@ def PS_storerbnewabs : HInst< (outs), (ins u32_0Imm:$Ii, IntRegs:$Nt8), "memb(#$Ii) = $Nt8.new", -tc_ff9ee76e, TypeV2LDST>, Enc_ad1831, AddrModeRel { +tc_5bf126a6, TypeV2LDST>, Enc_ad1831, AddrModeRel { let Inst{12-11} = 0b00; let Inst{24-21} = 0b0101; let Inst{31-27} = 0b01001; @@ -17671,7 +17743,7 @@ def PS_storerdabs : HInst< (outs), (ins u29_3Imm:$Ii, DoubleRegs:$Rtt32), "memd(#$Ii) = $Rtt32", -tc_a788683e, TypeV2LDST>, Enc_5c124a, AddrModeRel { +tc_0371abea, TypeV2LDST>, Enc_5c124a, AddrModeRel { let Inst{24-21} = 0b0110; let Inst{31-27} = 0b01001; let addrMode = Absolute; @@ -17692,7 +17764,7 @@ def PS_storerfabs : HInst< (outs), (ins u31_1Imm:$Ii, IntRegs:$Rt32), "memh(#$Ii) = $Rt32.h", -tc_a788683e, TypeV2LDST>, Enc_fda92c, AddrModeRel { +tc_0371abea, TypeV2LDST>, Enc_fda92c, AddrModeRel { let Inst{24-21} = 0b0011; let Inst{31-27} = 0b01001; let addrMode = Absolute; @@ -17713,7 +17785,7 @@ def PS_storerhabs : HInst< (outs), (ins u31_1Imm:$Ii, IntRegs:$Rt32), "memh(#$Ii) = $Rt32", -tc_a788683e, TypeV2LDST>, Enc_fda92c, AddrModeRel { +tc_0371abea, TypeV2LDST>, Enc_fda92c, AddrModeRel { let Inst{24-21} = 0b0010; let Inst{31-27} = 0b01001; let addrMode = Absolute; @@ -17735,7 +17807,7 @@ def PS_storerhnewabs : HInst< (outs), (ins u31_1Imm:$Ii, IntRegs:$Nt8), "memh(#$Ii) = $Nt8.new", -tc_ff9ee76e, TypeV2LDST>, Enc_bc03e5, AddrModeRel { +tc_5bf126a6, TypeV2LDST>, Enc_bc03e5, AddrModeRel { let Inst{12-11} = 0b01; let Inst{24-21} = 0b0101; let Inst{31-27} = 0b01001; @@ -17761,7 +17833,7 @@ def PS_storeriabs : HInst< (outs), (ins u30_2Imm:$Ii, IntRegs:$Rt32), "memw(#$Ii) = $Rt32", -tc_a788683e, TypeV2LDST>, Enc_541f26, AddrModeRel { +tc_0371abea, TypeV2LDST>, Enc_541f26, AddrModeRel { let Inst{24-21} = 0b0100; let Inst{31-27} = 0b01001; let addrMode = Absolute; @@ -17783,7 +17855,7 @@ def PS_storerinewabs : HInst< (outs), (ins u30_2Imm:$Ii, IntRegs:$Nt8), "memw(#$Ii) = $Nt8.new", -tc_ff9ee76e, TypeV2LDST>, Enc_78cbf0, AddrModeRel { +tc_5bf126a6, TypeV2LDST>, Enc_78cbf0, AddrModeRel { let Inst{12-11} = 0b10; let Inst{24-21} = 0b0101; let Inst{31-27} = 0b01001; @@ -17809,7 +17881,7 @@ def S2_addasl_rrri : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32, u3_0Imm:$Ii), "$Rd32 = addasl($Rt32,$Rs32,#$Ii)", -tc_c74f796f, TypeS_3op>, Enc_47ef61 { +tc_f675fee8, TypeS_3op>, Enc_47ef61 { let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000100000; let hasNewValue = 1; @@ -17820,7 +17892,7 @@ def S2_allocframe : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, u11_3Imm:$Ii), "allocframe($Rx32,#$Ii):raw", -tc_e216a5db, TypeST>, Enc_22c845 { +tc_b44ecf75, TypeST>, Enc_22c845 { let Inst{13-11} = 0b000; let Inst{31-21} = 0b10100000100; let hasNewValue = 1; @@ -17836,7 +17908,7 @@ def S2_asl_i_p : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rdd32 = asl($Rss32,#$Ii)", -tc_540fdfbc, TypeS_2op>, Enc_5eac98 { +tc_946df596, TypeS_2op>, Enc_5eac98 { let Inst{7-5} = 0b010; let Inst{31-21} = 0b10000000000; } @@ -17844,7 +17916,7 @@ def S2_asl_i_p_acc : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rxx32 += asl($Rss32,#$Ii)", -tc_c74f796f, TypeS_2op>, Enc_70fb07 { +tc_f675fee8, TypeS_2op>, Enc_70fb07 { let Inst{7-5} = 0b110; let Inst{31-21} = 0b10000010000; let prefersSlot3 = 1; @@ -17854,7 +17926,7 @@ def S2_asl_i_p_and : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rxx32 &= asl($Rss32,#$Ii)", -tc_84df2cd3, TypeS_2op>, Enc_70fb07 { +tc_f429765c, TypeS_2op>, Enc_70fb07 { let Inst{7-5} = 0b010; let Inst{31-21} = 0b10000010010; let prefersSlot3 = 1; @@ -17864,7 +17936,7 @@ def S2_asl_i_p_nac : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rxx32 -= asl($Rss32,#$Ii)", -tc_c74f796f, TypeS_2op>, Enc_70fb07 { +tc_f675fee8, TypeS_2op>, Enc_70fb07 { let Inst{7-5} = 0b010; let Inst{31-21} = 0b10000010000; let prefersSlot3 = 1; @@ -17874,7 +17946,7 @@ def S2_asl_i_p_or : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rxx32 |= asl($Rss32,#$Ii)", -tc_84df2cd3, TypeS_2op>, Enc_70fb07 { +tc_f429765c, TypeS_2op>, Enc_70fb07 { let Inst{7-5} = 0b110; let Inst{31-21} = 0b10000010010; let prefersSlot3 = 1; @@ -17884,7 +17956,7 @@ def S2_asl_i_p_xacc : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rxx32 ^= asl($Rss32,#$Ii)", -tc_84df2cd3, TypeS_2op>, Enc_70fb07 { +tc_f429765c, TypeS_2op>, Enc_70fb07 { let Inst{7-5} = 0b010; let Inst{31-21} = 0b10000010100; let prefersSlot3 = 1; @@ -17894,7 +17966,7 @@ def S2_asl_i_r : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, u5_0Imm:$Ii), "$Rd32 = asl($Rs32,#$Ii)", -tc_540fdfbc, TypeS_2op>, Enc_a05677 { +tc_946df596, TypeS_2op>, Enc_a05677 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001100000; @@ -17905,7 +17977,7 @@ def S2_asl_i_r_acc : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii), "$Rx32 += asl($Rs32,#$Ii)", -tc_c74f796f, TypeS_2op>, Enc_28a2dc { +tc_f675fee8, TypeS_2op>, Enc_28a2dc { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001110000; @@ -17918,7 +17990,7 @@ def S2_asl_i_r_and : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii), "$Rx32 &= asl($Rs32,#$Ii)", -tc_84df2cd3, TypeS_2op>, Enc_28a2dc { +tc_f429765c, TypeS_2op>, Enc_28a2dc { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001110010; @@ -17931,7 +18003,7 @@ def S2_asl_i_r_nac : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii), "$Rx32 -= asl($Rs32,#$Ii)", -tc_c74f796f, TypeS_2op>, Enc_28a2dc { +tc_f675fee8, TypeS_2op>, Enc_28a2dc { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001110000; @@ -17944,7 +18016,7 @@ def S2_asl_i_r_or : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii), "$Rx32 |= asl($Rs32,#$Ii)", -tc_84df2cd3, TypeS_2op>, Enc_28a2dc { +tc_f429765c, TypeS_2op>, Enc_28a2dc { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001110010; @@ -17957,7 +18029,7 @@ def S2_asl_i_r_sat : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, u5_0Imm:$Ii), "$Rd32 = asl($Rs32,#$Ii):sat", -tc_b44c6e2a, TypeS_2op>, Enc_a05677 { +tc_779080bf, TypeS_2op>, Enc_a05677 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001100010; @@ -17970,7 +18042,7 @@ def S2_asl_i_r_xacc : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii), "$Rx32 ^= asl($Rs32,#$Ii)", -tc_84df2cd3, TypeS_2op>, Enc_28a2dc { +tc_f429765c, TypeS_2op>, Enc_28a2dc { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001110100; @@ -17983,7 +18055,7 @@ def S2_asl_i_vh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, u4_0Imm:$Ii), "$Rdd32 = vaslh($Rss32,#$Ii)", -tc_540fdfbc, TypeS_2op>, Enc_12b6e9 { +tc_946df596, TypeS_2op>, Enc_12b6e9 { let Inst{7-5} = 0b010; let Inst{13-12} = 0b00; let Inst{31-21} = 0b10000000100; @@ -17992,7 +18064,7 @@ def S2_asl_i_vw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, u5_0Imm:$Ii), "$Rdd32 = vaslw($Rss32,#$Ii)", -tc_540fdfbc, TypeS_2op>, Enc_7e5a82 { +tc_946df596, TypeS_2op>, Enc_7e5a82 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10000000010; @@ -18001,7 +18073,7 @@ def S2_asl_r_p : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rdd32 = asl($Rss32,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_927852 { +tc_946df596, TypeS_3op>, Enc_927852 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000011100; @@ -18010,7 +18082,7 @@ def S2_asl_r_p_acc : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 += asl($Rss32,$Rt32)", -tc_c74f796f, TypeS_3op>, Enc_1aa186 { +tc_f675fee8, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011110; @@ -18021,7 +18093,7 @@ def S2_asl_r_p_and : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 &= asl($Rss32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_1aa186 { +tc_f429765c, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011010; @@ -18032,7 +18104,7 @@ def S2_asl_r_p_nac : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 -= asl($Rss32,$Rt32)", -tc_c74f796f, TypeS_3op>, Enc_1aa186 { +tc_f675fee8, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011100; @@ -18043,7 +18115,7 @@ def S2_asl_r_p_or : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 |= asl($Rss32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_1aa186 { +tc_f429765c, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011000; @@ -18054,7 +18126,7 @@ def S2_asl_r_p_xor : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 ^= asl($Rss32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_1aa186 { +tc_f429765c, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011011; @@ -18065,7 +18137,7 @@ def S2_asl_r_r : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = asl($Rs32,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_5ab2be { +tc_946df596, TypeS_3op>, Enc_5ab2be { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000110010; @@ -18076,7 +18148,7 @@ def S2_asl_r_r_acc : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += asl($Rs32,$Rt32)", -tc_c74f796f, TypeS_3op>, Enc_2ae154 { +tc_f675fee8, TypeS_3op>, Enc_2ae154 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001100110; @@ -18089,7 +18161,7 @@ def S2_asl_r_r_and : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 &= asl($Rs32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_2ae154 { +tc_f429765c, TypeS_3op>, Enc_2ae154 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001100010; @@ -18102,7 +18174,7 @@ def S2_asl_r_r_nac : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= asl($Rs32,$Rt32)", -tc_c74f796f, TypeS_3op>, Enc_2ae154 { +tc_f675fee8, TypeS_3op>, Enc_2ae154 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001100100; @@ -18115,7 +18187,7 @@ def S2_asl_r_r_or : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 |= asl($Rs32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_2ae154 { +tc_f429765c, TypeS_3op>, Enc_2ae154 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001100000; @@ -18128,7 +18200,7 @@ def S2_asl_r_r_sat : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = asl($Rs32,$Rt32):sat", -tc_b44c6e2a, TypeS_3op>, Enc_5ab2be { +tc_779080bf, TypeS_3op>, Enc_5ab2be { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000110000; @@ -18141,7 +18213,7 @@ def S2_asl_r_vh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rdd32 = vaslh($Rss32,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_927852 { +tc_946df596, TypeS_3op>, Enc_927852 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000011010; @@ -18150,7 +18222,7 @@ def S2_asl_r_vw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rdd32 = vaslw($Rss32,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_927852 { +tc_946df596, TypeS_3op>, Enc_927852 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000011000; @@ -18159,7 +18231,7 @@ def S2_asr_i_p : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rdd32 = asr($Rss32,#$Ii)", -tc_540fdfbc, TypeS_2op>, Enc_5eac98 { +tc_946df596, TypeS_2op>, Enc_5eac98 { let Inst{7-5} = 0b000; let Inst{31-21} = 0b10000000000; } @@ -18167,7 +18239,7 @@ def S2_asr_i_p_acc : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rxx32 += asr($Rss32,#$Ii)", -tc_c74f796f, TypeS_2op>, Enc_70fb07 { +tc_f675fee8, TypeS_2op>, Enc_70fb07 { let Inst{7-5} = 0b100; let Inst{31-21} = 0b10000010000; let prefersSlot3 = 1; @@ -18177,7 +18249,7 @@ def S2_asr_i_p_and : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rxx32 &= asr($Rss32,#$Ii)", -tc_84df2cd3, TypeS_2op>, Enc_70fb07 { +tc_f429765c, TypeS_2op>, Enc_70fb07 { let Inst{7-5} = 0b000; let Inst{31-21} = 0b10000010010; let prefersSlot3 = 1; @@ -18187,7 +18259,7 @@ def S2_asr_i_p_nac : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rxx32 -= asr($Rss32,#$Ii)", -tc_c74f796f, TypeS_2op>, Enc_70fb07 { +tc_f675fee8, TypeS_2op>, Enc_70fb07 { let Inst{7-5} = 0b000; let Inst{31-21} = 0b10000010000; let prefersSlot3 = 1; @@ -18197,7 +18269,7 @@ def S2_asr_i_p_or : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rxx32 |= asr($Rss32,#$Ii)", -tc_84df2cd3, TypeS_2op>, Enc_70fb07 { +tc_f429765c, TypeS_2op>, Enc_70fb07 { let Inst{7-5} = 0b100; let Inst{31-21} = 0b10000010010; let prefersSlot3 = 1; @@ -18207,7 +18279,7 @@ def S2_asr_i_p_rnd : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rdd32 = asr($Rss32,#$Ii):rnd", -tc_2b6f77c6, TypeS_2op>, Enc_5eac98 { +tc_002cb246, TypeS_2op>, Enc_5eac98 { let Inst{7-5} = 0b111; let Inst{31-21} = 0b10000000110; let prefersSlot3 = 1; @@ -18216,14 +18288,14 @@ def S2_asr_i_p_rnd_goodsyntax : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rdd32 = asrrnd($Rss32,#$Ii)", -tc_2b6f77c6, TypeS_2op> { +tc_002cb246, TypeS_2op> { let isPseudo = 1; } def S2_asr_i_r : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, u5_0Imm:$Ii), "$Rd32 = asr($Rs32,#$Ii)", -tc_540fdfbc, TypeS_2op>, Enc_a05677 { +tc_946df596, TypeS_2op>, Enc_a05677 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001100000; @@ -18234,7 +18306,7 @@ def S2_asr_i_r_acc : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii), "$Rx32 += asr($Rs32,#$Ii)", -tc_c74f796f, TypeS_2op>, Enc_28a2dc { +tc_f675fee8, TypeS_2op>, Enc_28a2dc { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001110000; @@ -18247,7 +18319,7 @@ def S2_asr_i_r_and : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii), "$Rx32 &= asr($Rs32,#$Ii)", -tc_84df2cd3, TypeS_2op>, Enc_28a2dc { +tc_f429765c, TypeS_2op>, Enc_28a2dc { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001110010; @@ -18260,7 +18332,7 @@ def S2_asr_i_r_nac : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii), "$Rx32 -= asr($Rs32,#$Ii)", -tc_c74f796f, TypeS_2op>, Enc_28a2dc { +tc_f675fee8, TypeS_2op>, Enc_28a2dc { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001110000; @@ -18273,7 +18345,7 @@ def S2_asr_i_r_or : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii), "$Rx32 |= asr($Rs32,#$Ii)", -tc_84df2cd3, TypeS_2op>, Enc_28a2dc { +tc_f429765c, TypeS_2op>, Enc_28a2dc { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001110010; @@ -18286,7 +18358,7 @@ def S2_asr_i_r_rnd : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, u5_0Imm:$Ii), "$Rd32 = asr($Rs32,#$Ii):rnd", -tc_2b6f77c6, TypeS_2op>, Enc_a05677 { +tc_002cb246, TypeS_2op>, Enc_a05677 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001100010; @@ -18298,7 +18370,7 @@ def S2_asr_i_r_rnd_goodsyntax : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, u5_0Imm:$Ii), "$Rd32 = asrrnd($Rs32,#$Ii)", -tc_2b6f77c6, TypeS_2op> { +tc_002cb246, TypeS_2op> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -18307,7 +18379,7 @@ def S2_asr_i_svw_trun : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32, u5_0Imm:$Ii), "$Rd32 = vasrw($Rss32,#$Ii)", -tc_1b9c9ee5, TypeS_2op>, Enc_8dec2e { +tc_4414d8b1, TypeS_2op>, Enc_8dec2e { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001000110; @@ -18319,7 +18391,7 @@ def S2_asr_i_vh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, u4_0Imm:$Ii), "$Rdd32 = vasrh($Rss32,#$Ii)", -tc_540fdfbc, TypeS_2op>, Enc_12b6e9 { +tc_946df596, TypeS_2op>, Enc_12b6e9 { let Inst{7-5} = 0b000; let Inst{13-12} = 0b00; let Inst{31-21} = 0b10000000100; @@ -18328,7 +18400,7 @@ def S2_asr_i_vw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, u5_0Imm:$Ii), "$Rdd32 = vasrw($Rss32,#$Ii)", -tc_540fdfbc, TypeS_2op>, Enc_7e5a82 { +tc_946df596, TypeS_2op>, Enc_7e5a82 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10000000010; @@ -18337,7 +18409,7 @@ def S2_asr_r_p : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rdd32 = asr($Rss32,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_927852 { +tc_946df596, TypeS_3op>, Enc_927852 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000011100; @@ -18346,7 +18418,7 @@ def S2_asr_r_p_acc : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 += asr($Rss32,$Rt32)", -tc_c74f796f, TypeS_3op>, Enc_1aa186 { +tc_f675fee8, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011110; @@ -18357,7 +18429,7 @@ def S2_asr_r_p_and : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 &= asr($Rss32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_1aa186 { +tc_f429765c, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011010; @@ -18368,7 +18440,7 @@ def S2_asr_r_p_nac : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 -= asr($Rss32,$Rt32)", -tc_c74f796f, TypeS_3op>, Enc_1aa186 { +tc_f675fee8, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011100; @@ -18379,7 +18451,7 @@ def S2_asr_r_p_or : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 |= asr($Rss32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_1aa186 { +tc_f429765c, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011000; @@ -18390,7 +18462,7 @@ def S2_asr_r_p_xor : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 ^= asr($Rss32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_1aa186 { +tc_f429765c, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011011; @@ -18401,7 +18473,7 @@ def S2_asr_r_r : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = asr($Rs32,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_5ab2be { +tc_946df596, TypeS_3op>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000110010; @@ -18412,7 +18484,7 @@ def S2_asr_r_r_acc : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += asr($Rs32,$Rt32)", -tc_c74f796f, TypeS_3op>, Enc_2ae154 { +tc_f675fee8, TypeS_3op>, Enc_2ae154 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001100110; @@ -18425,7 +18497,7 @@ def S2_asr_r_r_and : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 &= asr($Rs32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_2ae154 { +tc_f429765c, TypeS_3op>, Enc_2ae154 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001100010; @@ -18438,7 +18510,7 @@ def S2_asr_r_r_nac : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= asr($Rs32,$Rt32)", -tc_c74f796f, TypeS_3op>, Enc_2ae154 { +tc_f675fee8, TypeS_3op>, Enc_2ae154 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001100100; @@ -18451,7 +18523,7 @@ def S2_asr_r_r_or : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 |= asr($Rs32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_2ae154 { +tc_f429765c, TypeS_3op>, Enc_2ae154 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001100000; @@ -18464,7 +18536,7 @@ def S2_asr_r_r_sat : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = asr($Rs32,$Rt32):sat", -tc_b44c6e2a, TypeS_3op>, Enc_5ab2be { +tc_779080bf, TypeS_3op>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000110000; @@ -18477,7 +18549,7 @@ def S2_asr_r_svw_trun : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rd32 = vasrw($Rss32,$Rt32)", -tc_1b9c9ee5, TypeS_3op>, Enc_3d5b28 { +tc_4414d8b1, TypeS_3op>, Enc_3d5b28 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000101000; @@ -18489,7 +18561,7 @@ def S2_asr_r_vh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rdd32 = vasrh($Rss32,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_927852 { +tc_946df596, TypeS_3op>, Enc_927852 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000011010; @@ -18498,7 +18570,7 @@ def S2_asr_r_vw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rdd32 = vasrw($Rss32,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_927852 { +tc_946df596, TypeS_3op>, Enc_927852 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000011000; @@ -18507,7 +18579,7 @@ def S2_brev : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = brev($Rs32)", -tc_d088982c, TypeS_2op>, Enc_5e2823 { +tc_14b5c689, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000110; let Inst{31-21} = 0b10001100010; let hasNewValue = 1; @@ -18518,7 +18590,7 @@ def S2_brevp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = brev($Rss32)", -tc_d088982c, TypeS_2op>, Enc_b9c5fb { +tc_14b5c689, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000110; let Inst{31-21} = 0b10000000110; let prefersSlot3 = 1; @@ -18527,7 +18599,7 @@ def S2_cabacdecbin : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = decbin($Rss32,$Rtt32)", -tc_c6ebf8dd, TypeS_3op>, Enc_a56825 { +tc_76851da1, TypeS_3op>, Enc_a56825 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001110; @@ -18539,7 +18611,7 @@ def S2_cl0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = cl0($Rs32)", -tc_d088982c, TypeS_2op>, Enc_5e2823 { +tc_14b5c689, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000101; let Inst{31-21} = 0b10001100000; let hasNewValue = 1; @@ -18550,7 +18622,7 @@ def S2_cl0p : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = cl0($Rss32)", -tc_d088982c, TypeS_2op>, Enc_90cd8b { +tc_14b5c689, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000010; let Inst{31-21} = 0b10001000010; let hasNewValue = 1; @@ -18561,7 +18633,7 @@ def S2_cl1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = cl1($Rs32)", -tc_d088982c, TypeS_2op>, Enc_5e2823 { +tc_14b5c689, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000110; let Inst{31-21} = 0b10001100000; let hasNewValue = 1; @@ -18572,7 +18644,7 @@ def S2_cl1p : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = cl1($Rss32)", -tc_d088982c, TypeS_2op>, Enc_90cd8b { +tc_14b5c689, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000100; let Inst{31-21} = 0b10001000010; let hasNewValue = 1; @@ -18583,7 +18655,7 @@ def S2_clb : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = clb($Rs32)", -tc_d088982c, TypeS_2op>, Enc_5e2823 { +tc_14b5c689, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000100; let Inst{31-21} = 0b10001100000; let hasNewValue = 1; @@ -18594,7 +18666,7 @@ def S2_clbnorm : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = normamt($Rs32)", -tc_d088982c, TypeS_2op>, Enc_5e2823 { +tc_14b5c689, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000111; let Inst{31-21} = 0b10001100000; let hasNewValue = 1; @@ -18605,7 +18677,7 @@ def S2_clbp : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = clb($Rss32)", -tc_d088982c, TypeS_2op>, Enc_90cd8b { +tc_14b5c689, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b10001000010; let hasNewValue = 1; @@ -18616,7 +18688,7 @@ def S2_clrbit_i : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, u5_0Imm:$Ii), "$Rd32 = clrbit($Rs32,#$Ii)", -tc_540fdfbc, TypeS_2op>, Enc_a05677 { +tc_946df596, TypeS_2op>, Enc_a05677 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001100110; @@ -18627,7 +18699,7 @@ def S2_clrbit_r : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = clrbit($Rs32,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_5ab2be { +tc_946df596, TypeS_3op>, Enc_5ab2be { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000110100; @@ -18638,7 +18710,7 @@ def S2_ct0 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = ct0($Rs32)", -tc_d088982c, TypeS_2op>, Enc_5e2823 { +tc_14b5c689, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000100; let Inst{31-21} = 0b10001100010; let hasNewValue = 1; @@ -18649,7 +18721,7 @@ def S2_ct0p : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = ct0($Rss32)", -tc_d088982c, TypeS_2op>, Enc_90cd8b { +tc_14b5c689, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000010; let Inst{31-21} = 0b10001000111; let hasNewValue = 1; @@ -18660,7 +18732,7 @@ def S2_ct1 : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = ct1($Rs32)", -tc_d088982c, TypeS_2op>, Enc_5e2823 { +tc_14b5c689, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000101; let Inst{31-21} = 0b10001100010; let hasNewValue = 1; @@ -18671,7 +18743,7 @@ def S2_ct1p : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = ct1($Rss32)", -tc_d088982c, TypeS_2op>, Enc_90cd8b { +tc_14b5c689, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000100; let Inst{31-21} = 0b10001000111; let hasNewValue = 1; @@ -18682,7 +18754,7 @@ def S2_deinterleave : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = deinterleave($Rss32)", -tc_d088982c, TypeS_2op>, Enc_b9c5fb { +tc_14b5c689, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000100; let Inst{31-21} = 0b10000000110; let prefersSlot3 = 1; @@ -18691,7 +18763,7 @@ def S2_extractu : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, u5_0Imm:$Ii, u5_0Imm:$II), "$Rd32 = extractu($Rs32,#$Ii,#$II)", -tc_c74f796f, TypeS_2op>, Enc_b388cf { +tc_f675fee8, TypeS_2op>, Enc_b388cf { let Inst{13-13} = 0b0; let Inst{31-23} = 0b100011010; let hasNewValue = 1; @@ -18702,7 +18774,7 @@ def S2_extractu_rp : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, DoubleRegs:$Rtt32), "$Rd32 = extractu($Rs32,$Rtt32)", -tc_2b6f77c6, TypeS_3op>, Enc_e07374 { +tc_002cb246, TypeS_3op>, Enc_e07374 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001001000; @@ -18714,7 +18786,7 @@ def S2_extractup : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, u6_0Imm:$Ii, u6_0Imm:$II), "$Rdd32 = extractu($Rss32,#$Ii,#$II)", -tc_c74f796f, TypeS_2op>, Enc_b84c4c { +tc_f675fee8, TypeS_2op>, Enc_b84c4c { let Inst{31-24} = 0b10000001; let prefersSlot3 = 1; } @@ -18722,7 +18794,7 @@ def S2_extractup_rp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = extractu($Rss32,$Rtt32)", -tc_2b6f77c6, TypeS_3op>, Enc_a56825 { +tc_002cb246, TypeS_3op>, Enc_a56825 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001000; @@ -18732,7 +18804,7 @@ def S2_insert : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii, u5_0Imm:$II), "$Rx32 = insert($Rs32,#$Ii,#$II)", -tc_87735c3b, TypeS_2op>, Enc_a1e29d { +tc_bfec0f01, TypeS_2op>, Enc_a1e29d { let Inst{13-13} = 0b0; let Inst{31-23} = 0b100011110; let hasNewValue = 1; @@ -18744,7 +18816,7 @@ def S2_insert_rp : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, DoubleRegs:$Rtt32), "$Rx32 = insert($Rs32,$Rtt32)", -tc_84df2cd3, TypeS_3op>, Enc_179b35 { +tc_f429765c, TypeS_3op>, Enc_179b35 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001000000; @@ -18757,7 +18829,7 @@ def S2_insertp : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii, u6_0Imm:$II), "$Rxx32 = insert($Rss32,#$Ii,#$II)", -tc_87735c3b, TypeS_2op>, Enc_143a3c { +tc_bfec0f01, TypeS_2op>, Enc_143a3c { let Inst{31-24} = 0b10000011; let prefersSlot3 = 1; let Constraints = "$Rxx32 = $Rxx32in"; @@ -18766,7 +18838,7 @@ def S2_insertp_rp : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rxx32 = insert($Rss32,$Rtt32)", -tc_84df2cd3, TypeS_3op>, Enc_88c16c { +tc_f429765c, TypeS_3op>, Enc_88c16c { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001010000; @@ -18777,7 +18849,7 @@ def S2_interleave : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = interleave($Rss32)", -tc_d088982c, TypeS_2op>, Enc_b9c5fb { +tc_14b5c689, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000101; let Inst{31-21} = 0b10000000110; let prefersSlot3 = 1; @@ -18786,7 +18858,7 @@ def S2_lfsp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = lfs($Rss32,$Rtt32)", -tc_2b6f77c6, TypeS_3op>, Enc_a56825 { +tc_002cb246, TypeS_3op>, Enc_a56825 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001100; @@ -18796,7 +18868,7 @@ def S2_lsl_r_p : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rdd32 = lsl($Rss32,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_927852 { +tc_946df596, TypeS_3op>, Enc_927852 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000011100; @@ -18805,7 +18877,7 @@ def S2_lsl_r_p_acc : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 += lsl($Rss32,$Rt32)", -tc_c74f796f, TypeS_3op>, Enc_1aa186 { +tc_f675fee8, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011110; @@ -18816,7 +18888,7 @@ def S2_lsl_r_p_and : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 &= lsl($Rss32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_1aa186 { +tc_f429765c, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011010; @@ -18827,7 +18899,7 @@ def S2_lsl_r_p_nac : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 -= lsl($Rss32,$Rt32)", -tc_c74f796f, TypeS_3op>, Enc_1aa186 { +tc_f675fee8, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011100; @@ -18838,7 +18910,7 @@ def S2_lsl_r_p_or : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 |= lsl($Rss32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_1aa186 { +tc_f429765c, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011000; @@ -18849,7 +18921,7 @@ def S2_lsl_r_p_xor : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 ^= lsl($Rss32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_1aa186 { +tc_f429765c, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011011; @@ -18860,7 +18932,7 @@ def S2_lsl_r_r : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = lsl($Rs32,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_5ab2be { +tc_946df596, TypeS_3op>, Enc_5ab2be { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000110010; @@ -18871,7 +18943,7 @@ def S2_lsl_r_r_acc : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += lsl($Rs32,$Rt32)", -tc_c74f796f, TypeS_3op>, Enc_2ae154 { +tc_f675fee8, TypeS_3op>, Enc_2ae154 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001100110; @@ -18884,7 +18956,7 @@ def S2_lsl_r_r_and : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 &= lsl($Rs32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_2ae154 { +tc_f429765c, TypeS_3op>, Enc_2ae154 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001100010; @@ -18897,7 +18969,7 @@ def S2_lsl_r_r_nac : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= lsl($Rs32,$Rt32)", -tc_c74f796f, TypeS_3op>, Enc_2ae154 { +tc_f675fee8, TypeS_3op>, Enc_2ae154 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001100100; @@ -18910,7 +18982,7 @@ def S2_lsl_r_r_or : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 |= lsl($Rs32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_2ae154 { +tc_f429765c, TypeS_3op>, Enc_2ae154 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001100000; @@ -18923,7 +18995,7 @@ def S2_lsl_r_vh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rdd32 = vlslh($Rss32,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_927852 { +tc_946df596, TypeS_3op>, Enc_927852 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000011010; @@ -18932,7 +19004,7 @@ def S2_lsl_r_vw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rdd32 = vlslw($Rss32,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_927852 { +tc_946df596, TypeS_3op>, Enc_927852 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000011000; @@ -18941,7 +19013,7 @@ def S2_lsr_i_p : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rdd32 = lsr($Rss32,#$Ii)", -tc_540fdfbc, TypeS_2op>, Enc_5eac98 { +tc_946df596, TypeS_2op>, Enc_5eac98 { let Inst{7-5} = 0b001; let Inst{31-21} = 0b10000000000; } @@ -18949,7 +19021,7 @@ def S2_lsr_i_p_acc : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rxx32 += lsr($Rss32,#$Ii)", -tc_c74f796f, TypeS_2op>, Enc_70fb07 { +tc_f675fee8, TypeS_2op>, Enc_70fb07 { let Inst{7-5} = 0b101; let Inst{31-21} = 0b10000010000; let prefersSlot3 = 1; @@ -18959,7 +19031,7 @@ def S2_lsr_i_p_and : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rxx32 &= lsr($Rss32,#$Ii)", -tc_84df2cd3, TypeS_2op>, Enc_70fb07 { +tc_f429765c, TypeS_2op>, Enc_70fb07 { let Inst{7-5} = 0b001; let Inst{31-21} = 0b10000010010; let prefersSlot3 = 1; @@ -18969,7 +19041,7 @@ def S2_lsr_i_p_nac : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rxx32 -= lsr($Rss32,#$Ii)", -tc_c74f796f, TypeS_2op>, Enc_70fb07 { +tc_f675fee8, TypeS_2op>, Enc_70fb07 { let Inst{7-5} = 0b001; let Inst{31-21} = 0b10000010000; let prefersSlot3 = 1; @@ -18979,7 +19051,7 @@ def S2_lsr_i_p_or : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rxx32 |= lsr($Rss32,#$Ii)", -tc_84df2cd3, TypeS_2op>, Enc_70fb07 { +tc_f429765c, TypeS_2op>, Enc_70fb07 { let Inst{7-5} = 0b101; let Inst{31-21} = 0b10000010010; let prefersSlot3 = 1; @@ -18989,7 +19061,7 @@ def S2_lsr_i_p_xacc : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rxx32 ^= lsr($Rss32,#$Ii)", -tc_84df2cd3, TypeS_2op>, Enc_70fb07 { +tc_f429765c, TypeS_2op>, Enc_70fb07 { let Inst{7-5} = 0b001; let Inst{31-21} = 0b10000010100; let prefersSlot3 = 1; @@ -18999,7 +19071,7 @@ def S2_lsr_i_r : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, u5_0Imm:$Ii), "$Rd32 = lsr($Rs32,#$Ii)", -tc_540fdfbc, TypeS_2op>, Enc_a05677 { +tc_946df596, TypeS_2op>, Enc_a05677 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001100000; @@ -19010,7 +19082,7 @@ def S2_lsr_i_r_acc : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii), "$Rx32 += lsr($Rs32,#$Ii)", -tc_c74f796f, TypeS_2op>, Enc_28a2dc { +tc_f675fee8, TypeS_2op>, Enc_28a2dc { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001110000; @@ -19023,7 +19095,7 @@ def S2_lsr_i_r_and : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii), "$Rx32 &= lsr($Rs32,#$Ii)", -tc_84df2cd3, TypeS_2op>, Enc_28a2dc { +tc_f429765c, TypeS_2op>, Enc_28a2dc { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001110010; @@ -19036,7 +19108,7 @@ def S2_lsr_i_r_nac : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii), "$Rx32 -= lsr($Rs32,#$Ii)", -tc_c74f796f, TypeS_2op>, Enc_28a2dc { +tc_f675fee8, TypeS_2op>, Enc_28a2dc { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001110000; @@ -19049,7 +19121,7 @@ def S2_lsr_i_r_or : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii), "$Rx32 |= lsr($Rs32,#$Ii)", -tc_84df2cd3, TypeS_2op>, Enc_28a2dc { +tc_f429765c, TypeS_2op>, Enc_28a2dc { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001110010; @@ -19062,7 +19134,7 @@ def S2_lsr_i_r_xacc : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii), "$Rx32 ^= lsr($Rs32,#$Ii)", -tc_84df2cd3, TypeS_2op>, Enc_28a2dc { +tc_f429765c, TypeS_2op>, Enc_28a2dc { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001110100; @@ -19075,7 +19147,7 @@ def S2_lsr_i_vh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, u4_0Imm:$Ii), "$Rdd32 = vlsrh($Rss32,#$Ii)", -tc_540fdfbc, TypeS_2op>, Enc_12b6e9 { +tc_946df596, TypeS_2op>, Enc_12b6e9 { let Inst{7-5} = 0b001; let Inst{13-12} = 0b00; let Inst{31-21} = 0b10000000100; @@ -19084,7 +19156,7 @@ def S2_lsr_i_vw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, u5_0Imm:$Ii), "$Rdd32 = vlsrw($Rss32,#$Ii)", -tc_540fdfbc, TypeS_2op>, Enc_7e5a82 { +tc_946df596, TypeS_2op>, Enc_7e5a82 { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10000000010; @@ -19093,7 +19165,7 @@ def S2_lsr_r_p : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rdd32 = lsr($Rss32,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_927852 { +tc_946df596, TypeS_3op>, Enc_927852 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000011100; @@ -19102,7 +19174,7 @@ def S2_lsr_r_p_acc : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 += lsr($Rss32,$Rt32)", -tc_c74f796f, TypeS_3op>, Enc_1aa186 { +tc_f675fee8, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011110; @@ -19113,7 +19185,7 @@ def S2_lsr_r_p_and : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 &= lsr($Rss32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_1aa186 { +tc_f429765c, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011010; @@ -19124,7 +19196,7 @@ def S2_lsr_r_p_nac : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 -= lsr($Rss32,$Rt32)", -tc_c74f796f, TypeS_3op>, Enc_1aa186 { +tc_f675fee8, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011100; @@ -19135,7 +19207,7 @@ def S2_lsr_r_p_or : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 |= lsr($Rss32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_1aa186 { +tc_f429765c, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011000; @@ -19146,7 +19218,7 @@ def S2_lsr_r_p_xor : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 ^= lsr($Rss32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_1aa186 { +tc_f429765c, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001011011; @@ -19157,7 +19229,7 @@ def S2_lsr_r_r : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = lsr($Rs32,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_5ab2be { +tc_946df596, TypeS_3op>, Enc_5ab2be { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000110010; @@ -19168,7 +19240,7 @@ def S2_lsr_r_r_acc : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 += lsr($Rs32,$Rt32)", -tc_c74f796f, TypeS_3op>, Enc_2ae154 { +tc_f675fee8, TypeS_3op>, Enc_2ae154 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001100110; @@ -19181,7 +19253,7 @@ def S2_lsr_r_r_and : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 &= lsr($Rs32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_2ae154 { +tc_f429765c, TypeS_3op>, Enc_2ae154 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001100010; @@ -19194,7 +19266,7 @@ def S2_lsr_r_r_nac : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 -= lsr($Rs32,$Rt32)", -tc_c74f796f, TypeS_3op>, Enc_2ae154 { +tc_f675fee8, TypeS_3op>, Enc_2ae154 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001100100; @@ -19207,7 +19279,7 @@ def S2_lsr_r_r_or : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32), "$Rx32 |= lsr($Rs32,$Rt32)", -tc_84df2cd3, TypeS_3op>, Enc_2ae154 { +tc_f429765c, TypeS_3op>, Enc_2ae154 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001100000; @@ -19220,7 +19292,7 @@ def S2_lsr_r_vh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rdd32 = vlsrh($Rss32,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_927852 { +tc_946df596, TypeS_3op>, Enc_927852 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000011010; @@ -19229,16 +19301,28 @@ def S2_lsr_r_vw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rdd32 = vlsrw($Rss32,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_927852 { +tc_946df596, TypeS_3op>, Enc_927852 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000011000; } +def S2_mask : HInst< +(outs IntRegs:$Rd32), +(ins u5_0Imm:$Ii, u5_0Imm:$II), +"$Rd32 = mask(#$Ii,#$II)", +tc_9461ff31, TypeS_2op>, Enc_c85e2a, Requires<[HasV66]> { +let Inst{13-13} = 0b1; +let Inst{20-16} = 0b00000; +let Inst{31-23} = 0b100011010; +let hasNewValue = 1; +let opNewValue = 0; +let prefersSlot3 = 1; +} def S2_packhl : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = packhl($Rs32,$Rt32)", -tc_b9488031, TypeALU32_3op>, Enc_be32a5 { +tc_5a2711e5, TypeALU32_3op>, Enc_be32a5 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11110101100; @@ -19248,7 +19332,7 @@ def S2_parityp : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rd32 = parity($Rss32,$Rtt32)", -tc_2b6f77c6, TypeALU64>, Enc_d2216a { +tc_002cb246, TypeALU64>, Enc_d2216a { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010000000; @@ -19260,7 +19344,7 @@ def S2_pstorerbf_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4) memb($Rs32+#$Ii) = $Rt32", -tc_8b15472a, TypeV2LDST>, Enc_da8d43, AddrModeRel { +tc_f8e23f0b, TypeV2LDST>, Enc_da8d43, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000100000; let isPredicated = 1; @@ -19282,7 +19366,7 @@ def S2_pstorerbf_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4) memb($Rx32++#$Ii) = $Rt32", -tc_cd7374a0, TypeST>, Enc_cc449f, AddrModeRel { +tc_24b66c99, TypeST>, Enc_cc449f, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b0; let Inst{13-13} = 0b1; @@ -19300,7 +19384,7 @@ def S2_pstorerbf_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32), "if (!$Pv4) memb($Rs32) = $Rt32", -tc_8b15472a, TypeMAPPING> { +tc_f8e23f0b, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -19308,7 +19392,7 @@ def S2_pstorerbfnew_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4.new) memb($Rx32++#$Ii) = $Rt32", -tc_74e47fd9, TypeST>, Enc_cc449f, AddrModeRel { +tc_53559e35, TypeST>, Enc_cc449f, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -19327,7 +19411,7 @@ def S2_pstorerbnewf_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4) memb($Rs32+#$Ii) = $Nt8.new", -tc_594ab548, TypeV2LDST>, Enc_585242, AddrModeRel { +tc_8fb7ab1b, TypeV2LDST>, Enc_585242, AddrModeRel { let Inst{2-2} = 0b0; let Inst{12-11} = 0b00; let Inst{31-21} = 0b01000100101; @@ -19353,7 +19437,7 @@ def S2_pstorerbnewf_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4) memb($Rx32++#$Ii) = $Nt8.new", -tc_d9f95eef, TypeST>, Enc_52a5dd, AddrModeRel { +tc_838b34ea, TypeST>, Enc_52a5dd, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b0; let Inst{13-11} = 0b100; @@ -19375,7 +19459,7 @@ def S2_pstorerbnewf_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8), "if (!$Pv4) memb($Rs32) = $Nt8.new", -tc_594ab548, TypeMAPPING> { +tc_8fb7ab1b, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; let opNewValue = 2; @@ -19384,7 +19468,7 @@ def S2_pstorerbnewfnew_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4.new) memb($Rx32++#$Ii) = $Nt8.new", -tc_d24b2d85, TypeST>, Enc_52a5dd, AddrModeRel { +tc_d65dbf51, TypeST>, Enc_52a5dd, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-11} = 0b100; @@ -19407,7 +19491,7 @@ def S2_pstorerbnewt_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8), "if ($Pv4) memb($Rs32+#$Ii) = $Nt8.new", -tc_594ab548, TypeV2LDST>, Enc_585242, AddrModeRel { +tc_8fb7ab1b, TypeV2LDST>, Enc_585242, AddrModeRel { let Inst{2-2} = 0b0; let Inst{12-11} = 0b00; let Inst{31-21} = 0b01000000101; @@ -19432,7 +19516,7 @@ def S2_pstorerbnewt_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8), "if ($Pv4) memb($Rx32++#$Ii) = $Nt8.new", -tc_d9f95eef, TypeST>, Enc_52a5dd, AddrModeRel { +tc_838b34ea, TypeST>, Enc_52a5dd, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b0; let Inst{13-11} = 0b100; @@ -19453,7 +19537,7 @@ def S2_pstorerbnewt_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8), "if ($Pv4) memb($Rs32) = $Nt8.new", -tc_594ab548, TypeMAPPING> { +tc_8fb7ab1b, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; let opNewValue = 2; @@ -19462,7 +19546,7 @@ def S2_pstorerbnewtnew_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8), "if ($Pv4.new) memb($Rx32++#$Ii) = $Nt8.new", -tc_d24b2d85, TypeST>, Enc_52a5dd, AddrModeRel { +tc_d65dbf51, TypeST>, Enc_52a5dd, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-11} = 0b100; @@ -19484,7 +19568,7 @@ def S2_pstorerbt_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4) memb($Rs32+#$Ii) = $Rt32", -tc_8b15472a, TypeV2LDST>, Enc_da8d43, AddrModeRel { +tc_f8e23f0b, TypeV2LDST>, Enc_da8d43, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000000000; let isPredicated = 1; @@ -19505,7 +19589,7 @@ def S2_pstorerbt_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4) memb($Rx32++#$Ii) = $Rt32", -tc_cd7374a0, TypeST>, Enc_cc449f, AddrModeRel { +tc_24b66c99, TypeST>, Enc_cc449f, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b0; let Inst{13-13} = 0b1; @@ -19522,7 +19606,7 @@ def S2_pstorerbt_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32), "if ($Pv4) memb($Rs32) = $Rt32", -tc_8b15472a, TypeMAPPING> { +tc_f8e23f0b, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -19530,7 +19614,7 @@ def S2_pstorerbtnew_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4.new) memb($Rx32++#$Ii) = $Rt32", -tc_74e47fd9, TypeST>, Enc_cc449f, AddrModeRel { +tc_53559e35, TypeST>, Enc_cc449f, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -19548,7 +19632,7 @@ def S2_pstorerdf_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32), "if (!$Pv4) memd($Rs32+#$Ii) = $Rtt32", -tc_8b15472a, TypeV2LDST>, Enc_57a33e, AddrModeRel { +tc_f8e23f0b, TypeV2LDST>, Enc_57a33e, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000100110; let isPredicated = 1; @@ -19569,7 +19653,7 @@ def S2_pstorerdf_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32), "if (!$Pv4) memd($Rx32++#$Ii) = $Rtt32", -tc_cd7374a0, TypeST>, Enc_9a33d5, AddrModeRel { +tc_24b66c99, TypeST>, Enc_9a33d5, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b0; let Inst{13-13} = 0b1; @@ -19587,7 +19671,7 @@ def S2_pstorerdf_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32), "if (!$Pv4) memd($Rs32) = $Rtt32", -tc_8b15472a, TypeMAPPING> { +tc_f8e23f0b, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -19595,7 +19679,7 @@ def S2_pstorerdfnew_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32), "if (!$Pv4.new) memd($Rx32++#$Ii) = $Rtt32", -tc_74e47fd9, TypeST>, Enc_9a33d5, AddrModeRel { +tc_53559e35, TypeST>, Enc_9a33d5, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -19614,7 +19698,7 @@ def S2_pstorerdt_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32), "if ($Pv4) memd($Rs32+#$Ii) = $Rtt32", -tc_8b15472a, TypeV2LDST>, Enc_57a33e, AddrModeRel { +tc_f8e23f0b, TypeV2LDST>, Enc_57a33e, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000000110; let isPredicated = 1; @@ -19634,7 +19718,7 @@ def S2_pstorerdt_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32), "if ($Pv4) memd($Rx32++#$Ii) = $Rtt32", -tc_cd7374a0, TypeST>, Enc_9a33d5, AddrModeRel { +tc_24b66c99, TypeST>, Enc_9a33d5, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b0; let Inst{13-13} = 0b1; @@ -19651,7 +19735,7 @@ def S2_pstorerdt_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32), "if ($Pv4) memd($Rs32) = $Rtt32", -tc_8b15472a, TypeMAPPING> { +tc_f8e23f0b, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -19659,7 +19743,7 @@ def S2_pstorerdtnew_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32), "if ($Pv4.new) memd($Rx32++#$Ii) = $Rtt32", -tc_74e47fd9, TypeST>, Enc_9a33d5, AddrModeRel { +tc_53559e35, TypeST>, Enc_9a33d5, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -19677,7 +19761,7 @@ def S2_pstorerff_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4) memh($Rs32+#$Ii) = $Rt32.h", -tc_8b15472a, TypeV2LDST>, Enc_e8c45e, AddrModeRel { +tc_f8e23f0b, TypeV2LDST>, Enc_e8c45e, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000100011; let isPredicated = 1; @@ -19698,7 +19782,7 @@ def S2_pstorerff_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4) memh($Rx32++#$Ii) = $Rt32.h", -tc_cd7374a0, TypeST>, Enc_b886fd, AddrModeRel { +tc_24b66c99, TypeST>, Enc_b886fd, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b0; let Inst{13-13} = 0b1; @@ -19716,7 +19800,7 @@ def S2_pstorerff_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32), "if (!$Pv4) memh($Rs32) = $Rt32.h", -tc_8b15472a, TypeMAPPING> { +tc_f8e23f0b, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -19724,7 +19808,7 @@ def S2_pstorerffnew_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4.new) memh($Rx32++#$Ii) = $Rt32.h", -tc_74e47fd9, TypeST>, Enc_b886fd, AddrModeRel { +tc_53559e35, TypeST>, Enc_b886fd, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -19743,7 +19827,7 @@ def S2_pstorerft_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32), "if ($Pv4) memh($Rs32+#$Ii) = $Rt32.h", -tc_8b15472a, TypeV2LDST>, Enc_e8c45e, AddrModeRel { +tc_f8e23f0b, TypeV2LDST>, Enc_e8c45e, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000000011; let isPredicated = 1; @@ -19763,7 +19847,7 @@ def S2_pstorerft_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32), "if ($Pv4) memh($Rx32++#$Ii) = $Rt32.h", -tc_cd7374a0, TypeST>, Enc_b886fd, AddrModeRel { +tc_24b66c99, TypeST>, Enc_b886fd, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b0; let Inst{13-13} = 0b1; @@ -19780,7 +19864,7 @@ def S2_pstorerft_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32), "if ($Pv4) memh($Rs32) = $Rt32.h", -tc_8b15472a, TypeMAPPING> { +tc_f8e23f0b, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -19788,7 +19872,7 @@ def S2_pstorerftnew_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32), "if ($Pv4.new) memh($Rx32++#$Ii) = $Rt32.h", -tc_74e47fd9, TypeST>, Enc_b886fd, AddrModeRel { +tc_53559e35, TypeST>, Enc_b886fd, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -19806,7 +19890,7 @@ def S2_pstorerhf_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4) memh($Rs32+#$Ii) = $Rt32", -tc_8b15472a, TypeV2LDST>, Enc_e8c45e, AddrModeRel { +tc_f8e23f0b, TypeV2LDST>, Enc_e8c45e, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000100010; let isPredicated = 1; @@ -19828,7 +19912,7 @@ def S2_pstorerhf_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4) memh($Rx32++#$Ii) = $Rt32", -tc_cd7374a0, TypeST>, Enc_b886fd, AddrModeRel { +tc_24b66c99, TypeST>, Enc_b886fd, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b0; let Inst{13-13} = 0b1; @@ -19846,7 +19930,7 @@ def S2_pstorerhf_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32), "if (!$Pv4) memh($Rs32) = $Rt32", -tc_8b15472a, TypeMAPPING> { +tc_f8e23f0b, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -19854,7 +19938,7 @@ def S2_pstorerhfnew_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4.new) memh($Rx32++#$Ii) = $Rt32", -tc_74e47fd9, TypeST>, Enc_b886fd, AddrModeRel { +tc_53559e35, TypeST>, Enc_b886fd, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -19873,7 +19957,7 @@ def S2_pstorerhnewf_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4) memh($Rs32+#$Ii) = $Nt8.new", -tc_594ab548, TypeV2LDST>, Enc_f44229, AddrModeRel { +tc_8fb7ab1b, TypeV2LDST>, Enc_f44229, AddrModeRel { let Inst{2-2} = 0b0; let Inst{12-11} = 0b01; let Inst{31-21} = 0b01000100101; @@ -19899,7 +19983,7 @@ def S2_pstorerhnewf_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4) memh($Rx32++#$Ii) = $Nt8.new", -tc_d9f95eef, TypeST>, Enc_31aa6a, AddrModeRel { +tc_838b34ea, TypeST>, Enc_31aa6a, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b0; let Inst{13-11} = 0b101; @@ -19921,7 +20005,7 @@ def S2_pstorerhnewf_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8), "if (!$Pv4) memh($Rs32) = $Nt8.new", -tc_594ab548, TypeMAPPING> { +tc_8fb7ab1b, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; let opNewValue = 2; @@ -19930,7 +20014,7 @@ def S2_pstorerhnewfnew_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4.new) memh($Rx32++#$Ii) = $Nt8.new", -tc_d24b2d85, TypeST>, Enc_31aa6a, AddrModeRel { +tc_d65dbf51, TypeST>, Enc_31aa6a, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-11} = 0b101; @@ -19953,7 +20037,7 @@ def S2_pstorerhnewt_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8), "if ($Pv4) memh($Rs32+#$Ii) = $Nt8.new", -tc_594ab548, TypeV2LDST>, Enc_f44229, AddrModeRel { +tc_8fb7ab1b, TypeV2LDST>, Enc_f44229, AddrModeRel { let Inst{2-2} = 0b0; let Inst{12-11} = 0b01; let Inst{31-21} = 0b01000000101; @@ -19978,7 +20062,7 @@ def S2_pstorerhnewt_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8), "if ($Pv4) memh($Rx32++#$Ii) = $Nt8.new", -tc_d9f95eef, TypeST>, Enc_31aa6a, AddrModeRel { +tc_838b34ea, TypeST>, Enc_31aa6a, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b0; let Inst{13-11} = 0b101; @@ -19999,7 +20083,7 @@ def S2_pstorerhnewt_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8), "if ($Pv4) memh($Rs32) = $Nt8.new", -tc_594ab548, TypeMAPPING> { +tc_8fb7ab1b, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; let opNewValue = 2; @@ -20008,7 +20092,7 @@ def S2_pstorerhnewtnew_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8), "if ($Pv4.new) memh($Rx32++#$Ii) = $Nt8.new", -tc_d24b2d85, TypeST>, Enc_31aa6a, AddrModeRel { +tc_d65dbf51, TypeST>, Enc_31aa6a, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-11} = 0b101; @@ -20030,7 +20114,7 @@ def S2_pstorerht_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32), "if ($Pv4) memh($Rs32+#$Ii) = $Rt32", -tc_8b15472a, TypeV2LDST>, Enc_e8c45e, AddrModeRel { +tc_f8e23f0b, TypeV2LDST>, Enc_e8c45e, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000000010; let isPredicated = 1; @@ -20051,7 +20135,7 @@ def S2_pstorerht_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32), "if ($Pv4) memh($Rx32++#$Ii) = $Rt32", -tc_cd7374a0, TypeST>, Enc_b886fd, AddrModeRel { +tc_24b66c99, TypeST>, Enc_b886fd, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b0; let Inst{13-13} = 0b1; @@ -20068,7 +20152,7 @@ def S2_pstorerht_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32), "if ($Pv4) memh($Rs32) = $Rt32", -tc_8b15472a, TypeMAPPING> { +tc_f8e23f0b, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -20076,7 +20160,7 @@ def S2_pstorerhtnew_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32), "if ($Pv4.new) memh($Rx32++#$Ii) = $Rt32", -tc_74e47fd9, TypeST>, Enc_b886fd, AddrModeRel { +tc_53559e35, TypeST>, Enc_b886fd, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -20094,7 +20178,7 @@ def S2_pstorerif_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4) memw($Rs32+#$Ii) = $Rt32", -tc_8b15472a, TypeV2LDST>, Enc_397f23, AddrModeRel { +tc_f8e23f0b, TypeV2LDST>, Enc_397f23, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000100100; let isPredicated = 1; @@ -20116,7 +20200,7 @@ def S2_pstorerif_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4) memw($Rx32++#$Ii) = $Rt32", -tc_cd7374a0, TypeST>, Enc_7eaeb6, AddrModeRel { +tc_24b66c99, TypeST>, Enc_7eaeb6, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b0; let Inst{13-13} = 0b1; @@ -20134,7 +20218,7 @@ def S2_pstorerif_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32), "if (!$Pv4) memw($Rs32) = $Rt32", -tc_8b15472a, TypeMAPPING> { +tc_f8e23f0b, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -20142,7 +20226,7 @@ def S2_pstorerifnew_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4.new) memw($Rx32++#$Ii) = $Rt32", -tc_74e47fd9, TypeST>, Enc_7eaeb6, AddrModeRel { +tc_53559e35, TypeST>, Enc_7eaeb6, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -20162,7 +20246,7 @@ def S2_pstorerinewf_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4) memw($Rs32+#$Ii) = $Nt8.new", -tc_594ab548, TypeV2LDST>, Enc_8dbdfe, AddrModeRel { +tc_8fb7ab1b, TypeV2LDST>, Enc_8dbdfe, AddrModeRel { let Inst{2-2} = 0b0; let Inst{12-11} = 0b10; let Inst{31-21} = 0b01000100101; @@ -20188,7 +20272,7 @@ def S2_pstorerinewf_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4) memw($Rx32++#$Ii) = $Nt8.new", -tc_d9f95eef, TypeST>, Enc_65f095, AddrModeRel { +tc_838b34ea, TypeST>, Enc_65f095, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b0; let Inst{13-11} = 0b110; @@ -20210,7 +20294,7 @@ def S2_pstorerinewf_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8), "if (!$Pv4) memw($Rs32) = $Nt8.new", -tc_594ab548, TypeMAPPING> { +tc_8fb7ab1b, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; let opNewValue = 2; @@ -20219,7 +20303,7 @@ def S2_pstorerinewfnew_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4.new) memw($Rx32++#$Ii) = $Nt8.new", -tc_d24b2d85, TypeST>, Enc_65f095, AddrModeRel { +tc_d65dbf51, TypeST>, Enc_65f095, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-11} = 0b110; @@ -20242,7 +20326,7 @@ def S2_pstorerinewt_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8), "if ($Pv4) memw($Rs32+#$Ii) = $Nt8.new", -tc_594ab548, TypeV2LDST>, Enc_8dbdfe, AddrModeRel { +tc_8fb7ab1b, TypeV2LDST>, Enc_8dbdfe, AddrModeRel { let Inst{2-2} = 0b0; let Inst{12-11} = 0b10; let Inst{31-21} = 0b01000000101; @@ -20267,7 +20351,7 @@ def S2_pstorerinewt_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8), "if ($Pv4) memw($Rx32++#$Ii) = $Nt8.new", -tc_d9f95eef, TypeST>, Enc_65f095, AddrModeRel { +tc_838b34ea, TypeST>, Enc_65f095, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b0; let Inst{13-11} = 0b110; @@ -20288,7 +20372,7 @@ def S2_pstorerinewt_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8), "if ($Pv4) memw($Rs32) = $Nt8.new", -tc_594ab548, TypeMAPPING> { +tc_8fb7ab1b, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; let opNewValue = 2; @@ -20297,7 +20381,7 @@ def S2_pstorerinewtnew_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8), "if ($Pv4.new) memw($Rx32++#$Ii) = $Nt8.new", -tc_d24b2d85, TypeST>, Enc_65f095, AddrModeRel { +tc_d65dbf51, TypeST>, Enc_65f095, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-11} = 0b110; @@ -20319,7 +20403,7 @@ def S2_pstorerit_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32), "if ($Pv4) memw($Rs32+#$Ii) = $Rt32", -tc_8b15472a, TypeV2LDST>, Enc_397f23, AddrModeRel { +tc_f8e23f0b, TypeV2LDST>, Enc_397f23, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000000100; let isPredicated = 1; @@ -20340,7 +20424,7 @@ def S2_pstorerit_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32), "if ($Pv4) memw($Rx32++#$Ii) = $Rt32", -tc_cd7374a0, TypeST>, Enc_7eaeb6, AddrModeRel { +tc_24b66c99, TypeST>, Enc_7eaeb6, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b0; let Inst{13-13} = 0b1; @@ -20357,7 +20441,7 @@ def S2_pstorerit_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32), "if ($Pv4) memw($Rs32) = $Rt32", -tc_8b15472a, TypeMAPPING> { +tc_f8e23f0b, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -20365,7 +20449,7 @@ def S2_pstoreritnew_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32), "if ($Pv4.new) memw($Rx32++#$Ii) = $Rt32", -tc_74e47fd9, TypeST>, Enc_7eaeb6, AddrModeRel { +tc_53559e35, TypeST>, Enc_7eaeb6, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -20383,7 +20467,7 @@ def S2_setbit_i : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, u5_0Imm:$Ii), "$Rd32 = setbit($Rs32,#$Ii)", -tc_540fdfbc, TypeS_2op>, Enc_a05677 { +tc_946df596, TypeS_2op>, Enc_a05677 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001100110; @@ -20394,7 +20478,7 @@ def S2_setbit_r : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = setbit($Rs32,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_5ab2be { +tc_946df596, TypeS_3op>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000110100; @@ -20405,7 +20489,7 @@ def S2_shuffeb : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = shuffeb($Rss32,$Rtt32)", -tc_540fdfbc, TypeS_3op>, Enc_a56825 { +tc_946df596, TypeS_3op>, Enc_a56825 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001000; @@ -20414,7 +20498,7 @@ def S2_shuffeh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = shuffeh($Rss32,$Rtt32)", -tc_540fdfbc, TypeS_3op>, Enc_a56825 { +tc_946df596, TypeS_3op>, Enc_a56825 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001000; @@ -20423,7 +20507,7 @@ def S2_shuffob : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = shuffob($Rtt32,$Rss32)", -tc_540fdfbc, TypeS_3op>, Enc_ea23e4 { +tc_946df596, TypeS_3op>, Enc_ea23e4 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001000; @@ -20432,7 +20516,7 @@ def S2_shuffoh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32), "$Rdd32 = shuffoh($Rtt32,$Rss32)", -tc_540fdfbc, TypeS_3op>, Enc_ea23e4 { +tc_946df596, TypeS_3op>, Enc_ea23e4 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001100; @@ -20441,7 +20525,7 @@ def S2_storerb_io : HInst< (outs), (ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Rt32), "memb($Rs32+#$Ii) = $Rt32", -tc_05b6c987, TypeST>, Enc_448f7f, AddrModeRel, PostInc_BaseImm { +tc_30b9bb4a, TypeST>, Enc_448f7f, AddrModeRel, PostInc_BaseImm { let Inst{24-21} = 0b1000; let Inst{31-27} = 0b10100; let addrMode = BaseImmOffset; @@ -20462,9 +20546,10 @@ def S2_storerb_pbr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32), "memb($Rx32++$Mu2:brev) = $Rt32", -tc_f86c328a, TypeST>, Enc_d5c73f, AddrModeRel { +tc_da97ee82, TypeST>, Enc_d5c73f, AddrModeRel { let Inst{7-0} = 0b00000000; let Inst{31-21} = 0b10101111000; +let addrMode = PostInc; let accessSize = ByteAccess; let mayStore = 1; let BaseOpcode = "S2_storerb_pbr"; @@ -20475,7 +20560,7 @@ def S2_storerb_pci : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32), "memb($Rx32++#$Ii:circ($Mu2)) = $Rt32", -tc_9fdb5406, TypeST>, Enc_b15941, AddrModeRel { +tc_e86aa961, TypeST>, Enc_b15941, AddrModeRel { let Inst{2-0} = 0b000; let Inst{7-7} = 0b0; let Inst{31-21} = 0b10101001000; @@ -20491,7 +20576,7 @@ def S2_storerb_pcr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32), "memb($Rx32++I:circ($Mu2)) = $Rt32", -tc_f86c328a, TypeST>, Enc_d5c73f, AddrModeRel { +tc_da97ee82, TypeST>, Enc_d5c73f, AddrModeRel { let Inst{7-0} = 0b00000010; let Inst{31-21} = 0b10101001000; let addrMode = PostInc; @@ -20506,7 +20591,7 @@ def S2_storerb_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32), "memb($Rx32++#$Ii) = $Rt32", -tc_f86c328a, TypeST>, Enc_10bc21, AddrModeRel, PostInc_BaseImm { +tc_da97ee82, TypeST>, Enc_10bc21, AddrModeRel, PostInc_BaseImm { let Inst{2-0} = 0b000; let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; @@ -20524,7 +20609,7 @@ def S2_storerb_pr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32), "memb($Rx32++$Mu2) = $Rt32", -tc_f86c328a, TypeST>, Enc_d5c73f { +tc_da97ee82, TypeST>, Enc_d5c73f { let Inst{7-0} = 0b00000000; let Inst{31-21} = 0b10101101000; let addrMode = PostInc; @@ -20537,7 +20622,7 @@ def S2_storerb_zomap : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Rt32), "memb($Rs32) = $Rt32", -tc_05b6c987, TypeMAPPING> { +tc_30b9bb4a, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -20545,7 +20630,7 @@ def S2_storerbgp : HInst< (outs), (ins u32_0Imm:$Ii, IntRegs:$Rt32), "memb(gp+#$Ii) = $Rt32", -tc_a788683e, TypeV2LDST>, Enc_1b64fb, AddrModeRel { +tc_0371abea, TypeV2LDST>, Enc_1b64fb, AddrModeRel { let Inst{24-21} = 0b0000; let Inst{31-27} = 0b01001; let accessSize = ByteAccess; @@ -20563,7 +20648,7 @@ def S2_storerbnew_io : HInst< (outs), (ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Nt8), "memb($Rs32+#$Ii) = $Nt8.new", -tc_f7dd9c9f, TypeST>, Enc_4df4e9, AddrModeRel { +tc_be9602ff, TypeST>, Enc_4df4e9, AddrModeRel { let Inst{12-11} = 0b00; let Inst{24-21} = 0b1101; let Inst{31-27} = 0b10100; @@ -20588,10 +20673,11 @@ def S2_storerbnew_pbr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8), "memb($Rx32++$Mu2:brev) = $Nt8.new", -tc_e7d02c66, TypeST>, Enc_8dbe85, AddrModeRel { +tc_c79a189f, TypeST>, Enc_8dbe85, AddrModeRel { let Inst{7-0} = 0b00000000; let Inst{12-11} = 0b00; let Inst{31-21} = 0b10101111101; +let addrMode = PostInc; let accessSize = ByteAccess; let isNVStore = 1; let isNewValue = 1; @@ -20605,7 +20691,7 @@ def S2_storerbnew_pci : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, IntRegs:$Nt8), "memb($Rx32++#$Ii:circ($Mu2)) = $Nt8.new", -tc_9d5941c7, TypeST>, Enc_96ce4f, AddrModeRel { +tc_d5c0729a, TypeST>, Enc_96ce4f, AddrModeRel { let Inst{2-0} = 0b000; let Inst{7-7} = 0b0; let Inst{12-11} = 0b00; @@ -20625,7 +20711,7 @@ def S2_storerbnew_pcr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8), "memb($Rx32++I:circ($Mu2)) = $Nt8.new", -tc_e7d02c66, TypeST>, Enc_8dbe85, AddrModeRel { +tc_c79a189f, TypeST>, Enc_8dbe85, AddrModeRel { let Inst{7-0} = 0b00000010; let Inst{12-11} = 0b00; let Inst{31-21} = 0b10101001101; @@ -20644,7 +20730,7 @@ def S2_storerbnew_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8), "memb($Rx32++#$Ii) = $Nt8.new", -tc_e7d02c66, TypeST>, Enc_c7cd90, AddrModeRel { +tc_c79a189f, TypeST>, Enc_c7cd90, AddrModeRel { let Inst{2-0} = 0b000; let Inst{7-7} = 0b0; let Inst{13-11} = 0b000; @@ -20665,7 +20751,7 @@ def S2_storerbnew_pr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8), "memb($Rx32++$Mu2) = $Nt8.new", -tc_e7d02c66, TypeST>, Enc_8dbe85 { +tc_c79a189f, TypeST>, Enc_8dbe85 { let Inst{7-0} = 0b00000000; let Inst{12-11} = 0b00; let Inst{31-21} = 0b10101101101; @@ -20682,7 +20768,7 @@ def S2_storerbnew_zomap : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Nt8), "memb($Rs32) = $Nt8.new", -tc_f7dd9c9f, TypeMAPPING> { +tc_be9602ff, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; let opNewValue = 1; @@ -20691,7 +20777,7 @@ def S2_storerbnewgp : HInst< (outs), (ins u32_0Imm:$Ii, IntRegs:$Nt8), "memb(gp+#$Ii) = $Nt8.new", -tc_ff9ee76e, TypeV2LDST>, Enc_ad1831, AddrModeRel { +tc_5bf126a6, TypeV2LDST>, Enc_ad1831, AddrModeRel { let Inst{12-11} = 0b00; let Inst{24-21} = 0b0101; let Inst{31-27} = 0b01001; @@ -20713,7 +20799,7 @@ def S2_storerd_io : HInst< (outs), (ins IntRegs:$Rs32, s29_3Imm:$Ii, DoubleRegs:$Rtt32), "memd($Rs32+#$Ii) = $Rtt32", -tc_05b6c987, TypeST>, Enc_ce6828, AddrModeRel, PostInc_BaseImm { +tc_30b9bb4a, TypeST>, Enc_ce6828, AddrModeRel, PostInc_BaseImm { let Inst{24-21} = 0b1110; let Inst{31-27} = 0b10100; let addrMode = BaseImmOffset; @@ -20733,9 +20819,10 @@ def S2_storerd_pbr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, DoubleRegs:$Rtt32), "memd($Rx32++$Mu2:brev) = $Rtt32", -tc_f86c328a, TypeST>, Enc_928ca1 { +tc_da97ee82, TypeST>, Enc_928ca1 { let Inst{7-0} = 0b00000000; let Inst{31-21} = 0b10101111110; +let addrMode = PostInc; let accessSize = DoubleWordAccess; let mayStore = 1; let Constraints = "$Rx32 = $Rx32in"; @@ -20744,7 +20831,7 @@ def S2_storerd_pci : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_3Imm:$Ii, ModRegs:$Mu2, DoubleRegs:$Rtt32), "memd($Rx32++#$Ii:circ($Mu2)) = $Rtt32", -tc_9fdb5406, TypeST>, Enc_395cc4 { +tc_e86aa961, TypeST>, Enc_395cc4 { let Inst{2-0} = 0b000; let Inst{7-7} = 0b0; let Inst{31-21} = 0b10101001110; @@ -20758,7 +20845,7 @@ def S2_storerd_pcr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, DoubleRegs:$Rtt32), "memd($Rx32++I:circ($Mu2)) = $Rtt32", -tc_f86c328a, TypeST>, Enc_928ca1 { +tc_da97ee82, TypeST>, Enc_928ca1 { let Inst{7-0} = 0b00000010; let Inst{31-21} = 0b10101001110; let addrMode = PostInc; @@ -20771,7 +20858,7 @@ def S2_storerd_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32), "memd($Rx32++#$Ii) = $Rtt32", -tc_f86c328a, TypeST>, Enc_85bf58, AddrModeRel, PostInc_BaseImm { +tc_da97ee82, TypeST>, Enc_85bf58, AddrModeRel, PostInc_BaseImm { let Inst{2-0} = 0b000; let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; @@ -20788,7 +20875,7 @@ def S2_storerd_pr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, DoubleRegs:$Rtt32), "memd($Rx32++$Mu2) = $Rtt32", -tc_f86c328a, TypeST>, Enc_928ca1 { +tc_da97ee82, TypeST>, Enc_928ca1 { let Inst{7-0} = 0b00000000; let Inst{31-21} = 0b10101101110; let addrMode = PostInc; @@ -20800,7 +20887,7 @@ def S2_storerd_zomap : HInst< (outs), (ins IntRegs:$Rs32, DoubleRegs:$Rtt32), "memd($Rs32) = $Rtt32", -tc_05b6c987, TypeMAPPING> { +tc_30b9bb4a, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -20808,7 +20895,7 @@ def S2_storerdgp : HInst< (outs), (ins u29_3Imm:$Ii, DoubleRegs:$Rtt32), "memd(gp+#$Ii) = $Rtt32", -tc_a788683e, TypeV2LDST>, Enc_5c124a, AddrModeRel { +tc_0371abea, TypeV2LDST>, Enc_5c124a, AddrModeRel { let Inst{24-21} = 0b0110; let Inst{31-27} = 0b01001; let accessSize = DoubleWordAccess; @@ -20825,7 +20912,7 @@ def S2_storerf_io : HInst< (outs), (ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Rt32), "memh($Rs32+#$Ii) = $Rt32.h", -tc_05b6c987, TypeST>, Enc_e957fb, AddrModeRel, PostInc_BaseImm { +tc_30b9bb4a, TypeST>, Enc_e957fb, AddrModeRel, PostInc_BaseImm { let Inst{24-21} = 0b1011; let Inst{31-27} = 0b10100; let addrMode = BaseImmOffset; @@ -20845,9 +20932,10 @@ def S2_storerf_pbr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32), "memh($Rx32++$Mu2:brev) = $Rt32.h", -tc_f86c328a, TypeST>, Enc_d5c73f { +tc_da97ee82, TypeST>, Enc_d5c73f { let Inst{7-0} = 0b00000000; let Inst{31-21} = 0b10101111011; +let addrMode = PostInc; let accessSize = HalfWordAccess; let mayStore = 1; let Constraints = "$Rx32 = $Rx32in"; @@ -20856,7 +20944,7 @@ def S2_storerf_pci : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32), "memh($Rx32++#$Ii:circ($Mu2)) = $Rt32.h", -tc_9fdb5406, TypeST>, Enc_935d9b { +tc_e86aa961, TypeST>, Enc_935d9b { let Inst{2-0} = 0b000; let Inst{7-7} = 0b0; let Inst{31-21} = 0b10101001011; @@ -20870,7 +20958,7 @@ def S2_storerf_pcr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32), "memh($Rx32++I:circ($Mu2)) = $Rt32.h", -tc_f86c328a, TypeST>, Enc_d5c73f { +tc_da97ee82, TypeST>, Enc_d5c73f { let Inst{7-0} = 0b00000010; let Inst{31-21} = 0b10101001011; let addrMode = PostInc; @@ -20883,7 +20971,7 @@ def S2_storerf_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32), "memh($Rx32++#$Ii) = $Rt32.h", -tc_f86c328a, TypeST>, Enc_052c7d, AddrModeRel, PostInc_BaseImm { +tc_da97ee82, TypeST>, Enc_052c7d, AddrModeRel, PostInc_BaseImm { let Inst{2-0} = 0b000; let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; @@ -20900,7 +20988,7 @@ def S2_storerf_pr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32), "memh($Rx32++$Mu2) = $Rt32.h", -tc_f86c328a, TypeST>, Enc_d5c73f { +tc_da97ee82, TypeST>, Enc_d5c73f { let Inst{7-0} = 0b00000000; let Inst{31-21} = 0b10101101011; let addrMode = PostInc; @@ -20912,7 +21000,7 @@ def S2_storerf_zomap : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Rt32), "memh($Rs32) = $Rt32.h", -tc_05b6c987, TypeMAPPING> { +tc_30b9bb4a, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -20920,7 +21008,7 @@ def S2_storerfgp : HInst< (outs), (ins u31_1Imm:$Ii, IntRegs:$Rt32), "memh(gp+#$Ii) = $Rt32.h", -tc_a788683e, TypeV2LDST>, Enc_fda92c, AddrModeRel { +tc_0371abea, TypeV2LDST>, Enc_fda92c, AddrModeRel { let Inst{24-21} = 0b0011; let Inst{31-27} = 0b01001; let accessSize = HalfWordAccess; @@ -20937,7 +21025,7 @@ def S2_storerh_io : HInst< (outs), (ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Rt32), "memh($Rs32+#$Ii) = $Rt32", -tc_05b6c987, TypeST>, Enc_e957fb, AddrModeRel, PostInc_BaseImm { +tc_30b9bb4a, TypeST>, Enc_e957fb, AddrModeRel, PostInc_BaseImm { let Inst{24-21} = 0b1010; let Inst{31-27} = 0b10100; let addrMode = BaseImmOffset; @@ -20958,9 +21046,10 @@ def S2_storerh_pbr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32), "memh($Rx32++$Mu2:brev) = $Rt32", -tc_f86c328a, TypeST>, Enc_d5c73f, AddrModeRel { +tc_da97ee82, TypeST>, Enc_d5c73f, AddrModeRel { let Inst{7-0} = 0b00000000; let Inst{31-21} = 0b10101111010; +let addrMode = PostInc; let accessSize = HalfWordAccess; let mayStore = 1; let BaseOpcode = "S2_storerh_pbr"; @@ -20971,7 +21060,7 @@ def S2_storerh_pci : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32), "memh($Rx32++#$Ii:circ($Mu2)) = $Rt32", -tc_9fdb5406, TypeST>, Enc_935d9b, AddrModeRel { +tc_e86aa961, TypeST>, Enc_935d9b, AddrModeRel { let Inst{2-0} = 0b000; let Inst{7-7} = 0b0; let Inst{31-21} = 0b10101001010; @@ -20987,7 +21076,7 @@ def S2_storerh_pcr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32), "memh($Rx32++I:circ($Mu2)) = $Rt32", -tc_f86c328a, TypeST>, Enc_d5c73f, AddrModeRel { +tc_da97ee82, TypeST>, Enc_d5c73f, AddrModeRel { let Inst{7-0} = 0b00000010; let Inst{31-21} = 0b10101001010; let addrMode = PostInc; @@ -21002,7 +21091,7 @@ def S2_storerh_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32), "memh($Rx32++#$Ii) = $Rt32", -tc_f86c328a, TypeST>, Enc_052c7d, AddrModeRel, PostInc_BaseImm { +tc_da97ee82, TypeST>, Enc_052c7d, AddrModeRel, PostInc_BaseImm { let Inst{2-0} = 0b000; let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; @@ -21020,7 +21109,7 @@ def S2_storerh_pr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32), "memh($Rx32++$Mu2) = $Rt32", -tc_f86c328a, TypeST>, Enc_d5c73f { +tc_da97ee82, TypeST>, Enc_d5c73f { let Inst{7-0} = 0b00000000; let Inst{31-21} = 0b10101101010; let addrMode = PostInc; @@ -21033,7 +21122,7 @@ def S2_storerh_zomap : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Rt32), "memh($Rs32) = $Rt32", -tc_05b6c987, TypeMAPPING> { +tc_30b9bb4a, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -21041,7 +21130,7 @@ def S2_storerhgp : HInst< (outs), (ins u31_1Imm:$Ii, IntRegs:$Rt32), "memh(gp+#$Ii) = $Rt32", -tc_a788683e, TypeV2LDST>, Enc_fda92c, AddrModeRel { +tc_0371abea, TypeV2LDST>, Enc_fda92c, AddrModeRel { let Inst{24-21} = 0b0010; let Inst{31-27} = 0b01001; let accessSize = HalfWordAccess; @@ -21059,7 +21148,7 @@ def S2_storerhnew_io : HInst< (outs), (ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Nt8), "memh($Rs32+#$Ii) = $Nt8.new", -tc_f7dd9c9f, TypeST>, Enc_0d8870, AddrModeRel { +tc_be9602ff, TypeST>, Enc_0d8870, AddrModeRel { let Inst{12-11} = 0b01; let Inst{24-21} = 0b1101; let Inst{31-27} = 0b10100; @@ -21084,10 +21173,11 @@ def S2_storerhnew_pbr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8), "memh($Rx32++$Mu2:brev) = $Nt8.new", -tc_e7d02c66, TypeST>, Enc_8dbe85, AddrModeRel { +tc_c79a189f, TypeST>, Enc_8dbe85, AddrModeRel { let Inst{7-0} = 0b00000000; let Inst{12-11} = 0b01; let Inst{31-21} = 0b10101111101; +let addrMode = PostInc; let accessSize = HalfWordAccess; let isNVStore = 1; let isNewValue = 1; @@ -21101,7 +21191,7 @@ def S2_storerhnew_pci : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2, IntRegs:$Nt8), "memh($Rx32++#$Ii:circ($Mu2)) = $Nt8.new", -tc_9d5941c7, TypeST>, Enc_91b9fe, AddrModeRel { +tc_d5c0729a, TypeST>, Enc_91b9fe, AddrModeRel { let Inst{2-0} = 0b000; let Inst{7-7} = 0b0; let Inst{12-11} = 0b01; @@ -21121,7 +21211,7 @@ def S2_storerhnew_pcr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8), "memh($Rx32++I:circ($Mu2)) = $Nt8.new", -tc_e7d02c66, TypeST>, Enc_8dbe85, AddrModeRel { +tc_c79a189f, TypeST>, Enc_8dbe85, AddrModeRel { let Inst{7-0} = 0b00000010; let Inst{12-11} = 0b01; let Inst{31-21} = 0b10101001101; @@ -21140,7 +21230,7 @@ def S2_storerhnew_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8), "memh($Rx32++#$Ii) = $Nt8.new", -tc_e7d02c66, TypeST>, Enc_e26546, AddrModeRel { +tc_c79a189f, TypeST>, Enc_e26546, AddrModeRel { let Inst{2-0} = 0b000; let Inst{7-7} = 0b0; let Inst{13-11} = 0b001; @@ -21161,7 +21251,7 @@ def S2_storerhnew_pr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8), "memh($Rx32++$Mu2) = $Nt8.new", -tc_e7d02c66, TypeST>, Enc_8dbe85 { +tc_c79a189f, TypeST>, Enc_8dbe85 { let Inst{7-0} = 0b00000000; let Inst{12-11} = 0b01; let Inst{31-21} = 0b10101101101; @@ -21178,7 +21268,7 @@ def S2_storerhnew_zomap : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Nt8), "memh($Rs32) = $Nt8.new", -tc_f7dd9c9f, TypeMAPPING> { +tc_be9602ff, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; let opNewValue = 1; @@ -21187,7 +21277,7 @@ def S2_storerhnewgp : HInst< (outs), (ins u31_1Imm:$Ii, IntRegs:$Nt8), "memh(gp+#$Ii) = $Nt8.new", -tc_ff9ee76e, TypeV2LDST>, Enc_bc03e5, AddrModeRel { +tc_5bf126a6, TypeV2LDST>, Enc_bc03e5, AddrModeRel { let Inst{12-11} = 0b01; let Inst{24-21} = 0b0101; let Inst{31-27} = 0b01001; @@ -21209,7 +21299,7 @@ def S2_storeri_io : HInst< (outs), (ins IntRegs:$Rs32, s30_2Imm:$Ii, IntRegs:$Rt32), "memw($Rs32+#$Ii) = $Rt32", -tc_05b6c987, TypeST>, Enc_143445, AddrModeRel, PostInc_BaseImm { +tc_30b9bb4a, TypeST>, Enc_143445, AddrModeRel, PostInc_BaseImm { let Inst{24-21} = 0b1100; let Inst{31-27} = 0b10100; let addrMode = BaseImmOffset; @@ -21230,9 +21320,10 @@ def S2_storeri_pbr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32), "memw($Rx32++$Mu2:brev) = $Rt32", -tc_f86c328a, TypeST>, Enc_d5c73f, AddrModeRel { +tc_da97ee82, TypeST>, Enc_d5c73f, AddrModeRel { let Inst{7-0} = 0b00000000; let Inst{31-21} = 0b10101111100; +let addrMode = PostInc; let accessSize = WordAccess; let mayStore = 1; let BaseOpcode = "S2_storeri_pbr"; @@ -21243,7 +21334,7 @@ def S2_storeri_pci : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32), "memw($Rx32++#$Ii:circ($Mu2)) = $Rt32", -tc_9fdb5406, TypeST>, Enc_79b8c8, AddrModeRel { +tc_e86aa961, TypeST>, Enc_79b8c8, AddrModeRel { let Inst{2-0} = 0b000; let Inst{7-7} = 0b0; let Inst{31-21} = 0b10101001100; @@ -21259,7 +21350,7 @@ def S2_storeri_pcr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32), "memw($Rx32++I:circ($Mu2)) = $Rt32", -tc_f86c328a, TypeST>, Enc_d5c73f, AddrModeRel { +tc_da97ee82, TypeST>, Enc_d5c73f, AddrModeRel { let Inst{7-0} = 0b00000010; let Inst{31-21} = 0b10101001100; let addrMode = PostInc; @@ -21274,7 +21365,7 @@ def S2_storeri_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32), "memw($Rx32++#$Ii) = $Rt32", -tc_f86c328a, TypeST>, Enc_db40cd, AddrModeRel, PostInc_BaseImm { +tc_da97ee82, TypeST>, Enc_db40cd, AddrModeRel, PostInc_BaseImm { let Inst{2-0} = 0b000; let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; @@ -21292,7 +21383,7 @@ def S2_storeri_pr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32), "memw($Rx32++$Mu2) = $Rt32", -tc_f86c328a, TypeST>, Enc_d5c73f { +tc_da97ee82, TypeST>, Enc_d5c73f { let Inst{7-0} = 0b00000000; let Inst{31-21} = 0b10101101100; let addrMode = PostInc; @@ -21305,7 +21396,7 @@ def S2_storeri_zomap : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Rt32), "memw($Rs32) = $Rt32", -tc_05b6c987, TypeMAPPING> { +tc_30b9bb4a, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -21313,7 +21404,7 @@ def S2_storerigp : HInst< (outs), (ins u30_2Imm:$Ii, IntRegs:$Rt32), "memw(gp+#$Ii) = $Rt32", -tc_a788683e, TypeV2LDST>, Enc_541f26, AddrModeRel { +tc_0371abea, TypeV2LDST>, Enc_541f26, AddrModeRel { let Inst{24-21} = 0b0100; let Inst{31-27} = 0b01001; let accessSize = WordAccess; @@ -21331,7 +21422,7 @@ def S2_storerinew_io : HInst< (outs), (ins IntRegs:$Rs32, s30_2Imm:$Ii, IntRegs:$Nt8), "memw($Rs32+#$Ii) = $Nt8.new", -tc_f7dd9c9f, TypeST>, Enc_690862, AddrModeRel { +tc_be9602ff, TypeST>, Enc_690862, AddrModeRel { let Inst{12-11} = 0b10; let Inst{24-21} = 0b1101; let Inst{31-27} = 0b10100; @@ -21356,10 +21447,11 @@ def S2_storerinew_pbr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8), "memw($Rx32++$Mu2:brev) = $Nt8.new", -tc_e7d02c66, TypeST>, Enc_8dbe85, AddrModeRel { +tc_c79a189f, TypeST>, Enc_8dbe85, AddrModeRel { let Inst{7-0} = 0b00000000; let Inst{12-11} = 0b10; let Inst{31-21} = 0b10101111101; +let addrMode = PostInc; let accessSize = WordAccess; let isNVStore = 1; let isNewValue = 1; @@ -21373,7 +21465,7 @@ def S2_storerinew_pci : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2, IntRegs:$Nt8), "memw($Rx32++#$Ii:circ($Mu2)) = $Nt8.new", -tc_9d5941c7, TypeST>, Enc_3f97c8, AddrModeRel { +tc_d5c0729a, TypeST>, Enc_3f97c8, AddrModeRel { let Inst{2-0} = 0b000; let Inst{7-7} = 0b0; let Inst{12-11} = 0b10; @@ -21393,7 +21485,7 @@ def S2_storerinew_pcr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8), "memw($Rx32++I:circ($Mu2)) = $Nt8.new", -tc_e7d02c66, TypeST>, Enc_8dbe85, AddrModeRel { +tc_c79a189f, TypeST>, Enc_8dbe85, AddrModeRel { let Inst{7-0} = 0b00000010; let Inst{12-11} = 0b10; let Inst{31-21} = 0b10101001101; @@ -21412,7 +21504,7 @@ def S2_storerinew_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8), "memw($Rx32++#$Ii) = $Nt8.new", -tc_e7d02c66, TypeST>, Enc_223005, AddrModeRel { +tc_c79a189f, TypeST>, Enc_223005, AddrModeRel { let Inst{2-0} = 0b000; let Inst{7-7} = 0b0; let Inst{13-11} = 0b010; @@ -21432,7 +21524,7 @@ def S2_storerinew_pr : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8), "memw($Rx32++$Mu2) = $Nt8.new", -tc_e7d02c66, TypeST>, Enc_8dbe85 { +tc_c79a189f, TypeST>, Enc_8dbe85 { let Inst{7-0} = 0b00000000; let Inst{12-11} = 0b10; let Inst{31-21} = 0b10101101101; @@ -21449,7 +21541,7 @@ def S2_storerinew_zomap : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Nt8), "memw($Rs32) = $Nt8.new", -tc_f7dd9c9f, TypeMAPPING> { +tc_be9602ff, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; let opNewValue = 1; @@ -21458,7 +21550,7 @@ def S2_storerinewgp : HInst< (outs), (ins u30_2Imm:$Ii, IntRegs:$Nt8), "memw(gp+#$Ii) = $Nt8.new", -tc_ff9ee76e, TypeV2LDST>, Enc_78cbf0, AddrModeRel { +tc_5bf126a6, TypeV2LDST>, Enc_78cbf0, AddrModeRel { let Inst{12-11} = 0b10; let Inst{24-21} = 0b0101; let Inst{31-27} = 0b01001; @@ -21480,7 +21572,7 @@ def S2_storew_locked : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "memw_locked($Rs32,$Pd4) = $Rt32", -tc_1372bca1, TypeST>, Enc_c2b48e { +tc_5abb5e3f, TypeST>, Enc_c2b48e { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10100000101; @@ -21493,7 +21585,7 @@ def S2_svsathb : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = vsathb($Rs32)", -tc_cde8b071, TypeS_2op>, Enc_5e2823 { +tc_0ae0825c, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b10001100100; let hasNewValue = 1; @@ -21504,7 +21596,7 @@ def S2_svsathub : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = vsathub($Rs32)", -tc_cde8b071, TypeS_2op>, Enc_5e2823 { +tc_0ae0825c, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000010; let Inst{31-21} = 0b10001100100; let hasNewValue = 1; @@ -21515,7 +21607,7 @@ def S2_tableidxb : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II), "$Rx32 = tableidxb($Rs32,#$Ii,#$II):raw", -tc_87735c3b, TypeS_2op>, Enc_cd82bc { +tc_bfec0f01, TypeS_2op>, Enc_cd82bc { let Inst{31-22} = 0b1000011100; let hasNewValue = 1; let opNewValue = 0; @@ -21526,7 +21618,7 @@ def S2_tableidxb_goodsyntax : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II), "$Rx32 = tableidxb($Rs32,#$Ii,#$II)", -tc_87735c3b, TypeS_2op> { +tc_bfec0f01, TypeS_2op> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -21537,7 +21629,7 @@ def S2_tableidxd : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II), "$Rx32 = tableidxd($Rs32,#$Ii,#$II):raw", -tc_87735c3b, TypeS_2op>, Enc_cd82bc { +tc_bfec0f01, TypeS_2op>, Enc_cd82bc { let Inst{31-22} = 0b1000011111; let hasNewValue = 1; let opNewValue = 0; @@ -21548,7 +21640,7 @@ def S2_tableidxd_goodsyntax : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II), "$Rx32 = tableidxd($Rs32,#$Ii,#$II)", -tc_87735c3b, TypeS_2op> { +tc_bfec0f01, TypeS_2op> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -21558,7 +21650,7 @@ def S2_tableidxh : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II), "$Rx32 = tableidxh($Rs32,#$Ii,#$II):raw", -tc_87735c3b, TypeS_2op>, Enc_cd82bc { +tc_bfec0f01, TypeS_2op>, Enc_cd82bc { let Inst{31-22} = 0b1000011101; let hasNewValue = 1; let opNewValue = 0; @@ -21569,7 +21661,7 @@ def S2_tableidxh_goodsyntax : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II), "$Rx32 = tableidxh($Rs32,#$Ii,#$II)", -tc_87735c3b, TypeS_2op> { +tc_bfec0f01, TypeS_2op> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -21579,7 +21671,7 @@ def S2_tableidxw : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II), "$Rx32 = tableidxw($Rs32,#$Ii,#$II):raw", -tc_87735c3b, TypeS_2op>, Enc_cd82bc { +tc_bfec0f01, TypeS_2op>, Enc_cd82bc { let Inst{31-22} = 0b1000011110; let hasNewValue = 1; let opNewValue = 0; @@ -21590,7 +21682,7 @@ def S2_tableidxw_goodsyntax : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II), "$Rx32 = tableidxw($Rs32,#$Ii,#$II)", -tc_87735c3b, TypeS_2op> { +tc_bfec0f01, TypeS_2op> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -21600,7 +21692,7 @@ def S2_togglebit_i : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, u5_0Imm:$Ii), "$Rd32 = togglebit($Rs32,#$Ii)", -tc_540fdfbc, TypeS_2op>, Enc_a05677 { +tc_946df596, TypeS_2op>, Enc_a05677 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001100110; @@ -21611,7 +21703,7 @@ def S2_togglebit_r : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = togglebit($Rs32,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_5ab2be { +tc_946df596, TypeS_3op>, Enc_5ab2be { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000110100; @@ -21622,7 +21714,7 @@ def S2_tstbit_i : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, u5_0Imm:$Ii), "$Pd4 = tstbit($Rs32,#$Ii)", -tc_7a830544, TypeS_2op>, Enc_83ee64 { +tc_643b4717, TypeS_2op>, Enc_83ee64 { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10000101000; @@ -21631,7 +21723,7 @@ def S2_tstbit_r : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = tstbit($Rs32,$Rt32)", -tc_1e856f58, TypeS_3op>, Enc_c2b48e { +tc_85d5d03f, TypeS_3op>, Enc_c2b48e { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000111000; @@ -21640,7 +21732,7 @@ def S2_valignib : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32, u3_0Imm:$Ii), "$Rdd32 = valignb($Rtt32,$Rss32,#$Ii)", -tc_f8eeed7a, TypeS_3op>, Enc_729ff7 { +tc_b4b5c03a, TypeS_3op>, Enc_729ff7 { let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000000000; } @@ -21648,7 +21740,7 @@ def S2_valignrb : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32, PredRegs:$Pu4), "$Rdd32 = valignb($Rtt32,$Rss32,$Pu4)", -tc_f8eeed7a, TypeS_3op>, Enc_8c6530 { +tc_b4b5c03a, TypeS_3op>, Enc_8c6530 { let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000010000; @@ -21657,7 +21749,7 @@ def S2_vcnegh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rdd32 = vcnegh($Rss32,$Rt32)", -tc_b44c6e2a, TypeS_3op>, Enc_927852 { +tc_779080bf, TypeS_3op>, Enc_927852 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000011110; @@ -21668,7 +21760,7 @@ def S2_vcrotate : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rdd32 = vcrotate($Rss32,$Rt32)", -tc_2b6f77c6, TypeS_3op>, Enc_927852 { +tc_002cb246, TypeS_3op>, Enc_927852 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000011110; @@ -21679,7 +21771,7 @@ def S2_vrcnegh : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32), "$Rxx32 += vrcnegh($Rss32,$Rt32)", -tc_e913dc32, TypeS_3op>, Enc_1aa186 { +tc_d773585a, TypeS_3op>, Enc_1aa186 { let Inst{7-5} = 0b111; let Inst{13-13} = 0b1; let Inst{31-21} = 0b11001011001; @@ -21690,7 +21782,7 @@ def S2_vrndpackwh : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = vrndwh($Rss32)", -tc_d088982c, TypeS_2op>, Enc_90cd8b { +tc_14b5c689, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000100; let Inst{31-21} = 0b10001000100; let hasNewValue = 1; @@ -21701,7 +21793,7 @@ def S2_vrndpackwhs : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = vrndwh($Rss32):sat", -tc_c2f7d806, TypeS_2op>, Enc_90cd8b { +tc_cf8126ae, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000110; let Inst{31-21} = 0b10001000100; let hasNewValue = 1; @@ -21713,7 +21805,7 @@ def S2_vsathb : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = vsathb($Rss32)", -tc_cde8b071, TypeS_2op>, Enc_90cd8b { +tc_0ae0825c, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000110; let Inst{31-21} = 0b10001000000; let hasNewValue = 1; @@ -21724,7 +21816,7 @@ def S2_vsathb_nopack : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = vsathb($Rss32)", -tc_cde8b071, TypeS_2op>, Enc_b9c5fb { +tc_0ae0825c, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000111; let Inst{31-21} = 0b10000000000; let Defs = [USR_OVF]; @@ -21733,7 +21825,7 @@ def S2_vsathub : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = vsathub($Rss32)", -tc_cde8b071, TypeS_2op>, Enc_90cd8b { +tc_0ae0825c, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b10001000000; let hasNewValue = 1; @@ -21744,7 +21836,7 @@ def S2_vsathub_nopack : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = vsathub($Rss32)", -tc_cde8b071, TypeS_2op>, Enc_b9c5fb { +tc_0ae0825c, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000100; let Inst{31-21} = 0b10000000000; let Defs = [USR_OVF]; @@ -21753,7 +21845,7 @@ def S2_vsatwh : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = vsatwh($Rss32)", -tc_cde8b071, TypeS_2op>, Enc_90cd8b { +tc_0ae0825c, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000010; let Inst{31-21} = 0b10001000000; let hasNewValue = 1; @@ -21764,7 +21856,7 @@ def S2_vsatwh_nopack : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = vsatwh($Rss32)", -tc_cde8b071, TypeS_2op>, Enc_b9c5fb { +tc_0ae0825c, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000110; let Inst{31-21} = 0b10000000000; let Defs = [USR_OVF]; @@ -21773,7 +21865,7 @@ def S2_vsatwuh : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = vsatwuh($Rss32)", -tc_cde8b071, TypeS_2op>, Enc_90cd8b { +tc_0ae0825c, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000100; let Inst{31-21} = 0b10001000000; let hasNewValue = 1; @@ -21784,7 +21876,7 @@ def S2_vsatwuh_nopack : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32), "$Rdd32 = vsatwuh($Rss32)", -tc_cde8b071, TypeS_2op>, Enc_b9c5fb { +tc_0ae0825c, TypeS_2op>, Enc_b9c5fb { let Inst{13-5} = 0b000000101; let Inst{31-21} = 0b10000000000; let Defs = [USR_OVF]; @@ -21793,7 +21885,7 @@ def S2_vsplatrb : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = vsplatb($Rs32)", -tc_cde8b071, TypeS_2op>, Enc_5e2823 { +tc_0ae0825c, TypeS_2op>, Enc_5e2823 { let Inst{13-5} = 0b000000111; let Inst{31-21} = 0b10001100010; let hasNewValue = 1; @@ -21805,7 +21897,7 @@ def S2_vsplatrh : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = vsplath($Rs32)", -tc_cde8b071, TypeS_2op>, Enc_3a3d62 { +tc_0ae0825c, TypeS_2op>, Enc_3a3d62 { let Inst{13-5} = 0b000000010; let Inst{31-21} = 0b10000100010; let isReMaterializable = 1; @@ -21815,7 +21907,7 @@ def S2_vspliceib : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, u3_0Imm:$Ii), "$Rdd32 = vspliceb($Rss32,$Rtt32,#$Ii)", -tc_f8eeed7a, TypeS_3op>, Enc_d50cd3 { +tc_b4b5c03a, TypeS_3op>, Enc_d50cd3 { let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000000100; } @@ -21823,7 +21915,7 @@ def S2_vsplicerb : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, PredRegs:$Pu4), "$Rdd32 = vspliceb($Rss32,$Rtt32,$Pu4)", -tc_f8eeed7a, TypeS_3op>, Enc_dbd70c { +tc_b4b5c03a, TypeS_3op>, Enc_dbd70c { let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000010100; @@ -21832,7 +21924,7 @@ def S2_vsxtbh : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = vsxtbh($Rs32)", -tc_cde8b071, TypeS_2op>, Enc_3a3d62 { +tc_0ae0825c, TypeS_2op>, Enc_3a3d62 { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b10000100000; let isReMaterializable = 1; @@ -21842,7 +21934,7 @@ def S2_vsxthw : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = vsxthw($Rs32)", -tc_cde8b071, TypeS_2op>, Enc_3a3d62 { +tc_0ae0825c, TypeS_2op>, Enc_3a3d62 { let Inst{13-5} = 0b000000100; let Inst{31-21} = 0b10000100000; let isReMaterializable = 1; @@ -21852,7 +21944,7 @@ def S2_vtrunehb : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = vtrunehb($Rss32)", -tc_cde8b071, TypeS_2op>, Enc_90cd8b { +tc_0ae0825c, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000010; let Inst{31-21} = 0b10001000100; let hasNewValue = 1; @@ -21862,7 +21954,7 @@ def S2_vtrunewh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vtrunewh($Rss32,$Rtt32)", -tc_540fdfbc, TypeS_3op>, Enc_a56825 { +tc_946df596, TypeS_3op>, Enc_a56825 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001100; @@ -21871,7 +21963,7 @@ def S2_vtrunohb : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = vtrunohb($Rss32)", -tc_cde8b071, TypeS_2op>, Enc_90cd8b { +tc_0ae0825c, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b10001000100; let hasNewValue = 1; @@ -21881,7 +21973,7 @@ def S2_vtrunowh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vtrunowh($Rss32,$Rtt32)", -tc_540fdfbc, TypeS_3op>, Enc_a56825 { +tc_946df596, TypeS_3op>, Enc_a56825 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001100; @@ -21890,7 +21982,7 @@ def S2_vzxtbh : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = vzxtbh($Rs32)", -tc_cde8b071, TypeS_2op>, Enc_3a3d62 { +tc_0ae0825c, TypeS_2op>, Enc_3a3d62 { let Inst{13-5} = 0b000000010; let Inst{31-21} = 0b10000100000; let isReMaterializable = 1; @@ -21900,7 +21992,7 @@ def S2_vzxthw : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = vzxthw($Rs32)", -tc_cde8b071, TypeS_2op>, Enc_3a3d62 { +tc_0ae0825c, TypeS_2op>, Enc_3a3d62 { let Inst{13-5} = 0b000000110; let Inst{31-21} = 0b10000100000; let isReMaterializable = 1; @@ -21910,7 +22002,7 @@ def S4_addaddi : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Ru32, s32_0Imm:$Ii), "$Rd32 = add($Rs32,add($Ru32,#$Ii))", -tc_c74f796f, TypeALU64>, Enc_8b8d61 { +tc_f675fee8, TypeALU64>, Enc_8b8d61 { let Inst{31-23} = 0b110110110; let hasNewValue = 1; let opNewValue = 0; @@ -21925,7 +22017,7 @@ def S4_addi_asl_ri : HInst< (outs IntRegs:$Rx32), (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II), "$Rx32 = add(#$Ii,asl($Rx32in,#$II))", -tc_c74f796f, TypeALU64>, Enc_c31910 { +tc_f675fee8, TypeALU64>, Enc_c31910 { let Inst{2-0} = 0b100; let Inst{4-4} = 0b0; let Inst{31-24} = 0b11011110; @@ -21943,7 +22035,7 @@ def S4_addi_lsr_ri : HInst< (outs IntRegs:$Rx32), (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II), "$Rx32 = add(#$Ii,lsr($Rx32in,#$II))", -tc_c74f796f, TypeALU64>, Enc_c31910 { +tc_f675fee8, TypeALU64>, Enc_c31910 { let Inst{2-0} = 0b100; let Inst{4-4} = 0b1; let Inst{31-24} = 0b11011110; @@ -21961,7 +22053,7 @@ def S4_andi_asl_ri : HInst< (outs IntRegs:$Rx32), (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II), "$Rx32 = and(#$Ii,asl($Rx32in,#$II))", -tc_84df2cd3, TypeALU64>, Enc_c31910 { +tc_f429765c, TypeALU64>, Enc_c31910 { let Inst{2-0} = 0b000; let Inst{4-4} = 0b0; let Inst{31-24} = 0b11011110; @@ -21979,7 +22071,7 @@ def S4_andi_lsr_ri : HInst< (outs IntRegs:$Rx32), (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II), "$Rx32 = and(#$Ii,lsr($Rx32in,#$II))", -tc_84df2cd3, TypeALU64>, Enc_c31910 { +tc_f429765c, TypeALU64>, Enc_c31910 { let Inst{2-0} = 0b000; let Inst{4-4} = 0b1; let Inst{31-24} = 0b11011110; @@ -21997,7 +22089,7 @@ def S4_clbaddi : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s6_0Imm:$Ii), "$Rd32 = add(clb($Rs32),#$Ii)", -tc_2b6f77c6, TypeS_2op>, Enc_9fae8a { +tc_002cb246, TypeS_2op>, Enc_9fae8a { let Inst{7-5} = 0b000; let Inst{31-21} = 0b10001100001; let hasNewValue = 1; @@ -22008,7 +22100,7 @@ def S4_clbpaddi : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32, s6_0Imm:$Ii), "$Rd32 = add(clb($Rss32),#$Ii)", -tc_2b6f77c6, TypeS_2op>, Enc_a1640c { +tc_002cb246, TypeS_2op>, Enc_a1640c { let Inst{7-5} = 0b010; let Inst{31-21} = 0b10001000011; let hasNewValue = 1; @@ -22019,7 +22111,7 @@ def S4_clbpnorm : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = normamt($Rss32)", -tc_d088982c, TypeS_2op>, Enc_90cd8b { +tc_14b5c689, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000000; let Inst{31-21} = 0b10001000011; let hasNewValue = 1; @@ -22030,7 +22122,7 @@ def S4_extract : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, u5_0Imm:$Ii, u5_0Imm:$II), "$Rd32 = extract($Rs32,#$Ii,#$II)", -tc_c74f796f, TypeS_2op>, Enc_b388cf { +tc_f675fee8, TypeS_2op>, Enc_b388cf { let Inst{13-13} = 0b0; let Inst{31-23} = 0b100011011; let hasNewValue = 1; @@ -22041,7 +22133,7 @@ def S4_extract_rp : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, DoubleRegs:$Rtt32), "$Rd32 = extract($Rs32,$Rtt32)", -tc_2b6f77c6, TypeS_3op>, Enc_e07374 { +tc_002cb246, TypeS_3op>, Enc_e07374 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11001001000; @@ -22053,7 +22145,7 @@ def S4_extractp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, u6_0Imm:$Ii, u6_0Imm:$II), "$Rdd32 = extract($Rss32,#$Ii,#$II)", -tc_c74f796f, TypeS_2op>, Enc_b84c4c { +tc_f675fee8, TypeS_2op>, Enc_b84c4c { let Inst{31-24} = 0b10001010; let prefersSlot3 = 1; } @@ -22061,7 +22153,7 @@ def S4_extractp_rp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = extract($Rss32,$Rtt32)", -tc_2b6f77c6, TypeS_3op>, Enc_a56825 { +tc_002cb246, TypeS_3op>, Enc_a56825 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001110; @@ -22071,7 +22163,7 @@ def S4_lsli : HInst< (outs IntRegs:$Rd32), (ins s6_0Imm:$Ii, IntRegs:$Rt32), "$Rd32 = lsl(#$Ii,$Rt32)", -tc_540fdfbc, TypeS_3op>, Enc_fef969 { +tc_946df596, TypeS_3op>, Enc_fef969 { let Inst{7-6} = 0b11; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000110100; @@ -22082,7 +22174,7 @@ def S4_ntstbit_i : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, u5_0Imm:$Ii), "$Pd4 = !tstbit($Rs32,#$Ii)", -tc_7a830544, TypeS_2op>, Enc_83ee64 { +tc_643b4717, TypeS_2op>, Enc_83ee64 { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10000101001; @@ -22091,7 +22183,7 @@ def S4_ntstbit_r : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Pd4 = !tstbit($Rs32,$Rt32)", -tc_1e856f58, TypeS_3op>, Enc_c2b48e { +tc_85d5d03f, TypeS_3op>, Enc_c2b48e { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000111001; @@ -22100,7 +22192,7 @@ def S4_or_andi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii), "$Rx32 |= and($Rs32,#$Ii)", -tc_84df2cd3, TypeALU64>, Enc_b0e9d8 { +tc_f429765c, TypeALU64>, Enc_b0e9d8 { let Inst{31-22} = 0b1101101000; let hasNewValue = 1; let opNewValue = 0; @@ -22117,7 +22209,7 @@ def S4_or_andix : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Ru32, IntRegs:$Rx32in, s32_0Imm:$Ii), "$Rx32 = or($Ru32,and($Rx32in,#$Ii))", -tc_84df2cd3, TypeALU64>, Enc_b4e6cf { +tc_f429765c, TypeALU64>, Enc_b4e6cf { let Inst{31-22} = 0b1101101001; let hasNewValue = 1; let opNewValue = 0; @@ -22133,7 +22225,7 @@ def S4_or_ori : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii), "$Rx32 |= or($Rs32,#$Ii)", -tc_84df2cd3, TypeALU64>, Enc_b0e9d8 { +tc_f429765c, TypeALU64>, Enc_b0e9d8 { let Inst{31-22} = 0b1101101010; let hasNewValue = 1; let opNewValue = 0; @@ -22150,7 +22242,7 @@ def S4_ori_asl_ri : HInst< (outs IntRegs:$Rx32), (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II), "$Rx32 = or(#$Ii,asl($Rx32in,#$II))", -tc_84df2cd3, TypeALU64>, Enc_c31910 { +tc_f429765c, TypeALU64>, Enc_c31910 { let Inst{2-0} = 0b010; let Inst{4-4} = 0b0; let Inst{31-24} = 0b11011110; @@ -22168,7 +22260,7 @@ def S4_ori_lsr_ri : HInst< (outs IntRegs:$Rx32), (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II), "$Rx32 = or(#$Ii,lsr($Rx32in,#$II))", -tc_84df2cd3, TypeALU64>, Enc_c31910 { +tc_f429765c, TypeALU64>, Enc_c31910 { let Inst{2-0} = 0b010; let Inst{4-4} = 0b1; let Inst{31-24} = 0b11011110; @@ -22186,7 +22278,7 @@ def S4_parity : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = parity($Rs32,$Rt32)", -tc_2b6f77c6, TypeALU64>, Enc_5ab2be { +tc_002cb246, TypeALU64>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101111; @@ -22198,7 +22290,7 @@ def S4_pstorerbf_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4) memb(#$Ii) = $Rt32", -tc_238d91d2, TypeST>, Enc_1cf4ca, AddrModeRel { +tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-13} = 0b0; @@ -22223,7 +22315,7 @@ def S4_pstorerbf_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4) memb($Rs32+$Ru32<<#$Ii) = $Rt32", -tc_5274e61a, TypeST>, Enc_6339d5, AddrModeRel { +tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel { let Inst{31-21} = 0b00110101000; let isPredicated = 1; let isPredicatedFalse = 1; @@ -22239,7 +22331,7 @@ def S4_pstorerbfnew_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4.new) memb(#$Ii) = $Rt32", -tc_66888ded, TypeST>, Enc_1cf4ca, AddrModeRel { +tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -22265,7 +22357,7 @@ def S4_pstorerbfnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4.new) memb($Rs32+#$Ii) = $Rt32", -tc_f86c328a, TypeV2LDST>, Enc_da8d43, AddrModeRel { +tc_da97ee82, TypeV2LDST>, Enc_da8d43, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000110000; let isPredicated = 1; @@ -22288,7 +22380,7 @@ def S4_pstorerbfnew_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Rt32", -tc_3e07fb90, TypeST>, Enc_6339d5, AddrModeRel { +tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel { let Inst{31-21} = 0b00110111000; let isPredicated = 1; let isPredicatedFalse = 1; @@ -22305,7 +22397,7 @@ def S4_pstorerbfnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32), "if (!$Pv4.new) memb($Rs32) = $Rt32", -tc_f86c328a, TypeMAPPING> { +tc_da97ee82, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -22313,7 +22405,7 @@ def S4_pstorerbnewf_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4) memb(#$Ii) = $Nt8.new", -tc_6ac37025, TypeST>, Enc_44215c, AddrModeRel { +tc_4b68bce4, TypeST>, Enc_44215c, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-11} = 0b000; @@ -22341,7 +22433,7 @@ def S4_pstorerbnewf_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new", -tc_adb14c66, TypeST>, Enc_47ee5e, AddrModeRel { +tc_e95795ec, TypeST>, Enc_47ee5e, AddrModeRel { let Inst{4-3} = 0b00; let Inst{31-21} = 0b00110101101; let isPredicated = 1; @@ -22361,7 +22453,7 @@ def S4_pstorerbnewfnew_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4.new) memb(#$Ii) = $Nt8.new", -tc_53bdb2f6, TypeST>, Enc_44215c, AddrModeRel { +tc_d2e63d61, TypeST>, Enc_44215c, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-11} = 0b100; @@ -22390,7 +22482,7 @@ def S4_pstorerbnewfnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4.new) memb($Rs32+#$Ii) = $Nt8.new", -tc_e7d02c66, TypeV2LDST>, Enc_585242, AddrModeRel { +tc_c79a189f, TypeV2LDST>, Enc_585242, AddrModeRel { let Inst{2-2} = 0b0; let Inst{12-11} = 0b00; let Inst{31-21} = 0b01000110101; @@ -22417,7 +22509,7 @@ def S4_pstorerbnewfnew_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new", -tc_e421e012, TypeST>, Enc_47ee5e, AddrModeRel { +tc_b90a29b1, TypeST>, Enc_47ee5e, AddrModeRel { let Inst{4-3} = 0b00; let Inst{31-21} = 0b00110111101; let isPredicated = 1; @@ -22438,7 +22530,7 @@ def S4_pstorerbnewfnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8), "if (!$Pv4.new) memb($Rs32) = $Nt8.new", -tc_e7d02c66, TypeMAPPING> { +tc_c79a189f, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; let opNewValue = 2; @@ -22447,7 +22539,7 @@ def S4_pstorerbnewt_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8), "if ($Pv4) memb(#$Ii) = $Nt8.new", -tc_6ac37025, TypeST>, Enc_44215c, AddrModeRel { +tc_4b68bce4, TypeST>, Enc_44215c, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-11} = 0b000; @@ -22474,7 +22566,7 @@ def S4_pstorerbnewt_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8), "if ($Pv4) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new", -tc_adb14c66, TypeST>, Enc_47ee5e, AddrModeRel { +tc_e95795ec, TypeST>, Enc_47ee5e, AddrModeRel { let Inst{4-3} = 0b00; let Inst{31-21} = 0b00110100101; let isPredicated = 1; @@ -22493,7 +22585,7 @@ def S4_pstorerbnewtnew_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8), "if ($Pv4.new) memb(#$Ii) = $Nt8.new", -tc_53bdb2f6, TypeST>, Enc_44215c, AddrModeRel { +tc_d2e63d61, TypeST>, Enc_44215c, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-11} = 0b100; @@ -22521,7 +22613,7 @@ def S4_pstorerbnewtnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8), "if ($Pv4.new) memb($Rs32+#$Ii) = $Nt8.new", -tc_e7d02c66, TypeV2LDST>, Enc_585242, AddrModeRel { +tc_c79a189f, TypeV2LDST>, Enc_585242, AddrModeRel { let Inst{2-2} = 0b0; let Inst{12-11} = 0b00; let Inst{31-21} = 0b01000010101; @@ -22547,7 +22639,7 @@ def S4_pstorerbnewtnew_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8), "if ($Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new", -tc_e421e012, TypeST>, Enc_47ee5e, AddrModeRel { +tc_b90a29b1, TypeST>, Enc_47ee5e, AddrModeRel { let Inst{4-3} = 0b00; let Inst{31-21} = 0b00110110101; let isPredicated = 1; @@ -22567,7 +22659,7 @@ def S4_pstorerbnewtnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8), "if ($Pv4.new) memb($Rs32) = $Nt8.new", -tc_e7d02c66, TypeMAPPING> { +tc_c79a189f, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; let opNewValue = 2; @@ -22576,7 +22668,7 @@ def S4_pstorerbt_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4) memb(#$Ii) = $Rt32", -tc_238d91d2, TypeST>, Enc_1cf4ca, AddrModeRel { +tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-13} = 0b0; @@ -22600,7 +22692,7 @@ def S4_pstorerbt_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4) memb($Rs32+$Ru32<<#$Ii) = $Rt32", -tc_5274e61a, TypeST>, Enc_6339d5, AddrModeRel { +tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel { let Inst{31-21} = 0b00110100000; let isPredicated = 1; let addrMode = BaseRegOffset; @@ -22615,7 +22707,7 @@ def S4_pstorerbtnew_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4.new) memb(#$Ii) = $Rt32", -tc_66888ded, TypeST>, Enc_1cf4ca, AddrModeRel { +tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -22640,7 +22732,7 @@ def S4_pstorerbtnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4.new) memb($Rs32+#$Ii) = $Rt32", -tc_f86c328a, TypeV2LDST>, Enc_da8d43, AddrModeRel { +tc_da97ee82, TypeV2LDST>, Enc_da8d43, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000010000; let isPredicated = 1; @@ -22662,7 +22754,7 @@ def S4_pstorerbtnew_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Rt32", -tc_3e07fb90, TypeST>, Enc_6339d5, AddrModeRel { +tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel { let Inst{31-21} = 0b00110110000; let isPredicated = 1; let addrMode = BaseRegOffset; @@ -22678,7 +22770,7 @@ def S4_pstorerbtnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32), "if ($Pv4.new) memb($Rs32) = $Rt32", -tc_f86c328a, TypeMAPPING> { +tc_da97ee82, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -22686,7 +22778,7 @@ def S4_pstorerdf_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32), "if (!$Pv4) memd(#$Ii) = $Rtt32", -tc_238d91d2, TypeST>, Enc_50b5ac, AddrModeRel { +tc_362c6592, TypeST>, Enc_50b5ac, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-13} = 0b0; @@ -22710,7 +22802,7 @@ def S4_pstorerdf_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32), "if (!$Pv4) memd($Rs32+$Ru32<<#$Ii) = $Rtt32", -tc_5274e61a, TypeST>, Enc_1a9974, AddrModeRel { +tc_3962fa26, TypeST>, Enc_1a9974, AddrModeRel { let Inst{31-21} = 0b00110101110; let isPredicated = 1; let isPredicatedFalse = 1; @@ -22725,7 +22817,7 @@ def S4_pstorerdfnew_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32), "if (!$Pv4.new) memd(#$Ii) = $Rtt32", -tc_66888ded, TypeST>, Enc_50b5ac, AddrModeRel { +tc_da4a37ed, TypeST>, Enc_50b5ac, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -22750,7 +22842,7 @@ def S4_pstorerdfnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32), "if (!$Pv4.new) memd($Rs32+#$Ii) = $Rtt32", -tc_f86c328a, TypeV2LDST>, Enc_57a33e, AddrModeRel { +tc_da97ee82, TypeV2LDST>, Enc_57a33e, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000110110; let isPredicated = 1; @@ -22772,7 +22864,7 @@ def S4_pstorerdfnew_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32), "if (!$Pv4.new) memd($Rs32+$Ru32<<#$Ii) = $Rtt32", -tc_3e07fb90, TypeST>, Enc_1a9974, AddrModeRel { +tc_40116ca8, TypeST>, Enc_1a9974, AddrModeRel { let Inst{31-21} = 0b00110111110; let isPredicated = 1; let isPredicatedFalse = 1; @@ -22788,7 +22880,7 @@ def S4_pstorerdfnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32), "if (!$Pv4.new) memd($Rs32) = $Rtt32", -tc_f86c328a, TypeMAPPING> { +tc_da97ee82, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -22796,7 +22888,7 @@ def S4_pstorerdt_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32), "if ($Pv4) memd(#$Ii) = $Rtt32", -tc_238d91d2, TypeST>, Enc_50b5ac, AddrModeRel { +tc_362c6592, TypeST>, Enc_50b5ac, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-13} = 0b0; @@ -22819,7 +22911,7 @@ def S4_pstorerdt_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32), "if ($Pv4) memd($Rs32+$Ru32<<#$Ii) = $Rtt32", -tc_5274e61a, TypeST>, Enc_1a9974, AddrModeRel { +tc_3962fa26, TypeST>, Enc_1a9974, AddrModeRel { let Inst{31-21} = 0b00110100110; let isPredicated = 1; let addrMode = BaseRegOffset; @@ -22833,7 +22925,7 @@ def S4_pstorerdtnew_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32), "if ($Pv4.new) memd(#$Ii) = $Rtt32", -tc_66888ded, TypeST>, Enc_50b5ac, AddrModeRel { +tc_da4a37ed, TypeST>, Enc_50b5ac, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -22857,7 +22949,7 @@ def S4_pstorerdtnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32), "if ($Pv4.new) memd($Rs32+#$Ii) = $Rtt32", -tc_f86c328a, TypeV2LDST>, Enc_57a33e, AddrModeRel { +tc_da97ee82, TypeV2LDST>, Enc_57a33e, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000010110; let isPredicated = 1; @@ -22878,7 +22970,7 @@ def S4_pstorerdtnew_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32), "if ($Pv4.new) memd($Rs32+$Ru32<<#$Ii) = $Rtt32", -tc_3e07fb90, TypeST>, Enc_1a9974, AddrModeRel { +tc_40116ca8, TypeST>, Enc_1a9974, AddrModeRel { let Inst{31-21} = 0b00110110110; let isPredicated = 1; let addrMode = BaseRegOffset; @@ -22893,7 +22985,7 @@ def S4_pstorerdtnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32), "if ($Pv4.new) memd($Rs32) = $Rtt32", -tc_f86c328a, TypeMAPPING> { +tc_da97ee82, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -22901,7 +22993,7 @@ def S4_pstorerff_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4) memh(#$Ii) = $Rt32.h", -tc_238d91d2, TypeST>, Enc_1cf4ca, AddrModeRel { +tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-13} = 0b0; @@ -22925,7 +23017,7 @@ def S4_pstorerff_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h", -tc_5274e61a, TypeST>, Enc_6339d5, AddrModeRel { +tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel { let Inst{31-21} = 0b00110101011; let isPredicated = 1; let isPredicatedFalse = 1; @@ -22940,7 +23032,7 @@ def S4_pstorerffnew_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4.new) memh(#$Ii) = $Rt32.h", -tc_66888ded, TypeST>, Enc_1cf4ca, AddrModeRel { +tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -22965,7 +23057,7 @@ def S4_pstorerffnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4.new) memh($Rs32+#$Ii) = $Rt32.h", -tc_f86c328a, TypeV2LDST>, Enc_e8c45e, AddrModeRel { +tc_da97ee82, TypeV2LDST>, Enc_e8c45e, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000110011; let isPredicated = 1; @@ -22987,7 +23079,7 @@ def S4_pstorerffnew_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h", -tc_3e07fb90, TypeST>, Enc_6339d5, AddrModeRel { +tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel { let Inst{31-21} = 0b00110111011; let isPredicated = 1; let isPredicatedFalse = 1; @@ -23003,7 +23095,7 @@ def S4_pstorerffnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32), "if (!$Pv4.new) memh($Rs32) = $Rt32.h", -tc_f86c328a, TypeMAPPING> { +tc_da97ee82, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -23011,7 +23103,7 @@ def S4_pstorerft_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4) memh(#$Ii) = $Rt32.h", -tc_238d91d2, TypeST>, Enc_1cf4ca, AddrModeRel { +tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-13} = 0b0; @@ -23034,7 +23126,7 @@ def S4_pstorerft_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h", -tc_5274e61a, TypeST>, Enc_6339d5, AddrModeRel { +tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel { let Inst{31-21} = 0b00110100011; let isPredicated = 1; let addrMode = BaseRegOffset; @@ -23048,7 +23140,7 @@ def S4_pstorerftnew_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4.new) memh(#$Ii) = $Rt32.h", -tc_66888ded, TypeST>, Enc_1cf4ca, AddrModeRel { +tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -23072,7 +23164,7 @@ def S4_pstorerftnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32), "if ($Pv4.new) memh($Rs32+#$Ii) = $Rt32.h", -tc_f86c328a, TypeV2LDST>, Enc_e8c45e, AddrModeRel { +tc_da97ee82, TypeV2LDST>, Enc_e8c45e, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000010011; let isPredicated = 1; @@ -23093,7 +23185,7 @@ def S4_pstorerftnew_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h", -tc_3e07fb90, TypeST>, Enc_6339d5, AddrModeRel { +tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel { let Inst{31-21} = 0b00110110011; let isPredicated = 1; let addrMode = BaseRegOffset; @@ -23108,7 +23200,7 @@ def S4_pstorerftnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32), "if ($Pv4.new) memh($Rs32) = $Rt32.h", -tc_f86c328a, TypeMAPPING> { +tc_da97ee82, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -23116,7 +23208,7 @@ def S4_pstorerhf_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4) memh(#$Ii) = $Rt32", -tc_238d91d2, TypeST>, Enc_1cf4ca, AddrModeRel { +tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-13} = 0b0; @@ -23141,7 +23233,7 @@ def S4_pstorerhf_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32", -tc_5274e61a, TypeST>, Enc_6339d5, AddrModeRel { +tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel { let Inst{31-21} = 0b00110101010; let isPredicated = 1; let isPredicatedFalse = 1; @@ -23157,7 +23249,7 @@ def S4_pstorerhfnew_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4.new) memh(#$Ii) = $Rt32", -tc_66888ded, TypeST>, Enc_1cf4ca, AddrModeRel { +tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -23183,7 +23275,7 @@ def S4_pstorerhfnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4.new) memh($Rs32+#$Ii) = $Rt32", -tc_f86c328a, TypeV2LDST>, Enc_e8c45e, AddrModeRel { +tc_da97ee82, TypeV2LDST>, Enc_e8c45e, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000110010; let isPredicated = 1; @@ -23206,7 +23298,7 @@ def S4_pstorerhfnew_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32", -tc_3e07fb90, TypeST>, Enc_6339d5, AddrModeRel { +tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel { let Inst{31-21} = 0b00110111010; let isPredicated = 1; let isPredicatedFalse = 1; @@ -23223,7 +23315,7 @@ def S4_pstorerhfnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32), "if (!$Pv4.new) memh($Rs32) = $Rt32", -tc_f86c328a, TypeMAPPING> { +tc_da97ee82, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -23231,7 +23323,7 @@ def S4_pstorerhnewf_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4) memh(#$Ii) = $Nt8.new", -tc_6ac37025, TypeST>, Enc_44215c, AddrModeRel { +tc_4b68bce4, TypeST>, Enc_44215c, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-11} = 0b001; @@ -23259,7 +23351,7 @@ def S4_pstorerhnewf_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new", -tc_adb14c66, TypeST>, Enc_47ee5e, AddrModeRel { +tc_e95795ec, TypeST>, Enc_47ee5e, AddrModeRel { let Inst{4-3} = 0b01; let Inst{31-21} = 0b00110101101; let isPredicated = 1; @@ -23279,7 +23371,7 @@ def S4_pstorerhnewfnew_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4.new) memh(#$Ii) = $Nt8.new", -tc_53bdb2f6, TypeST>, Enc_44215c, AddrModeRel { +tc_d2e63d61, TypeST>, Enc_44215c, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-11} = 0b101; @@ -23308,7 +23400,7 @@ def S4_pstorerhnewfnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4.new) memh($Rs32+#$Ii) = $Nt8.new", -tc_e7d02c66, TypeV2LDST>, Enc_f44229, AddrModeRel { +tc_c79a189f, TypeV2LDST>, Enc_f44229, AddrModeRel { let Inst{2-2} = 0b0; let Inst{12-11} = 0b01; let Inst{31-21} = 0b01000110101; @@ -23335,7 +23427,7 @@ def S4_pstorerhnewfnew_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new", -tc_e421e012, TypeST>, Enc_47ee5e, AddrModeRel { +tc_b90a29b1, TypeST>, Enc_47ee5e, AddrModeRel { let Inst{4-3} = 0b01; let Inst{31-21} = 0b00110111101; let isPredicated = 1; @@ -23356,7 +23448,7 @@ def S4_pstorerhnewfnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8), "if (!$Pv4.new) memh($Rs32) = $Nt8.new", -tc_e7d02c66, TypeMAPPING> { +tc_c79a189f, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; let opNewValue = 2; @@ -23365,7 +23457,7 @@ def S4_pstorerhnewt_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8), "if ($Pv4) memh(#$Ii) = $Nt8.new", -tc_6ac37025, TypeST>, Enc_44215c, AddrModeRel { +tc_4b68bce4, TypeST>, Enc_44215c, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-11} = 0b001; @@ -23392,7 +23484,7 @@ def S4_pstorerhnewt_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8), "if ($Pv4) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new", -tc_adb14c66, TypeST>, Enc_47ee5e, AddrModeRel { +tc_e95795ec, TypeST>, Enc_47ee5e, AddrModeRel { let Inst{4-3} = 0b01; let Inst{31-21} = 0b00110100101; let isPredicated = 1; @@ -23411,7 +23503,7 @@ def S4_pstorerhnewtnew_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8), "if ($Pv4.new) memh(#$Ii) = $Nt8.new", -tc_53bdb2f6, TypeST>, Enc_44215c, AddrModeRel { +tc_d2e63d61, TypeST>, Enc_44215c, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-11} = 0b101; @@ -23439,7 +23531,7 @@ def S4_pstorerhnewtnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8), "if ($Pv4.new) memh($Rs32+#$Ii) = $Nt8.new", -tc_e7d02c66, TypeV2LDST>, Enc_f44229, AddrModeRel { +tc_c79a189f, TypeV2LDST>, Enc_f44229, AddrModeRel { let Inst{2-2} = 0b0; let Inst{12-11} = 0b01; let Inst{31-21} = 0b01000010101; @@ -23465,7 +23557,7 @@ def S4_pstorerhnewtnew_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8), "if ($Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new", -tc_e421e012, TypeST>, Enc_47ee5e, AddrModeRel { +tc_b90a29b1, TypeST>, Enc_47ee5e, AddrModeRel { let Inst{4-3} = 0b01; let Inst{31-21} = 0b00110110101; let isPredicated = 1; @@ -23485,7 +23577,7 @@ def S4_pstorerhnewtnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8), "if ($Pv4.new) memh($Rs32) = $Nt8.new", -tc_e7d02c66, TypeMAPPING> { +tc_c79a189f, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; let opNewValue = 2; @@ -23494,7 +23586,7 @@ def S4_pstorerht_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4) memh(#$Ii) = $Rt32", -tc_238d91d2, TypeST>, Enc_1cf4ca, AddrModeRel { +tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-13} = 0b0; @@ -23518,7 +23610,7 @@ def S4_pstorerht_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32", -tc_5274e61a, TypeST>, Enc_6339d5, AddrModeRel { +tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel { let Inst{31-21} = 0b00110100010; let isPredicated = 1; let addrMode = BaseRegOffset; @@ -23533,7 +23625,7 @@ def S4_pstorerhtnew_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4.new) memh(#$Ii) = $Rt32", -tc_66888ded, TypeST>, Enc_1cf4ca, AddrModeRel { +tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -23558,7 +23650,7 @@ def S4_pstorerhtnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32), "if ($Pv4.new) memh($Rs32+#$Ii) = $Rt32", -tc_f86c328a, TypeV2LDST>, Enc_e8c45e, AddrModeRel { +tc_da97ee82, TypeV2LDST>, Enc_e8c45e, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000010010; let isPredicated = 1; @@ -23580,7 +23672,7 @@ def S4_pstorerhtnew_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32", -tc_3e07fb90, TypeST>, Enc_6339d5, AddrModeRel { +tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel { let Inst{31-21} = 0b00110110010; let isPredicated = 1; let addrMode = BaseRegOffset; @@ -23596,7 +23688,7 @@ def S4_pstorerhtnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32), "if ($Pv4.new) memh($Rs32) = $Rt32", -tc_f86c328a, TypeMAPPING> { +tc_da97ee82, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -23604,7 +23696,7 @@ def S4_pstorerif_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4) memw(#$Ii) = $Rt32", -tc_238d91d2, TypeST>, Enc_1cf4ca, AddrModeRel { +tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-13} = 0b0; @@ -23629,7 +23721,7 @@ def S4_pstorerif_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4) memw($Rs32+$Ru32<<#$Ii) = $Rt32", -tc_5274e61a, TypeST>, Enc_6339d5, AddrModeRel { +tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel { let Inst{31-21} = 0b00110101100; let isPredicated = 1; let isPredicatedFalse = 1; @@ -23645,7 +23737,7 @@ def S4_pstorerifnew_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4.new) memw(#$Ii) = $Rt32", -tc_66888ded, TypeST>, Enc_1cf4ca, AddrModeRel { +tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -23671,7 +23763,7 @@ def S4_pstorerifnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4.new) memw($Rs32+#$Ii) = $Rt32", -tc_f86c328a, TypeV2LDST>, Enc_397f23, AddrModeRel { +tc_da97ee82, TypeV2LDST>, Enc_397f23, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000110100; let isPredicated = 1; @@ -23694,7 +23786,7 @@ def S4_pstorerifnew_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "if (!$Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Rt32", -tc_3e07fb90, TypeST>, Enc_6339d5, AddrModeRel { +tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel { let Inst{31-21} = 0b00110111100; let isPredicated = 1; let isPredicatedFalse = 1; @@ -23711,7 +23803,7 @@ def S4_pstorerifnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32), "if (!$Pv4.new) memw($Rs32) = $Rt32", -tc_f86c328a, TypeMAPPING> { +tc_da97ee82, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -23719,7 +23811,7 @@ def S4_pstorerinewf_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4) memw(#$Ii) = $Nt8.new", -tc_6ac37025, TypeST>, Enc_44215c, AddrModeRel { +tc_4b68bce4, TypeST>, Enc_44215c, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-11} = 0b010; @@ -23747,7 +23839,7 @@ def S4_pstorerinewf_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new", -tc_adb14c66, TypeST>, Enc_47ee5e, AddrModeRel { +tc_e95795ec, TypeST>, Enc_47ee5e, AddrModeRel { let Inst{4-3} = 0b10; let Inst{31-21} = 0b00110101101; let isPredicated = 1; @@ -23767,7 +23859,7 @@ def S4_pstorerinewfnew_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4.new) memw(#$Ii) = $Nt8.new", -tc_53bdb2f6, TypeST>, Enc_44215c, AddrModeRel { +tc_d2e63d61, TypeST>, Enc_44215c, AddrModeRel { let Inst{2-2} = 0b1; let Inst{7-7} = 0b1; let Inst{13-11} = 0b110; @@ -23796,7 +23888,7 @@ def S4_pstorerinewfnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4.new) memw($Rs32+#$Ii) = $Nt8.new", -tc_e7d02c66, TypeV2LDST>, Enc_8dbdfe, AddrModeRel { +tc_c79a189f, TypeV2LDST>, Enc_8dbdfe, AddrModeRel { let Inst{2-2} = 0b0; let Inst{12-11} = 0b10; let Inst{31-21} = 0b01000110101; @@ -23823,7 +23915,7 @@ def S4_pstorerinewfnew_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8), "if (!$Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new", -tc_e421e012, TypeST>, Enc_47ee5e, AddrModeRel { +tc_b90a29b1, TypeST>, Enc_47ee5e, AddrModeRel { let Inst{4-3} = 0b10; let Inst{31-21} = 0b00110111101; let isPredicated = 1; @@ -23844,7 +23936,7 @@ def S4_pstorerinewfnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8), "if (!$Pv4.new) memw($Rs32) = $Nt8.new", -tc_e7d02c66, TypeMAPPING> { +tc_c79a189f, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; let opNewValue = 2; @@ -23853,7 +23945,7 @@ def S4_pstorerinewt_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8), "if ($Pv4) memw(#$Ii) = $Nt8.new", -tc_6ac37025, TypeST>, Enc_44215c, AddrModeRel { +tc_4b68bce4, TypeST>, Enc_44215c, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-11} = 0b010; @@ -23880,7 +23972,7 @@ def S4_pstorerinewt_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8), "if ($Pv4) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new", -tc_adb14c66, TypeST>, Enc_47ee5e, AddrModeRel { +tc_e95795ec, TypeST>, Enc_47ee5e, AddrModeRel { let Inst{4-3} = 0b10; let Inst{31-21} = 0b00110100101; let isPredicated = 1; @@ -23899,7 +23991,7 @@ def S4_pstorerinewtnew_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8), "if ($Pv4.new) memw(#$Ii) = $Nt8.new", -tc_53bdb2f6, TypeST>, Enc_44215c, AddrModeRel { +tc_d2e63d61, TypeST>, Enc_44215c, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-11} = 0b110; @@ -23927,7 +24019,7 @@ def S4_pstorerinewtnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8), "if ($Pv4.new) memw($Rs32+#$Ii) = $Nt8.new", -tc_e7d02c66, TypeV2LDST>, Enc_8dbdfe, AddrModeRel { +tc_c79a189f, TypeV2LDST>, Enc_8dbdfe, AddrModeRel { let Inst{2-2} = 0b0; let Inst{12-11} = 0b10; let Inst{31-21} = 0b01000010101; @@ -23953,7 +24045,7 @@ def S4_pstorerinewtnew_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8), "if ($Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new", -tc_e421e012, TypeST>, Enc_47ee5e, AddrModeRel { +tc_b90a29b1, TypeST>, Enc_47ee5e, AddrModeRel { let Inst{4-3} = 0b10; let Inst{31-21} = 0b00110110101; let isPredicated = 1; @@ -23973,7 +24065,7 @@ def S4_pstorerinewtnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8), "if ($Pv4.new) memw($Rs32) = $Nt8.new", -tc_e7d02c66, TypeMAPPING> { +tc_c79a189f, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; let opNewValue = 2; @@ -23982,7 +24074,7 @@ def S4_pstorerit_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4) memw(#$Ii) = $Rt32", -tc_238d91d2, TypeST>, Enc_1cf4ca, AddrModeRel { +tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-13} = 0b0; @@ -24006,7 +24098,7 @@ def S4_pstorerit_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4) memw($Rs32+$Ru32<<#$Ii) = $Rt32", -tc_5274e61a, TypeST>, Enc_6339d5, AddrModeRel { +tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel { let Inst{31-21} = 0b00110100100; let isPredicated = 1; let addrMode = BaseRegOffset; @@ -24021,7 +24113,7 @@ def S4_pstoreritnew_abs : HInst< (outs), (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4.new) memw(#$Ii) = $Rt32", -tc_66888ded, TypeST>, Enc_1cf4ca, AddrModeRel { +tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel { let Inst{2-2} = 0b0; let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; @@ -24046,7 +24138,7 @@ def S4_pstoreritnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32), "if ($Pv4.new) memw($Rs32+#$Ii) = $Rt32", -tc_f86c328a, TypeV2LDST>, Enc_397f23, AddrModeRel { +tc_da97ee82, TypeV2LDST>, Enc_397f23, AddrModeRel { let Inst{2-2} = 0b0; let Inst{31-21} = 0b01000010100; let isPredicated = 1; @@ -24068,7 +24160,7 @@ def S4_pstoreritnew_rr : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "if ($Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Rt32", -tc_3e07fb90, TypeST>, Enc_6339d5, AddrModeRel { +tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel { let Inst{31-21} = 0b00110110100; let isPredicated = 1; let addrMode = BaseRegOffset; @@ -24084,7 +24176,7 @@ def S4_pstoreritnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32), "if ($Pv4.new) memw($Rs32) = $Rt32", -tc_f86c328a, TypeMAPPING> { +tc_da97ee82, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -24092,7 +24184,7 @@ def S4_stored_locked : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, DoubleRegs:$Rtt32), "memd_locked($Rs32,$Pd4) = $Rtt32", -tc_1372bca1, TypeST>, Enc_d7dc10 { +tc_5abb5e3f, TypeST>, Enc_d7dc10 { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10100000111; @@ -24105,7 +24197,7 @@ def S4_storeirb_io : HInst< (outs), (ins IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II), "memb($Rs32+#$Ii) = #$II", -tc_05b6c987, TypeST>, Enc_8203bb, PredNewRel { +tc_b83e6d73, TypeST>, Enc_8203bb, PredNewRel { let Inst{31-21} = 0b00111100000; let addrMode = BaseImmOffset; let accessSize = ByteAccess; @@ -24124,7 +24216,7 @@ def S4_storeirb_zomap : HInst< (outs), (ins IntRegs:$Rs32, s8_0Imm:$II), "memb($Rs32) = #$II", -tc_05b6c987, TypeMAPPING> { +tc_b83e6d73, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -24132,7 +24224,7 @@ def S4_storeirbf_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II), "if (!$Pv4) memb($Rs32+#$Ii) = #$II", -tc_8b15472a, TypeST>, Enc_d7a65e, PredNewRel { +tc_0b2be201, TypeST>, Enc_d7a65e, PredNewRel { let Inst{31-21} = 0b00111000100; let isPredicated = 1; let isPredicatedFalse = 1; @@ -24152,7 +24244,7 @@ def S4_storeirbf_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II), "if (!$Pv4) memb($Rs32) = #$II", -tc_8b15472a, TypeMAPPING> { +tc_0b2be201, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -24160,7 +24252,7 @@ def S4_storeirbfnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II), "if (!$Pv4.new) memb($Rs32+#$Ii) = #$II", -tc_f86c328a, TypeST>, Enc_d7a65e, PredNewRel { +tc_c4f596e3, TypeST>, Enc_d7a65e, PredNewRel { let Inst{31-21} = 0b00111001100; let isPredicated = 1; let isPredicatedFalse = 1; @@ -24181,7 +24273,7 @@ def S4_storeirbfnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II), "if (!$Pv4.new) memb($Rs32) = #$II", -tc_f86c328a, TypeMAPPING> { +tc_c4f596e3, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -24189,7 +24281,7 @@ def S4_storeirbt_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II), "if ($Pv4) memb($Rs32+#$Ii) = #$II", -tc_8b15472a, TypeST>, Enc_d7a65e, PredNewRel { +tc_0b2be201, TypeST>, Enc_d7a65e, PredNewRel { let Inst{31-21} = 0b00111000000; let isPredicated = 1; let addrMode = BaseImmOffset; @@ -24208,7 +24300,7 @@ def S4_storeirbt_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II), "if ($Pv4) memb($Rs32) = #$II", -tc_8b15472a, TypeMAPPING> { +tc_0b2be201, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -24216,7 +24308,7 @@ def S4_storeirbtnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II), "if ($Pv4.new) memb($Rs32+#$Ii) = #$II", -tc_f86c328a, TypeST>, Enc_d7a65e, PredNewRel { +tc_c4f596e3, TypeST>, Enc_d7a65e, PredNewRel { let Inst{31-21} = 0b00111001000; let isPredicated = 1; let addrMode = BaseImmOffset; @@ -24236,7 +24328,7 @@ def S4_storeirbtnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II), "if ($Pv4.new) memb($Rs32) = #$II", -tc_f86c328a, TypeMAPPING> { +tc_c4f596e3, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -24244,7 +24336,7 @@ def S4_storeirh_io : HInst< (outs), (ins IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II), "memh($Rs32+#$Ii) = #$II", -tc_05b6c987, TypeST>, Enc_a803e0, PredNewRel { +tc_b83e6d73, TypeST>, Enc_a803e0, PredNewRel { let Inst{31-21} = 0b00111100001; let addrMode = BaseImmOffset; let accessSize = HalfWordAccess; @@ -24263,7 +24355,7 @@ def S4_storeirh_zomap : HInst< (outs), (ins IntRegs:$Rs32, s8_0Imm:$II), "memh($Rs32) = #$II", -tc_05b6c987, TypeMAPPING> { +tc_b83e6d73, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -24271,7 +24363,7 @@ def S4_storeirhf_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II), "if (!$Pv4) memh($Rs32+#$Ii) = #$II", -tc_8b15472a, TypeST>, Enc_f20719, PredNewRel { +tc_0b2be201, TypeST>, Enc_f20719, PredNewRel { let Inst{31-21} = 0b00111000101; let isPredicated = 1; let isPredicatedFalse = 1; @@ -24291,7 +24383,7 @@ def S4_storeirhf_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II), "if (!$Pv4) memh($Rs32) = #$II", -tc_8b15472a, TypeMAPPING> { +tc_0b2be201, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -24299,7 +24391,7 @@ def S4_storeirhfnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II), "if (!$Pv4.new) memh($Rs32+#$Ii) = #$II", -tc_f86c328a, TypeST>, Enc_f20719, PredNewRel { +tc_c4f596e3, TypeST>, Enc_f20719, PredNewRel { let Inst{31-21} = 0b00111001101; let isPredicated = 1; let isPredicatedFalse = 1; @@ -24320,7 +24412,7 @@ def S4_storeirhfnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II), "if (!$Pv4.new) memh($Rs32) = #$II", -tc_f86c328a, TypeMAPPING> { +tc_c4f596e3, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -24328,7 +24420,7 @@ def S4_storeirht_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II), "if ($Pv4) memh($Rs32+#$Ii) = #$II", -tc_8b15472a, TypeST>, Enc_f20719, PredNewRel { +tc_0b2be201, TypeST>, Enc_f20719, PredNewRel { let Inst{31-21} = 0b00111000001; let isPredicated = 1; let addrMode = BaseImmOffset; @@ -24347,7 +24439,7 @@ def S4_storeirht_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II), "if ($Pv4) memh($Rs32) = #$II", -tc_8b15472a, TypeMAPPING> { +tc_0b2be201, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -24355,7 +24447,7 @@ def S4_storeirhtnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II), "if ($Pv4.new) memh($Rs32+#$Ii) = #$II", -tc_f86c328a, TypeST>, Enc_f20719, PredNewRel { +tc_c4f596e3, TypeST>, Enc_f20719, PredNewRel { let Inst{31-21} = 0b00111001001; let isPredicated = 1; let addrMode = BaseImmOffset; @@ -24375,7 +24467,7 @@ def S4_storeirhtnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II), "if ($Pv4.new) memh($Rs32) = #$II", -tc_f86c328a, TypeMAPPING> { +tc_c4f596e3, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -24383,7 +24475,7 @@ def S4_storeiri_io : HInst< (outs), (ins IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II), "memw($Rs32+#$Ii) = #$II", -tc_05b6c987, TypeST>, Enc_f37377, PredNewRel { +tc_b83e6d73, TypeST>, Enc_f37377, PredNewRel { let Inst{31-21} = 0b00111100010; let addrMode = BaseImmOffset; let accessSize = WordAccess; @@ -24402,7 +24494,7 @@ def S4_storeiri_zomap : HInst< (outs), (ins IntRegs:$Rs32, s8_0Imm:$II), "memw($Rs32) = #$II", -tc_05b6c987, TypeMAPPING> { +tc_b83e6d73, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -24410,7 +24502,7 @@ def S4_storeirif_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II), "if (!$Pv4) memw($Rs32+#$Ii) = #$II", -tc_8b15472a, TypeST>, Enc_5ccba9, PredNewRel { +tc_0b2be201, TypeST>, Enc_5ccba9, PredNewRel { let Inst{31-21} = 0b00111000110; let isPredicated = 1; let isPredicatedFalse = 1; @@ -24430,7 +24522,7 @@ def S4_storeirif_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II), "if (!$Pv4) memw($Rs32) = #$II", -tc_8b15472a, TypeMAPPING> { +tc_0b2be201, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -24438,7 +24530,7 @@ def S4_storeirifnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II), "if (!$Pv4.new) memw($Rs32+#$Ii) = #$II", -tc_f86c328a, TypeST>, Enc_5ccba9, PredNewRel { +tc_c4f596e3, TypeST>, Enc_5ccba9, PredNewRel { let Inst{31-21} = 0b00111001110; let isPredicated = 1; let isPredicatedFalse = 1; @@ -24459,7 +24551,7 @@ def S4_storeirifnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II), "if (!$Pv4.new) memw($Rs32) = #$II", -tc_f86c328a, TypeMAPPING> { +tc_c4f596e3, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -24467,7 +24559,7 @@ def S4_storeirit_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II), "if ($Pv4) memw($Rs32+#$Ii) = #$II", -tc_8b15472a, TypeST>, Enc_5ccba9, PredNewRel { +tc_0b2be201, TypeST>, Enc_5ccba9, PredNewRel { let Inst{31-21} = 0b00111000010; let isPredicated = 1; let addrMode = BaseImmOffset; @@ -24486,7 +24578,7 @@ def S4_storeirit_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II), "if ($Pv4) memw($Rs32) = #$II", -tc_8b15472a, TypeMAPPING> { +tc_0b2be201, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -24494,7 +24586,7 @@ def S4_storeiritnew_io : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II), "if ($Pv4.new) memw($Rs32+#$Ii) = #$II", -tc_f86c328a, TypeST>, Enc_5ccba9, PredNewRel { +tc_c4f596e3, TypeST>, Enc_5ccba9, PredNewRel { let Inst{31-21} = 0b00111001010; let isPredicated = 1; let addrMode = BaseImmOffset; @@ -24514,7 +24606,7 @@ def S4_storeiritnew_zomap : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II), "if ($Pv4.new) memw($Rs32) = #$II", -tc_f86c328a, TypeMAPPING> { +tc_c4f596e3, TypeMAPPING> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -24522,7 +24614,7 @@ def S4_storerb_ap : HInst< (outs IntRegs:$Re32), (ins u32_0Imm:$II, IntRegs:$Rt32), "memb($Re32=#$II) = $Rt32", -tc_66888ded, TypeST>, Enc_8bcba4, AddrModeRel { +tc_da4a37ed, TypeST>, Enc_8bcba4, AddrModeRel { let Inst{7-6} = 0b10; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10101011000; @@ -24543,7 +24635,7 @@ def S4_storerb_rr : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "memb($Rs32+$Ru32<<#$Ii) = $Rt32", -tc_d9709180, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl { +tc_5aee39f7, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl { let Inst{6-5} = 0b00; let Inst{31-21} = 0b00111011000; let addrMode = BaseRegOffset; @@ -24559,7 +24651,7 @@ def S4_storerb_ur : HInst< (outs), (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32), "memb($Ru32<<#$Ii+#$II) = $Rt32", -tc_0dc560de, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl { +tc_14b272fa, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl { let Inst{7-7} = 0b1; let Inst{31-21} = 0b10101101000; let addrMode = BaseLongOffset; @@ -24581,7 +24673,7 @@ def S4_storerbnew_ap : HInst< (outs IntRegs:$Re32), (ins u32_0Imm:$II, IntRegs:$Nt8), "memb($Re32=#$II) = $Nt8.new", -tc_53bdb2f6, TypeST>, Enc_724154, AddrModeRel { +tc_d2e63d61, TypeST>, Enc_724154, AddrModeRel { let Inst{7-6} = 0b10; let Inst{13-11} = 0b000; let Inst{31-21} = 0b10101011101; @@ -24605,7 +24697,7 @@ def S4_storerbnew_rr : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8), "memb($Rs32+$Ru32<<#$Ii) = $Nt8.new", -tc_b166348b, TypeST>, Enc_c6220b, AddrModeRel { +tc_67435e81, TypeST>, Enc_c6220b, AddrModeRel { let Inst{6-3} = 0b0000; let Inst{31-21} = 0b00111011101; let addrMode = BaseRegOffset; @@ -24624,7 +24716,7 @@ def S4_storerbnew_ur : HInst< (outs), (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Nt8), "memb($Ru32<<#$Ii+#$II) = $Nt8.new", -tc_a8acdac0, TypeST>, Enc_7eb485, AddrModeRel { +tc_fcc3ddf9, TypeST>, Enc_7eb485, AddrModeRel { let Inst{7-7} = 0b1; let Inst{12-11} = 0b00; let Inst{31-21} = 0b10101101101; @@ -24649,7 +24741,7 @@ def S4_storerd_ap : HInst< (outs IntRegs:$Re32), (ins u32_0Imm:$II, DoubleRegs:$Rtt32), "memd($Re32=#$II) = $Rtt32", -tc_66888ded, TypeST>, Enc_c7a204 { +tc_da4a37ed, TypeST>, Enc_c7a204 { let Inst{7-6} = 0b10; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10101011110; @@ -24669,7 +24761,7 @@ def S4_storerd_rr : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32), "memd($Rs32+$Ru32<<#$Ii) = $Rtt32", -tc_d9709180, TypeST>, Enc_55355c, AddrModeRel, ImmRegShl { +tc_5aee39f7, TypeST>, Enc_55355c, AddrModeRel, ImmRegShl { let Inst{6-5} = 0b00; let Inst{31-21} = 0b00111011110; let addrMode = BaseRegOffset; @@ -24684,7 +24776,7 @@ def S4_storerd_ur : HInst< (outs), (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, DoubleRegs:$Rtt32), "memd($Ru32<<#$Ii+#$II) = $Rtt32", -tc_0dc560de, TypeST>, Enc_f79415, AddrModeRel, ImmRegShl { +tc_14b272fa, TypeST>, Enc_f79415, AddrModeRel, ImmRegShl { let Inst{7-7} = 0b1; let Inst{31-21} = 0b10101101110; let addrMode = BaseLongOffset; @@ -24705,7 +24797,7 @@ def S4_storerf_ap : HInst< (outs IntRegs:$Re32), (ins u32_0Imm:$II, IntRegs:$Rt32), "memh($Re32=#$II) = $Rt32.h", -tc_66888ded, TypeST>, Enc_8bcba4 { +tc_da4a37ed, TypeST>, Enc_8bcba4 { let Inst{7-6} = 0b10; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10101011011; @@ -24725,7 +24817,7 @@ def S4_storerf_rr : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "memh($Rs32+$Ru32<<#$Ii) = $Rt32.h", -tc_d9709180, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl { +tc_5aee39f7, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl { let Inst{6-5} = 0b00; let Inst{31-21} = 0b00111011011; let addrMode = BaseRegOffset; @@ -24740,7 +24832,7 @@ def S4_storerf_ur : HInst< (outs), (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32), "memh($Ru32<<#$Ii+#$II) = $Rt32.h", -tc_0dc560de, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl { +tc_14b272fa, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl { let Inst{7-7} = 0b1; let Inst{31-21} = 0b10101101011; let addrMode = BaseLongOffset; @@ -24761,7 +24853,7 @@ def S4_storerh_ap : HInst< (outs IntRegs:$Re32), (ins u32_0Imm:$II, IntRegs:$Rt32), "memh($Re32=#$II) = $Rt32", -tc_66888ded, TypeST>, Enc_8bcba4, AddrModeRel { +tc_da4a37ed, TypeST>, Enc_8bcba4, AddrModeRel { let Inst{7-6} = 0b10; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10101011010; @@ -24782,7 +24874,7 @@ def S4_storerh_rr : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "memh($Rs32+$Ru32<<#$Ii) = $Rt32", -tc_d9709180, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl { +tc_5aee39f7, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl { let Inst{6-5} = 0b00; let Inst{31-21} = 0b00111011010; let addrMode = BaseRegOffset; @@ -24798,7 +24890,7 @@ def S4_storerh_ur : HInst< (outs), (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32), "memh($Ru32<<#$Ii+#$II) = $Rt32", -tc_0dc560de, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl { +tc_14b272fa, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl { let Inst{7-7} = 0b1; let Inst{31-21} = 0b10101101010; let addrMode = BaseLongOffset; @@ -24820,7 +24912,7 @@ def S4_storerhnew_ap : HInst< (outs IntRegs:$Re32), (ins u32_0Imm:$II, IntRegs:$Nt8), "memh($Re32=#$II) = $Nt8.new", -tc_53bdb2f6, TypeST>, Enc_724154, AddrModeRel { +tc_d2e63d61, TypeST>, Enc_724154, AddrModeRel { let Inst{7-6} = 0b10; let Inst{13-11} = 0b001; let Inst{31-21} = 0b10101011101; @@ -24844,7 +24936,7 @@ def S4_storerhnew_rr : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8), "memh($Rs32+$Ru32<<#$Ii) = $Nt8.new", -tc_b166348b, TypeST>, Enc_c6220b, AddrModeRel { +tc_67435e81, TypeST>, Enc_c6220b, AddrModeRel { let Inst{6-3} = 0b0001; let Inst{31-21} = 0b00111011101; let addrMode = BaseRegOffset; @@ -24863,7 +24955,7 @@ def S4_storerhnew_ur : HInst< (outs), (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Nt8), "memh($Ru32<<#$Ii+#$II) = $Nt8.new", -tc_a8acdac0, TypeST>, Enc_7eb485, AddrModeRel { +tc_fcc3ddf9, TypeST>, Enc_7eb485, AddrModeRel { let Inst{7-7} = 0b1; let Inst{12-11} = 0b01; let Inst{31-21} = 0b10101101101; @@ -24888,7 +24980,7 @@ def S4_storeri_ap : HInst< (outs IntRegs:$Re32), (ins u32_0Imm:$II, IntRegs:$Rt32), "memw($Re32=#$II) = $Rt32", -tc_66888ded, TypeST>, Enc_8bcba4, AddrModeRel { +tc_da4a37ed, TypeST>, Enc_8bcba4, AddrModeRel { let Inst{7-6} = 0b10; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10101011100; @@ -24909,7 +25001,7 @@ def S4_storeri_rr : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32), "memw($Rs32+$Ru32<<#$Ii) = $Rt32", -tc_d9709180, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl { +tc_5aee39f7, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl { let Inst{6-5} = 0b00; let Inst{31-21} = 0b00111011100; let addrMode = BaseRegOffset; @@ -24925,7 +25017,7 @@ def S4_storeri_ur : HInst< (outs), (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32), "memw($Ru32<<#$Ii+#$II) = $Rt32", -tc_0dc560de, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl { +tc_14b272fa, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl { let Inst{7-7} = 0b1; let Inst{31-21} = 0b10101101100; let addrMode = BaseLongOffset; @@ -24947,7 +25039,7 @@ def S4_storerinew_ap : HInst< (outs IntRegs:$Re32), (ins u32_0Imm:$II, IntRegs:$Nt8), "memw($Re32=#$II) = $Nt8.new", -tc_53bdb2f6, TypeST>, Enc_724154, AddrModeRel { +tc_d2e63d61, TypeST>, Enc_724154, AddrModeRel { let Inst{7-6} = 0b10; let Inst{13-11} = 0b010; let Inst{31-21} = 0b10101011101; @@ -24971,7 +25063,7 @@ def S4_storerinew_rr : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8), "memw($Rs32+$Ru32<<#$Ii) = $Nt8.new", -tc_b166348b, TypeST>, Enc_c6220b, AddrModeRel { +tc_67435e81, TypeST>, Enc_c6220b, AddrModeRel { let Inst{6-3} = 0b0010; let Inst{31-21} = 0b00111011101; let addrMode = BaseRegOffset; @@ -24990,7 +25082,7 @@ def S4_storerinew_ur : HInst< (outs), (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Nt8), "memw($Ru32<<#$Ii+#$II) = $Nt8.new", -tc_a8acdac0, TypeST>, Enc_7eb485, AddrModeRel { +tc_fcc3ddf9, TypeST>, Enc_7eb485, AddrModeRel { let Inst{7-7} = 0b1; let Inst{12-11} = 0b10; let Inst{31-21} = 0b10101101101; @@ -25015,7 +25107,7 @@ def S4_subaddi : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Ru32), "$Rd32 = add($Rs32,sub(#$Ii,$Ru32))", -tc_c74f796f, TypeALU64>, Enc_8b8d61 { +tc_f675fee8, TypeALU64>, Enc_8b8d61 { let Inst{31-23} = 0b110110111; let hasNewValue = 1; let opNewValue = 0; @@ -25030,7 +25122,7 @@ def S4_subi_asl_ri : HInst< (outs IntRegs:$Rx32), (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II), "$Rx32 = sub(#$Ii,asl($Rx32in,#$II))", -tc_c74f796f, TypeALU64>, Enc_c31910 { +tc_f675fee8, TypeALU64>, Enc_c31910 { let Inst{2-0} = 0b110; let Inst{4-4} = 0b0; let Inst{31-24} = 0b11011110; @@ -25048,7 +25140,7 @@ def S4_subi_lsr_ri : HInst< (outs IntRegs:$Rx32), (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II), "$Rx32 = sub(#$Ii,lsr($Rx32in,#$II))", -tc_c74f796f, TypeALU64>, Enc_c31910 { +tc_f675fee8, TypeALU64>, Enc_c31910 { let Inst{2-0} = 0b110; let Inst{4-4} = 0b1; let Inst{31-24} = 0b11011110; @@ -25066,7 +25158,7 @@ def S4_vrcrotate : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, IntRegs:$Rt32, u2_0Imm:$Ii), "$Rdd32 = vrcrotate($Rss32,$Rt32,#$Ii)", -tc_b9c0b731, TypeS_3op>, Enc_645d54 { +tc_13bfbcf9, TypeS_3op>, Enc_645d54 { let Inst{7-6} = 0b11; let Inst{31-21} = 0b11000011110; let prefersSlot3 = 1; @@ -25075,7 +25167,7 @@ def S4_vrcrotate_acc : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32, u2_0Imm:$Ii), "$Rxx32 += vrcrotate($Rss32,$Rt32,#$Ii)", -tc_60571023, TypeS_3op>, Enc_b72622 { +tc_9debc299, TypeS_3op>, Enc_b72622 { let Inst{7-6} = 0b00; let Inst{31-21} = 0b11001011101; let prefersSlot3 = 1; @@ -25085,7 +25177,7 @@ def S4_vxaddsubh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vxaddsubh($Rss32,$Rtt32):sat", -tc_b44c6e2a, TypeS_3op>, Enc_a56825 { +tc_779080bf, TypeS_3op>, Enc_a56825 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001010; @@ -25096,7 +25188,7 @@ def S4_vxaddsubhr : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vxaddsubh($Rss32,$Rtt32):rnd:>>1:sat", -tc_2b6f77c6, TypeS_3op>, Enc_a56825 { +tc_002cb246, TypeS_3op>, Enc_a56825 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001110; @@ -25107,7 +25199,7 @@ def S4_vxaddsubw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vxaddsubw($Rss32,$Rtt32):sat", -tc_b44c6e2a, TypeS_3op>, Enc_a56825 { +tc_779080bf, TypeS_3op>, Enc_a56825 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001010; @@ -25118,7 +25210,7 @@ def S4_vxsubaddh : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vxsubaddh($Rss32,$Rtt32):sat", -tc_b44c6e2a, TypeS_3op>, Enc_a56825 { +tc_779080bf, TypeS_3op>, Enc_a56825 { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001010; @@ -25129,7 +25221,7 @@ def S4_vxsubaddhr : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vxsubaddh($Rss32,$Rtt32):rnd:>>1:sat", -tc_2b6f77c6, TypeS_3op>, Enc_a56825 { +tc_002cb246, TypeS_3op>, Enc_a56825 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001110; @@ -25140,7 +25232,7 @@ def S4_vxsubaddw : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vxsubaddw($Rss32,$Rtt32):sat", -tc_b44c6e2a, TypeS_3op>, Enc_a56825 { +tc_779080bf, TypeS_3op>, Enc_a56825 { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001010; @@ -25151,7 +25243,7 @@ def S5_asrhub_rnd_sat : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32, u4_0Imm:$Ii), "$Rd32 = vasrhub($Rss32,#$Ii):raw", -tc_2b6f77c6, TypeS_2op>, Enc_11a146 { +tc_002cb246, TypeS_2op>, Enc_11a146 { let Inst{7-5} = 0b100; let Inst{13-12} = 0b00; let Inst{31-21} = 0b10001000011; @@ -25164,7 +25256,7 @@ def S5_asrhub_rnd_sat_goodsyntax : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32, u4_0Imm:$Ii), "$Rd32 = vasrhub($Rss32,#$Ii):rnd:sat", -tc_2b6f77c6, TypeS_2op> { +tc_002cb246, TypeS_2op> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -25173,7 +25265,7 @@ def S5_asrhub_sat : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32, u4_0Imm:$Ii), "$Rd32 = vasrhub($Rss32,#$Ii):sat", -tc_2b6f77c6, TypeS_2op>, Enc_11a146 { +tc_002cb246, TypeS_2op>, Enc_11a146 { let Inst{7-5} = 0b101; let Inst{13-12} = 0b00; let Inst{31-21} = 0b10001000011; @@ -25186,7 +25278,7 @@ def S5_popcountp : HInst< (outs IntRegs:$Rd32), (ins DoubleRegs:$Rss32), "$Rd32 = popcount($Rss32)", -tc_00afc57e, TypeS_2op>, Enc_90cd8b { +tc_703e822c, TypeS_2op>, Enc_90cd8b { let Inst{13-5} = 0b000000011; let Inst{31-21} = 0b10001000011; let hasNewValue = 1; @@ -25197,7 +25289,7 @@ def S5_vasrhrnd : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, u4_0Imm:$Ii), "$Rdd32 = vasrh($Rss32,#$Ii):raw", -tc_2b6f77c6, TypeS_2op>, Enc_12b6e9 { +tc_002cb246, TypeS_2op>, Enc_12b6e9 { let Inst{7-5} = 0b000; let Inst{13-12} = 0b00; let Inst{31-21} = 0b10000000001; @@ -25207,14 +25299,14 @@ def S5_vasrhrnd_goodsyntax : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, u4_0Imm:$Ii), "$Rdd32 = vasrh($Rss32,#$Ii):rnd", -tc_2b6f77c6, TypeS_2op> { +tc_002cb246, TypeS_2op> { let isPseudo = 1; } def S6_allocframe_to_raw : HInst< (outs), (ins u11_3Imm:$Ii), "allocframe(#$Ii)", -tc_e216a5db, TypeMAPPING>, Requires<[HasV65]> { +tc_b44ecf75, TypeMAPPING>, Requires<[HasV65]> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -25222,7 +25314,7 @@ def S6_rol_i_p : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rdd32 = rol($Rss32,#$Ii)", -tc_55050d58, TypeS_2op>, Enc_5eac98, Requires<[HasV60]> { +tc_1fc97744, TypeS_2op>, Enc_5eac98, Requires<[HasV60]> { let Inst{7-5} = 0b011; let Inst{31-21} = 0b10000000000; } @@ -25230,7 +25322,7 @@ def S6_rol_i_p_acc : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rxx32 += rol($Rss32,#$Ii)", -tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> { +tc_784490da, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> { let Inst{7-5} = 0b111; let Inst{31-21} = 0b10000010000; let prefersSlot3 = 1; @@ -25240,7 +25332,7 @@ def S6_rol_i_p_and : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rxx32 &= rol($Rss32,#$Ii)", -tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> { +tc_784490da, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> { let Inst{7-5} = 0b011; let Inst{31-21} = 0b10000010010; let prefersSlot3 = 1; @@ -25250,7 +25342,7 @@ def S6_rol_i_p_nac : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rxx32 -= rol($Rss32,#$Ii)", -tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> { +tc_784490da, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> { let Inst{7-5} = 0b011; let Inst{31-21} = 0b10000010000; let prefersSlot3 = 1; @@ -25260,7 +25352,7 @@ def S6_rol_i_p_or : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rxx32 |= rol($Rss32,#$Ii)", -tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> { +tc_784490da, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> { let Inst{7-5} = 0b111; let Inst{31-21} = 0b10000010010; let prefersSlot3 = 1; @@ -25270,7 +25362,7 @@ def S6_rol_i_p_xacc : HInst< (outs DoubleRegs:$Rxx32), (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii), "$Rxx32 ^= rol($Rss32,#$Ii)", -tc_41d5298e, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> { +tc_784490da, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> { let Inst{7-5} = 0b011; let Inst{31-21} = 0b10000010100; let prefersSlot3 = 1; @@ -25280,7 +25372,7 @@ def S6_rol_i_r : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, u5_0Imm:$Ii), "$Rd32 = rol($Rs32,#$Ii)", -tc_55050d58, TypeS_2op>, Enc_a05677, Requires<[HasV60]> { +tc_1fc97744, TypeS_2op>, Enc_a05677, Requires<[HasV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001100000; @@ -25291,7 +25383,7 @@ def S6_rol_i_r_acc : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii), "$Rx32 += rol($Rs32,#$Ii)", -tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> { +tc_784490da, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001110000; @@ -25304,7 +25396,7 @@ def S6_rol_i_r_and : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii), "$Rx32 &= rol($Rs32,#$Ii)", -tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> { +tc_784490da, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001110010; @@ -25317,7 +25409,7 @@ def S6_rol_i_r_nac : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii), "$Rx32 -= rol($Rs32,#$Ii)", -tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> { +tc_784490da, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001110000; @@ -25330,7 +25422,7 @@ def S6_rol_i_r_or : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii), "$Rx32 |= rol($Rs32,#$Ii)", -tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> { +tc_784490da, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001110010; @@ -25343,7 +25435,7 @@ def S6_rol_i_r_xacc : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii), "$Rx32 ^= rol($Rs32,#$Ii)", -tc_41d5298e, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> { +tc_784490da, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10001110100; @@ -25356,7 +25448,7 @@ def S6_vsplatrbp : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = vsplatb($Rs32)", -tc_be706f30, TypeS_2op>, Enc_3a3d62, Requires<[HasV62]> { +tc_a1c00888, TypeS_2op>, Enc_3a3d62, Requires<[HasV62]> { let Inst{13-5} = 0b000000100; let Inst{31-21} = 0b10000100010; } @@ -25364,7 +25456,7 @@ def S6_vtrunehb_ppp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vtrunehb($Rss32,$Rtt32)", -tc_55050d58, TypeS_3op>, Enc_a56825, Requires<[HasV62]> { +tc_1fc97744, TypeS_3op>, Enc_a56825, Requires<[HasV62]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001100; @@ -25373,7 +25465,7 @@ def S6_vtrunohb_ppp : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = vtrunohb($Rss32,$Rtt32)", -tc_55050d58, TypeS_3op>, Enc_a56825, Requires<[HasV62]> { +tc_1fc97744, TypeS_3op>, Enc_a56825, Requires<[HasV62]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001100; @@ -25382,7 +25474,7 @@ def SA1_addi : HInst< (outs GeneralSubRegs:$Rx16), (ins IntRegs:$Rx16in, s32_0Imm:$Ii), "$Rx16 = add($Rx16in,#$Ii)", -tc_609d2efe, TypeSUBINSN>, Enc_93af4c { +tc_0a705168, TypeSUBINSN>, Enc_93af4c { let Inst{12-11} = 0b00; let hasNewValue = 1; let opNewValue = 0; @@ -25399,7 +25491,7 @@ def SA1_addrx : HInst< (outs GeneralSubRegs:$Rx16), (ins IntRegs:$Rx16in, GeneralSubRegs:$Rs16), "$Rx16 = add($Rx16in,$Rs16)", -tc_609d2efe, TypeSUBINSN>, Enc_0527db { +tc_0a705168, TypeSUBINSN>, Enc_0527db { let Inst{12-8} = 0b11000; let hasNewValue = 1; let opNewValue = 0; @@ -25411,7 +25503,7 @@ def SA1_addsp : HInst< (outs GeneralSubRegs:$Rd16), (ins u6_2Imm:$Ii), "$Rd16 = add(r29,#$Ii)", -tc_a904d137, TypeSUBINSN>, Enc_2df31d { +tc_9fc3dae0, TypeSUBINSN>, Enc_2df31d { let Inst{12-10} = 0b011; let hasNewValue = 1; let opNewValue = 0; @@ -25423,7 +25515,7 @@ def SA1_and1 : HInst< (outs GeneralSubRegs:$Rd16), (ins GeneralSubRegs:$Rs16), "$Rd16 = and($Rs16,#1)", -tc_a904d137, TypeSUBINSN>, Enc_97d666 { +tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 { let Inst{12-8} = 0b10010; let hasNewValue = 1; let opNewValue = 0; @@ -25434,7 +25526,7 @@ def SA1_clrf : HInst< (outs GeneralSubRegs:$Rd16), (ins), "if (!p0) $Rd16 = #0", -tc_1b82a277, TypeSUBINSN>, Enc_1f5ba6 { +tc_a1123dda, TypeSUBINSN>, Enc_1f5ba6 { let Inst{12-4} = 0b110100111; let isPredicated = 1; let isPredicatedFalse = 1; @@ -25448,7 +25540,7 @@ def SA1_clrfnew : HInst< (outs GeneralSubRegs:$Rd16), (ins), "if (!p0.new) $Rd16 = #0", -tc_e9c822f7, TypeSUBINSN>, Enc_1f5ba6 { +tc_8b3e402a, TypeSUBINSN>, Enc_1f5ba6 { let Inst{12-4} = 0b110100101; let isPredicated = 1; let isPredicatedFalse = 1; @@ -25463,7 +25555,7 @@ def SA1_clrt : HInst< (outs GeneralSubRegs:$Rd16), (ins), "if (p0) $Rd16 = #0", -tc_1b82a277, TypeSUBINSN>, Enc_1f5ba6 { +tc_a1123dda, TypeSUBINSN>, Enc_1f5ba6 { let Inst{12-4} = 0b110100110; let isPredicated = 1; let hasNewValue = 1; @@ -25476,7 +25568,7 @@ def SA1_clrtnew : HInst< (outs GeneralSubRegs:$Rd16), (ins), "if (p0.new) $Rd16 = #0", -tc_e9c822f7, TypeSUBINSN>, Enc_1f5ba6 { +tc_8b3e402a, TypeSUBINSN>, Enc_1f5ba6 { let Inst{12-4} = 0b110100100; let isPredicated = 1; let hasNewValue = 1; @@ -25490,7 +25582,7 @@ def SA1_cmpeqi : HInst< (outs), (ins GeneralSubRegs:$Rs16, u2_0Imm:$Ii), "p0 = cmp.eq($Rs16,#$Ii)", -tc_90f3e30c, TypeSUBINSN>, Enc_63eaeb { +tc_5b7c0967, TypeSUBINSN>, Enc_63eaeb { let Inst{3-2} = 0b00; let Inst{12-8} = 0b11001; let AsmVariantName = "NonParsable"; @@ -25501,7 +25593,7 @@ def SA1_combine0i : HInst< (outs GeneralDoubleLow8Regs:$Rdd8), (ins u2_0Imm:$Ii), "$Rdd8 = combine(#0,#$Ii)", -tc_a904d137, TypeSUBINSN>, Enc_ed48be { +tc_9fc3dae0, TypeSUBINSN>, Enc_ed48be { let Inst{4-3} = 0b00; let Inst{12-7} = 0b111000; let hasNewValue = 1; @@ -25513,7 +25605,7 @@ def SA1_combine1i : HInst< (outs GeneralDoubleLow8Regs:$Rdd8), (ins u2_0Imm:$Ii), "$Rdd8 = combine(#1,#$Ii)", -tc_a904d137, TypeSUBINSN>, Enc_ed48be { +tc_9fc3dae0, TypeSUBINSN>, Enc_ed48be { let Inst{4-3} = 0b01; let Inst{12-7} = 0b111000; let hasNewValue = 1; @@ -25525,7 +25617,7 @@ def SA1_combine2i : HInst< (outs GeneralDoubleLow8Regs:$Rdd8), (ins u2_0Imm:$Ii), "$Rdd8 = combine(#2,#$Ii)", -tc_a904d137, TypeSUBINSN>, Enc_ed48be { +tc_9fc3dae0, TypeSUBINSN>, Enc_ed48be { let Inst{4-3} = 0b10; let Inst{12-7} = 0b111000; let hasNewValue = 1; @@ -25537,7 +25629,7 @@ def SA1_combine3i : HInst< (outs GeneralDoubleLow8Regs:$Rdd8), (ins u2_0Imm:$Ii), "$Rdd8 = combine(#3,#$Ii)", -tc_a904d137, TypeSUBINSN>, Enc_ed48be { +tc_9fc3dae0, TypeSUBINSN>, Enc_ed48be { let Inst{4-3} = 0b11; let Inst{12-7} = 0b111000; let hasNewValue = 1; @@ -25549,7 +25641,7 @@ def SA1_combinerz : HInst< (outs GeneralDoubleLow8Regs:$Rdd8), (ins GeneralSubRegs:$Rs16), "$Rdd8 = combine($Rs16,#0)", -tc_a904d137, TypeSUBINSN>, Enc_399e12 { +tc_9fc3dae0, TypeSUBINSN>, Enc_399e12 { let Inst{3-3} = 0b1; let Inst{12-8} = 0b11101; let hasNewValue = 1; @@ -25561,7 +25653,7 @@ def SA1_combinezr : HInst< (outs GeneralDoubleLow8Regs:$Rdd8), (ins GeneralSubRegs:$Rs16), "$Rdd8 = combine(#0,$Rs16)", -tc_a904d137, TypeSUBINSN>, Enc_399e12 { +tc_9fc3dae0, TypeSUBINSN>, Enc_399e12 { let Inst{3-3} = 0b0; let Inst{12-8} = 0b11101; let hasNewValue = 1; @@ -25573,7 +25665,7 @@ def SA1_dec : HInst< (outs GeneralSubRegs:$Rd16), (ins GeneralSubRegs:$Rs16, n1Const:$n1), "$Rd16 = add($Rs16,#$n1)", -tc_609d2efe, TypeSUBINSN>, Enc_ee5ed0 { +tc_0a705168, TypeSUBINSN>, Enc_ee5ed0 { let Inst{12-8} = 0b10011; let hasNewValue = 1; let opNewValue = 0; @@ -25584,7 +25676,7 @@ def SA1_inc : HInst< (outs GeneralSubRegs:$Rd16), (ins GeneralSubRegs:$Rs16), "$Rd16 = add($Rs16,#1)", -tc_a904d137, TypeSUBINSN>, Enc_97d666 { +tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 { let Inst{12-8} = 0b10001; let hasNewValue = 1; let opNewValue = 0; @@ -25595,7 +25687,7 @@ def SA1_seti : HInst< (outs GeneralSubRegs:$Rd16), (ins u32_0Imm:$Ii), "$Rd16 = #$Ii", -tc_a904d137, TypeSUBINSN>, Enc_e39bb2 { +tc_9fc3dae0, TypeSUBINSN>, Enc_e39bb2 { let Inst{12-10} = 0b010; let hasNewValue = 1; let opNewValue = 0; @@ -25611,7 +25703,7 @@ def SA1_setin1 : HInst< (outs GeneralSubRegs:$Rd16), (ins n1Const:$n1), "$Rd16 = #$n1", -tc_a904d137, TypeSUBINSN>, Enc_7a0ea6 { +tc_9fc3dae0, TypeSUBINSN>, Enc_7a0ea6 { let Inst{12-4} = 0b110100000; let hasNewValue = 1; let opNewValue = 0; @@ -25622,7 +25714,7 @@ def SA1_sxtb : HInst< (outs GeneralSubRegs:$Rd16), (ins GeneralSubRegs:$Rs16), "$Rd16 = sxtb($Rs16)", -tc_a904d137, TypeSUBINSN>, Enc_97d666 { +tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 { let Inst{12-8} = 0b10101; let hasNewValue = 1; let opNewValue = 0; @@ -25633,7 +25725,7 @@ def SA1_sxth : HInst< (outs GeneralSubRegs:$Rd16), (ins GeneralSubRegs:$Rs16), "$Rd16 = sxth($Rs16)", -tc_a904d137, TypeSUBINSN>, Enc_97d666 { +tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 { let Inst{12-8} = 0b10100; let hasNewValue = 1; let opNewValue = 0; @@ -25644,7 +25736,7 @@ def SA1_tfr : HInst< (outs GeneralSubRegs:$Rd16), (ins GeneralSubRegs:$Rs16), "$Rd16 = $Rs16", -tc_a904d137, TypeSUBINSN>, Enc_97d666 { +tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 { let Inst{12-8} = 0b10000; let hasNewValue = 1; let opNewValue = 0; @@ -25655,7 +25747,7 @@ def SA1_zxtb : HInst< (outs GeneralSubRegs:$Rd16), (ins GeneralSubRegs:$Rs16), "$Rd16 = and($Rs16,#255)", -tc_a904d137, TypeSUBINSN>, Enc_97d666 { +tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 { let Inst{12-8} = 0b10111; let hasNewValue = 1; let opNewValue = 0; @@ -25666,7 +25758,7 @@ def SA1_zxth : HInst< (outs GeneralSubRegs:$Rd16), (ins GeneralSubRegs:$Rs16), "$Rd16 = zxth($Rs16)", -tc_a904d137, TypeSUBINSN>, Enc_97d666 { +tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 { let Inst{12-8} = 0b10110; let hasNewValue = 1; let opNewValue = 0; @@ -25677,7 +25769,7 @@ def SL1_loadri_io : HInst< (outs GeneralSubRegs:$Rd16), (ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii), "$Rd16 = memw($Rs16+#$Ii)", -tc_7f881c76, TypeSUBINSN>, Enc_53dca9 { +tc_17e0d2cd, TypeSUBINSN>, Enc_53dca9 { let Inst{12-12} = 0b0; let hasNewValue = 1; let opNewValue = 0; @@ -25691,7 +25783,7 @@ def SL1_loadrub_io : HInst< (outs GeneralSubRegs:$Rd16), (ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii), "$Rd16 = memub($Rs16+#$Ii)", -tc_7f881c76, TypeSUBINSN>, Enc_c175d0 { +tc_17e0d2cd, TypeSUBINSN>, Enc_c175d0 { let Inst{12-12} = 0b1; let hasNewValue = 1; let opNewValue = 0; @@ -25705,7 +25797,7 @@ def SL2_deallocframe : HInst< (outs), (ins), "deallocframe", -tc_36c68ad1, TypeSUBINSN>, Enc_e3b0c4 { +tc_39dfefe8, TypeSUBINSN>, Enc_e3b0c4 { let Inst{12-0} = 0b1111100000000; let accessSize = DoubleWordAccess; let AsmVariantName = "NonParsable"; @@ -25718,7 +25810,7 @@ def SL2_jumpr31 : HInst< (outs), (ins), "jumpr r31", -tc_2a160009, TypeSUBINSN>, Enc_e3b0c4 { +tc_b4407292, TypeSUBINSN>, Enc_e3b0c4 { let Inst{12-0} = 0b1111111000000; let isTerminator = 1; let isIndirectBranch = 1; @@ -25733,7 +25825,7 @@ def SL2_jumpr31_f : HInst< (outs), (ins), "if (!p0) jumpr r31", -tc_2a160009, TypeSUBINSN>, Enc_e3b0c4 { +tc_b4407292, TypeSUBINSN>, Enc_e3b0c4 { let Inst{12-0} = 0b1111111000101; let isPredicated = 1; let isPredicatedFalse = 1; @@ -25751,7 +25843,7 @@ def SL2_jumpr31_fnew : HInst< (outs), (ins), "if (!p0.new) jumpr:nt r31", -tc_2a160009, TypeSUBINSN>, Enc_e3b0c4 { +tc_b4407292, TypeSUBINSN>, Enc_e3b0c4 { let Inst{12-0} = 0b1111111000111; let isPredicated = 1; let isPredicatedFalse = 1; @@ -25770,7 +25862,7 @@ def SL2_jumpr31_t : HInst< (outs), (ins), "if (p0) jumpr r31", -tc_2a160009, TypeSUBINSN>, Enc_e3b0c4 { +tc_b4407292, TypeSUBINSN>, Enc_e3b0c4 { let Inst{12-0} = 0b1111111000100; let isPredicated = 1; let isTerminator = 1; @@ -25787,7 +25879,7 @@ def SL2_jumpr31_tnew : HInst< (outs), (ins), "if (p0.new) jumpr:nt r31", -tc_2a160009, TypeSUBINSN>, Enc_e3b0c4 { +tc_b4407292, TypeSUBINSN>, Enc_e3b0c4 { let Inst{12-0} = 0b1111111000110; let isPredicated = 1; let isTerminator = 1; @@ -25805,7 +25897,7 @@ def SL2_loadrb_io : HInst< (outs GeneralSubRegs:$Rd16), (ins GeneralSubRegs:$Rs16, u3_0Imm:$Ii), "$Rd16 = memb($Rs16+#$Ii)", -tc_7f881c76, TypeSUBINSN>, Enc_2fbf3c { +tc_17e0d2cd, TypeSUBINSN>, Enc_2fbf3c { let Inst{12-11} = 0b10; let hasNewValue = 1; let opNewValue = 0; @@ -25819,7 +25911,7 @@ def SL2_loadrd_sp : HInst< (outs GeneralDoubleLow8Regs:$Rdd8), (ins u5_3Imm:$Ii), "$Rdd8 = memd(r29+#$Ii)", -tc_9c98e8af, TypeSUBINSN>, Enc_86a14b { +tc_c4db48cb, TypeSUBINSN>, Enc_86a14b { let Inst{12-8} = 0b11110; let hasNewValue = 1; let opNewValue = 0; @@ -25834,7 +25926,7 @@ def SL2_loadrh_io : HInst< (outs GeneralSubRegs:$Rd16), (ins GeneralSubRegs:$Rs16, u3_1Imm:$Ii), "$Rd16 = memh($Rs16+#$Ii)", -tc_7f881c76, TypeSUBINSN>, Enc_2bae10 { +tc_17e0d2cd, TypeSUBINSN>, Enc_2bae10 { let Inst{12-11} = 0b00; let hasNewValue = 1; let opNewValue = 0; @@ -25848,7 +25940,7 @@ def SL2_loadri_sp : HInst< (outs GeneralSubRegs:$Rd16), (ins u5_2Imm:$Ii), "$Rd16 = memw(r29+#$Ii)", -tc_9c98e8af, TypeSUBINSN>, Enc_51635c { +tc_c4db48cb, TypeSUBINSN>, Enc_51635c { let Inst{12-9} = 0b1110; let hasNewValue = 1; let opNewValue = 0; @@ -25863,7 +25955,7 @@ def SL2_loadruh_io : HInst< (outs GeneralSubRegs:$Rd16), (ins GeneralSubRegs:$Rs16, u3_1Imm:$Ii), "$Rd16 = memuh($Rs16+#$Ii)", -tc_7f881c76, TypeSUBINSN>, Enc_2bae10 { +tc_17e0d2cd, TypeSUBINSN>, Enc_2bae10 { let Inst{12-11} = 0b01; let hasNewValue = 1; let opNewValue = 0; @@ -25877,7 +25969,7 @@ def SL2_return : HInst< (outs), (ins), "dealloc_return", -tc_fcab4871, TypeSUBINSN>, Enc_e3b0c4 { +tc_36153880, TypeSUBINSN>, Enc_e3b0c4 { let Inst{12-0} = 0b1111101000000; let isTerminator = 1; let isIndirectBranch = 1; @@ -25895,7 +25987,7 @@ def SL2_return_f : HInst< (outs), (ins), "if (!p0) dealloc_return", -tc_fcab4871, TypeSUBINSN>, Enc_e3b0c4 { +tc_36153880, TypeSUBINSN>, Enc_e3b0c4 { let Inst{12-0} = 0b1111101000101; let isPredicated = 1; let isPredicatedFalse = 1; @@ -25916,7 +26008,7 @@ def SL2_return_fnew : HInst< (outs), (ins), "if (!p0.new) dealloc_return:nt", -tc_fcab4871, TypeSUBINSN>, Enc_e3b0c4 { +tc_36153880, TypeSUBINSN>, Enc_e3b0c4 { let Inst{12-0} = 0b1111101000111; let isPredicated = 1; let isPredicatedFalse = 1; @@ -25938,7 +26030,7 @@ def SL2_return_t : HInst< (outs), (ins), "if (p0) dealloc_return", -tc_fcab4871, TypeSUBINSN>, Enc_e3b0c4 { +tc_36153880, TypeSUBINSN>, Enc_e3b0c4 { let Inst{12-0} = 0b1111101000100; let isPredicated = 1; let isTerminator = 1; @@ -25958,7 +26050,7 @@ def SL2_return_tnew : HInst< (outs), (ins), "if (p0.new) dealloc_return:nt", -tc_fcab4871, TypeSUBINSN>, Enc_e3b0c4 { +tc_36153880, TypeSUBINSN>, Enc_e3b0c4 { let Inst{12-0} = 0b1111101000110; let isPredicated = 1; let isTerminator = 1; @@ -25979,7 +26071,7 @@ def SS1_storeb_io : HInst< (outs), (ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii, GeneralSubRegs:$Rt16), "memb($Rs16+#$Ii) = $Rt16", -tc_05b6c987, TypeSUBINSN>, Enc_b38ffc { +tc_30b9bb4a, TypeSUBINSN>, Enc_b38ffc { let Inst{12-12} = 0b1; let addrMode = BaseImmOffset; let accessSize = ByteAccess; @@ -25991,7 +26083,7 @@ def SS1_storew_io : HInst< (outs), (ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii, GeneralSubRegs:$Rt16), "memw($Rs16+#$Ii) = $Rt16", -tc_05b6c987, TypeSUBINSN>, Enc_f55a0c { +tc_30b9bb4a, TypeSUBINSN>, Enc_f55a0c { let Inst{12-12} = 0b0; let addrMode = BaseImmOffset; let accessSize = WordAccess; @@ -26003,7 +26095,7 @@ def SS2_allocframe : HInst< (outs), (ins u5_3Imm:$Ii), "allocframe(#$Ii)", -tc_0fc1ae07, TypeSUBINSN>, Enc_6f70ca { +tc_49a8207d, TypeSUBINSN>, Enc_6f70ca { let Inst{3-0} = 0b0000; let Inst{12-9} = 0b1110; let addrMode = BaseImmOffset; @@ -26018,7 +26110,7 @@ def SS2_storebi0 : HInst< (outs), (ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii), "memb($Rs16+#$Ii) = #0", -tc_57288781, TypeSUBINSN>, Enc_84d359 { +tc_89e94ad3, TypeSUBINSN>, Enc_84d359 { let Inst{12-8} = 0b10010; let addrMode = BaseImmOffset; let accessSize = ByteAccess; @@ -26030,7 +26122,7 @@ def SS2_storebi1 : HInst< (outs), (ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii), "memb($Rs16+#$Ii) = #1", -tc_57288781, TypeSUBINSN>, Enc_84d359 { +tc_89e94ad3, TypeSUBINSN>, Enc_84d359 { let Inst{12-8} = 0b10011; let addrMode = BaseImmOffset; let accessSize = ByteAccess; @@ -26042,7 +26134,7 @@ def SS2_stored_sp : HInst< (outs), (ins s6_3Imm:$Ii, GeneralDoubleLow8Regs:$Rtt8), "memd(r29+#$Ii) = $Rtt8", -tc_a788683e, TypeSUBINSN>, Enc_b8309d { +tc_0371abea, TypeSUBINSN>, Enc_b8309d { let Inst{12-9} = 0b0101; let addrMode = BaseImmOffset; let accessSize = DoubleWordAccess; @@ -26055,7 +26147,7 @@ def SS2_storeh_io : HInst< (outs), (ins GeneralSubRegs:$Rs16, u3_1Imm:$Ii, GeneralSubRegs:$Rt16), "memh($Rs16+#$Ii) = $Rt16", -tc_05b6c987, TypeSUBINSN>, Enc_625deb { +tc_30b9bb4a, TypeSUBINSN>, Enc_625deb { let Inst{12-11} = 0b00; let addrMode = BaseImmOffset; let accessSize = HalfWordAccess; @@ -26067,7 +26159,7 @@ def SS2_storew_sp : HInst< (outs), (ins u5_2Imm:$Ii, GeneralSubRegs:$Rt16), "memw(r29+#$Ii) = $Rt16", -tc_a788683e, TypeSUBINSN>, Enc_87c142 { +tc_0371abea, TypeSUBINSN>, Enc_87c142 { let Inst{12-9} = 0b0100; let addrMode = BaseImmOffset; let accessSize = WordAccess; @@ -26080,7 +26172,7 @@ def SS2_storewi0 : HInst< (outs), (ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii), "memw($Rs16+#$Ii) = #0", -tc_57288781, TypeSUBINSN>, Enc_a6ce9c { +tc_89e94ad3, TypeSUBINSN>, Enc_a6ce9c { let Inst{12-8} = 0b10000; let addrMode = BaseImmOffset; let accessSize = WordAccess; @@ -26092,7 +26184,7 @@ def SS2_storewi1 : HInst< (outs), (ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii), "memw($Rs16+#$Ii) = #1", -tc_57288781, TypeSUBINSN>, Enc_a6ce9c { +tc_89e94ad3, TypeSUBINSN>, Enc_a6ce9c { let Inst{12-8} = 0b10001; let addrMode = BaseImmOffset; let accessSize = WordAccess; @@ -26230,7 +26322,7 @@ def V6_extractw : HInst< (outs IntRegs:$Rd32), (ins HvxVR:$Vu32, IntRegs:$Rs32), "$Rd32 = vextract($Vu32,$Rs32)", -tc_9777e6bf, TypeLD>, Enc_50e578, Requires<[UseHVXV60]> { +tc_540c3da3, TypeLD>, Enc_50e578, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10010010000; @@ -26451,7 +26543,7 @@ def V6_lvsplatb : HInst< (outs HvxVR:$Vd32), (ins IntRegs:$Rt32), "$Vd32.b = vsplat($Rt32)", -tc_6b78cf13, TypeCVI_VX>, Enc_a5ed8a, Requires<[UseHVXV62]> { +tc_c4edf264, TypeCVI_VX>, Enc_a5ed8a, Requires<[UseHVXV62]> { let Inst{13-5} = 0b000000010; let Inst{31-21} = 0b00011001110; let hasNewValue = 1; @@ -26462,7 +26554,7 @@ def V6_lvsplath : HInst< (outs HvxVR:$Vd32), (ins IntRegs:$Rt32), "$Vd32.h = vsplat($Rt32)", -tc_6b78cf13, TypeCVI_VX>, Enc_a5ed8a, Requires<[UseHVXV62]> { +tc_c4edf264, TypeCVI_VX>, Enc_a5ed8a, Requires<[UseHVXV62]> { let Inst{13-5} = 0b000000001; let Inst{31-21} = 0b00011001110; let hasNewValue = 1; @@ -26473,7 +26565,7 @@ def V6_lvsplatw : HInst< (outs HvxVR:$Vd32), (ins IntRegs:$Rt32), "$Vd32 = vsplat($Rt32)", -tc_6b78cf13, TypeCVI_VX_LATE>, Enc_a5ed8a, Requires<[UseHVXV60]> { +tc_c4edf264, TypeCVI_VX_LATE>, Enc_a5ed8a, Requires<[UseHVXV60]> { let Inst{13-5} = 0b000000001; let Inst{31-21} = 0b00011001101; let hasNewValue = 1; @@ -26484,7 +26576,7 @@ def V6_pred_and : HInst< (outs HvxQR:$Qd4), (ins HvxQR:$Qs4, HvxQR:$Qt4), "$Qd4 = and($Qs4,$Qt4)", -tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> { let Inst{7-2} = 0b000000; let Inst{13-10} = 0b0000; let Inst{21-16} = 0b000011; @@ -26497,7 +26589,7 @@ def V6_pred_and_n : HInst< (outs HvxQR:$Qd4), (ins HvxQR:$Qs4, HvxQR:$Qt4), "$Qd4 = and($Qs4,!$Qt4)", -tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> { let Inst{7-2} = 0b000101; let Inst{13-10} = 0b0000; let Inst{21-16} = 0b000011; @@ -26510,7 +26602,7 @@ def V6_pred_not : HInst< (outs HvxQR:$Qd4), (ins HvxQR:$Qs4), "$Qd4 = not($Qs4)", -tc_71337255, TypeCVI_VA>, Enc_bfbf03, Requires<[UseHVXV60]> { +tc_0ec46cf9, TypeCVI_VA>, Enc_bfbf03, Requires<[UseHVXV60]> { let Inst{7-2} = 0b000010; let Inst{13-10} = 0b0000; let Inst{31-16} = 0b0001111000000011; @@ -26522,7 +26614,7 @@ def V6_pred_or : HInst< (outs HvxQR:$Qd4), (ins HvxQR:$Qs4, HvxQR:$Qt4), "$Qd4 = or($Qs4,$Qt4)", -tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> { let Inst{7-2} = 0b000001; let Inst{13-10} = 0b0000; let Inst{21-16} = 0b000011; @@ -26535,7 +26627,7 @@ def V6_pred_or_n : HInst< (outs HvxQR:$Qd4), (ins HvxQR:$Qs4, HvxQR:$Qt4), "$Qd4 = or($Qs4,!$Qt4)", -tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> { let Inst{7-2} = 0b000100; let Inst{13-10} = 0b0000; let Inst{21-16} = 0b000011; @@ -26548,7 +26640,7 @@ def V6_pred_scalar2 : HInst< (outs HvxQR:$Qd4), (ins IntRegs:$Rt32), "$Qd4 = vsetq($Rt32)", -tc_4105d6b5, TypeCVI_VP>, Enc_7222b7, Requires<[UseHVXV60]> { +tc_5bf8afbb, TypeCVI_VP>, Enc_7222b7, Requires<[UseHVXV60]> { let Inst{13-2} = 0b000000010001; let Inst{31-21} = 0b00011001101; let hasNewValue = 1; @@ -26559,7 +26651,7 @@ def V6_pred_scalar2v2 : HInst< (outs HvxQR:$Qd4), (ins IntRegs:$Rt32), "$Qd4 = vsetq2($Rt32)", -tc_4105d6b5, TypeCVI_VP>, Enc_7222b7, Requires<[UseHVXV62]> { +tc_5bf8afbb, TypeCVI_VP>, Enc_7222b7, Requires<[UseHVXV62]> { let Inst{13-2} = 0b000000010011; let Inst{31-21} = 0b00011001101; let hasNewValue = 1; @@ -26570,7 +26662,7 @@ def V6_pred_xor : HInst< (outs HvxQR:$Qd4), (ins HvxQR:$Qs4, HvxQR:$Qt4), "$Qd4 = xor($Qs4,$Qt4)", -tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV60]> { let Inst{7-2} = 0b000011; let Inst{13-10} = 0b0000; let Inst{21-16} = 0b000011; @@ -26583,7 +26675,7 @@ def V6_shuffeqh : HInst< (outs HvxQR:$Qd4), (ins HvxQR:$Qs4, HvxQR:$Qt4), "$Qd4.b = vshuffe($Qs4.h,$Qt4.h)", -tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV62]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV62]> { let Inst{7-2} = 0b000110; let Inst{13-10} = 0b0000; let Inst{21-16} = 0b000011; @@ -26596,7 +26688,7 @@ def V6_shuffeqw : HInst< (outs HvxQR:$Qd4), (ins HvxQR:$Qs4, HvxQR:$Qt4), "$Qd4.h = vshuffe($Qs4.w,$Qt4.w)", -tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV62]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_134437, Requires<[UseHVXV62]> { let Inst{7-2} = 0b000111; let Inst{13-10} = 0b0000; let Inst{21-16} = 0b000011; @@ -26746,7 +26838,7 @@ def V6_vL32Ub_ai : HInst< (outs HvxVR:$Vd32), (ins IntRegs:$Rt32, s4_0Imm:$Ii), "$Vd32 = vmemu($Rt32+#$Ii)", -tc_35e92f8e, TypeCVI_VM_VP_LDU>, Enc_f3f408, Requires<[UseHVXV60]> { +tc_a7e6707d, TypeCVI_VM_VP_LDU>, Enc_f3f408, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000000; @@ -26763,7 +26855,7 @@ def V6_vL32Ub_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii), "$Vd32 = vmemu($Rx32++#$Ii)", -tc_4fd8566e, TypeCVI_VM_VP_LDU>, Enc_a255dc, Requires<[UseHVXV60]> { +tc_3c56e5ce, TypeCVI_VM_VP_LDU>, Enc_a255dc, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001000; @@ -26782,7 +26874,7 @@ def V6_vL32Ub_ppu : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Vd32 = vmemu($Rx32++$Mu2)", -tc_4fd8566e, TypeCVI_VM_VP_LDU>, Enc_2ebe3b, Requires<[UseHVXV60]> { +tc_3c56e5ce, TypeCVI_VM_VP_LDU>, Enc_2ebe3b, Requires<[UseHVXV60]> { let Inst{12-5} = 0b00000111; let Inst{31-21} = 0b00101011000; let hasNewValue = 1; @@ -26799,7 +26891,7 @@ def V6_vL32b_ai : HInst< (outs HvxVR:$Vd32), (ins IntRegs:$Rt32, s4_0Imm:$Ii), "$Vd32 = vmem($Rt32+#$Ii)", -tc_b712833a, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel { +tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel { let Inst{7-5} = 0b000; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000000; @@ -26819,7 +26911,7 @@ def V6_vL32b_cur_ai : HInst< (outs HvxVR:$Vd32), (ins IntRegs:$Rt32, s4_0Imm:$Ii), "$Vd32.cur = vmem($Rt32+#$Ii)", -tc_b712833a, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel { +tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel { let Inst{7-5} = 0b001; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000000; @@ -26839,7 +26931,7 @@ def V6_vL32b_cur_npred_ai : HInst< (outs HvxVR:$Vd32), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii), "if (!$Pv4) $Vd32.cur = vmem($Rt32+#$Ii)", -tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { +tc_abe8c3b2, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b101; let Inst{31-21} = 0b00101000100; let isPredicated = 1; @@ -26859,7 +26951,7 @@ def V6_vL32b_cur_npred_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii), "if (!$Pv4) $Vd32.cur = vmem($Rx32++#$Ii)", -tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { +tc_453fe68d, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001100; @@ -26881,7 +26973,7 @@ def V6_vL32b_cur_npred_ppu : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2), "if (!$Pv4) $Vd32.cur = vmem($Rx32++$Mu2)", -tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { +tc_453fe68d, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { let Inst{10-5} = 0b000101; let Inst{31-21} = 0b00101011100; let isPredicated = 1; @@ -26902,7 +26994,7 @@ def V6_vL32b_cur_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii), "$Vd32.cur = vmem($Rx32++#$Ii)", -tc_eb669007, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel { +tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel { let Inst{7-5} = 0b001; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001000; @@ -26923,7 +27015,7 @@ def V6_vL32b_cur_ppu : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Vd32.cur = vmem($Rx32++$Mu2)", -tc_eb669007, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel { +tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel { let Inst{12-5} = 0b00000001; let Inst{31-21} = 0b00101011000; let hasNewValue = 1; @@ -26943,7 +27035,7 @@ def V6_vL32b_cur_pred_ai : HInst< (outs HvxVR:$Vd32), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii), "if ($Pv4) $Vd32.cur = vmem($Rt32+#$Ii)", -tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { +tc_abe8c3b2, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b100; let Inst{31-21} = 0b00101000100; let isPredicated = 1; @@ -26962,7 +27054,7 @@ def V6_vL32b_cur_pred_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii), "if ($Pv4) $Vd32.cur = vmem($Rx32++#$Ii)", -tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { +tc_453fe68d, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001100; @@ -26983,7 +27075,7 @@ def V6_vL32b_cur_pred_ppu : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2), "if ($Pv4) $Vd32.cur = vmem($Rx32++$Mu2)", -tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { +tc_453fe68d, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { let Inst{10-5} = 0b000100; let Inst{31-21} = 0b00101011100; let isPredicated = 1; @@ -27003,7 +27095,7 @@ def V6_vL32b_npred_ai : HInst< (outs HvxVR:$Vd32), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii), "if (!$Pv4) $Vd32 = vmem($Rt32+#$Ii)", -tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { +tc_abe8c3b2, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b011; let Inst{31-21} = 0b00101000100; let isPredicated = 1; @@ -27022,7 +27114,7 @@ def V6_vL32b_npred_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii), "if (!$Pv4) $Vd32 = vmem($Rx32++#$Ii)", -tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { +tc_453fe68d, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001100; @@ -27043,7 +27135,7 @@ def V6_vL32b_npred_ppu : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2), "if (!$Pv4) $Vd32 = vmem($Rx32++$Mu2)", -tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { +tc_453fe68d, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { let Inst{10-5} = 0b000011; let Inst{31-21} = 0b00101011100; let isPredicated = 1; @@ -27063,7 +27155,7 @@ def V6_vL32b_nt_ai : HInst< (outs HvxVR:$Vd32), (ins IntRegs:$Rt32, s4_0Imm:$Ii), "$Vd32 = vmem($Rt32+#$Ii):nt", -tc_b712833a, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel { +tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel { let Inst{7-5} = 0b000; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000010; @@ -27084,7 +27176,7 @@ def V6_vL32b_nt_cur_ai : HInst< (outs HvxVR:$Vd32), (ins IntRegs:$Rt32, s4_0Imm:$Ii), "$Vd32.cur = vmem($Rt32+#$Ii):nt", -tc_b712833a, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel { +tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel { let Inst{7-5} = 0b001; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000010; @@ -27105,7 +27197,7 @@ def V6_vL32b_nt_cur_npred_ai : HInst< (outs HvxVR:$Vd32), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii), "if (!$Pv4) $Vd32.cur = vmem($Rt32+#$Ii):nt", -tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { +tc_abe8c3b2, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b101; let Inst{31-21} = 0b00101000110; let isPredicated = 1; @@ -27126,7 +27218,7 @@ def V6_vL32b_nt_cur_npred_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii), "if (!$Pv4) $Vd32.cur = vmem($Rx32++#$Ii):nt", -tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { +tc_453fe68d, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001110; @@ -27149,7 +27241,7 @@ def V6_vL32b_nt_cur_npred_ppu : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2), "if (!$Pv4) $Vd32.cur = vmem($Rx32++$Mu2):nt", -tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { +tc_453fe68d, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { let Inst{10-5} = 0b000101; let Inst{31-21} = 0b00101011110; let isPredicated = 1; @@ -27171,7 +27263,7 @@ def V6_vL32b_nt_cur_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii), "$Vd32.cur = vmem($Rx32++#$Ii):nt", -tc_eb669007, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel { +tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel { let Inst{7-5} = 0b001; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001010; @@ -27193,7 +27285,7 @@ def V6_vL32b_nt_cur_ppu : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Vd32.cur = vmem($Rx32++$Mu2):nt", -tc_eb669007, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel { +tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel { let Inst{12-5} = 0b00000001; let Inst{31-21} = 0b00101011010; let hasNewValue = 1; @@ -27214,7 +27306,7 @@ def V6_vL32b_nt_cur_pred_ai : HInst< (outs HvxVR:$Vd32), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii), "if ($Pv4) $Vd32.cur = vmem($Rt32+#$Ii):nt", -tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { +tc_abe8c3b2, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b100; let Inst{31-21} = 0b00101000110; let isPredicated = 1; @@ -27234,7 +27326,7 @@ def V6_vL32b_nt_cur_pred_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii), "if ($Pv4) $Vd32.cur = vmem($Rx32++#$Ii):nt", -tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { +tc_453fe68d, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001110; @@ -27256,7 +27348,7 @@ def V6_vL32b_nt_cur_pred_ppu : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2), "if ($Pv4) $Vd32.cur = vmem($Rx32++$Mu2):nt", -tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { +tc_453fe68d, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { let Inst{10-5} = 0b000100; let Inst{31-21} = 0b00101011110; let isPredicated = 1; @@ -27277,7 +27369,7 @@ def V6_vL32b_nt_npred_ai : HInst< (outs HvxVR:$Vd32), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii), "if (!$Pv4) $Vd32 = vmem($Rt32+#$Ii):nt", -tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { +tc_abe8c3b2, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b011; let Inst{31-21} = 0b00101000110; let isPredicated = 1; @@ -27297,7 +27389,7 @@ def V6_vL32b_nt_npred_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii), "if (!$Pv4) $Vd32 = vmem($Rx32++#$Ii):nt", -tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { +tc_453fe68d, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001110; @@ -27319,7 +27411,7 @@ def V6_vL32b_nt_npred_ppu : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2), "if (!$Pv4) $Vd32 = vmem($Rx32++$Mu2):nt", -tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { +tc_453fe68d, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { let Inst{10-5} = 0b000011; let Inst{31-21} = 0b00101011110; let isPredicated = 1; @@ -27340,7 +27432,7 @@ def V6_vL32b_nt_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii), "$Vd32 = vmem($Rx32++#$Ii):nt", -tc_eb669007, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel { +tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel { let Inst{7-5} = 0b000; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001010; @@ -27362,7 +27454,7 @@ def V6_vL32b_nt_ppu : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Vd32 = vmem($Rx32++$Mu2):nt", -tc_eb669007, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel { +tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b00101011010; let hasNewValue = 1; @@ -27383,7 +27475,7 @@ def V6_vL32b_nt_pred_ai : HInst< (outs HvxVR:$Vd32), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii), "if ($Pv4) $Vd32 = vmem($Rt32+#$Ii):nt", -tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { +tc_abe8c3b2, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b010; let Inst{31-21} = 0b00101000110; let isPredicated = 1; @@ -27402,7 +27494,7 @@ def V6_vL32b_nt_pred_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii), "if ($Pv4) $Vd32 = vmem($Rx32++#$Ii):nt", -tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { +tc_453fe68d, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001110; @@ -27423,7 +27515,7 @@ def V6_vL32b_nt_pred_ppu : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2), "if ($Pv4) $Vd32 = vmem($Rx32++$Mu2):nt", -tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { +tc_453fe68d, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { let Inst{10-5} = 0b000010; let Inst{31-21} = 0b00101011110; let isPredicated = 1; @@ -27443,7 +27535,7 @@ def V6_vL32b_nt_tmp_ai : HInst< (outs HvxVR:$Vd32), (ins IntRegs:$Rt32, s4_0Imm:$Ii), "$Vd32.tmp = vmem($Rt32+#$Ii):nt", -tc_77a4c701, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel { +tc_52447ecc, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel { let Inst{7-5} = 0b010; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000010; @@ -27463,7 +27555,7 @@ def V6_vL32b_nt_tmp_npred_ai : HInst< (outs HvxVR:$Vd32), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii), "if (!$Pv4) $Vd32.tmp = vmem($Rt32+#$Ii):nt", -tc_51cd3aab, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { +tc_3904b926, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b111; let Inst{31-21} = 0b00101000110; let isPredicated = 1; @@ -27483,7 +27575,7 @@ def V6_vL32b_nt_tmp_npred_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii), "if (!$Pv4) $Vd32.tmp = vmem($Rx32++#$Ii):nt", -tc_38208312, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { +tc_b9db8205, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001110; @@ -27505,7 +27597,7 @@ def V6_vL32b_nt_tmp_npred_ppu : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2), "if (!$Pv4) $Vd32.tmp = vmem($Rx32++$Mu2):nt", -tc_38208312, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { +tc_b9db8205, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { let Inst{10-5} = 0b000111; let Inst{31-21} = 0b00101011110; let isPredicated = 1; @@ -27526,7 +27618,7 @@ def V6_vL32b_nt_tmp_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii), "$Vd32.tmp = vmem($Rx32++#$Ii):nt", -tc_9c267309, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel { +tc_663c80a7, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel { let Inst{7-5} = 0b010; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001010; @@ -27547,7 +27639,7 @@ def V6_vL32b_nt_tmp_ppu : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Vd32.tmp = vmem($Rx32++$Mu2):nt", -tc_9c267309, TypeCVI_VM_TMP_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel { +tc_663c80a7, TypeCVI_VM_TMP_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel { let Inst{12-5} = 0b00000010; let Inst{31-21} = 0b00101011010; let hasNewValue = 1; @@ -27567,7 +27659,7 @@ def V6_vL32b_nt_tmp_pred_ai : HInst< (outs HvxVR:$Vd32), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii), "if ($Pv4) $Vd32.tmp = vmem($Rt32+#$Ii):nt", -tc_51cd3aab, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { +tc_3904b926, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b110; let Inst{31-21} = 0b00101000110; let isPredicated = 1; @@ -27586,7 +27678,7 @@ def V6_vL32b_nt_tmp_pred_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii), "if ($Pv4) $Vd32.tmp = vmem($Rx32++#$Ii):nt", -tc_38208312, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { +tc_b9db8205, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001110; @@ -27607,7 +27699,7 @@ def V6_vL32b_nt_tmp_pred_ppu : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2), "if ($Pv4) $Vd32.tmp = vmem($Rx32++$Mu2):nt", -tc_38208312, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { +tc_b9db8205, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { let Inst{10-5} = 0b000110; let Inst{31-21} = 0b00101011110; let isPredicated = 1; @@ -27627,7 +27719,7 @@ def V6_vL32b_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii), "$Vd32 = vmem($Rx32++#$Ii)", -tc_eb669007, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel { +tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel { let Inst{7-5} = 0b000; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001000; @@ -27648,7 +27740,7 @@ def V6_vL32b_ppu : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Vd32 = vmem($Rx32++$Mu2)", -tc_eb669007, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel { +tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b00101011000; let hasNewValue = 1; @@ -27668,7 +27760,7 @@ def V6_vL32b_pred_ai : HInst< (outs HvxVR:$Vd32), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii), "if ($Pv4) $Vd32 = vmem($Rt32+#$Ii)", -tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { +tc_abe8c3b2, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b010; let Inst{31-21} = 0b00101000100; let isPredicated = 1; @@ -27686,7 +27778,7 @@ def V6_vL32b_pred_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii), "if ($Pv4) $Vd32 = vmem($Rx32++#$Ii)", -tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { +tc_453fe68d, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001100; @@ -27706,7 +27798,7 @@ def V6_vL32b_pred_ppu : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2), "if ($Pv4) $Vd32 = vmem($Rx32++$Mu2)", -tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { +tc_453fe68d, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { let Inst{10-5} = 0b000010; let Inst{31-21} = 0b00101011100; let isPredicated = 1; @@ -27725,7 +27817,7 @@ def V6_vL32b_tmp_ai : HInst< (outs HvxVR:$Vd32), (ins IntRegs:$Rt32, s4_0Imm:$Ii), "$Vd32.tmp = vmem($Rt32+#$Ii)", -tc_77a4c701, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel { +tc_52447ecc, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel { let Inst{7-5} = 0b010; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000000; @@ -27744,7 +27836,7 @@ def V6_vL32b_tmp_npred_ai : HInst< (outs HvxVR:$Vd32), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii), "if (!$Pv4) $Vd32.tmp = vmem($Rt32+#$Ii)", -tc_51cd3aab, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { +tc_3904b926, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b111; let Inst{31-21} = 0b00101000100; let isPredicated = 1; @@ -27763,7 +27855,7 @@ def V6_vL32b_tmp_npred_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii), "if (!$Pv4) $Vd32.tmp = vmem($Rx32++#$Ii)", -tc_38208312, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { +tc_b9db8205, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001100; @@ -27784,7 +27876,7 @@ def V6_vL32b_tmp_npred_ppu : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2), "if (!$Pv4) $Vd32.tmp = vmem($Rx32++$Mu2)", -tc_38208312, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { +tc_b9db8205, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { let Inst{10-5} = 0b000111; let Inst{31-21} = 0b00101011100; let isPredicated = 1; @@ -27804,7 +27896,7 @@ def V6_vL32b_tmp_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii), "$Vd32.tmp = vmem($Rx32++#$Ii)", -tc_9c267309, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel { +tc_663c80a7, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel { let Inst{7-5} = 0b010; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001000; @@ -27824,7 +27916,7 @@ def V6_vL32b_tmp_ppu : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "$Vd32.tmp = vmem($Rx32++$Mu2)", -tc_9c267309, TypeCVI_VM_TMP_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel { +tc_663c80a7, TypeCVI_VM_TMP_LD>, Enc_2ebe3b, Requires<[UseHVXV60]>, PredRel { let Inst{12-5} = 0b00000010; let Inst{31-21} = 0b00101011000; let hasNewValue = 1; @@ -27843,7 +27935,7 @@ def V6_vL32b_tmp_pred_ai : HInst< (outs HvxVR:$Vd32), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii), "if ($Pv4) $Vd32.tmp = vmem($Rt32+#$Ii)", -tc_51cd3aab, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { +tc_3904b926, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b110; let Inst{31-21} = 0b00101000100; let isPredicated = 1; @@ -27861,7 +27953,7 @@ def V6_vL32b_tmp_pred_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii), "if ($Pv4) $Vd32.tmp = vmem($Rx32++#$Ii)", -tc_38208312, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { +tc_b9db8205, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[UseHVXV62]>, PredRel { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001100; @@ -27881,7 +27973,7 @@ def V6_vL32b_tmp_pred_ppu : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2), "if ($Pv4) $Vd32.tmp = vmem($Rx32++$Mu2)", -tc_38208312, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { +tc_b9db8205, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[UseHVXV62]>, PredRel { let Inst{10-5} = 0b000110; let Inst{31-21} = 0b00101011100; let isPredicated = 1; @@ -27900,7 +27992,7 @@ def V6_vS32Ub_ai : HInst< (outs), (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32), "vmemu($Rt32+#$Ii) = $Vs32", -tc_354299ad, TypeCVI_VM_STU>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel { +tc_f21e8abb, TypeCVI_VM_STU>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-5} = 0b111; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000001; @@ -27915,7 +28007,7 @@ def V6_vS32Ub_npred_ai : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32), "if (!$Pv4) vmemu($Rt32+#$Ii) = $Vs32", -tc_d642eff3, TypeCVI_VM_STU>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel { +tc_131f1c81, TypeCVI_VM_STU>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-5} = 0b111; let Inst{31-21} = 0b00101000101; let isPredicated = 1; @@ -27930,7 +28022,7 @@ def V6_vS32Ub_npred_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32), "if (!$Pv4) vmemu($Rx32++#$Ii) = $Vs32", -tc_6fd9ad30, TypeCVI_VM_STU>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel { +tc_c7039829, TypeCVI_VM_STU>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001101; @@ -27947,7 +28039,7 @@ def V6_vS32Ub_npred_ppu : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32), "if (!$Pv4) vmemu($Rx32++$Mu2) = $Vs32", -tc_6fd9ad30, TypeCVI_VM_STU>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel { +tc_c7039829, TypeCVI_VM_STU>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel { let Inst{10-5} = 0b000111; let Inst{31-21} = 0b00101011101; let isPredicated = 1; @@ -27963,7 +28055,7 @@ def V6_vS32Ub_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32), "vmemu($Rx32++#$Ii) = $Vs32", -tc_7fa82b08, TypeCVI_VM_STU>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel { +tc_e2d2e9e5, TypeCVI_VM_STU>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-5} = 0b111; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001001; @@ -27979,7 +28071,7 @@ def V6_vS32Ub_ppu : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32), "vmemu($Rx32++$Mu2) = $Vs32", -tc_7fa82b08, TypeCVI_VM_STU>, Enc_d15d19, Requires<[UseHVXV60]>, NewValueRel { +tc_e2d2e9e5, TypeCVI_VM_STU>, Enc_d15d19, Requires<[UseHVXV60]>, NewValueRel { let Inst{12-5} = 0b00000111; let Inst{31-21} = 0b00101011001; let addrMode = PostInc; @@ -27994,7 +28086,7 @@ def V6_vS32Ub_pred_ai : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32), "if ($Pv4) vmemu($Rt32+#$Ii) = $Vs32", -tc_d642eff3, TypeCVI_VM_STU>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel { +tc_131f1c81, TypeCVI_VM_STU>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-5} = 0b110; let Inst{31-21} = 0b00101000101; let isPredicated = 1; @@ -28008,7 +28100,7 @@ def V6_vS32Ub_pred_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32), "if ($Pv4) vmemu($Rx32++#$Ii) = $Vs32", -tc_6fd9ad30, TypeCVI_VM_STU>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel { +tc_c7039829, TypeCVI_VM_STU>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001101; @@ -28024,7 +28116,7 @@ def V6_vS32Ub_pred_ppu : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32), "if ($Pv4) vmemu($Rx32++$Mu2) = $Vs32", -tc_6fd9ad30, TypeCVI_VM_STU>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel { +tc_c7039829, TypeCVI_VM_STU>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel { let Inst{10-5} = 0b000110; let Inst{31-21} = 0b00101011101; let isPredicated = 1; @@ -28039,7 +28131,7 @@ def V6_vS32b_ai : HInst< (outs), (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32), "vmem($Rt32+#$Ii) = $Vs32", -tc_e3748cdf, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel { +tc_c5dba46e, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-5} = 0b000; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000001; @@ -28055,7 +28147,7 @@ def V6_vS32b_new_ai : HInst< (outs), (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Os8), "vmem($Rt32+#$Ii) = $Os8.new", -tc_1b93bdc6, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[UseHVXV60]>, NewValueRel { +tc_ab23f776, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-3} = 0b00100; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000001; @@ -28074,7 +28166,7 @@ def V6_vS32b_new_npred_ai : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Os8), "if (!$Pv4) vmem($Rt32+#$Ii) = $Os8.new", -tc_d5090f3e, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[UseHVXV60]>, NewValueRel { +tc_7177e272, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-3} = 0b01101; let Inst{31-21} = 0b00101000101; let isPredicated = 1; @@ -28093,7 +28185,7 @@ def V6_vS32b_new_npred_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Os8), "if (!$Pv4) vmem($Rx32++#$Ii) = $Os8.new", -tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[UseHVXV60]>, NewValueRel { +tc_e99d4c2e, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-3} = 0b01101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001101; @@ -28114,7 +28206,7 @@ def V6_vS32b_new_npred_ppu : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Os8), "if (!$Pv4) vmem($Rx32++$Mu2) = $Os8.new", -tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[UseHVXV60]>, NewValueRel { +tc_e99d4c2e, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[UseHVXV60]>, NewValueRel { let Inst{10-3} = 0b00001101; let Inst{31-21} = 0b00101011101; let isPredicated = 1; @@ -28134,7 +28226,7 @@ def V6_vS32b_new_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Os8), "vmem($Rx32++#$Ii) = $Os8.new", -tc_db5b9e2f, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[UseHVXV60]>, NewValueRel { +tc_6942b6e0, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-3} = 0b00100; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001001; @@ -28154,7 +28246,7 @@ def V6_vS32b_new_ppu : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Os8), "vmem($Rx32++$Mu2) = $Os8.new", -tc_db5b9e2f, TypeCVI_VM_NEW_ST>, Enc_cf1927, Requires<[UseHVXV60]>, NewValueRel { +tc_6942b6e0, TypeCVI_VM_NEW_ST>, Enc_cf1927, Requires<[UseHVXV60]>, NewValueRel { let Inst{12-3} = 0b0000000100; let Inst{31-21} = 0b00101011001; let addrMode = PostInc; @@ -28173,7 +28265,7 @@ def V6_vS32b_new_pred_ai : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Os8), "if ($Pv4) vmem($Rt32+#$Ii) = $Os8.new", -tc_d5090f3e, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[UseHVXV60]>, NewValueRel { +tc_7177e272, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-3} = 0b01000; let Inst{31-21} = 0b00101000101; let isPredicated = 1; @@ -28191,7 +28283,7 @@ def V6_vS32b_new_pred_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Os8), "if ($Pv4) vmem($Rx32++#$Ii) = $Os8.new", -tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[UseHVXV60]>, NewValueRel { +tc_e99d4c2e, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-3} = 0b01000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001101; @@ -28211,7 +28303,7 @@ def V6_vS32b_new_pred_ppu : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Os8), "if ($Pv4) vmem($Rx32++$Mu2) = $Os8.new", -tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[UseHVXV60]>, NewValueRel { +tc_e99d4c2e, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[UseHVXV60]>, NewValueRel { let Inst{10-3} = 0b00001000; let Inst{31-21} = 0b00101011101; let isPredicated = 1; @@ -28230,7 +28322,7 @@ def V6_vS32b_npred_ai : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32), "if (!$Pv4) vmem($Rt32+#$Ii) = $Vs32", -tc_85d237e3, TypeCVI_VM_ST>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel { +tc_a02a10a8, TypeCVI_VM_ST>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-5} = 0b001; let Inst{31-21} = 0b00101000101; let isPredicated = 1; @@ -28246,7 +28338,7 @@ def V6_vS32b_npred_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32), "if (!$Pv4) vmem($Rx32++#$Ii) = $Vs32", -tc_0317c6ca, TypeCVI_VM_ST>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel { +tc_54a0dc47, TypeCVI_VM_ST>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001101; @@ -28264,7 +28356,7 @@ def V6_vS32b_npred_ppu : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32), "if (!$Pv4) vmem($Rx32++$Mu2) = $Vs32", -tc_0317c6ca, TypeCVI_VM_ST>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel { +tc_54a0dc47, TypeCVI_VM_ST>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel { let Inst{10-5} = 0b000001; let Inst{31-21} = 0b00101011101; let isPredicated = 1; @@ -28281,7 +28373,7 @@ def V6_vS32b_nqpred_ai : HInst< (outs), (ins HvxQR:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32), "if (!$Qv4) vmem($Rt32+#$Ii) = $Vs32", -tc_aedb9f9e, TypeCVI_VM_ST>, Enc_2ea740, Requires<[UseHVXV60]> { +tc_447d9895, TypeCVI_VM_ST>, Enc_2ea740, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{31-21} = 0b00101000100; let addrMode = BaseImmOffset; @@ -28293,7 +28385,7 @@ def V6_vS32b_nqpred_pi : HInst< (outs IntRegs:$Rx32), (ins HvxQR:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32), "if (!$Qv4) vmem($Rx32++#$Ii) = $Vs32", -tc_99093773, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[UseHVXV60]> { +tc_191381c1, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001100; @@ -28307,7 +28399,7 @@ def V6_vS32b_nqpred_ppu : HInst< (outs IntRegs:$Rx32), (ins HvxQR:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32), "if (!$Qv4) vmem($Rx32++$Mu2) = $Vs32", -tc_99093773, TypeCVI_VM_ST>, Enc_4dff07, Requires<[UseHVXV60]> { +tc_191381c1, TypeCVI_VM_ST>, Enc_4dff07, Requires<[UseHVXV60]> { let Inst{10-5} = 0b000001; let Inst{31-21} = 0b00101011100; let addrMode = PostInc; @@ -28320,7 +28412,7 @@ def V6_vS32b_nt_ai : HInst< (outs), (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32), "vmem($Rt32+#$Ii):nt = $Vs32", -tc_e3748cdf, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel { +tc_c5dba46e, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-5} = 0b000; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000011; @@ -28337,7 +28429,7 @@ def V6_vS32b_nt_new_ai : HInst< (outs), (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Os8), "vmem($Rt32+#$Ii):nt = $Os8.new", -tc_1b93bdc6, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[UseHVXV60]>, NewValueRel { +tc_ab23f776, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-3} = 0b00100; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000011; @@ -28357,7 +28449,7 @@ def V6_vS32b_nt_new_npred_ai : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Os8), "if (!$Pv4) vmem($Rt32+#$Ii):nt = $Os8.new", -tc_d5090f3e, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[UseHVXV60]>, NewValueRel { +tc_7177e272, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-3} = 0b01111; let Inst{31-21} = 0b00101000111; let isPredicated = 1; @@ -28377,7 +28469,7 @@ def V6_vS32b_nt_new_npred_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Os8), "if (!$Pv4) vmem($Rx32++#$Ii):nt = $Os8.new", -tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[UseHVXV60]>, NewValueRel { +tc_e99d4c2e, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-3} = 0b01111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001111; @@ -28399,7 +28491,7 @@ def V6_vS32b_nt_new_npred_ppu : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Os8), "if (!$Pv4) vmem($Rx32++$Mu2):nt = $Os8.new", -tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[UseHVXV60]>, NewValueRel { +tc_e99d4c2e, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[UseHVXV60]>, NewValueRel { let Inst{10-3} = 0b00001111; let Inst{31-21} = 0b00101011111; let isPredicated = 1; @@ -28420,7 +28512,7 @@ def V6_vS32b_nt_new_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Os8), "vmem($Rx32++#$Ii):nt = $Os8.new", -tc_db5b9e2f, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[UseHVXV60]>, NewValueRel { +tc_6942b6e0, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-3} = 0b00100; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001011; @@ -28441,7 +28533,7 @@ def V6_vS32b_nt_new_ppu : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Os8), "vmem($Rx32++$Mu2):nt = $Os8.new", -tc_db5b9e2f, TypeCVI_VM_NEW_ST>, Enc_cf1927, Requires<[UseHVXV60]>, NewValueRel { +tc_6942b6e0, TypeCVI_VM_NEW_ST>, Enc_cf1927, Requires<[UseHVXV60]>, NewValueRel { let Inst{12-3} = 0b0000000100; let Inst{31-21} = 0b00101011011; let addrMode = PostInc; @@ -28461,7 +28553,7 @@ def V6_vS32b_nt_new_pred_ai : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Os8), "if ($Pv4) vmem($Rt32+#$Ii):nt = $Os8.new", -tc_d5090f3e, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[UseHVXV60]>, NewValueRel { +tc_7177e272, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-3} = 0b01010; let Inst{31-21} = 0b00101000111; let isPredicated = 1; @@ -28480,7 +28572,7 @@ def V6_vS32b_nt_new_pred_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Os8), "if ($Pv4) vmem($Rx32++#$Ii):nt = $Os8.new", -tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[UseHVXV60]>, NewValueRel { +tc_e99d4c2e, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-3} = 0b01010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001111; @@ -28501,7 +28593,7 @@ def V6_vS32b_nt_new_pred_ppu : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Os8), "if ($Pv4) vmem($Rx32++$Mu2):nt = $Os8.new", -tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[UseHVXV60]>, NewValueRel { +tc_e99d4c2e, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[UseHVXV60]>, NewValueRel { let Inst{10-3} = 0b00001010; let Inst{31-21} = 0b00101011111; let isPredicated = 1; @@ -28521,7 +28613,7 @@ def V6_vS32b_nt_npred_ai : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32), "if (!$Pv4) vmem($Rt32+#$Ii):nt = $Vs32", -tc_85d237e3, TypeCVI_VM_ST>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel { +tc_a02a10a8, TypeCVI_VM_ST>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-5} = 0b001; let Inst{31-21} = 0b00101000111; let isPredicated = 1; @@ -28538,7 +28630,7 @@ def V6_vS32b_nt_npred_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32), "if (!$Pv4) vmem($Rx32++#$Ii):nt = $Vs32", -tc_0317c6ca, TypeCVI_VM_ST>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel { +tc_54a0dc47, TypeCVI_VM_ST>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001111; @@ -28557,7 +28649,7 @@ def V6_vS32b_nt_npred_ppu : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32), "if (!$Pv4) vmem($Rx32++$Mu2):nt = $Vs32", -tc_0317c6ca, TypeCVI_VM_ST>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel { +tc_54a0dc47, TypeCVI_VM_ST>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel { let Inst{10-5} = 0b000001; let Inst{31-21} = 0b00101011111; let isPredicated = 1; @@ -28575,7 +28667,7 @@ def V6_vS32b_nt_nqpred_ai : HInst< (outs), (ins HvxQR:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32), "if (!$Qv4) vmem($Rt32+#$Ii):nt = $Vs32", -tc_aedb9f9e, TypeCVI_VM_ST>, Enc_2ea740, Requires<[UseHVXV60]> { +tc_447d9895, TypeCVI_VM_ST>, Enc_2ea740, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{31-21} = 0b00101000110; let addrMode = BaseImmOffset; @@ -28588,7 +28680,7 @@ def V6_vS32b_nt_nqpred_pi : HInst< (outs IntRegs:$Rx32), (ins HvxQR:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32), "if (!$Qv4) vmem($Rx32++#$Ii):nt = $Vs32", -tc_99093773, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[UseHVXV60]> { +tc_191381c1, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001110; @@ -28603,7 +28695,7 @@ def V6_vS32b_nt_nqpred_ppu : HInst< (outs IntRegs:$Rx32), (ins HvxQR:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32), "if (!$Qv4) vmem($Rx32++$Mu2):nt = $Vs32", -tc_99093773, TypeCVI_VM_ST>, Enc_4dff07, Requires<[UseHVXV60]> { +tc_191381c1, TypeCVI_VM_ST>, Enc_4dff07, Requires<[UseHVXV60]> { let Inst{10-5} = 0b000001; let Inst{31-21} = 0b00101011110; let addrMode = PostInc; @@ -28617,7 +28709,7 @@ def V6_vS32b_nt_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32), "vmem($Rx32++#$Ii):nt = $Vs32", -tc_a4c9df3b, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel { +tc_3e2aaafc, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-5} = 0b000; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001011; @@ -28635,7 +28727,7 @@ def V6_vS32b_nt_ppu : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32), "vmem($Rx32++$Mu2):nt = $Vs32", -tc_a4c9df3b, TypeCVI_VM_ST>, Enc_d15d19, Requires<[UseHVXV60]>, NewValueRel { +tc_3e2aaafc, TypeCVI_VM_ST>, Enc_d15d19, Requires<[UseHVXV60]>, NewValueRel { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b00101011011; let addrMode = PostInc; @@ -28652,7 +28744,7 @@ def V6_vS32b_nt_pred_ai : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32), "if ($Pv4) vmem($Rt32+#$Ii):nt = $Vs32", -tc_85d237e3, TypeCVI_VM_ST>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel { +tc_a02a10a8, TypeCVI_VM_ST>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-5} = 0b000; let Inst{31-21} = 0b00101000111; let isPredicated = 1; @@ -28668,7 +28760,7 @@ def V6_vS32b_nt_pred_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32), "if ($Pv4) vmem($Rx32++#$Ii):nt = $Vs32", -tc_0317c6ca, TypeCVI_VM_ST>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel { +tc_54a0dc47, TypeCVI_VM_ST>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001111; @@ -28686,7 +28778,7 @@ def V6_vS32b_nt_pred_ppu : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32), "if ($Pv4) vmem($Rx32++$Mu2):nt = $Vs32", -tc_0317c6ca, TypeCVI_VM_ST>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel { +tc_54a0dc47, TypeCVI_VM_ST>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel { let Inst{10-5} = 0b000000; let Inst{31-21} = 0b00101011111; let isPredicated = 1; @@ -28703,7 +28795,7 @@ def V6_vS32b_nt_qpred_ai : HInst< (outs), (ins HvxQR:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32), "if ($Qv4) vmem($Rt32+#$Ii):nt = $Vs32", -tc_aedb9f9e, TypeCVI_VM_ST>, Enc_2ea740, Requires<[UseHVXV60]> { +tc_447d9895, TypeCVI_VM_ST>, Enc_2ea740, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{31-21} = 0b00101000110; let addrMode = BaseImmOffset; @@ -28716,7 +28808,7 @@ def V6_vS32b_nt_qpred_pi : HInst< (outs IntRegs:$Rx32), (ins HvxQR:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32), "if ($Qv4) vmem($Rx32++#$Ii):nt = $Vs32", -tc_99093773, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[UseHVXV60]> { +tc_191381c1, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001110; @@ -28731,7 +28823,7 @@ def V6_vS32b_nt_qpred_ppu : HInst< (outs IntRegs:$Rx32), (ins HvxQR:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32), "if ($Qv4) vmem($Rx32++$Mu2):nt = $Vs32", -tc_99093773, TypeCVI_VM_ST>, Enc_4dff07, Requires<[UseHVXV60]> { +tc_191381c1, TypeCVI_VM_ST>, Enc_4dff07, Requires<[UseHVXV60]> { let Inst{10-5} = 0b000000; let Inst{31-21} = 0b00101011110; let addrMode = PostInc; @@ -28745,7 +28837,7 @@ def V6_vS32b_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32), "vmem($Rx32++#$Ii) = $Vs32", -tc_a4c9df3b, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel { +tc_3e2aaafc, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-5} = 0b000; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001001; @@ -28762,7 +28854,7 @@ def V6_vS32b_ppu : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32), "vmem($Rx32++$Mu2) = $Vs32", -tc_a4c9df3b, TypeCVI_VM_ST>, Enc_d15d19, Requires<[UseHVXV60]>, NewValueRel { +tc_3e2aaafc, TypeCVI_VM_ST>, Enc_d15d19, Requires<[UseHVXV60]>, NewValueRel { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b00101011001; let addrMode = PostInc; @@ -28777,7 +28869,7 @@ def V6_vS32b_pred_ai : HInst< (outs), (ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32), "if ($Pv4) vmem($Rt32+#$Ii) = $Vs32", -tc_85d237e3, TypeCVI_VM_ST>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel { +tc_a02a10a8, TypeCVI_VM_ST>, Enc_27b757, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-5} = 0b000; let Inst{31-21} = 0b00101000101; let isPredicated = 1; @@ -28792,7 +28884,7 @@ def V6_vS32b_pred_pi : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32), "if ($Pv4) vmem($Rx32++#$Ii) = $Vs32", -tc_0317c6ca, TypeCVI_VM_ST>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel { +tc_54a0dc47, TypeCVI_VM_ST>, Enc_865390, Requires<[UseHVXV60]>, NewValueRel { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001101; @@ -28809,7 +28901,7 @@ def V6_vS32b_pred_ppu : HInst< (outs IntRegs:$Rx32), (ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32), "if ($Pv4) vmem($Rx32++$Mu2) = $Vs32", -tc_0317c6ca, TypeCVI_VM_ST>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel { +tc_54a0dc47, TypeCVI_VM_ST>, Enc_1ef990, Requires<[UseHVXV60]>, NewValueRel { let Inst{10-5} = 0b000000; let Inst{31-21} = 0b00101011101; let isPredicated = 1; @@ -28825,7 +28917,7 @@ def V6_vS32b_qpred_ai : HInst< (outs), (ins HvxQR:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32), "if ($Qv4) vmem($Rt32+#$Ii) = $Vs32", -tc_aedb9f9e, TypeCVI_VM_ST>, Enc_2ea740, Requires<[UseHVXV60]> { +tc_447d9895, TypeCVI_VM_ST>, Enc_2ea740, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{31-21} = 0b00101000100; let addrMode = BaseImmOffset; @@ -28837,7 +28929,7 @@ def V6_vS32b_qpred_pi : HInst< (outs IntRegs:$Rx32), (ins HvxQR:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32), "if ($Qv4) vmem($Rx32++#$Ii) = $Vs32", -tc_99093773, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[UseHVXV60]> { +tc_191381c1, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00101001100; @@ -28851,7 +28943,7 @@ def V6_vS32b_qpred_ppu : HInst< (outs IntRegs:$Rx32), (ins HvxQR:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, HvxVR:$Vs32), "if ($Qv4) vmem($Rx32++$Mu2) = $Vs32", -tc_99093773, TypeCVI_VM_ST>, Enc_4dff07, Requires<[UseHVXV60]> { +tc_191381c1, TypeCVI_VM_ST>, Enc_4dff07, Requires<[UseHVXV60]> { let Inst{10-5} = 0b000000; let Inst{31-21} = 0b00101011100; let addrMode = PostInc; @@ -28864,7 +28956,7 @@ def V6_vS32b_srls_ai : HInst< (outs), (ins IntRegs:$Rt32, s4_0Imm:$Ii), "vmem($Rt32+#$Ii):scatter_release", -tc_29841470, TypeCVI_SCATTER_NEW_RST>, Enc_ff3442, Requires<[UseHVXV65]> { +tc_3ce09744, TypeCVI_SCATTER_NEW_RST>, Enc_ff3442, Requires<[UseHVXV65]> { let Inst{7-0} = 0b00101000; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000001; @@ -28878,7 +28970,7 @@ def V6_vS32b_srls_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii), "vmem($Rx32++#$Ii):scatter_release", -tc_5c03dc63, TypeCVI_SCATTER_NEW_RST>, Enc_6c9ee0, Requires<[UseHVXV65]> { +tc_20a4bbec, TypeCVI_SCATTER_NEW_RST>, Enc_6c9ee0, Requires<[UseHVXV65]> { let Inst{7-0} = 0b00101000; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001001; @@ -28893,7 +28985,7 @@ def V6_vS32b_srls_ppu : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2), "vmem($Rx32++$Mu2):scatter_release", -tc_5c03dc63, TypeCVI_SCATTER_NEW_RST>, Enc_44661f, Requires<[UseHVXV65]> { +tc_20a4bbec, TypeCVI_SCATTER_NEW_RST>, Enc_44661f, Requires<[UseHVXV65]> { let Inst{12-0} = 0b0000000101000; let Inst{31-21} = 0b00101011001; let addrMode = PostInc; @@ -28907,7 +28999,7 @@ def V6_vabsb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32.b = vabs($Vu32.b)", -tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV65]> { +tc_0ec46cf9, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV65]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000001; @@ -28930,7 +29022,7 @@ def V6_vabsb_sat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32.b = vabs($Vu32.b):sat", -tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV65]> { +tc_0ec46cf9, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV65]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000001; @@ -28953,7 +29045,7 @@ def V6_vabsdiffh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.uh = vabsdiff($Vu32.h,$Vv32.h)", -tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> { +tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100110; @@ -28976,7 +29068,7 @@ def V6_vabsdiffub : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.ub = vabsdiff($Vu32.ub,$Vv32.ub)", -tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> { +tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100110; @@ -28999,7 +29091,7 @@ def V6_vabsdiffuh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.uh = vabsdiff($Vu32.uh,$Vv32.uh)", -tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> { +tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100110; @@ -29022,7 +29114,7 @@ def V6_vabsdiffw : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.uw = vabsdiff($Vu32.w,$Vv32.w)", -tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> { +tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100110; @@ -29045,7 +29137,7 @@ def V6_vabsh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32.h = vabs($Vu32.h)", -tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> { +tc_0ec46cf9, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000000; @@ -29068,7 +29160,7 @@ def V6_vabsh_sat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32.h = vabs($Vu32.h):sat", -tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> { +tc_0ec46cf9, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000000; @@ -29091,7 +29183,7 @@ def V6_vabsub_alt : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32.ub = vabs($Vu32.b)", -tc_71337255, TypeMAPPING>, Requires<[UseHVXV65]> { +tc_0ec46cf9, TypeMAPPING>, Requires<[UseHVXV65]> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -29102,7 +29194,7 @@ def V6_vabsuh_alt : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32.uh = vabs($Vu32.h)", -tc_71337255, TypeMAPPING>, Requires<[UseHVXV65]> { +tc_0ec46cf9, TypeMAPPING>, Requires<[UseHVXV65]> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -29113,7 +29205,7 @@ def V6_vabsuw_alt : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32.uw = vabs($Vu32.w)", -tc_71337255, TypeMAPPING>, Requires<[UseHVXV65]> { +tc_0ec46cf9, TypeMAPPING>, Requires<[UseHVXV65]> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -29124,7 +29216,7 @@ def V6_vabsw : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32.w = vabs($Vu32.w)", -tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> { +tc_0ec46cf9, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000000; @@ -29147,7 +29239,7 @@ def V6_vabsw_sat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32.w = vabs($Vu32.w):sat", -tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> { +tc_0ec46cf9, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000000; @@ -29170,7 +29262,7 @@ def V6_vaddb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.b = vadd($Vu32.b,$Vv32.b)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111101; @@ -29193,7 +29285,7 @@ def V6_vaddb_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.b = vadd($Vuu32.b,$Vvv32.b)", -tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100011; @@ -29216,7 +29308,7 @@ def V6_vaddbnq : HInst< (outs HvxVR:$Vx32), (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32), "if (!$Qv4) $Vx32.b += $Vu32.b", -tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b1; let Inst{21-16} = 0b000001; @@ -29244,7 +29336,7 @@ def V6_vaddbq : HInst< (outs HvxVR:$Vx32), (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32), "if ($Qv4) $Vx32.b += $Vu32.b", -tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b1; let Inst{21-16} = 0b000001; @@ -29272,7 +29364,7 @@ def V6_vaddbsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.b = vadd($Vu32.b,$Vv32.b):sat", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111000; @@ -29295,7 +29387,7 @@ def V6_vaddbsat_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.b = vadd($Vuu32.b,$Vvv32.b):sat", -tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV62]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV62]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011110101; @@ -29318,7 +29410,7 @@ def V6_vaddcarry : HInst< (outs HvxVR:$Vd32, HvxQR:$Qx4), (ins HvxVR:$Vu32, HvxVR:$Vv32, HvxQR:$Qx4in), "$Vd32.w = vadd($Vu32.w,$Vv32.w,$Qx4):carry", -tc_5a9fc4ec, TypeCVI_VA>, Enc_b43b67, Requires<[UseHVXV62]> { +tc_7e6a3e89, TypeCVI_VA>, Enc_b43b67, Requires<[UseHVXV62]> { let Inst{7-7} = 0b0; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100101; @@ -29327,11 +29419,37 @@ let opNewValue = 0; let DecoderNamespace = "EXT_mmvec"; let Constraints = "$Qx4 = $Qx4in"; } +def V6_vaddcarryo : HInst< +(outs HvxVR:$Vd32, HvxQR:$Qe4), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.w,$Qe4 = vadd($Vu32.w,$Vv32.w):carry", +tc_e35c1e93, TypeCOPROC_VX>, Enc_c1d806, Requires<[UseHVXV66]> { +let Inst{7-7} = 0b0; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011101101; +let hasNewValue = 1; +let opNewValue = 0; +let hasNewValue2 = 1; +let opNewValue2 = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vaddcarrysat : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32, HvxQR:$Qs4), +"$Vd32.w = vadd($Vu32.w,$Vv32.w,$Qs4):carry:sat", +tc_257f6f7c, TypeCVI_VA>, Enc_e0820b, Requires<[UseHVXV66]> { +let Inst{7-7} = 0b0; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011101100; +let hasNewValue = 1; +let opNewValue = 0; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vaddclbh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vadd(vclb($Vu32.h),$Vv32.h)", -tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV62]> { +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV62]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011111000; @@ -29343,7 +29461,7 @@ def V6_vaddclbw : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vadd(vclb($Vu32.w),$Vv32.w)", -tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV62]> { +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV62]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011111000; @@ -29355,7 +29473,7 @@ def V6_vaddh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vadd($Vu32.h,$Vv32.h)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111101; @@ -29378,7 +29496,7 @@ def V6_vaddh_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.h = vadd($Vuu32.h,$Vvv32.h)", -tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100011; @@ -29401,7 +29519,7 @@ def V6_vaddhnq : HInst< (outs HvxVR:$Vx32), (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32), "if (!$Qv4) $Vx32.h += $Vu32.h", -tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b1; let Inst{21-16} = 0b000001; @@ -29429,7 +29547,7 @@ def V6_vaddhq : HInst< (outs HvxVR:$Vx32), (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32), "if ($Qv4) $Vx32.h += $Vu32.h", -tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b1; let Inst{21-16} = 0b000001; @@ -29457,7 +29575,7 @@ def V6_vaddhsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vadd($Vu32.h,$Vv32.h):sat", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100010; @@ -29480,7 +29598,7 @@ def V6_vaddhsat_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.h = vadd($Vuu32.h,$Vvv32.h):sat", -tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100100; @@ -29503,7 +29621,7 @@ def V6_vaddhw : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vdd32.w = vadd($Vu32.h,$Vv32.h)", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100101; @@ -29515,7 +29633,7 @@ def V6_vaddhw_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vxx32.w += vadd($Vu32.h,$Vv32.h)", -tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV62]> { +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV62]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100001; @@ -29553,7 +29671,7 @@ def V6_vaddubh : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vdd32.h = vadd($Vu32.ub,$Vv32.ub)", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100101; @@ -29565,7 +29683,7 @@ def V6_vaddubh_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vxx32.h += vadd($Vu32.ub,$Vv32.ub)", -tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV62]> { +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV62]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100010; @@ -29603,7 +29721,7 @@ def V6_vaddubsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.ub = vadd($Vu32.ub,$Vv32.ub):sat", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100010; @@ -29626,7 +29744,7 @@ def V6_vaddubsat_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.ub = vadd($Vuu32.ub,$Vvv32.ub):sat", -tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100011; @@ -29649,7 +29767,7 @@ def V6_vaddububb_sat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.ub = vadd($Vu32.ub,$Vv32.b):sat", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011110101; @@ -29661,7 +29779,7 @@ def V6_vadduhsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.uh = vadd($Vu32.uh,$Vv32.uh):sat", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100010; @@ -29684,7 +29802,7 @@ def V6_vadduhsat_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.uh = vadd($Vuu32.uh,$Vvv32.uh):sat", -tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100100; @@ -29707,7 +29825,7 @@ def V6_vadduhw : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vdd32.w = vadd($Vu32.uh,$Vv32.uh)", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100101; @@ -29719,7 +29837,7 @@ def V6_vadduhw_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vxx32.w += vadd($Vu32.uh,$Vv32.uh)", -tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV62]> { +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV62]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100010; @@ -29757,7 +29875,7 @@ def V6_vadduwsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.uw = vadd($Vu32.uw,$Vv32.uw):sat", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111011; @@ -29780,7 +29898,7 @@ def V6_vadduwsat_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.uw = vadd($Vuu32.uw,$Vvv32.uw):sat", -tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV62]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV62]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011110101; @@ -29803,7 +29921,7 @@ def V6_vaddw : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vadd($Vu32.w,$Vv32.w)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100010; @@ -29826,7 +29944,7 @@ def V6_vaddw_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.w = vadd($Vuu32.w,$Vvv32.w)", -tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100011; @@ -29849,7 +29967,7 @@ def V6_vaddwnq : HInst< (outs HvxVR:$Vx32), (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32), "if (!$Qv4) $Vx32.w += $Vu32.w", -tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b1; let Inst{21-16} = 0b000001; @@ -29877,7 +29995,7 @@ def V6_vaddwq : HInst< (outs HvxVR:$Vx32), (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32), "if ($Qv4) $Vx32.w += $Vu32.w", -tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b1; let Inst{21-16} = 0b000001; @@ -29905,7 +30023,7 @@ def V6_vaddwsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vadd($Vu32.w,$Vv32.w):sat", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100010; @@ -29928,7 +30046,7 @@ def V6_vaddwsat_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.w = vadd($Vuu32.w,$Vvv32.w):sat", -tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100100; @@ -29951,7 +30069,7 @@ def V6_valignb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32 = valign($Vu32,$Vv32,$Rt8)", -tc_c4b515c5, TypeCVI_VP>, Enc_a30110, Requires<[UseHVXV60]> { +tc_56e64202, TypeCVI_VP>, Enc_a30110, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-24} = 0b00011011; @@ -29963,7 +30081,7 @@ def V6_valignbi : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, u3_0Imm:$Ii), "$Vd32 = valign($Vu32,$Vv32,#$Ii)", -tc_c4b515c5, TypeCVI_VP>, Enc_0b2e5b, Requires<[UseHVXV60]> { +tc_56e64202, TypeCVI_VP>, Enc_0b2e5b, Requires<[UseHVXV60]> { let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011110001; let hasNewValue = 1; @@ -29974,7 +30092,7 @@ def V6_vand : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32 = vand($Vu32,$Vv32)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100001; @@ -29986,7 +30104,7 @@ def V6_vandnqrt : HInst< (outs HvxVR:$Vd32), (ins HvxQR:$Qu4, IntRegs:$Rt32), "$Vd32 = vand(!$Qu4,$Rt32)", -tc_e231aa4f, TypeCVI_VX>, Enc_7b7ba8, Requires<[UseHVXV62]> { +tc_ac4046bc, TypeCVI_VX>, Enc_7b7ba8, Requires<[UseHVXV62]> { let Inst{7-5} = 0b101; let Inst{13-10} = 0b0001; let Inst{31-21} = 0b00011001101; @@ -29998,7 +30116,7 @@ def V6_vandnqrt_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxQR:$Qu4, IntRegs:$Rt32), "$Vx32 |= vand(!$Qu4,$Rt32)", -tc_9311da3f, TypeCVI_VX>, Enc_895bd9, Requires<[UseHVXV62]> { +tc_2e8f5f6e, TypeCVI_VX>, Enc_895bd9, Requires<[UseHVXV62]> { let Inst{7-5} = 0b011; let Inst{13-10} = 0b1001; let Inst{31-21} = 0b00011001011; @@ -30036,7 +30154,7 @@ def V6_vandqrt : HInst< (outs HvxVR:$Vd32), (ins HvxQR:$Qu4, IntRegs:$Rt32), "$Vd32 = vand($Qu4,$Rt32)", -tc_e231aa4f, TypeCVI_VX_LATE>, Enc_7b7ba8, Requires<[UseHVXV60]> { +tc_ac4046bc, TypeCVI_VX_LATE>, Enc_7b7ba8, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-10} = 0b0000; let Inst{31-21} = 0b00011001101; @@ -30048,7 +30166,7 @@ def V6_vandqrt_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxQR:$Qu4, IntRegs:$Rt32), "$Vx32 |= vand($Qu4,$Rt32)", -tc_9311da3f, TypeCVI_VX_LATE>, Enc_895bd9, Requires<[UseHVXV60]> { +tc_2e8f5f6e, TypeCVI_VX_LATE>, Enc_895bd9, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-10} = 0b1000; let Inst{31-21} = 0b00011001011; @@ -30086,7 +30204,7 @@ def V6_vandvnqv : HInst< (outs HvxVR:$Vd32), (ins HvxQR:$Qv4, HvxVR:$Vu32), "$Vd32 = vand(!$Qv4,$Vu32)", -tc_bbaf280e, TypeCVI_VA>, Enc_c4dc92, Requires<[UseHVXV62]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_c4dc92, Requires<[UseHVXV62]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b1; let Inst{21-16} = 0b000011; @@ -30099,7 +30217,7 @@ def V6_vandvqv : HInst< (outs HvxVR:$Vd32), (ins HvxQR:$Qv4, HvxVR:$Vu32), "$Vd32 = vand($Qv4,$Vu32)", -tc_bbaf280e, TypeCVI_VA>, Enc_c4dc92, Requires<[UseHVXV62]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_c4dc92, Requires<[UseHVXV62]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b1; let Inst{21-16} = 0b000011; @@ -30112,7 +30230,7 @@ def V6_vandvrt : HInst< (outs HvxQR:$Qd4), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Qd4 = vand($Vu32,$Rt32)", -tc_e231aa4f, TypeCVI_VX_LATE>, Enc_0f8bab, Requires<[UseHVXV60]> { +tc_ac4046bc, TypeCVI_VX_LATE>, Enc_0f8bab, Requires<[UseHVXV60]> { let Inst{7-2} = 0b010010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001101; @@ -30124,7 +30242,7 @@ def V6_vandvrt_acc : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, IntRegs:$Rt32), "$Qx4 |= vand($Vu32,$Rt32)", -tc_9311da3f, TypeCVI_VX_LATE>, Enc_adf111, Requires<[UseHVXV60]> { +tc_2e8f5f6e, TypeCVI_VX_LATE>, Enc_adf111, Requires<[UseHVXV60]> { let Inst{7-2} = 0b100000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001011; @@ -30158,7 +30276,7 @@ def V6_vaslh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.h = vasl($Vu32.h,$Rt32)", -tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_7417e785, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001100; @@ -30170,7 +30288,7 @@ def V6_vaslh_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vx32.h += vasl($Vu32.h,$Rt32)", -tc_c00bf9c9, TypeCVI_VS>, Enc_5138b3, Requires<[UseHVXV65]> { +tc_309dbb4f, TypeCVI_VS>, Enc_5138b3, Requires<[UseHVXV65]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001101; @@ -30208,7 +30326,7 @@ def V6_vaslhv : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vasl($Vu32.h,$Vv32.h)", -tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111101; @@ -30231,7 +30349,7 @@ def V6_vaslw : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.w = vasl($Vu32.w,$Rt32)", -tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_7417e785, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001011; @@ -30243,7 +30361,7 @@ def V6_vaslw_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vx32.w += vasl($Vu32.w,$Rt32)", -tc_c00bf9c9, TypeCVI_VS>, Enc_5138b3, Requires<[UseHVXV60]> { +tc_309dbb4f, TypeCVI_VS>, Enc_5138b3, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001011; @@ -30281,7 +30399,7 @@ def V6_vaslwv : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vasl($Vu32.w,$Vv32.w)", -tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111101; @@ -30300,11 +30418,36 @@ let isPseudo = 1; let isCodeGenOnly = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vasr_into : HInst< +(outs HvxWR:$Vxx32), +(ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Vxx32.w = vasrinto($Vu32.w,$Vv32.w)", +tc_df80eeb0, TypeCVI_VP_VS>, Enc_3fc427, Requires<[UseHVXV66]> { +let Inst{7-5} = 0b111; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011010101; +let hasNewValue = 1; +let opNewValue = 0; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Vxx32 = $Vxx32in"; +} +def V6_vasr_into_alt : HInst< +(outs HvxWR:$Vxx32), +(ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Vxx32 = vasrinto($Vu32,$Vv32)", +PSEUDO, TypeMAPPING>, Requires<[UseHVXV66]> { +let hasNewValue = 1; +let opNewValue = 0; +let isPseudo = 1; +let isCodeGenOnly = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Vxx32 = $Vxx32in"; +} def V6_vasrh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.h = vasr($Vu32.h,$Rt32)", -tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_7417e785, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001011; @@ -30316,7 +30459,7 @@ def V6_vasrh_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vx32.h += vasr($Vu32.h,$Rt32)", -tc_c00bf9c9, TypeCVI_VS>, Enc_5138b3, Requires<[UseHVXV65]> { +tc_309dbb4f, TypeCVI_VS>, Enc_5138b3, Requires<[UseHVXV65]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001100; @@ -30354,7 +30497,7 @@ def V6_vasrhbrndsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32.b = vasr($Vu32.h,$Vv32.h,$Rt8):rnd:sat", -tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> { +tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b1; let Inst{31-24} = 0b00011011; @@ -30366,7 +30509,7 @@ def V6_vasrhbrndsat_alt : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32 = vasrhb($Vu32,$Vv32,$Rt8):rnd:sat", -tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> { +tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -30376,7 +30519,7 @@ def V6_vasrhbsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32.b = vasr($Vu32.h,$Vv32.h,$Rt8):sat", -tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV62]> { +tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV62]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-24} = 0b00011000; @@ -30388,7 +30531,7 @@ def V6_vasrhubrndsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32.ub = vasr($Vu32.h,$Vv32.h,$Rt8):rnd:sat", -tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> { +tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-24} = 0b00011011; @@ -30400,7 +30543,7 @@ def V6_vasrhubrndsat_alt : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32 = vasrhub($Vu32,$Vv32,$Rt8):rnd:sat", -tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> { +tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -30410,7 +30553,7 @@ def V6_vasrhubsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32.ub = vasr($Vu32.h,$Vv32.h,$Rt8):sat", -tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> { +tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-24} = 0b00011011; @@ -30422,7 +30565,7 @@ def V6_vasrhubsat_alt : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32 = vasrhub($Vu32,$Vv32,$Rt8):sat", -tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> { +tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -30432,7 +30575,7 @@ def V6_vasrhv : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vasr($Vu32.h,$Vv32.h)", -tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111101; @@ -30455,7 +30598,7 @@ def V6_vasruhubrndsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32.ub = vasr($Vu32.uh,$Vv32.uh,$Rt8):rnd:sat", -tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV65]> { +tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV65]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-24} = 0b00011000; @@ -30467,7 +30610,7 @@ def V6_vasruhubsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32.ub = vasr($Vu32.uh,$Vv32.uh,$Rt8):sat", -tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV65]> { +tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV65]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b1; let Inst{31-24} = 0b00011000; @@ -30479,7 +30622,7 @@ def V6_vasruwuhrndsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32.uh = vasr($Vu32.uw,$Vv32.uw,$Rt8):rnd:sat", -tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV62]> { +tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV62]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-24} = 0b00011000; @@ -30491,7 +30634,7 @@ def V6_vasruwuhsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32.uh = vasr($Vu32.uw,$Vv32.uw,$Rt8):sat", -tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV65]> { +tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV65]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b1; let Inst{31-24} = 0b00011000; @@ -30503,7 +30646,7 @@ def V6_vasrw : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.w = vasr($Vu32.w,$Rt32)", -tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_7417e785, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001011; @@ -30515,7 +30658,7 @@ def V6_vasrw_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vx32.w += vasr($Vu32.w,$Rt32)", -tc_c00bf9c9, TypeCVI_VS>, Enc_5138b3, Requires<[UseHVXV60]> { +tc_309dbb4f, TypeCVI_VS>, Enc_5138b3, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001011; @@ -30553,7 +30696,7 @@ def V6_vasrwh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8)", -tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> { +tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-24} = 0b00011011; @@ -30565,7 +30708,7 @@ def V6_vasrwh_alt : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32 = vasrwh($Vu32,$Vv32,$Rt8)", -tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> { +tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -30575,7 +30718,7 @@ def V6_vasrwhrndsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8):rnd:sat", -tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> { +tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-24} = 0b00011011; @@ -30587,7 +30730,7 @@ def V6_vasrwhrndsat_alt : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32 = vasrwh($Vu32,$Vv32,$Rt8):rnd:sat", -tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> { +tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -30597,7 +30740,7 @@ def V6_vasrwhsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8):sat", -tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> { +tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-24} = 0b00011011; @@ -30609,7 +30752,7 @@ def V6_vasrwhsat_alt : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32 = vasrwh($Vu32,$Vv32,$Rt8):sat", -tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> { +tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -30619,7 +30762,7 @@ def V6_vasrwuhrndsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32.uh = vasr($Vu32.w,$Vv32.w,$Rt8):rnd:sat", -tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV62]> { +tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV62]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-24} = 0b00011000; @@ -30631,7 +30774,7 @@ def V6_vasrwuhsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32.uh = vasr($Vu32.w,$Vv32.w,$Rt8):sat", -tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> { +tc_16ff9ef8, TypeCVI_VS>, Enc_a30110, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-24} = 0b00011011; @@ -30643,7 +30786,7 @@ def V6_vasrwuhsat_alt : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32 = vasrwuh($Vu32,$Vv32,$Rt8):sat", -tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60]> { +tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -30653,7 +30796,7 @@ def V6_vasrwv : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vasr($Vu32.w,$Vv32.w)", -tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111101; @@ -30676,7 +30819,7 @@ def V6_vassign : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32 = $Vu32", -tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> { +tc_0ec46cf9, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b1; let Inst{31-16} = 0b0001111000000011; @@ -30698,7 +30841,7 @@ def V6_vavgb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.b = vavg($Vu32.b,$Vv32.b)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011111000; @@ -30721,7 +30864,7 @@ def V6_vavgbrnd : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.b = vavg($Vu32.b,$Vv32.b):rnd", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011111000; @@ -30744,7 +30887,7 @@ def V6_vavgh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vavg($Vu32.h,$Vv32.h)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100110; @@ -30767,7 +30910,7 @@ def V6_vavghrnd : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vavg($Vu32.h,$Vv32.h):rnd", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100111; @@ -30790,7 +30933,7 @@ def V6_vavgub : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.ub = vavg($Vu32.ub,$Vv32.ub)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100110; @@ -30813,7 +30956,7 @@ def V6_vavgubrnd : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.ub = vavg($Vu32.ub,$Vv32.ub):rnd", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100111; @@ -30836,7 +30979,7 @@ def V6_vavguh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.uh = vavg($Vu32.uh,$Vv32.uh)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100110; @@ -30859,7 +31002,7 @@ def V6_vavguhrnd : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.uh = vavg($Vu32.uh,$Vv32.uh):rnd", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100111; @@ -30882,7 +31025,7 @@ def V6_vavguw : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.uw = vavg($Vu32.uw,$Vv32.uw)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011111000; @@ -30905,7 +31048,7 @@ def V6_vavguwrnd : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.uw = vavg($Vu32.uw,$Vv32.uw):rnd", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011111000; @@ -30928,7 +31071,7 @@ def V6_vavgw : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vavg($Vu32.w,$Vv32.w)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100110; @@ -30951,7 +31094,7 @@ def V6_vavgwrnd : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vavg($Vu32.w,$Vv32.w):rnd", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100111; @@ -30974,7 +31117,7 @@ def V6_vccombine : HInst< (outs HvxWR:$Vdd32), (ins PredRegs:$Ps4, HvxVR:$Vu32, HvxVR:$Vv32), "if ($Ps4) $Vdd32 = vcombine($Vu32,$Vv32)", -tc_2171ebae, TypeCVI_VA_DV>, Enc_8c2412, Requires<[UseHVXV60]> { +tc_af25efd9, TypeCVI_VA_DV>, Enc_8c2412, Requires<[UseHVXV60]> { let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011010011; @@ -30987,7 +31130,7 @@ def V6_vcl0h : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32.uh = vcl0($Vu32.uh)", -tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> { +tc_51d0ecc3, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000010; @@ -31010,7 +31153,7 @@ def V6_vcl0w : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32.uw = vcl0($Vu32.uw)", -tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> { +tc_51d0ecc3, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000010; @@ -31033,7 +31176,7 @@ def V6_vcmov : HInst< (outs HvxVR:$Vd32), (ins PredRegs:$Ps4, HvxVR:$Vu32), "if ($Ps4) $Vd32 = $Vu32", -tc_b06ab583, TypeCVI_VA>, Enc_770858, Requires<[UseHVXV60]> { +tc_3aacf4a8, TypeCVI_VA>, Enc_770858, Requires<[UseHVXV60]> { let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001101000000000; @@ -31046,7 +31189,7 @@ def V6_vcombine : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vdd32 = vcombine($Vu32,$Vv32)", -tc_97c165b9, TypeCVI_VA_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111010; @@ -31070,7 +31213,7 @@ def V6_vdd0 : HInst< (outs HvxWR:$Vdd32), (ins), "$Vdd32 = #0", -tc_8a6eb39a, TypeMAPPING>, Requires<[UseHVXV65]> { +tc_718b5c53, TypeMAPPING>, Requires<[UseHVXV65]> { let hasNewValue = 1; let opNewValue = 0; let isPseudo = 1; @@ -31081,7 +31224,7 @@ def V6_vdeal : HInst< (outs HvxVR:$Vy32, HvxVR:$Vx32), (ins HvxVR:$Vy32in, HvxVR:$Vx32in, IntRegs:$Rt32), "vdeal($Vy32,$Vx32,$Rt32)", -tc_5c120602, TypeCVI_VP_VS>, Enc_989021, Requires<[UseHVXV60]> { +tc_561aaa58, TypeCVI_VP_VS>, Enc_989021, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001111; @@ -31096,7 +31239,7 @@ def V6_vdealb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32.b = vdeal($Vu32.b)", -tc_e6299d16, TypeCVI_VP>, Enc_e7581c, Requires<[UseHVXV60]> { +tc_946013d8, TypeCVI_VP>, Enc_e7581c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000000; @@ -31108,7 +31251,7 @@ def V6_vdealb4w : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.b = vdeale($Vu32.b,$Vv32.b)", -tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { +tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111001; @@ -31142,7 +31285,7 @@ def V6_vdealh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32.h = vdeal($Vu32.h)", -tc_e6299d16, TypeCVI_VP>, Enc_e7581c, Requires<[UseHVXV60]> { +tc_946013d8, TypeCVI_VP>, Enc_e7581c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000000; @@ -31165,7 +31308,7 @@ def V6_vdealvdd : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vdd32 = vdeal($Vu32,$Vv32,$Rt8)", -tc_4e2a5159, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[UseHVXV60]> { +tc_87adc037, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b1; let Inst{31-24} = 0b00011011; @@ -31177,7 +31320,7 @@ def V6_vdelta : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32 = vdelta($Vu32,$Vv32)", -tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { +tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111001; @@ -31189,7 +31332,7 @@ def V6_vdmpybus : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.h = vdmpy($Vu32.ub,$Rt32.b)", -tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_649072c2, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001000; @@ -31201,7 +31344,7 @@ def V6_vdmpybus_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vx32.h += vdmpy($Vu32.ub,$Rt32.b)", -tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> { +tc_b091f1c6, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001000; @@ -31239,7 +31382,7 @@ def V6_vdmpybus_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, IntRegs:$Rt32), "$Vdd32.h = vdmpy($Vuu32.ub,$Rt32.b)", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001000; @@ -31251,7 +31394,7 @@ def V6_vdmpybus_dv_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32), "$Vxx32.h += vdmpy($Vuu32.ub,$Rt32.b)", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001000; @@ -31289,7 +31432,7 @@ def V6_vdmpyhb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.w = vdmpy($Vu32.h,$Rt32.b)", -tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_649072c2, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001000; @@ -31301,7 +31444,7 @@ def V6_vdmpyhb_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vx32.w += vdmpy($Vu32.h,$Rt32.b)", -tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> { +tc_b091f1c6, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001000; @@ -31339,7 +31482,7 @@ def V6_vdmpyhb_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, IntRegs:$Rt32), "$Vdd32.w = vdmpy($Vuu32.h,$Rt32.b)", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001001; @@ -31351,7 +31494,7 @@ def V6_vdmpyhb_dv_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32), "$Vxx32.w += vdmpy($Vuu32.h,$Rt32.b)", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001001; @@ -31389,7 +31532,7 @@ def V6_vdmpyhisat : HInst< (outs HvxVR:$Vd32), (ins HvxWR:$Vuu32, IntRegs:$Rt32), "$Vd32.w = vdmpy($Vuu32.h,$Rt32.h):sat", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_0e41fa, Requires<[UseHVXV60]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_0e41fa, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001001; @@ -31401,7 +31544,7 @@ def V6_vdmpyhisat_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxWR:$Vuu32, IntRegs:$Rt32), "$Vx32.w += vdmpy($Vuu32.h,$Rt32.h):sat", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_cc857d, Requires<[UseHVXV60]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_cc857d, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001001; @@ -31439,7 +31582,7 @@ def V6_vdmpyhsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.w = vdmpy($Vu32.h,$Rt32.h):sat", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001001; @@ -31451,7 +31594,7 @@ def V6_vdmpyhsat_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vx32.w += vdmpy($Vu32.h,$Rt32.h):sat", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_5138b3, Requires<[UseHVXV60]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_5138b3, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001001; @@ -31489,7 +31632,7 @@ def V6_vdmpyhsuisat : HInst< (outs HvxVR:$Vd32), (ins HvxWR:$Vuu32, IntRegs:$Rt32), "$Vd32.w = vdmpy($Vuu32.h,$Rt32.uh,#1):sat", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_0e41fa, Requires<[UseHVXV60]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_0e41fa, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001001; @@ -31501,7 +31644,7 @@ def V6_vdmpyhsuisat_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxWR:$Vuu32, IntRegs:$Rt32), "$Vx32.w += vdmpy($Vuu32.h,$Rt32.uh,#1):sat", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_cc857d, Requires<[UseHVXV60]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_cc857d, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001001; @@ -31539,7 +31682,7 @@ def V6_vdmpyhsusat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.w = vdmpy($Vu32.h,$Rt32.uh):sat", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001001; @@ -31551,7 +31694,7 @@ def V6_vdmpyhsusat_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vx32.w += vdmpy($Vu32.h,$Rt32.uh):sat", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_5138b3, Requires<[UseHVXV60]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_5138b3, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001001; @@ -31589,7 +31732,7 @@ def V6_vdmpyhvsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vdmpy($Vu32.h,$Vv32.h):sat", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100000; @@ -31601,7 +31744,7 @@ def V6_vdmpyhvsat_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vx32.w += vdmpy($Vu32.h,$Vv32.h):sat", -tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100000; @@ -31639,7 +31782,7 @@ def V6_vdsaduh : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, IntRegs:$Rt32), "$Vdd32.uw = vdsad($Vuu32.uh,$Rt32.uh)", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001000; @@ -31651,7 +31794,7 @@ def V6_vdsaduh_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32), "$Vxx32.uw += vdsad($Vuu32.uh,$Rt32.uh)", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001011; @@ -31689,7 +31832,7 @@ def V6_veqb : HInst< (outs HvxQR:$Qd4), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Qd4 = vcmp.eq($Vu32.b,$Vv32.b)", -tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111100; @@ -31701,7 +31844,7 @@ def V6_veqb_and : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 &= vcmp.eq($Vu32.b,$Vv32.b)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b000000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -31712,7 +31855,7 @@ def V6_veqb_or : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 |= vcmp.eq($Vu32.b,$Vv32.b)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b010000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -31724,7 +31867,7 @@ def V6_veqb_xor : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 ^= vcmp.eq($Vu32.b,$Vv32.b)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b100000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -31735,7 +31878,7 @@ def V6_veqh : HInst< (outs HvxQR:$Qd4), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Qd4 = vcmp.eq($Vu32.h,$Vv32.h)", -tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> { let Inst{7-2} = 0b000001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111100; @@ -31747,7 +31890,7 @@ def V6_veqh_and : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 &= vcmp.eq($Vu32.h,$Vv32.h)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b000001; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -31758,7 +31901,7 @@ def V6_veqh_or : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 |= vcmp.eq($Vu32.h,$Vv32.h)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b010001; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -31770,7 +31913,7 @@ def V6_veqh_xor : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 ^= vcmp.eq($Vu32.h,$Vv32.h)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b100001; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -31781,7 +31924,7 @@ def V6_veqw : HInst< (outs HvxQR:$Qd4), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Qd4 = vcmp.eq($Vu32.w,$Vv32.w)", -tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> { let Inst{7-2} = 0b000010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111100; @@ -31793,7 +31936,7 @@ def V6_veqw_and : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 &= vcmp.eq($Vu32.w,$Vv32.w)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b000010; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -31804,7 +31947,7 @@ def V6_veqw_or : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 |= vcmp.eq($Vu32.w,$Vv32.w)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b010010; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -31816,7 +31959,7 @@ def V6_veqw_xor : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 ^= vcmp.eq($Vu32.w,$Vv32.w)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b100010; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -31827,7 +31970,7 @@ def V6_vgathermh : HInst< (outs), (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32), "vtmp.h = vgather($Rt32,$Mu2,$Vv32.h).h", -tc_66bb62ea, TypeCVI_GATHER>, Enc_8b8927, Requires<[UseHVXV65]> { +tc_e8797b98, TypeCVI_GATHER>, Enc_8b8927, Requires<[UseHVXV65]> { let Inst{12-5} = 0b00001000; let Inst{31-21} = 0b00101111000; let hasNewValue = 1; @@ -31843,7 +31986,7 @@ def V6_vgathermhq : HInst< (outs), (ins HvxQR:$Qs4, IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32), "if ($Qs4) vtmp.h = vgather($Rt32,$Mu2,$Vv32.h).h", -tc_63e3d94c, TypeCVI_GATHER>, Enc_158beb, Requires<[UseHVXV65]> { +tc_05ac6f98, TypeCVI_GATHER>, Enc_158beb, Requires<[UseHVXV65]> { let Inst{12-7} = 0b001010; let Inst{31-21} = 0b00101111000; let hasNewValue = 1; @@ -31859,7 +32002,7 @@ def V6_vgathermhw : HInst< (outs), (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxWR:$Vvv32), "vtmp.h = vgather($Rt32,$Mu2,$Vvv32.w).h", -tc_bfe309d5, TypeCVI_GATHER>, Enc_28dcbb, Requires<[UseHVXV65]> { +tc_05058f6f, TypeCVI_GATHER>, Enc_28dcbb, Requires<[UseHVXV65]> { let Inst{12-5} = 0b00010000; let Inst{31-21} = 0b00101111000; let hasNewValue = 1; @@ -31875,7 +32018,7 @@ def V6_vgathermhwq : HInst< (outs), (ins HvxQR:$Qs4, IntRegs:$Rt32, ModRegs:$Mu2, HvxWR:$Vvv32), "if ($Qs4) vtmp.h = vgather($Rt32,$Mu2,$Vvv32.w).h", -tc_98733e9d, TypeCVI_GATHER>, Enc_4e4a80, Requires<[UseHVXV65]> { +tc_fd7610da, TypeCVI_GATHER>, Enc_4e4a80, Requires<[UseHVXV65]> { let Inst{12-7} = 0b001100; let Inst{31-21} = 0b00101111000; let hasNewValue = 1; @@ -31891,7 +32034,7 @@ def V6_vgathermw : HInst< (outs), (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32), "vtmp.w = vgather($Rt32,$Mu2,$Vv32.w).w", -tc_66bb62ea, TypeCVI_GATHER>, Enc_8b8927, Requires<[UseHVXV65]> { +tc_e8797b98, TypeCVI_GATHER>, Enc_8b8927, Requires<[UseHVXV65]> { let Inst{12-5} = 0b00000000; let Inst{31-21} = 0b00101111000; let hasNewValue = 1; @@ -31907,7 +32050,7 @@ def V6_vgathermwq : HInst< (outs), (ins HvxQR:$Qs4, IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32), "if ($Qs4) vtmp.w = vgather($Rt32,$Mu2,$Vv32.w).w", -tc_63e3d94c, TypeCVI_GATHER>, Enc_158beb, Requires<[UseHVXV65]> { +tc_05ac6f98, TypeCVI_GATHER>, Enc_158beb, Requires<[UseHVXV65]> { let Inst{12-7} = 0b001000; let Inst{31-21} = 0b00101111000; let hasNewValue = 1; @@ -31923,7 +32066,7 @@ def V6_vgtb : HInst< (outs HvxQR:$Qd4), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Qd4 = vcmp.gt($Vu32.b,$Vv32.b)", -tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> { let Inst{7-2} = 0b000100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111100; @@ -31935,7 +32078,7 @@ def V6_vgtb_and : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 &= vcmp.gt($Vu32.b,$Vv32.b)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b000100; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -31946,7 +32089,7 @@ def V6_vgtb_or : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 |= vcmp.gt($Vu32.b,$Vv32.b)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b010100; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -31958,7 +32101,7 @@ def V6_vgtb_xor : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 ^= vcmp.gt($Vu32.b,$Vv32.b)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b100100; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -31969,7 +32112,7 @@ def V6_vgth : HInst< (outs HvxQR:$Qd4), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Qd4 = vcmp.gt($Vu32.h,$Vv32.h)", -tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> { let Inst{7-2} = 0b000101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111100; @@ -31981,7 +32124,7 @@ def V6_vgth_and : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 &= vcmp.gt($Vu32.h,$Vv32.h)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b000101; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -31992,7 +32135,7 @@ def V6_vgth_or : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 |= vcmp.gt($Vu32.h,$Vv32.h)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b010101; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -32004,7 +32147,7 @@ def V6_vgth_xor : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 ^= vcmp.gt($Vu32.h,$Vv32.h)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b100101; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -32015,7 +32158,7 @@ def V6_vgtub : HInst< (outs HvxQR:$Qd4), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Qd4 = vcmp.gt($Vu32.ub,$Vv32.ub)", -tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> { let Inst{7-2} = 0b001000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111100; @@ -32027,7 +32170,7 @@ def V6_vgtub_and : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 &= vcmp.gt($Vu32.ub,$Vv32.ub)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b001000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -32038,7 +32181,7 @@ def V6_vgtub_or : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 |= vcmp.gt($Vu32.ub,$Vv32.ub)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b011000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -32050,7 +32193,7 @@ def V6_vgtub_xor : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 ^= vcmp.gt($Vu32.ub,$Vv32.ub)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b101000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -32061,7 +32204,7 @@ def V6_vgtuh : HInst< (outs HvxQR:$Qd4), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Qd4 = vcmp.gt($Vu32.uh,$Vv32.uh)", -tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> { let Inst{7-2} = 0b001001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111100; @@ -32073,7 +32216,7 @@ def V6_vgtuh_and : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 &= vcmp.gt($Vu32.uh,$Vv32.uh)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b001001; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -32084,7 +32227,7 @@ def V6_vgtuh_or : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 |= vcmp.gt($Vu32.uh,$Vv32.uh)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b011001; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -32096,7 +32239,7 @@ def V6_vgtuh_xor : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 ^= vcmp.gt($Vu32.uh,$Vv32.uh)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b101001; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -32107,7 +32250,7 @@ def V6_vgtuw : HInst< (outs HvxQR:$Qd4), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Qd4 = vcmp.gt($Vu32.uw,$Vv32.uw)", -tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> { let Inst{7-2} = 0b001010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111100; @@ -32119,7 +32262,7 @@ def V6_vgtuw_and : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 &= vcmp.gt($Vu32.uw,$Vv32.uw)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b001010; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -32130,7 +32273,7 @@ def V6_vgtuw_or : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 |= vcmp.gt($Vu32.uw,$Vv32.uw)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b011010; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -32142,7 +32285,7 @@ def V6_vgtuw_xor : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 ^= vcmp.gt($Vu32.uw,$Vv32.uw)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b101010; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -32153,7 +32296,7 @@ def V6_vgtw : HInst< (outs HvxQR:$Qd4), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Qd4 = vcmp.gt($Vu32.w,$Vv32.w)", -tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV60]> { let Inst{7-2} = 0b000110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111100; @@ -32165,7 +32308,7 @@ def V6_vgtw_and : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 &= vcmp.gt($Vu32.w,$Vv32.w)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b000110; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -32176,7 +32319,7 @@ def V6_vgtw_or : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 |= vcmp.gt($Vu32.w,$Vv32.w)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b010110; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -32188,7 +32331,7 @@ def V6_vgtw_xor : HInst< (outs HvxQR:$Qx4), (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), "$Qx4 ^= vcmp.gt($Vu32.w,$Vv32.w)", -tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> { let Inst{7-2} = 0b100110; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100100; @@ -32199,7 +32342,7 @@ def V6_vhist : HInst< (outs), (ins), "vhist", -tc_e5053c8f, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV60]> { +tc_1381a97c, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV60]> { let Inst{13-0} = 0b10000010000000; let Inst{31-16} = 0b0001111000000000; let DecoderNamespace = "EXT_mmvec"; @@ -32208,7 +32351,7 @@ def V6_vhistq : HInst< (outs), (ins HvxQR:$Qv4), "vhist($Qv4)", -tc_cedf314b, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV60]> { +tc_e3f68a46, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV60]> { let Inst{13-0} = 0b10000010000000; let Inst{21-16} = 0b000010; let Inst{31-24} = 0b00011110; @@ -32218,7 +32361,7 @@ def V6_vinsertwr : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, IntRegs:$Rt32), "$Vx32.w = vinsert($Rt32)", -tc_e231aa4f, TypeCVI_VX_LATE>, Enc_569cfe, Requires<[UseHVXV60]> { +tc_ac4046bc, TypeCVI_VX_LATE>, Enc_569cfe, Requires<[UseHVXV60]> { let Inst{13-5} = 0b100000001; let Inst{31-21} = 0b00011001101; let hasNewValue = 1; @@ -32230,7 +32373,7 @@ def V6_vlalignb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32 = vlalign($Vu32,$Vv32,$Rt8)", -tc_c4b515c5, TypeCVI_VP>, Enc_a30110, Requires<[UseHVXV60]> { +tc_56e64202, TypeCVI_VP>, Enc_a30110, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-24} = 0b00011011; @@ -32242,7 +32385,7 @@ def V6_vlalignbi : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, u3_0Imm:$Ii), "$Vd32 = vlalign($Vu32,$Vv32,#$Ii)", -tc_c4b515c5, TypeCVI_VP>, Enc_0b2e5b, Requires<[UseHVXV60]> { +tc_56e64202, TypeCVI_VP>, Enc_0b2e5b, Requires<[UseHVXV60]> { let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011110011; let hasNewValue = 1; @@ -32253,7 +32396,7 @@ def V6_vlsrb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.ub = vlsr($Vu32.ub,$Rt32)", -tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV62]> { +tc_7417e785, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV62]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001100; @@ -32265,7 +32408,7 @@ def V6_vlsrh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.uh = vlsr($Vu32.uh,$Rt32)", -tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_7417e785, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001100; @@ -32288,7 +32431,7 @@ def V6_vlsrhv : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vlsr($Vu32.h,$Vv32.h)", -tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111101; @@ -32311,7 +32454,7 @@ def V6_vlsrw : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.uw = vlsr($Vu32.uw,$Rt32)", -tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_7417e785, TypeCVI_VS>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001100; @@ -32334,7 +32477,7 @@ def V6_vlsrwv : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vlsr($Vu32.w,$Vv32.w)", -tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111101; @@ -32357,7 +32500,7 @@ def V6_vlut4 : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, DoubleRegs:$Rtt32), "$Vd32.h = vlut4($Vu32.uh,$Rtt32.h)", -tc_fa99dc24, TypeCVI_VX_DV>, Enc_263841, Requires<[UseHVXV65]> { +tc_f1de44ef, TypeCVI_VX_DV>, Enc_263841, Requires<[UseHVXV65]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001011; @@ -32369,7 +32512,7 @@ def V6_vlutvvb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32.b = vlut32($Vu32.b,$Vv32.b,$Rt8)", -tc_c4b515c5, TypeCVI_VP>, Enc_a30110, Requires<[UseHVXV60]> { +tc_56e64202, TypeCVI_VP>, Enc_a30110, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b1; let Inst{31-24} = 0b00011011; @@ -32381,7 +32524,7 @@ def V6_vlutvvb_nm : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vd32.b = vlut32($Vu32.b,$Vv32.b,$Rt8):nomatch", -tc_c4b515c5, TypeCVI_VP>, Enc_a30110, Requires<[UseHVXV62]> { +tc_56e64202, TypeCVI_VP>, Enc_a30110, Requires<[UseHVXV62]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-24} = 0b00011000; @@ -32393,7 +32536,7 @@ def V6_vlutvvb_oracc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vx32.b |= vlut32($Vu32.b,$Vv32.b,$Rt8)", -tc_cbf6d1dc, TypeCVI_VP_VS>, Enc_245865, Requires<[UseHVXV60]> { +tc_9d1dc972, TypeCVI_VP_VS>, Enc_245865, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b1; let Inst{31-24} = 0b00011011; @@ -32407,7 +32550,7 @@ def V6_vlutvvb_oracci : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32, u3_0Imm:$Ii), "$Vx32.b |= vlut32($Vu32.b,$Vv32.b,#$Ii)", -tc_cbf6d1dc, TypeCVI_VP_VS>, Enc_cd4705, Requires<[UseHVXV62]> { +tc_9d1dc972, TypeCVI_VP_VS>, Enc_cd4705, Requires<[UseHVXV62]> { let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100110; let hasNewValue = 1; @@ -32420,7 +32563,7 @@ def V6_vlutvvbi : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, u3_0Imm:$Ii), "$Vd32.b = vlut32($Vu32.b,$Vv32.b,#$Ii)", -tc_c4b515c5, TypeCVI_VP>, Enc_0b2e5b, Requires<[UseHVXV62]> { +tc_56e64202, TypeCVI_VP>, Enc_0b2e5b, Requires<[UseHVXV62]> { let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011110001; let hasNewValue = 1; @@ -32431,7 +32574,7 @@ def V6_vlutvwh : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vdd32.h = vlut16($Vu32.b,$Vv32.h,$Rt8)", -tc_4e2a5159, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[UseHVXV60]> { +tc_87adc037, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b1; let Inst{31-24} = 0b00011011; @@ -32443,7 +32586,7 @@ def V6_vlutvwh_nm : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vdd32.h = vlut16($Vu32.b,$Vv32.h,$Rt8):nomatch", -tc_4e2a5159, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[UseHVXV62]> { +tc_87adc037, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[UseHVXV62]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-24} = 0b00011000; @@ -32455,7 +32598,7 @@ def V6_vlutvwh_oracc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vxx32.h |= vlut16($Vu32.b,$Vv32.h,$Rt8)", -tc_cbf6d1dc, TypeCVI_VP_VS>, Enc_7b523d, Requires<[UseHVXV60]> { +tc_9d1dc972, TypeCVI_VP_VS>, Enc_7b523d, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b1; let Inst{31-24} = 0b00011011; @@ -32469,7 +32612,7 @@ def V6_vlutvwh_oracci : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32, u3_0Imm:$Ii), "$Vxx32.h |= vlut16($Vu32.b,$Vv32.h,#$Ii)", -tc_cbf6d1dc, TypeCVI_VP_VS>, Enc_1178da, Requires<[UseHVXV62]> { +tc_9d1dc972, TypeCVI_VP_VS>, Enc_1178da, Requires<[UseHVXV62]> { let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100111; let hasNewValue = 1; @@ -32482,7 +32625,7 @@ def V6_vlutvwhi : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, u3_0Imm:$Ii), "$Vdd32.h = vlut16($Vu32.b,$Vv32.h,#$Ii)", -tc_4e2a5159, TypeCVI_VP_VS>, Enc_4b39e4, Requires<[UseHVXV62]> { +tc_87adc037, TypeCVI_VP_VS>, Enc_4b39e4, Requires<[UseHVXV62]> { let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011110011; let hasNewValue = 1; @@ -32493,7 +32636,7 @@ def V6_vmaxb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.b = vmax($Vu32.b,$Vv32.b)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111001; @@ -32516,7 +32659,7 @@ def V6_vmaxh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vmax($Vu32.h,$Vv32.h)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111000; @@ -32539,7 +32682,7 @@ def V6_vmaxub : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.ub = vmax($Vu32.ub,$Vv32.ub)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111000; @@ -32562,7 +32705,7 @@ def V6_vmaxuh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.uh = vmax($Vu32.uh,$Vv32.uh)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111000; @@ -32585,7 +32728,7 @@ def V6_vmaxw : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vmax($Vu32.w,$Vv32.w)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111001; @@ -32608,7 +32751,7 @@ def V6_vminb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.b = vmin($Vu32.b,$Vv32.b)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111001; @@ -32631,7 +32774,7 @@ def V6_vminh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vmin($Vu32.h,$Vv32.h)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111000; @@ -32654,7 +32797,7 @@ def V6_vminub : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.ub = vmin($Vu32.ub,$Vv32.ub)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111000; @@ -32677,7 +32820,7 @@ def V6_vminuh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.uh = vmin($Vu32.uh,$Vv32.uh)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111000; @@ -32700,7 +32843,7 @@ def V6_vminw : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vmin($Vu32.w,$Vv32.w)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111000; @@ -32723,7 +32866,7 @@ def V6_vmpabus : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, IntRegs:$Rt32), "$Vdd32.h = vmpa($Vuu32.ub,$Rt32.b)", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001001; @@ -32735,7 +32878,7 @@ def V6_vmpabus_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32), "$Vxx32.h += vmpa($Vuu32.ub,$Rt32.b)", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001001; @@ -32773,7 +32916,7 @@ def V6_vmpabusv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.h = vmpa($Vuu32.ub,$Vvv32.b)", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100001; @@ -32796,7 +32939,7 @@ def V6_vmpabuu : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, IntRegs:$Rt32), "$Vdd32.h = vmpa($Vuu32.ub,$Rt32.ub)", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV65]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV65]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001011; @@ -32808,7 +32951,7 @@ def V6_vmpabuu_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32), "$Vxx32.h += vmpa($Vuu32.ub,$Rt32.ub)", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV65]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV65]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001101; @@ -32846,7 +32989,7 @@ def V6_vmpabuuv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.h = vmpa($Vuu32.ub,$Vvv32.ub)", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100111; @@ -32869,7 +33012,7 @@ def V6_vmpahb : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, IntRegs:$Rt32), "$Vdd32.w = vmpa($Vuu32.h,$Rt32.b)", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001001; @@ -32881,7 +33024,7 @@ def V6_vmpahb_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32), "$Vxx32.w += vmpa($Vuu32.h,$Rt32.b)", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001001; @@ -32919,7 +33062,7 @@ def V6_vmpahhsat : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, DoubleRegs:$Rtt32), "$Vx32.h = vmpa($Vx32in.h,$Vu32.h,$Rtt32.h):sat", -tc_7474003e, TypeCVI_VX_DV>, Enc_310ba1, Requires<[UseHVXV65]> { +tc_90bcc1db, TypeCVI_VX_DV>, Enc_310ba1, Requires<[UseHVXV65]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001100; @@ -32932,7 +33075,7 @@ def V6_vmpauhb : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, IntRegs:$Rt32), "$Vdd32.w = vmpa($Vuu32.uh,$Rt32.b)", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV62]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV62]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001100; @@ -32944,7 +33087,7 @@ def V6_vmpauhb_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32), "$Vxx32.w += vmpa($Vuu32.uh,$Rt32.b)", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV62]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV62]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001100; @@ -32982,7 +33125,7 @@ def V6_vmpauhuhsat : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, DoubleRegs:$Rtt32), "$Vx32.h = vmpa($Vx32in.h,$Vu32.uh,$Rtt32.uh):sat", -tc_7474003e, TypeCVI_VX_DV>, Enc_310ba1, Requires<[UseHVXV65]> { +tc_90bcc1db, TypeCVI_VX_DV>, Enc_310ba1, Requires<[UseHVXV65]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001100; @@ -32995,7 +33138,7 @@ def V6_vmpsuhuhsat : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, DoubleRegs:$Rtt32), "$Vx32.h = vmps($Vx32in.h,$Vu32.uh,$Rtt32.uh):sat", -tc_7474003e, TypeCVI_VX_DV>, Enc_310ba1, Requires<[UseHVXV65]> { +tc_90bcc1db, TypeCVI_VX_DV>, Enc_310ba1, Requires<[UseHVXV65]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001100; @@ -33008,7 +33151,7 @@ def V6_vmpybus : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vdd32.h = vmpy($Vu32.ub,$Rt32.b)", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[UseHVXV60]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001001; @@ -33020,7 +33163,7 @@ def V6_vmpybus_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vxx32.h += vmpy($Vu32.ub,$Rt32.b)", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV60]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001001; @@ -33058,7 +33201,7 @@ def V6_vmpybusv : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vdd32.h = vmpy($Vu32.ub,$Vv32.b)", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100000; @@ -33070,7 +33213,7 @@ def V6_vmpybusv_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vxx32.h += vmpy($Vu32.ub,$Vv32.b)", -tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> { +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100000; @@ -33108,7 +33251,7 @@ def V6_vmpybv : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vdd32.h = vmpy($Vu32.b,$Vv32.b)", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100000; @@ -33120,7 +33263,7 @@ def V6_vmpybv_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vxx32.h += vmpy($Vu32.b,$Vv32.b)", -tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> { +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100000; @@ -33158,7 +33301,7 @@ def V6_vmpyewuh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vmpye($Vu32.w,$Vv32.uh)", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111111; @@ -33170,7 +33313,7 @@ def V6_vmpyewuh_64 : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vdd32 = vmpye($Vu32.w,$Vv32.uh)", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV62]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV62]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011110101; @@ -33193,7 +33336,7 @@ def V6_vmpyh : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vdd32.w = vmpy($Vu32.h,$Rt32.h)", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[UseHVXV60]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001010; @@ -33205,7 +33348,7 @@ def V6_vmpyh_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vxx32.w += vmpy($Vu32.h,$Rt32.h)", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV65]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV65]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001101; @@ -33243,7 +33386,7 @@ def V6_vmpyhsat_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vxx32.w += vmpy($Vu32.h,$Rt32.h):sat", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV60]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001010; @@ -33270,7 +33413,7 @@ def V6_vmpyhsrs : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:rnd:sat", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001010; @@ -33293,7 +33436,7 @@ def V6_vmpyhss : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:sat", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001010; @@ -33316,7 +33459,7 @@ def V6_vmpyhus : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vdd32.w = vmpy($Vu32.h,$Vv32.uh)", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100001; @@ -33328,7 +33471,7 @@ def V6_vmpyhus_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vxx32.w += vmpy($Vu32.h,$Vv32.uh)", -tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> { +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100001; @@ -33366,7 +33509,7 @@ def V6_vmpyhv : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vdd32.w = vmpy($Vu32.h,$Vv32.h)", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100000; @@ -33378,7 +33521,7 @@ def V6_vmpyhv_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vxx32.w += vmpy($Vu32.h,$Vv32.h)", -tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> { +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100000; @@ -33416,7 +33559,7 @@ def V6_vmpyhvsrs : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vmpy($Vu32.h,$Vv32.h):<<1:rnd:sat", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100001; @@ -33439,7 +33582,7 @@ def V6_vmpyieoh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vmpyieo($Vu32.h,$Vv32.h)", -tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> { +tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111011; @@ -33451,7 +33594,7 @@ def V6_vmpyiewh_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vx32.w += vmpyie($Vu32.w,$Vv32.h)", -tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100010; @@ -33478,7 +33621,7 @@ def V6_vmpyiewuh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vmpyie($Vu32.w,$Vv32.uh)", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111110; @@ -33490,7 +33633,7 @@ def V6_vmpyiewuh_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vx32.w += vmpyie($Vu32.w,$Vv32.uh)", -tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100001; @@ -33528,7 +33671,7 @@ def V6_vmpyih : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vmpyi($Vu32.h,$Vv32.h)", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100001; @@ -33540,7 +33683,7 @@ def V6_vmpyih_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vx32.h += vmpyi($Vu32.h,$Vv32.h)", -tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100001; @@ -33578,7 +33721,7 @@ def V6_vmpyihb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.h = vmpyi($Vu32.h,$Rt32.b)", -tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_649072c2, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001011; @@ -33590,7 +33733,7 @@ def V6_vmpyihb_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vx32.h += vmpyi($Vu32.h,$Rt32.b)", -tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> { +tc_b091f1c6, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001011; @@ -33628,7 +33771,7 @@ def V6_vmpyiowh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vmpyio($Vu32.w,$Vv32.h)", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111110; @@ -33651,7 +33794,7 @@ def V6_vmpyiwb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.w = vmpyi($Vu32.w,$Rt32.b)", -tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_649072c2, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001101; @@ -33663,7 +33806,7 @@ def V6_vmpyiwb_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vx32.w += vmpyi($Vu32.w,$Rt32.b)", -tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> { +tc_b091f1c6, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001010; @@ -33701,7 +33844,7 @@ def V6_vmpyiwh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.w = vmpyi($Vu32.w,$Rt32.h)", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001100; @@ -33713,7 +33856,7 @@ def V6_vmpyiwh_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vx32.w += vmpyi($Vu32.w,$Rt32.h)", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_5138b3, Requires<[UseHVXV60]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_5138b3, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001010; @@ -33751,7 +33894,7 @@ def V6_vmpyiwub : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.w = vmpyi($Vu32.w,$Rt32.ub)", -tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV62]> { +tc_649072c2, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV62]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001100; @@ -33763,7 +33906,7 @@ def V6_vmpyiwub_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vx32.w += vmpyi($Vu32.w,$Rt32.ub)", -tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV62]> { +tc_b091f1c6, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV62]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001100; @@ -33801,7 +33944,7 @@ def V6_vmpyowh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vmpyo($Vu32.w,$Vv32.h):<<1:sat", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111111; @@ -33813,7 +33956,7 @@ def V6_vmpyowh_64_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vxx32 += vmpyo($Vu32.w,$Vv32.h)", -tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV62]> { +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV62]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100001; @@ -33838,7 +33981,7 @@ def V6_vmpyowh_rnd : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vmpyo($Vu32.w,$Vv32.h):<<1:rnd:sat", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111010; @@ -33861,7 +34004,7 @@ def V6_vmpyowh_rnd_sacc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vx32.w += vmpyo($Vu32.w,$Vv32.h):<<1:rnd:sat:shift", -tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100001; @@ -33887,7 +34030,7 @@ def V6_vmpyowh_sacc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vx32.w += vmpyo($Vu32.w,$Vv32.h):<<1:sat:shift", -tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100001; @@ -33913,7 +34056,7 @@ def V6_vmpyub : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vdd32.uh = vmpy($Vu32.ub,$Rt32.ub)", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[UseHVXV60]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001110; @@ -33925,7 +34068,7 @@ def V6_vmpyub_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vxx32.uh += vmpy($Vu32.ub,$Rt32.ub)", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV60]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001100; @@ -33963,7 +34106,7 @@ def V6_vmpyubv : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vdd32.uh = vmpy($Vu32.ub,$Vv32.ub)", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100000; @@ -33975,7 +34118,7 @@ def V6_vmpyubv_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vxx32.uh += vmpy($Vu32.ub,$Vv32.ub)", -tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> { +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100000; @@ -34013,7 +34156,7 @@ def V6_vmpyuh : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vdd32.uw = vmpy($Vu32.uh,$Rt32.uh)", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[UseHVXV60]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001010; @@ -34025,7 +34168,7 @@ def V6_vmpyuh_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vxx32.uw += vmpy($Vu32.uh,$Rt32.uh)", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV60]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_5e8512, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001010; @@ -34063,7 +34206,7 @@ def V6_vmpyuhe : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.uw = vmpye($Vu32.uh,$Rt32.uh)", -tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV65]> { +tc_649072c2, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV65]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001011; @@ -34075,7 +34218,7 @@ def V6_vmpyuhe_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vx32.uw += vmpye($Vu32.uh,$Rt32.uh)", -tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV65]> { +tc_b091f1c6, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV65]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001100; @@ -34089,7 +34232,7 @@ def V6_vmpyuhv : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vdd32.uw = vmpy($Vu32.uh,$Vv32.uh)", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100001; @@ -34101,7 +34244,7 @@ def V6_vmpyuhv_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vxx32.uw += vmpy($Vu32.uh,$Vv32.uh)", -tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> { +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100001; @@ -34139,7 +34282,7 @@ def V6_vmux : HInst< (outs HvxVR:$Vd32), (ins HvxQR:$Qt4, HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32 = vmux($Qt4,$Vu32,$Vv32)", -tc_a3127e12, TypeCVI_VA>, Enc_31db33, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_31db33, Requires<[UseHVXV60]> { let Inst{7-7} = 0b0; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011110111; @@ -34151,7 +34294,7 @@ def V6_vnavgb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.b = vnavg($Vu32.b,$Vv32.b)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV65]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011111000; @@ -34174,7 +34317,7 @@ def V6_vnavgh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vnavg($Vu32.h,$Vv32.h)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100111; @@ -34197,7 +34340,7 @@ def V6_vnavgub : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.b = vnavg($Vu32.ub,$Vv32.ub)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100111; @@ -34220,7 +34363,7 @@ def V6_vnavgw : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vnavg($Vu32.w,$Vv32.w)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100111; @@ -34243,7 +34386,7 @@ def V6_vnccombine : HInst< (outs HvxWR:$Vdd32), (ins PredRegs:$Ps4, HvxVR:$Vu32, HvxVR:$Vv32), "if (!$Ps4) $Vdd32 = vcombine($Vu32,$Vv32)", -tc_2171ebae, TypeCVI_VA_DV>, Enc_8c2412, Requires<[UseHVXV60]> { +tc_af25efd9, TypeCVI_VA_DV>, Enc_8c2412, Requires<[UseHVXV60]> { let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011010010; @@ -34257,7 +34400,7 @@ def V6_vncmov : HInst< (outs HvxVR:$Vd32), (ins PredRegs:$Ps4, HvxVR:$Vu32), "if (!$Ps4) $Vd32 = $Vu32", -tc_b06ab583, TypeCVI_VA>, Enc_770858, Requires<[UseHVXV60]> { +tc_3aacf4a8, TypeCVI_VA>, Enc_770858, Requires<[UseHVXV60]> { let Inst{7-7} = 0b0; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001101000100000; @@ -34271,7 +34414,7 @@ def V6_vnormamth : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32.h = vnormamt($Vu32.h)", -tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> { +tc_51d0ecc3, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000011; @@ -34294,7 +34437,7 @@ def V6_vnormamtw : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32.w = vnormamt($Vu32.w)", -tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> { +tc_51d0ecc3, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000011; @@ -34317,7 +34460,7 @@ def V6_vnot : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32 = vnot($Vu32)", -tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> { +tc_0ec46cf9, TypeCVI_VA>, Enc_e7581c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000000; @@ -34329,7 +34472,7 @@ def V6_vor : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32 = vor($Vu32,$Vv32)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100001; @@ -34341,7 +34484,7 @@ def V6_vpackeb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.b = vpacke($Vu32.h,$Vv32.h)", -tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { +tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111110; @@ -34364,7 +34507,7 @@ def V6_vpackeh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vpacke($Vu32.w,$Vv32.w)", -tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { +tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111110; @@ -34387,7 +34530,7 @@ def V6_vpackhb_sat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.b = vpack($Vu32.h,$Vv32.h):sat", -tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { +tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111110; @@ -34410,7 +34553,7 @@ def V6_vpackhub_sat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.ub = vpack($Vu32.h,$Vv32.h):sat", -tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { +tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111110; @@ -34433,7 +34576,7 @@ def V6_vpackob : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.b = vpacko($Vu32.h,$Vv32.h)", -tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { +tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111111; @@ -34456,7 +34599,7 @@ def V6_vpackoh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vpacko($Vu32.w,$Vv32.w)", -tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { +tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111111; @@ -34479,7 +34622,7 @@ def V6_vpackwh_sat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vpack($Vu32.w,$Vv32.w):sat", -tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { +tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111111; @@ -34502,7 +34645,7 @@ def V6_vpackwuh_sat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.uh = vpack($Vu32.w,$Vv32.w):sat", -tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { +tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111110; @@ -34525,7 +34668,7 @@ def V6_vpopcounth : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32.h = vpopcount($Vu32.h)", -tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> { +tc_51d0ecc3, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000010; @@ -34548,7 +34691,7 @@ def V6_vprefixqb : HInst< (outs HvxVR:$Vd32), (ins HvxQR:$Qv4), "$Vd32.b = prefixsum($Qv4)", -tc_d2cb81ea, TypeCVI_VS>, Enc_6f83e7, Requires<[UseHVXV65]> { +tc_51d0ecc3, TypeCVI_VS>, Enc_6f83e7, Requires<[UseHVXV65]> { let Inst{13-5} = 0b100000010; let Inst{21-16} = 0b000011; let Inst{31-24} = 0b00011110; @@ -34560,7 +34703,7 @@ def V6_vprefixqh : HInst< (outs HvxVR:$Vd32), (ins HvxQR:$Qv4), "$Vd32.h = prefixsum($Qv4)", -tc_d2cb81ea, TypeCVI_VS>, Enc_6f83e7, Requires<[UseHVXV65]> { +tc_51d0ecc3, TypeCVI_VS>, Enc_6f83e7, Requires<[UseHVXV65]> { let Inst{13-5} = 0b100001010; let Inst{21-16} = 0b000011; let Inst{31-24} = 0b00011110; @@ -34572,7 +34715,7 @@ def V6_vprefixqw : HInst< (outs HvxVR:$Vd32), (ins HvxQR:$Qv4), "$Vd32.w = prefixsum($Qv4)", -tc_d2cb81ea, TypeCVI_VS>, Enc_6f83e7, Requires<[UseHVXV65]> { +tc_51d0ecc3, TypeCVI_VS>, Enc_6f83e7, Requires<[UseHVXV65]> { let Inst{13-5} = 0b100010010; let Inst{21-16} = 0b000011; let Inst{31-24} = 0b00011110; @@ -34584,7 +34727,7 @@ def V6_vrdelta : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32 = vrdelta($Vu32,$Vv32)", -tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { +tc_46d6c3e0, TypeCVI_VP>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111001; @@ -34596,7 +34739,7 @@ def V6_vrmpybub_rtt : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, DoubleRegs:$Rtt32), "$Vdd32.w = vrmpy($Vu32.b,$Rtt32.ub)", -tc_a807365d, TypeCVI_VS_VX>, Enc_cb785b, Requires<[UseHVXV65]> { +tc_cd94bfe0, TypeCVI_VS_VX>, Enc_cb785b, Requires<[UseHVXV65]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001110; @@ -34608,7 +34751,7 @@ def V6_vrmpybub_rtt_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32, DoubleRegs:$Rtt32), "$Vxx32.w += vrmpy($Vu32.b,$Rtt32.ub)", -tc_ee927c0e, TypeCVI_VS_VX>, Enc_ad9bef, Requires<[UseHVXV65]> { +tc_15fdf750, TypeCVI_VS_VX>, Enc_ad9bef, Requires<[UseHVXV65]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001101; @@ -34646,7 +34789,7 @@ def V6_vrmpybus : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.w = vrmpy($Vu32.ub,$Rt32.b)", -tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_649072c2, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001000; @@ -34658,7 +34801,7 @@ def V6_vrmpybus_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vx32.w += vrmpy($Vu32.ub,$Rt32.b)", -tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> { +tc_b091f1c6, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001000; @@ -34696,7 +34839,7 @@ def V6_vrmpybusi : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii), "$Vdd32.w = vrmpy($Vuu32.ub,$Rt32.b,#$Ii)", -tc_7e9f581b, TypeCVI_VX_DV>, Enc_2f2f04, Requires<[UseHVXV60]> { +tc_1ad8a370, TypeCVI_VX_DV>, Enc_2f2f04, Requires<[UseHVXV60]> { let Inst{7-6} = 0b10; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001010; @@ -34708,7 +34851,7 @@ def V6_vrmpybusi_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii), "$Vxx32.w += vrmpy($Vuu32.ub,$Rt32.b,#$Ii)", -tc_41f99e1c, TypeCVI_VX_DV>, Enc_d483b9, Requires<[UseHVXV60]> { +tc_e675c45a, TypeCVI_VX_DV>, Enc_d483b9, Requires<[UseHVXV60]> { let Inst{7-6} = 0b10; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001010; @@ -34746,7 +34889,7 @@ def V6_vrmpybusv : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vrmpy($Vu32.ub,$Vv32.b)", -tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> { +tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100000; @@ -34758,7 +34901,7 @@ def V6_vrmpybusv_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vx32.w += vrmpy($Vu32.ub,$Vv32.b)", -tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100000; @@ -34796,7 +34939,7 @@ def V6_vrmpybv : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vrmpy($Vu32.b,$Vv32.b)", -tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> { +tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100000; @@ -34808,7 +34951,7 @@ def V6_vrmpybv_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vx32.w += vrmpy($Vu32.b,$Vv32.b)", -tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100000; @@ -34846,7 +34989,7 @@ def V6_vrmpyub : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.uw = vrmpy($Vu32.ub,$Rt32.ub)", -tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_649072c2, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001000; @@ -34858,7 +35001,7 @@ def V6_vrmpyub_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vx32.uw += vrmpy($Vu32.ub,$Rt32.ub)", -tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> { +tc_b091f1c6, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001000; @@ -34896,7 +35039,7 @@ def V6_vrmpyub_rtt : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, DoubleRegs:$Rtt32), "$Vdd32.uw = vrmpy($Vu32.ub,$Rtt32.ub)", -tc_a807365d, TypeCVI_VS_VX>, Enc_cb785b, Requires<[UseHVXV65]> { +tc_cd94bfe0, TypeCVI_VS_VX>, Enc_cb785b, Requires<[UseHVXV65]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001110; @@ -34908,7 +35051,7 @@ def V6_vrmpyub_rtt_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32, DoubleRegs:$Rtt32), "$Vxx32.uw += vrmpy($Vu32.ub,$Rtt32.ub)", -tc_ee927c0e, TypeCVI_VS_VX>, Enc_ad9bef, Requires<[UseHVXV65]> { +tc_15fdf750, TypeCVI_VS_VX>, Enc_ad9bef, Requires<[UseHVXV65]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001101; @@ -34946,7 +35089,7 @@ def V6_vrmpyubi : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii), "$Vdd32.uw = vrmpy($Vuu32.ub,$Rt32.ub,#$Ii)", -tc_7e9f581b, TypeCVI_VX_DV>, Enc_2f2f04, Requires<[UseHVXV60]> { +tc_1ad8a370, TypeCVI_VX_DV>, Enc_2f2f04, Requires<[UseHVXV60]> { let Inst{7-6} = 0b11; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001101; @@ -34958,7 +35101,7 @@ def V6_vrmpyubi_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii), "$Vxx32.uw += vrmpy($Vuu32.ub,$Rt32.ub,#$Ii)", -tc_41f99e1c, TypeCVI_VX_DV>, Enc_d483b9, Requires<[UseHVXV60]> { +tc_e675c45a, TypeCVI_VX_DV>, Enc_d483b9, Requires<[UseHVXV60]> { let Inst{7-6} = 0b11; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001011; @@ -34996,7 +35139,7 @@ def V6_vrmpyubv : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.uw = vrmpy($Vu32.ub,$Vv32.ub)", -tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> { +tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100000; @@ -35008,7 +35151,7 @@ def V6_vrmpyubv_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vx32.uw += vrmpy($Vu32.ub,$Vv32.ub)", -tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100000; @@ -35042,11 +35185,276 @@ let isPseudo = 1; let isCodeGenOnly = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vrmpyzbb_rt : HInst< +(outs HvxVQR:$Vdddd32), +(ins HvxVR:$Vu32, IntRegsLow8:$Rt8), +"$Vdddd32.w = vrmpyz($Vu32.b,$Rt8.b)", +tc_61bf7c03, TypeCVI_4SLOT_MPY>, Enc_1bd127, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b0; +let Inst{31-19} = 0b0001100111101; +let hasNewValue = 1; +let opNewValue = 0; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vrmpyzbb_rt_acc : HInst< +(outs HvxVQR:$Vyyyy32), +(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegsLow8:$Rt8), +"$Vyyyy32.w += vrmpyz($Vu32.b,$Rt8.b)", +tc_933f2b39, TypeCVI_4SLOT_MPY>, Enc_d7bc34, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b1; +let Inst{31-19} = 0b0001100111000; +let hasNewValue = 1; +let opNewValue = 0; +let isAccumulator = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Vyyyy32 = $Vyyyy32in"; +} +def V6_vrmpyzbb_rx : HInst< +(outs HvxVQR:$Vdddd32, IntRegsLow8:$Rx8), +(ins HvxVR:$Vu32, IntRegs:$Rx8in), +"$Vdddd32.w = vrmpyz($Vu32.b,$Rx8.b++)", +tc_26a377fe, TypeCVI_4SLOT_MPY>, Enc_3b7631, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b0; +let Inst{31-19} = 0b0001100111100; +let hasNewValue = 1; +let opNewValue = 0; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Rx8 = $Rx8in"; +} +def V6_vrmpyzbb_rx_acc : HInst< +(outs HvxVQR:$Vyyyy32, IntRegsLow8:$Rx8), +(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegs:$Rx8in), +"$Vyyyy32.w += vrmpyz($Vu32.b,$Rx8.b++)", +tc_2d4051cd, TypeCVI_4SLOT_MPY>, Enc_bddee3, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b1; +let Inst{31-19} = 0b0001100111001; +let hasNewValue = 1; +let opNewValue = 0; +let isAccumulator = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Vyyyy32 = $Vyyyy32in, $Rx8 = $Rx8in"; +} +def V6_vrmpyzbub_rt : HInst< +(outs HvxVQR:$Vdddd32), +(ins HvxVR:$Vu32, IntRegsLow8:$Rt8), +"$Vdddd32.w = vrmpyz($Vu32.b,$Rt8.ub)", +tc_61bf7c03, TypeCVI_4SLOT_MPY>, Enc_1bd127, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b0; +let Inst{31-19} = 0b0001100111111; +let hasNewValue = 1; +let opNewValue = 0; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vrmpyzbub_rt_acc : HInst< +(outs HvxVQR:$Vyyyy32), +(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegsLow8:$Rt8), +"$Vyyyy32.w += vrmpyz($Vu32.b,$Rt8.ub)", +tc_933f2b39, TypeCVI_4SLOT_MPY>, Enc_d7bc34, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b1; +let Inst{31-19} = 0b0001100111010; +let hasNewValue = 1; +let opNewValue = 0; +let isAccumulator = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Vyyyy32 = $Vyyyy32in"; +} +def V6_vrmpyzbub_rx : HInst< +(outs HvxVQR:$Vdddd32, IntRegsLow8:$Rx8), +(ins HvxVR:$Vu32, IntRegs:$Rx8in), +"$Vdddd32.w = vrmpyz($Vu32.b,$Rx8.ub++)", +tc_26a377fe, TypeCVI_4SLOT_MPY>, Enc_3b7631, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b0; +let Inst{31-19} = 0b0001100111110; +let hasNewValue = 1; +let opNewValue = 0; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Rx8 = $Rx8in"; +} +def V6_vrmpyzbub_rx_acc : HInst< +(outs HvxVQR:$Vyyyy32, IntRegsLow8:$Rx8), +(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegs:$Rx8in), +"$Vyyyy32.w += vrmpyz($Vu32.b,$Rx8.ub++)", +tc_2d4051cd, TypeCVI_4SLOT_MPY>, Enc_bddee3, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b1; +let Inst{31-19} = 0b0001100111011; +let hasNewValue = 1; +let opNewValue = 0; +let isAccumulator = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Vyyyy32 = $Vyyyy32in, $Rx8 = $Rx8in"; +} +def V6_vrmpyzcb_rt : HInst< +(outs HvxVQR:$Vdddd32), +(ins HvxVR:$Vu32, IntRegsLow8:$Rt8), +"$Vdddd32.w = vr16mpyz($Vu32.c,$Rt8.b)", +tc_61bf7c03, TypeCVI_4SLOT_MPY>, Enc_1bd127, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b0; +let Inst{31-19} = 0b0001100111101; +let hasNewValue = 1; +let opNewValue = 0; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vrmpyzcb_rt_acc : HInst< +(outs HvxVQR:$Vyyyy32), +(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegsLow8:$Rt8), +"$Vyyyy32.w += vr16mpyz($Vu32.c,$Rt8.b)", +tc_933f2b39, TypeCVI_4SLOT_MPY>, Enc_d7bc34, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b011; +let Inst{13-13} = 0b1; +let Inst{31-19} = 0b0001100111000; +let hasNewValue = 1; +let opNewValue = 0; +let isAccumulator = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Vyyyy32 = $Vyyyy32in"; +} +def V6_vrmpyzcb_rx : HInst< +(outs HvxVQR:$Vdddd32, IntRegsLow8:$Rx8), +(ins HvxVR:$Vu32, IntRegs:$Rx8in), +"$Vdddd32.w = vr16mpyz($Vu32.c,$Rx8.b++)", +tc_26a377fe, TypeCVI_4SLOT_MPY>, Enc_3b7631, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b0; +let Inst{31-19} = 0b0001100111100; +let hasNewValue = 1; +let opNewValue = 0; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Rx8 = $Rx8in"; +} +def V6_vrmpyzcb_rx_acc : HInst< +(outs HvxVQR:$Vyyyy32, IntRegsLow8:$Rx8), +(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegs:$Rx8in), +"$Vyyyy32.w += vr16mpyz($Vu32.c,$Rx8.b++)", +tc_2d4051cd, TypeCVI_4SLOT_MPY>, Enc_bddee3, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b011; +let Inst{13-13} = 0b1; +let Inst{31-19} = 0b0001100111001; +let hasNewValue = 1; +let opNewValue = 0; +let isAccumulator = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Vyyyy32 = $Vyyyy32in, $Rx8 = $Rx8in"; +} +def V6_vrmpyzcbs_rt : HInst< +(outs HvxVQR:$Vdddd32), +(ins HvxVR:$Vu32, IntRegsLow8:$Rt8), +"$Vdddd32.w = vr16mpyzs($Vu32.c,$Rt8.b)", +tc_61bf7c03, TypeCVI_4SLOT_MPY>, Enc_1bd127, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b0; +let Inst{31-19} = 0b0001100111101; +let hasNewValue = 1; +let opNewValue = 0; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vrmpyzcbs_rt_acc : HInst< +(outs HvxVQR:$Vyyyy32), +(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegsLow8:$Rt8), +"$Vyyyy32.w += vr16mpyzs($Vu32.c,$Rt8.b)", +tc_933f2b39, TypeCVI_4SLOT_MPY>, Enc_d7bc34, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b1; +let Inst{31-19} = 0b0001100111000; +let hasNewValue = 1; +let opNewValue = 0; +let isAccumulator = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Vyyyy32 = $Vyyyy32in"; +} +def V6_vrmpyzcbs_rx : HInst< +(outs HvxVQR:$Vdddd32, IntRegsLow8:$Rx8), +(ins HvxVR:$Vu32, IntRegs:$Rx8in), +"$Vdddd32.w = vr16mpyzs($Vu32.c,$Rx8.b++)", +tc_26a377fe, TypeCVI_4SLOT_MPY>, Enc_3b7631, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b0; +let Inst{31-19} = 0b0001100111100; +let hasNewValue = 1; +let opNewValue = 0; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Rx8 = $Rx8in"; +} +def V6_vrmpyzcbs_rx_acc : HInst< +(outs HvxVQR:$Vyyyy32, IntRegsLow8:$Rx8), +(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegs:$Rx8in), +"$Vyyyy32.w += vr16mpyzs($Vu32.c,$Rx8.b++)", +tc_2d4051cd, TypeCVI_4SLOT_MPY>, Enc_bddee3, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b1; +let Inst{31-19} = 0b0001100111001; +let hasNewValue = 1; +let opNewValue = 0; +let isAccumulator = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Vyyyy32 = $Vyyyy32in, $Rx8 = $Rx8in"; +} +def V6_vrmpyznb_rt : HInst< +(outs HvxVQR:$Vdddd32), +(ins HvxVR:$Vu32, IntRegsLow8:$Rt8), +"$Vdddd32.w = vr8mpyz($Vu32.n,$Rt8.b)", +tc_61bf7c03, TypeCVI_4SLOT_MPY>, Enc_1bd127, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b0; +let Inst{31-19} = 0b0001100111111; +let hasNewValue = 1; +let opNewValue = 0; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vrmpyznb_rt_acc : HInst< +(outs HvxVQR:$Vyyyy32), +(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegsLow8:$Rt8), +"$Vyyyy32.w += vr8mpyz($Vu32.n,$Rt8.b)", +tc_933f2b39, TypeCVI_4SLOT_MPY>, Enc_d7bc34, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b1; +let Inst{31-19} = 0b0001100111010; +let hasNewValue = 1; +let opNewValue = 0; +let isAccumulator = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Vyyyy32 = $Vyyyy32in"; +} +def V6_vrmpyznb_rx : HInst< +(outs HvxVQR:$Vdddd32, IntRegsLow8:$Rx8), +(ins HvxVR:$Vu32, IntRegs:$Rx8in), +"$Vdddd32.w = vr8mpyz($Vu32.n,$Rx8.b++)", +tc_26a377fe, TypeCVI_4SLOT_MPY>, Enc_3b7631, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b0; +let Inst{31-19} = 0b0001100111110; +let hasNewValue = 1; +let opNewValue = 0; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Rx8 = $Rx8in"; +} +def V6_vrmpyznb_rx_acc : HInst< +(outs HvxVQR:$Vyyyy32, IntRegsLow8:$Rx8), +(ins HvxVQR:$Vyyyy32in, HvxVR:$Vu32, IntRegs:$Rx8in), +"$Vyyyy32.w += vr8mpyz($Vu32.n,$Rx8.b++)", +tc_2d4051cd, TypeCVI_4SLOT_MPY>, Enc_bddee3, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b1; +let Inst{31-19} = 0b0001100111011; +let hasNewValue = 1; +let opNewValue = 0; +let isAccumulator = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Vyyyy32 = $Vyyyy32in, $Rx8 = $Rx8in"; +} def V6_vror : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32 = vror($Vu32,$Rt32)", -tc_bf142ae2, TypeCVI_VP>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_6e7fa133, TypeCVI_VP>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001011; @@ -35054,11 +35462,34 @@ let hasNewValue = 1; let opNewValue = 0; let DecoderNamespace = "EXT_mmvec"; } +def V6_vrotr : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.uw = vrotr($Vu32.uw,$Vv32.uw)", +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV66]> { +let Inst{7-5} = 0b111; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011010100; +let hasNewValue = 1; +let opNewValue = 0; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vrotr_alt : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32 = vrotr($Vu32,$Vv32)", +PSEUDO, TypeMAPPING>, Requires<[UseHVXV66]> { +let hasNewValue = 1; +let opNewValue = 0; +let isPseudo = 1; +let isCodeGenOnly = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vroundhb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.b = vround($Vu32.h,$Vv32.h):sat", -tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111011; @@ -35081,7 +35512,7 @@ def V6_vroundhub : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.ub = vround($Vu32.h,$Vv32.h):sat", -tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111011; @@ -35104,7 +35535,7 @@ def V6_vrounduhub : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.ub = vround($Vu32.uh,$Vv32.uh):sat", -tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV62]> { +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV62]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111111; @@ -35127,7 +35558,7 @@ def V6_vrounduwuh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.uh = vround($Vu32.uw,$Vv32.uw):sat", -tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV62]> { +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV62]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111111; @@ -35150,7 +35581,7 @@ def V6_vroundwh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vround($Vu32.w,$Vv32.w):sat", -tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111011; @@ -35173,7 +35604,7 @@ def V6_vroundwuh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.uh = vround($Vu32.w,$Vv32.w):sat", -tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111011; @@ -35196,7 +35627,7 @@ def V6_vrsadubi : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii), "$Vdd32.uw = vrsad($Vuu32.ub,$Rt32.ub,#$Ii)", -tc_7e9f581b, TypeCVI_VX_DV>, Enc_2f2f04, Requires<[UseHVXV60]> { +tc_1ad8a370, TypeCVI_VX_DV>, Enc_2f2f04, Requires<[UseHVXV60]> { let Inst{7-6} = 0b11; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001010; @@ -35208,7 +35639,7 @@ def V6_vrsadubi_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii), "$Vxx32.uw += vrsad($Vuu32.ub,$Rt32.ub,#$Ii)", -tc_41f99e1c, TypeCVI_VX_DV>, Enc_d483b9, Requires<[UseHVXV60]> { +tc_e675c45a, TypeCVI_VX_DV>, Enc_d483b9, Requires<[UseHVXV60]> { let Inst{7-6} = 0b11; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001010; @@ -35242,11 +35673,23 @@ let isPseudo = 1; let isCodeGenOnly = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vsatdw : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.w = vsatdw($Vu32.w,$Vv32.w)", +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV66]> { +let Inst{7-5} = 0b111; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011101100; +let hasNewValue = 1; +let opNewValue = 0; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vsathub : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.ub = vsat($Vu32.h,$Vv32.h)", -tc_9b9642a1, TypeCVI_VINLANESAT>, Enc_45364e, Requires<[UseHVXV60]> { +tc_8772086c, TypeCVI_VINLANESAT>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111011; @@ -35269,7 +35712,7 @@ def V6_vsatuwuh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.uh = vsat($Vu32.uw,$Vv32.uw)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111001; @@ -35292,7 +35735,7 @@ def V6_vsatwh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vsat($Vu32.w,$Vv32.w)", -tc_9b9642a1, TypeCVI_VINLANESAT>, Enc_45364e, Requires<[UseHVXV60]> { +tc_8772086c, TypeCVI_VINLANESAT>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111011; @@ -35315,7 +35758,7 @@ def V6_vsb : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32), "$Vdd32.h = vsxt($Vu32.b)", -tc_644584f8, TypeCVI_VA_DV>, Enc_dd766a, Requires<[UseHVXV60]> { +tc_b4416217, TypeCVI_VA_DV>, Enc_dd766a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000010; @@ -35338,7 +35781,7 @@ def V6_vscattermh : HInst< (outs), (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32, HvxVR:$Vw32), "vscatter($Rt32,$Mu2,$Vv32.h).h = $Vw32", -tc_4f190ba3, TypeCVI_SCATTER>, Enc_16c48b, Requires<[UseHVXV65]> { +tc_9f363d21, TypeCVI_SCATTER>, Enc_16c48b, Requires<[UseHVXV65]> { let Inst{7-5} = 0b001; let Inst{31-21} = 0b00101111001; let accessSize = HalfWordAccess; @@ -35349,7 +35792,7 @@ def V6_vscattermh_add : HInst< (outs), (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32, HvxVR:$Vw32), "vscatter($Rt32,$Mu2,$Vv32.h).h += $Vw32", -tc_4f190ba3, TypeCVI_SCATTER>, Enc_16c48b, Requires<[UseHVXV65]> { +tc_9f363d21, TypeCVI_SCATTER>, Enc_16c48b, Requires<[UseHVXV65]> { let Inst{7-5} = 0b101; let Inst{31-21} = 0b00101111001; let accessSize = HalfWordAccess; @@ -35380,7 +35823,7 @@ def V6_vscattermhq : HInst< (outs), (ins HvxQR:$Qs4, IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32, HvxVR:$Vw32), "if ($Qs4) vscatter($Rt32,$Mu2,$Vv32.h).h = $Vw32", -tc_df54ad52, TypeCVI_SCATTER>, Enc_9be1de, Requires<[UseHVXV65]> { +tc_8e420e4d, TypeCVI_SCATTER>, Enc_9be1de, Requires<[UseHVXV65]> { let Inst{7-7} = 0b1; let Inst{31-21} = 0b00101111100; let accessSize = HalfWordAccess; @@ -35400,7 +35843,7 @@ def V6_vscattermhw : HInst< (outs), (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxWR:$Vvv32, HvxVR:$Vw32), "vscatter($Rt32,$Mu2,$Vvv32.w).h = $Vw32", -tc_ec58f88a, TypeCVI_SCATTER_DV>, Enc_a641d0, Requires<[UseHVXV65]> { +tc_7273323b, TypeCVI_SCATTER_DV>, Enc_a641d0, Requires<[UseHVXV65]> { let Inst{7-5} = 0b010; let Inst{31-21} = 0b00101111001; let accessSize = HalfWordAccess; @@ -35411,7 +35854,7 @@ def V6_vscattermhw_add : HInst< (outs), (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxWR:$Vvv32, HvxVR:$Vw32), "vscatter($Rt32,$Mu2,$Vvv32.w).h += $Vw32", -tc_ec58f88a, TypeCVI_SCATTER_DV>, Enc_a641d0, Requires<[UseHVXV65]> { +tc_7273323b, TypeCVI_SCATTER_DV>, Enc_a641d0, Requires<[UseHVXV65]> { let Inst{7-5} = 0b110; let Inst{31-21} = 0b00101111001; let accessSize = HalfWordAccess; @@ -35423,7 +35866,7 @@ def V6_vscattermhwq : HInst< (outs), (ins HvxQR:$Qs4, IntRegs:$Rt32, ModRegs:$Mu2, HvxWR:$Vvv32, HvxVR:$Vw32), "if ($Qs4) vscatter($Rt32,$Mu2,$Vvv32.w).h = $Vw32", -tc_94f43c04, TypeCVI_SCATTER_DV>, Enc_3d6d37, Requires<[UseHVXV65]> { +tc_58d21193, TypeCVI_SCATTER_DV>, Enc_3d6d37, Requires<[UseHVXV65]> { let Inst{7-7} = 0b0; let Inst{31-21} = 0b00101111101; let accessSize = HalfWordAccess; @@ -35434,7 +35877,7 @@ def V6_vscattermw : HInst< (outs), (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32, HvxVR:$Vw32), "vscatter($Rt32,$Mu2,$Vv32.w).w = $Vw32", -tc_4f190ba3, TypeCVI_SCATTER>, Enc_16c48b, Requires<[UseHVXV65]> { +tc_9f363d21, TypeCVI_SCATTER>, Enc_16c48b, Requires<[UseHVXV65]> { let Inst{7-5} = 0b000; let Inst{31-21} = 0b00101111001; let accessSize = WordAccess; @@ -35445,7 +35888,7 @@ def V6_vscattermw_add : HInst< (outs), (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32, HvxVR:$Vw32), "vscatter($Rt32,$Mu2,$Vv32.w).w += $Vw32", -tc_4f190ba3, TypeCVI_SCATTER>, Enc_16c48b, Requires<[UseHVXV65]> { +tc_9f363d21, TypeCVI_SCATTER>, Enc_16c48b, Requires<[UseHVXV65]> { let Inst{7-5} = 0b100; let Inst{31-21} = 0b00101111001; let accessSize = WordAccess; @@ -35504,7 +35947,7 @@ def V6_vscattermwq : HInst< (outs), (ins HvxQR:$Qs4, IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32, HvxVR:$Vw32), "if ($Qs4) vscatter($Rt32,$Mu2,$Vv32.w).w = $Vw32", -tc_df54ad52, TypeCVI_SCATTER>, Enc_9be1de, Requires<[UseHVXV65]> { +tc_8e420e4d, TypeCVI_SCATTER>, Enc_9be1de, Requires<[UseHVXV65]> { let Inst{7-7} = 0b0; let Inst{31-21} = 0b00101111100; let accessSize = WordAccess; @@ -35524,7 +35967,7 @@ def V6_vsh : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32), "$Vdd32.w = vsxt($Vu32.h)", -tc_644584f8, TypeCVI_VA_DV>, Enc_dd766a, Requires<[UseHVXV60]> { +tc_b4416217, TypeCVI_VA_DV>, Enc_dd766a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000010; @@ -35547,7 +35990,7 @@ def V6_vshufeh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vshuffe($Vu32.h,$Vv32.h)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111010; @@ -35570,7 +36013,7 @@ def V6_vshuff : HInst< (outs HvxVR:$Vy32, HvxVR:$Vx32), (ins HvxVR:$Vy32in, HvxVR:$Vx32in, IntRegs:$Rt32), "vshuff($Vy32,$Vx32,$Rt32)", -tc_5c120602, TypeCVI_VP_VS>, Enc_989021, Requires<[UseHVXV60]> { +tc_561aaa58, TypeCVI_VP_VS>, Enc_989021, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001111; @@ -35585,7 +36028,7 @@ def V6_vshuffb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32.b = vshuff($Vu32.b)", -tc_e6299d16, TypeCVI_VP>, Enc_e7581c, Requires<[UseHVXV60]> { +tc_946013d8, TypeCVI_VP>, Enc_e7581c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000010; @@ -35608,7 +36051,7 @@ def V6_vshuffeb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.b = vshuffe($Vu32.b,$Vv32.b)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111010; @@ -35631,7 +36074,7 @@ def V6_vshuffh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), "$Vd32.h = vshuff($Vu32.h)", -tc_e6299d16, TypeCVI_VP>, Enc_e7581c, Requires<[UseHVXV60]> { +tc_946013d8, TypeCVI_VP>, Enc_e7581c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000001; @@ -35654,7 +36097,7 @@ def V6_vshuffob : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.b = vshuffo($Vu32.b,$Vv32.b)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111010; @@ -35677,7 +36120,7 @@ def V6_vshuffvdd : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), "$Vdd32 = vshuff($Vu32,$Vv32,$Rt8)", -tc_4e2a5159, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[UseHVXV60]> { +tc_87adc037, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b1; let Inst{31-24} = 0b00011011; @@ -35689,7 +36132,7 @@ def V6_vshufoeb : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vdd32.b = vshuffoe($Vu32.b,$Vv32.b)", -tc_97c165b9, TypeCVI_VA_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111010; @@ -35712,7 +36155,7 @@ def V6_vshufoeh : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vdd32.h = vshuffoe($Vu32.h,$Vv32.h)", -tc_97c165b9, TypeCVI_VA_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111010; @@ -35735,7 +36178,7 @@ def V6_vshufoh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vshuffo($Vu32.h,$Vv32.h)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111010; @@ -35758,7 +36201,7 @@ def V6_vsubb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.b = vsub($Vu32.b,$Vv32.b)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100010; @@ -35781,7 +36224,7 @@ def V6_vsubb_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.b = vsub($Vuu32.b,$Vvv32.b)", -tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100100; @@ -35804,7 +36247,7 @@ def V6_vsubbnq : HInst< (outs HvxVR:$Vx32), (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32), "if (!$Qv4) $Vx32.b -= $Vu32.b", -tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b1; let Inst{21-16} = 0b000010; @@ -35830,7 +36273,7 @@ def V6_vsubbq : HInst< (outs HvxVR:$Vx32), (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32), "if ($Qv4) $Vx32.b -= $Vu32.b", -tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b1; let Inst{21-16} = 0b000001; @@ -35856,7 +36299,7 @@ def V6_vsubbsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.b = vsub($Vu32.b,$Vv32.b):sat", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111001; @@ -35879,7 +36322,7 @@ def V6_vsubbsat_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.b = vsub($Vuu32.b,$Vvv32.b):sat", -tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV62]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV62]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011110101; @@ -35902,7 +36345,7 @@ def V6_vsubcarry : HInst< (outs HvxVR:$Vd32, HvxQR:$Qx4), (ins HvxVR:$Vu32, HvxVR:$Vv32, HvxQR:$Qx4in), "$Vd32.w = vsub($Vu32.w,$Vv32.w,$Qx4):carry", -tc_5a9fc4ec, TypeCVI_VA>, Enc_b43b67, Requires<[UseHVXV62]> { +tc_7e6a3e89, TypeCVI_VA>, Enc_b43b67, Requires<[UseHVXV62]> { let Inst{7-7} = 0b1; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100101; @@ -35911,11 +36354,25 @@ let opNewValue = 0; let DecoderNamespace = "EXT_mmvec"; let Constraints = "$Qx4 = $Qx4in"; } +def V6_vsubcarryo : HInst< +(outs HvxVR:$Vd32, HvxQR:$Qe4), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.w,$Qe4 = vsub($Vu32.w,$Vv32.w):carry", +tc_e35c1e93, TypeCOPROC_VX>, Enc_c1d806, Requires<[UseHVXV66]> { +let Inst{7-7} = 0b1; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011101101; +let hasNewValue = 1; +let opNewValue = 0; +let hasNewValue2 = 1; +let opNewValue2 = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vsubh : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vsub($Vu32.h,$Vv32.h)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100010; @@ -35938,7 +36395,7 @@ def V6_vsubh_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.h = vsub($Vuu32.h,$Vvv32.h)", -tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100100; @@ -35961,7 +36418,7 @@ def V6_vsubhnq : HInst< (outs HvxVR:$Vx32), (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32), "if (!$Qv4) $Vx32.h -= $Vu32.h", -tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b1; let Inst{21-16} = 0b000010; @@ -35987,7 +36444,7 @@ def V6_vsubhq : HInst< (outs HvxVR:$Vx32), (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32), "if ($Qv4) $Vx32.h -= $Vu32.h", -tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b1; let Inst{21-16} = 0b000001; @@ -36013,7 +36470,7 @@ def V6_vsubhsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vsub($Vu32.h,$Vv32.h):sat", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100011; @@ -36036,7 +36493,7 @@ def V6_vsubhsat_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.h = vsub($Vuu32.h,$Vvv32.h):sat", -tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100101; @@ -36059,7 +36516,7 @@ def V6_vsubhw : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vdd32.w = vsub($Vu32.h,$Vv32.h)", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100101; @@ -36082,7 +36539,7 @@ def V6_vsububh : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vdd32.h = vsub($Vu32.ub,$Vv32.ub)", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100101; @@ -36105,7 +36562,7 @@ def V6_vsububsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.ub = vsub($Vu32.ub,$Vv32.ub):sat", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100011; @@ -36128,7 +36585,7 @@ def V6_vsububsat_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.ub = vsub($Vuu32.ub,$Vvv32.ub):sat", -tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100100; @@ -36151,7 +36608,7 @@ def V6_vsubububb_sat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.ub = vsub($Vu32.ub,$Vv32.b):sat", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011110101; @@ -36163,7 +36620,7 @@ def V6_vsubuhsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.uh = vsub($Vu32.uh,$Vv32.uh):sat", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100011; @@ -36186,7 +36643,7 @@ def V6_vsubuhsat_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.uh = vsub($Vuu32.uh,$Vvv32.uh):sat", -tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100100; @@ -36209,7 +36666,7 @@ def V6_vsubuhw : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vdd32.w = vsub($Vu32.uh,$Vv32.uh)", -tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { +tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV60]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100101; @@ -36232,7 +36689,7 @@ def V6_vsubuwsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.uw = vsub($Vu32.uw,$Vv32.uw):sat", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV62]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011111110; @@ -36255,7 +36712,7 @@ def V6_vsubuwsat_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.uw = vsub($Vuu32.uw,$Vvv32.uw):sat", -tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV62]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV62]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011110101; @@ -36278,7 +36735,7 @@ def V6_vsubw : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vsub($Vu32.w,$Vv32.w)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100010; @@ -36301,7 +36758,7 @@ def V6_vsubw_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.w = vsub($Vuu32.w,$Vvv32.w)", -tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { let Inst{7-5} = 0b101; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100100; @@ -36324,7 +36781,7 @@ def V6_vsubwnq : HInst< (outs HvxVR:$Vx32), (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32), "if (!$Qv4) $Vx32.w -= $Vu32.w", -tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b1; let Inst{21-16} = 0b000010; @@ -36350,7 +36807,7 @@ def V6_vsubwq : HInst< (outs HvxVR:$Vx32), (ins HvxQR:$Qv4, HvxVR:$Vx32in, HvxVR:$Vu32), "if ($Qv4) $Vx32.w -= $Vu32.w", -tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { +tc_257f6f7c, TypeCVI_VA>, Enc_a90628, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b1; let Inst{21-16} = 0b000010; @@ -36376,7 +36833,7 @@ def V6_vsubwsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vsub($Vu32.w,$Vv32.w):sat", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100011; @@ -36399,7 +36856,7 @@ def V6_vsubwsat_dv : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32), "$Vdd32.w = vsub($Vuu32.w,$Vvv32.w):sat", -tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { +tc_db5555f3, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100101; @@ -36422,7 +36879,7 @@ def V6_vswap : HInst< (outs HvxWR:$Vdd32), (ins HvxQR:$Qt4, HvxVR:$Vu32, HvxVR:$Vv32), "$Vdd32 = vswap($Qt4,$Vu32,$Vv32)", -tc_316c637c, TypeCVI_VA_DV>, Enc_3dac0b, Requires<[UseHVXV60]> { +tc_71646d06, TypeCVI_VA_DV>, Enc_3dac0b, Requires<[UseHVXV60]> { let Inst{7-7} = 0b0; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011110101; @@ -36434,7 +36891,7 @@ def V6_vtmpyb : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, IntRegs:$Rt32), "$Vdd32.h = vtmpy($Vuu32.b,$Rt32.b)", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001000; @@ -36446,7 +36903,7 @@ def V6_vtmpyb_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32), "$Vxx32.h += vtmpy($Vuu32.b,$Rt32.b)", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001000; @@ -36484,7 +36941,7 @@ def V6_vtmpybus : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, IntRegs:$Rt32), "$Vdd32.h = vtmpy($Vuu32.ub,$Rt32.b)", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001000; @@ -36496,7 +36953,7 @@ def V6_vtmpybus_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32), "$Vxx32.h += vtmpy($Vuu32.ub,$Rt32.b)", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001000; @@ -36534,7 +36991,7 @@ def V6_vtmpyhb : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, IntRegs:$Rt32), "$Vdd32.w = vtmpy($Vuu32.h,$Rt32.b)", -tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> { +tc_0b04c6c7, TypeCVI_VX_DV>, Enc_aad80c, Requires<[UseHVXV60]> { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001101; @@ -36546,7 +37003,7 @@ def V6_vtmpyhb_acc : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxWR:$Vuu32, IntRegs:$Rt32), "$Vxx32.w += vtmpy($Vuu32.h,$Rt32.b)", -tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> { +tc_660769f1, TypeCVI_VX_DV>, Enc_d6990d, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001000; @@ -36598,7 +37055,7 @@ def V6_vunpackb : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32), "$Vdd32.h = vunpack($Vu32.b)", -tc_d7bea0ec, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV60]> { +tc_04da405a, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000001; @@ -36621,7 +37078,7 @@ def V6_vunpackh : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32), "$Vdd32.w = vunpack($Vu32.h)", -tc_d7bea0ec, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV60]> { +tc_04da405a, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000001; @@ -36644,7 +37101,7 @@ def V6_vunpackob : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32), "$Vxx32.h |= vunpacko($Vu32.b)", -tc_72ad7b54, TypeCVI_VP_VS>, Enc_500cb0, Requires<[UseHVXV60]> { +tc_2c745bb8, TypeCVI_VP_VS>, Enc_500cb0, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b1; let Inst{31-16} = 0b0001111000000000; @@ -36670,7 +37127,7 @@ def V6_vunpackoh : HInst< (outs HvxWR:$Vxx32), (ins HvxWR:$Vxx32in, HvxVR:$Vu32), "$Vxx32.w |= vunpacko($Vu32.h)", -tc_72ad7b54, TypeCVI_VP_VS>, Enc_500cb0, Requires<[UseHVXV60]> { +tc_2c745bb8, TypeCVI_VP_VS>, Enc_500cb0, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b1; let Inst{31-16} = 0b0001111000000000; @@ -36697,7 +37154,7 @@ def V6_vunpackub : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32), "$Vdd32.uh = vunpack($Vu32.ub)", -tc_d7bea0ec, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV60]> { +tc_04da405a, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000001; @@ -36720,7 +37177,7 @@ def V6_vunpackuh : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32), "$Vdd32.uw = vunpack($Vu32.uh)", -tc_d7bea0ec, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV60]> { +tc_04da405a, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000001; @@ -36743,7 +37200,7 @@ def V6_vwhist128 : HInst< (outs), (ins), "vwhist128", -tc_e5053c8f, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV62]> { +tc_1381a97c, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV62]> { let Inst{13-0} = 0b10010010000000; let Inst{31-16} = 0b0001111000000000; let DecoderNamespace = "EXT_mmvec"; @@ -36752,7 +37209,7 @@ def V6_vwhist128m : HInst< (outs), (ins u1_0Imm:$Ii), "vwhist128(#$Ii)", -tc_b77635b4, TypeCVI_HIST>, Enc_efaed8, Requires<[UseHVXV62]> { +tc_b28e51aa, TypeCVI_HIST>, Enc_efaed8, Requires<[UseHVXV62]> { let Inst{7-0} = 0b10000000; let Inst{13-9} = 0b10011; let Inst{31-16} = 0b0001111000000000; @@ -36762,7 +37219,7 @@ def V6_vwhist128q : HInst< (outs), (ins HvxQR:$Qv4), "vwhist128($Qv4)", -tc_cedf314b, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV62]> { +tc_e3f68a46, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV62]> { let Inst{13-0} = 0b10010010000000; let Inst{21-16} = 0b000010; let Inst{31-24} = 0b00011110; @@ -36772,7 +37229,7 @@ def V6_vwhist128qm : HInst< (outs), (ins HvxQR:$Qv4, u1_0Imm:$Ii), "vwhist128($Qv4,#$Ii)", -tc_28978789, TypeCVI_HIST>, Enc_802dc0, Requires<[UseHVXV62]> { +tc_767c4e9d, TypeCVI_HIST>, Enc_802dc0, Requires<[UseHVXV62]> { let Inst{7-0} = 0b10000000; let Inst{13-9} = 0b10011; let Inst{21-16} = 0b000010; @@ -36783,7 +37240,7 @@ def V6_vwhist256 : HInst< (outs), (ins), "vwhist256", -tc_e5053c8f, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV62]> { +tc_1381a97c, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV62]> { let Inst{13-0} = 0b10001010000000; let Inst{31-16} = 0b0001111000000000; let DecoderNamespace = "EXT_mmvec"; @@ -36792,7 +37249,7 @@ def V6_vwhist256_sat : HInst< (outs), (ins), "vwhist256:sat", -tc_e5053c8f, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV62]> { +tc_1381a97c, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV62]> { let Inst{13-0} = 0b10001110000000; let Inst{31-16} = 0b0001111000000000; let DecoderNamespace = "EXT_mmvec"; @@ -36801,7 +37258,7 @@ def V6_vwhist256q : HInst< (outs), (ins HvxQR:$Qv4), "vwhist256($Qv4)", -tc_cedf314b, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV62]> { +tc_e3f68a46, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV62]> { let Inst{13-0} = 0b10001010000000; let Inst{21-16} = 0b000010; let Inst{31-24} = 0b00011110; @@ -36811,7 +37268,7 @@ def V6_vwhist256q_sat : HInst< (outs), (ins HvxQR:$Qv4), "vwhist256($Qv4):sat", -tc_cedf314b, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV62]> { +tc_e3f68a46, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV62]> { let Inst{13-0} = 0b10001110000000; let Inst{21-16} = 0b000010; let Inst{31-24} = 0b00011110; @@ -36821,7 +37278,7 @@ def V6_vxor : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32 = vxor($Vu32,$Vv32)", -tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b111; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100001; @@ -36833,7 +37290,7 @@ def V6_vzb : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32), "$Vdd32.uh = vzxt($Vu32.ub)", -tc_644584f8, TypeCVI_VA_DV>, Enc_dd766a, Requires<[UseHVXV60]> { +tc_b4416217, TypeCVI_VA_DV>, Enc_dd766a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000010; @@ -36856,7 +37313,7 @@ def V6_vzh : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32), "$Vdd32.uw = vzxt($Vu32.uh)", -tc_644584f8, TypeCVI_VA_DV>, Enc_dd766a, Requires<[UseHVXV60]> { +tc_b4416217, TypeCVI_VA_DV>, Enc_dd766a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-16} = 0b0001111000000010; @@ -36875,11 +37332,122 @@ let isPseudo = 1; let isCodeGenOnly = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_zLd_ai : HInst< +(outs), +(ins IntRegs:$Rt32, s4_0Imm:$Ii), +"z = vmem($Rt32+#$Ii)", +tc_e699ae41, TypeCVI_ZW>, Enc_ff3442, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-0} = 0b00000000; +let Inst{12-11} = 0b00; +let Inst{31-21} = 0b00101100000; +let addrMode = BaseImmOffset; +let mayLoad = 1; +let isRestrictNoSlot1Store = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_zLd_pi : HInst< +(outs IntRegs:$Rx32), +(ins IntRegs:$Rx32in, s3_0Imm:$Ii), +"z = vmem($Rx32++#$Ii)", +tc_a0dbea28, TypeCVI_ZW>, Enc_6c9ee0, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-0} = 0b00000000; +let Inst{13-11} = 0b000; +let Inst{31-21} = 0b00101101000; +let addrMode = PostInc; +let mayLoad = 1; +let isRestrictNoSlot1Store = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Rx32 = $Rx32in"; +} +def V6_zLd_ppu : HInst< +(outs IntRegs:$Rx32), +(ins IntRegs:$Rx32in, ModRegs:$Mu2), +"z = vmem($Rx32++$Mu2)", +tc_a0dbea28, TypeCVI_ZW>, Enc_44661f, Requires<[UseHVXV66,UseZReg]> { +let Inst{12-0} = 0b0000000000001; +let Inst{31-21} = 0b00101101000; +let addrMode = PostInc; +let mayLoad = 1; +let isRestrictNoSlot1Store = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Rx32 = $Rx32in"; +} +def V6_zLd_pred_ai : HInst< +(outs), +(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii), +"if ($Pv4) z = vmem($Rt32+#$Ii)", +tc_dd5b0695, TypeCVI_ZW>, Enc_ef601b, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-0} = 0b00000000; +let Inst{31-21} = 0b00101100100; +let isPredicated = 1; +let addrMode = BaseImmOffset; +let mayLoad = 1; +let isRestrictNoSlot1Store = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_zLd_pred_pi : HInst< +(outs IntRegs:$Rx32), +(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii), +"if ($Pv4) z = vmem($Rx32++#$Ii)", +tc_3ad719fb, TypeCVI_ZW>, Enc_6baed4, Requires<[UseHVXV66,UseZReg]> { +let Inst{7-0} = 0b00000000; +let Inst{13-13} = 0b0; +let Inst{31-21} = 0b00101101100; +let isPredicated = 1; +let addrMode = PostInc; +let mayLoad = 1; +let isRestrictNoSlot1Store = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Rx32 = $Rx32in"; +} +def V6_zLd_pred_ppu : HInst< +(outs IntRegs:$Rx32), +(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2), +"if ($Pv4) z = vmem($Rx32++$Mu2)", +tc_3ad719fb, TypeCVI_ZW>, Enc_691712, Requires<[UseHVXV66,UseZReg]> { +let Inst{10-0} = 0b00000000001; +let Inst{31-21} = 0b00101101100; +let isPredicated = 1; +let addrMode = PostInc; +let mayLoad = 1; +let isRestrictNoSlot1Store = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Rx32 = $Rx32in"; +} +def V6_zextract : HInst< +(outs HvxVR:$Vd32), +(ins IntRegs:$Rt32), +"$Vd32 = zextract($Rt32)", +tc_5bf8afbb, TypeCVI_VP>, Enc_a5ed8a, Requires<[UseHVXV66,UseZReg]> { +let Inst{13-5} = 0b000001001; +let Inst{31-21} = 0b00011001101; +let hasNewValue = 1; +let opNewValue = 0; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_zld0 : HInst< +(outs), +(ins IntRegs:$Rt32), +"z = vmem($Rt32)", +PSEUDO, TypeMAPPING>, Requires<[UseHVXV66]> { +let isPseudo = 1; +let isCodeGenOnly = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_zldp0 : HInst< +(outs), +(ins PredRegs:$Pv4, IntRegs:$Rt32), +"if ($Pv4) z = vmem($Rt32)", +PSEUDO, TypeMAPPING>, Requires<[UseHVXV66]> { +let isPseudo = 1; +let isCodeGenOnly = 1; +let DecoderNamespace = "EXT_mmvec"; +} def Y2_barrier : HInst< (outs), (ins), "barrier", -tc_367f7f3d, TypeST>, Enc_e3b0c4 { +tc_8c99de45, TypeST>, Enc_e3b0c4 { let Inst{13-0} = 0b00000000000000; let Inst{31-16} = 0b1010100000000000; let isSoloAX = 1; @@ -36889,7 +37457,7 @@ def Y2_break : HInst< (outs), (ins), "brkpt", -tc_4ca572d4, TypeCR>, Enc_e3b0c4 { +tc_9ad9998f, TypeCR>, Enc_e3b0c4 { let Inst{13-0} = 0b00000000000000; let Inst{31-16} = 0b0110110000100000; let isSolo = 1; @@ -36898,7 +37466,7 @@ def Y2_dccleana : HInst< (outs), (ins IntRegs:$Rs32), "dccleana($Rs32)", -tc_00e7c26e, TypeST>, Enc_ecbcc8 { +tc_b857bf4e, TypeST>, Enc_ecbcc8 { let Inst{13-0} = 0b00000000000000; let Inst{31-21} = 0b10100000000; let isRestrictSlot1AOK = 1; @@ -36908,7 +37476,7 @@ def Y2_dccleaninva : HInst< (outs), (ins IntRegs:$Rs32), "dccleaninva($Rs32)", -tc_00e7c26e, TypeST>, Enc_ecbcc8 { +tc_b857bf4e, TypeST>, Enc_ecbcc8 { let Inst{13-0} = 0b00000000000000; let Inst{31-21} = 0b10100000010; let isRestrictSlot1AOK = 1; @@ -36918,7 +37486,7 @@ def Y2_dcfetch : HInst< (outs), (ins IntRegs:$Rs32), "dcfetch($Rs32)", -tc_3da80ba5, TypeMAPPING> { +tc_d63f638c, TypeMAPPING> { let hasSideEffects = 1; let isPseudo = 1; let isCodeGenOnly = 1; @@ -36927,7 +37495,7 @@ def Y2_dcfetchbo : HInst< (outs), (ins IntRegs:$Rs32, u11_3Imm:$Ii), "dcfetch($Rs32+#$Ii)", -tc_4d9914c9, TypeLD>, Enc_2d829e { +tc_9ca930f7, TypeLD>, Enc_2d829e { let Inst{13-11} = 0b000; let Inst{31-21} = 0b10010100000; let addrMode = BaseImmOffset; @@ -36938,7 +37506,7 @@ def Y2_dcinva : HInst< (outs), (ins IntRegs:$Rs32), "dcinva($Rs32)", -tc_00e7c26e, TypeST>, Enc_ecbcc8 { +tc_b857bf4e, TypeST>, Enc_ecbcc8 { let Inst{13-0} = 0b00000000000000; let Inst{31-21} = 0b10100000001; let isRestrictSlot1AOK = 1; @@ -36948,7 +37516,7 @@ def Y2_dczeroa : HInst< (outs), (ins IntRegs:$Rs32), "dczeroa($Rs32)", -tc_00e7c26e, TypeST>, Enc_ecbcc8 { +tc_b857bf4e, TypeST>, Enc_ecbcc8 { let Inst{13-0} = 0b00000000000000; let Inst{31-21} = 0b10100000110; let isRestrictSlot1AOK = 1; @@ -36959,7 +37527,7 @@ def Y2_icinva : HInst< (outs), (ins IntRegs:$Rs32), "icinva($Rs32)", -tc_999d32db, TypeJ>, Enc_ecbcc8 { +tc_5d7f5414, TypeJ>, Enc_ecbcc8 { let Inst{13-0} = 0b00000000000000; let Inst{31-21} = 0b01010110110; let isSolo = 1; @@ -36968,7 +37536,7 @@ def Y2_isync : HInst< (outs), (ins), "isync", -tc_b13761ae, TypeJ>, Enc_e3b0c4 { +tc_8b121f4a, TypeJ>, Enc_e3b0c4 { let Inst{13-0} = 0b00000000000010; let Inst{31-16} = 0b0101011111000000; let isSolo = 1; @@ -36977,16 +37545,25 @@ def Y2_syncht : HInst< (outs), (ins), "syncht", -tc_367f7f3d, TypeST>, Enc_e3b0c4 { +tc_8c99de45, TypeST>, Enc_e3b0c4 { let Inst{13-0} = 0b00000000000000; let Inst{31-16} = 0b1010100001000000; let isSolo = 1; } +def Y2_wait : HInst< +(outs), +(ins IntRegs:$Rs32), +"wait($Rs32)", +tc_174516e8, TypeCR>, Enc_ecbcc8, Requires<[HasV65]> { +let Inst{13-0} = 0b00000000000000; +let Inst{31-21} = 0b01100100010; +let isSolo = 1; +} def Y4_l2fetch : HInst< (outs), (ins IntRegs:$Rs32, IntRegs:$Rt32), "l2fetch($Rs32,$Rt32)", -tc_daa058fa, TypeST>, Enc_ca3887 { +tc_fe211424, TypeST>, Enc_ca3887 { let Inst{7-0} = 0b00000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10100110000; @@ -36998,7 +37575,7 @@ def Y4_trace : HInst< (outs), (ins IntRegs:$Rs32), "trace($Rs32)", -tc_c82dc1ff, TypeCR>, Enc_ecbcc8 { +tc_6b25e783, TypeCR>, Enc_ecbcc8 { let Inst{13-0} = 0b00000000000000; let Inst{31-21} = 0b01100010010; let isSoloAX = 1; @@ -37007,7 +37584,7 @@ def Y5_l2fetch : HInst< (outs), (ins IntRegs:$Rs32, DoubleRegs:$Rtt32), "l2fetch($Rs32,$Rtt32)", -tc_daa058fa, TypeST>, Enc_e6abcf { +tc_fe211424, TypeST>, Enc_e6abcf { let Inst{7-0} = 0b00000000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b10100110100; @@ -37019,7 +37596,7 @@ def dep_A2_addsat : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = add($Rs32,$Rt32):sat:deprecated", -tc_b44c6e2a, TypeALU64>, Enc_5ab2be { +tc_779080bf, TypeALU64>, Enc_5ab2be { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101100; @@ -37032,7 +37609,7 @@ def dep_A2_subsat : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rt32, IntRegs:$Rs32), "$Rd32 = sub($Rt32,$Rs32):sat:deprecated", -tc_b44c6e2a, TypeALU64>, Enc_bd6011 { +tc_779080bf, TypeALU64>, Enc_bd6011 { let Inst{7-5} = 0b100; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010101100; @@ -37045,7 +37622,7 @@ def dep_S2_packhl : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rdd32 = packhl($Rs32,$Rt32):deprecated", -tc_540fdfbc, TypeALU64>, Enc_be32a5 { +tc_946df596, TypeALU64>, Enc_be32a5 { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11010100000; diff --git a/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td new file mode 100644 index 0000000000000000000000000000000000000000..2346fa5726264dfa393ebed107c098b4036dfd68 --- /dev/null +++ b/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td @@ -0,0 +1,3337 @@ +//===----------------------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Automatically generated file, please consult code owner before editing. +//===----------------------------------------------------------------------===// + + +// V5 Scalar Instructions. + +def: Pat<(int_hexagon_S2_asr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_asr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vsatwh DoubleRegs:$src1), + (S2_vsatwh DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpysu_up IntRegs:$src1, IntRegs:$src2), + (M2_mpysu_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyud_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyud_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cmpysc_s1 IntRegs:$src1, IntRegs:$src2), + (M2_cmpysc_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cmpysc_s0 IntRegs:$src1, IntRegs:$src2), + (M2_cmpysc_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_cmpyi_whc DoubleRegs:$src1, IntRegs:$src2), + (M4_cmpyi_whc DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_sat_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_sat_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_sat_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_sat_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_shuffoh DoubleRegs:$src1, DoubleRegs:$src2), + (S2_shuffoh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sfmax IntRegs:$src1, IntRegs:$src2), + (F2_sfmax IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vabswsat DoubleRegs:$src1), + (A2_vabswsat DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$src1, u5_0ImmPred:$src2), + (S2_asr_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2), + (S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_combineri IntRegs:$src1, s32_0ImmPred:$src2), + (A4_combineri IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_nac_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_nac_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_vpmpyh_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M4_vpmpyh_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vcmpy_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2), + (M2_vcmpy_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_notp DoubleRegs:$src1), + (A2_notp DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_hl_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_hl_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_or_and PredRegs:$src1, PredRegs:$src2, PredRegs:$src3), + (C4_or_and PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vmac2s_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_vmac2s_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vmac2s_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_vmac2s_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_brevp DoubleRegs:$src1), + (S2_brevp DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_pmpyw_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M4_pmpyw_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_cl1 IntRegs:$src1), + (S2_cl1 IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_cmplte IntRegs:$src1, IntRegs:$src2), + (C4_cmplte IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmpyul_s0 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_mmpyul_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vaddws DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vaddws DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_maxup DoubleRegs:$src1, DoubleRegs:$src2), + (A2_maxup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred:$src2), + (A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_interleave DoubleRegs:$src1), + (S2_interleave DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vrcmpyi_s0 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_vrcmpyi_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_abssat IntRegs:$src1), + (A2_abssat IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vcmpwgtu DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vcmpwgtu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_cmpgtu IntRegs:$src1, IntRegs:$src2), + (C2_cmpgtu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2), + (C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cmphgtui IntRegs:$src1, u32_0ImmPred:$src2), + (A4_cmphgtui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_cmpgti IntRegs:$src1, s32_0ImmPred:$src2), + (C2_cmpgti IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyi IntRegs:$src1, IntRegs:$src2), + (M2_mpyi IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_df2uw_chop DoubleRegs:$src1), + (F2_conv_df2uw_chop DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cmpheq IntRegs:$src1, IntRegs:$src2), + (A4_cmpheq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_lh_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_lh_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vrcnegh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_vrcnegh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_extractup DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3), + (S2_extractup DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_ntstbit_r IntRegs:$src1, IntRegs:$src2), + (S4_ntstbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_w2sf IntRegs:$src1), + (F2_conv_w2sf IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_not PredRegs:$src1), + (C2_not PredRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_tfrpr PredRegs:$src1), + (C2_tfrpr PredRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_ll_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_ll_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cmpbgt IntRegs:$src1, IntRegs:$src2), + (A4_cmpbgt IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (S2_asr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_rcmpneqi IntRegs:$src1, s32_0ImmPred:$src2), + (A4_rcmpneqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_subacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_subacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_orp DoubleRegs:$src1, DoubleRegs:$src2), + (A2_orp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_up IntRegs:$src1, IntRegs:$src2), + (M2_mpyu_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_acc_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_acc_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2), + (S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2), + (S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cmpbgtu IntRegs:$src1, IntRegs:$src2), + (A4_cmpbgtu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmpbeq_any DoubleRegs:$src1, DoubleRegs:$src2), + (A4_vcmpbeq_any DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cmpbgti IntRegs:$src1, s8_0ImmPred:$src2), + (A4_cmpbgti IntRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_lh_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpyd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_addsp IntRegs:$src1, DoubleRegs:$src2), + (A2_addsp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_vxsubaddw DoubleRegs:$src1, DoubleRegs:$src2), + (S4_vxsubaddw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred:$src2), + (A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_vxsubaddh DoubleRegs:$src1, DoubleRegs:$src2), + (S4_vxsubaddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_pmpyw IntRegs:$src1, IntRegs:$src2), + (M4_pmpyw IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vsathb DoubleRegs:$src1), + (S2_vsathb DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_asr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyu_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyu_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_lsl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_pxorf PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (A2_pxorf PredRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred:$src2), + (S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$src1, u5_0ImmPred:$src2), + (S2_asl_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vrminuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (A4_vrminuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sffma IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (F2_sffma IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_absp DoubleRegs:$src1), + (A2_absp DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_all8 PredRegs:$src1), + (C2_all8 PredRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vrminuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (A4_vrminuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sffma_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (F2_sffma_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_vrmpyoh_s0 DoubleRegs:$src1, DoubleRegs:$src2), + (M4_vrmpyoh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_vrmpyoh_s1 DoubleRegs:$src1, DoubleRegs:$src2), + (M4_vrmpyoh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_bitsset IntRegs:$src1, IntRegs:$src2), + (C2_bitsset IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpysip IntRegs:$src1, u32_0ImmPred:$src2), + (M2_mpysip IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpysin IntRegs:$src1, u8_0ImmPred:$src2), + (M2_mpysin IntRegs:$src1, u8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_boundscheck IntRegs:$src1, DoubleRegs:$src2), + (A4_boundscheck IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M5_vrmpybuu DoubleRegs:$src1, DoubleRegs:$src2), + (M5_vrmpybuu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_fastcorner9 PredRegs:$src1, PredRegs:$src2), + (C4_fastcorner9 PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vrcmpys_s1rp DoubleRegs:$src1, IntRegs:$src2), + (M2_vrcmpys_s1rp DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_subsat IntRegs:$src1, IntRegs:$src2), + (A2_subsat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_r_r IntRegs:$src1, IntRegs:$src2), + (S2_asl_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_r_p DoubleRegs:$src1, IntRegs:$src2), + (S2_asl_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vnavgh DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vnavgh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_nac_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_nac_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_ud2df DoubleRegs:$src1), + (F2_conv_ud2df DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vnavgw DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vnavgw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_subi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S4_subi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vzxthw IntRegs:$src1), + (S2_vzxthw IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sfadd IntRegs:$src1, IntRegs:$src2), + (F2_sfadd IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_sub IntRegs:$src1, IntRegs:$src2), + (A2_sub IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vmac2su_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_vmac2su_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vmac2su_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_vmac2su_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2), + (M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3, u5_0ImmPred:$src4), + (S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3, u5_0ImmPred:$src4)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_packhl IntRegs:$src1, IntRegs:$src2), + (S2_packhl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred:$src2), + (A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vavguwr DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vavguwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (S2_asl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_svsubhs IntRegs:$src1, IntRegs:$src2), + (A2_svsubhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_addh_l16_hl IntRegs:$src1, IntRegs:$src2), + (A2_addh_l16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_and_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M4_and_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_d2df DoubleRegs:$src1), + (F2_conv_d2df DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_cmpgtui IntRegs:$src1, u32_0ImmPred:$src2), + (C2_cmpgtui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vconj DoubleRegs:$src1), + (A2_vconj DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_r_vw DoubleRegs:$src1, IntRegs:$src2), + (S2_lsr_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_r_vh DoubleRegs:$src1, IntRegs:$src2), + (S2_lsr_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_subh_l16_hl IntRegs:$src1, IntRegs:$src2), + (A2_subh_l16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_vxsubaddhr DoubleRegs:$src1, DoubleRegs:$src2), + (S4_vxsubaddhr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_clbp DoubleRegs:$src1), + (S2_clbp DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_deinterleave DoubleRegs:$src1), + (S2_deinterleave DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_any8 PredRegs:$src1), + (C2_any8 PredRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_togglebit_r IntRegs:$src1, IntRegs:$src2), + (S2_togglebit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_togglebit_i IntRegs:$src1, u5_0ImmPred:$src2), + (S2_togglebit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_uw2sf IntRegs:$src1), + (F2_conv_uw2sf IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vsathb_nopack DoubleRegs:$src1), + (S2_vsathb_nopack DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cmacs_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_cmacs_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cmacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_cmacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_sat_hh_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_sat_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_sat_hh_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_sat_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmacuhs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_mmacuhs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmacuhs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_mmacuhs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_clrbit_r IntRegs:$src1, IntRegs:$src2), + (S2_clrbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_or_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3), + (C4_or_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (S2_asl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), + (S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred:$src2), + (A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_vrmpyoh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M4_vrmpyoh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_vrmpyoh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M4_vrmpyoh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vrmaxh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (A4_vrmaxh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vcmpbeq DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vcmpbeq DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vcmphgt DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vcmphgt DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vnavgwcr DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vnavgwcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vrcmacr_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_vrcmacr_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vavgwcr DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vavgwcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), + (S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vrmaxw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (A4_vrmaxw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vnavghr DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vnavghr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_cmpyi_wh DoubleRegs:$src1, IntRegs:$src2), + (M4_cmpyi_wh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_tfrsi s32_0ImmPred:$src1), + (A2_tfrsi s32_0ImmPred:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_svnavgh IntRegs:$src1, IntRegs:$src2), + (A2_svnavgh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$src1, u5_0ImmPred:$src2), + (S2_lsr_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vmac2 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_vmac2 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred:$src2), + (A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_svavgh IntRegs:$src1, IntRegs:$src2), + (A2_svavgh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_vrmpyeh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M4_vrmpyeh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_vrmpyeh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M4_vrmpyeh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2), + (S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_combine_hl IntRegs:$src1, IntRegs:$src2), + (A2_combine_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_up IntRegs:$src1, IntRegs:$src2), + (M2_mpy_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_combine_hh IntRegs:$src1, IntRegs:$src2), + (A2_combine_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_negsat IntRegs:$src1), + (A2_negsat IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_hl_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpyd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_hl_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpyd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_bitsplit IntRegs:$src1, IntRegs:$src2), + (A4_bitsplit IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vabshsat DoubleRegs:$src1), + (A2_vabshsat DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyui IntRegs:$src1, IntRegs:$src2), + (M2_mpyui IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_addh_l16_sat_ll IntRegs:$src1, IntRegs:$src2), + (A2_addh_l16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (S2_lsl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmpyul_rs0 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_mmpyul_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_lsr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cmacr_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_cmacr_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_or_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M4_or_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_mpyrr_addi u32_0ImmPred:$src1, IntRegs:$src2, IntRegs:$src3), + (M4_mpyrr_addi u32_0ImmPred:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3), + (S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_sat_hl_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_sat_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_sat_hl_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_sat_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_mpyrr_addr IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M4_mpyrr_addr IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmachs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_mmachs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmachs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_mmachs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vrcmpyr_s0c DoubleRegs:$src1, DoubleRegs:$src2), + (M2_vrcmpyr_s0c DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_acc_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_acc_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyd_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sffixupn IntRegs:$src1, IntRegs:$src2), + (F2_sffixupn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyd_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyd_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vadduhs DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vadduhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vsubuhs DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vsubuhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_subh_h16_hl IntRegs:$src1, IntRegs:$src2), + (A2_subh_h16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_subh_h16_hh IntRegs:$src1, IntRegs:$src2), + (A2_subh_h16_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_xorp DoubleRegs:$src1, DoubleRegs:$src2), + (A2_xorp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_tfrpcp DoubleRegs:$src1), + (A4_tfrpcp DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_addh_h16_lh IntRegs:$src1, IntRegs:$src2), + (A2_addh_h16_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_addh_h16_sat_hl IntRegs:$src1, IntRegs:$src2), + (A2_addh_h16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_addh_h16_ll IntRegs:$src1, IntRegs:$src2), + (A2_addh_h16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_addh_h16_sat_hh IntRegs:$src1, IntRegs:$src2), + (A2_addh_h16_sat_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_zxtb IntRegs:$src1), + (A2_zxtb IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_zxth IntRegs:$src1), + (A2_zxth IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vnavgwr DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vnavgwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_or_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M4_or_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyud_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyud_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M5_vmacbsu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M5_vmacbsu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sffms_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (F2_sffms_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_cmpneqi IntRegs:$src1, s32_0ImmPred:$src2), + (C4_cmpneqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_and_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M4_and_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_sat DoubleRegs:$src1), + (A2_sat DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyd_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyd_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_addsat IntRegs:$src1, IntRegs:$src2), + (A2_addsat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_svavghs IntRegs:$src1, IntRegs:$src2), + (A2_svavghs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vrsadub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (A2_vrsadub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_bitsclri IntRegs:$src1, u6_0ImmPred:$src2), + (C2_bitsclri IntRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_subh_h16_sat_hh IntRegs:$src1, IntRegs:$src2), + (A2_subh_h16_sat_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_subh_h16_sat_hl IntRegs:$src1, IntRegs:$src2), + (A2_subh_h16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmaculs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_mmaculs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmaculs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_mmaculs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vradduh DoubleRegs:$src1, DoubleRegs:$src2), + (M2_vradduh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_addp_c DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3), + (A4_addp_c DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_xor PredRegs:$src1, PredRegs:$src2), + (C2_xor PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (S2_lsl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmpyh_rs1 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_mmpyh_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmpyh_rs0 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_mmpyh_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_df2ud_chop DoubleRegs:$src1), + (F2_conv_df2ud_chop DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_or_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3), + (C4_or_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_vxaddsubhr DoubleRegs:$src1, DoubleRegs:$src2), + (S4_vxaddsubhr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vsathub DoubleRegs:$src1), + (S2_vsathub DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_df2sf DoubleRegs:$src1), + (F2_conv_df2sf DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_hmmpyh_rs1 IntRegs:$src1, IntRegs:$src2), + (M2_hmmpyh_rs1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_hmmpyh_s1 IntRegs:$src1, IntRegs:$src2), + (M2_hmmpyh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vavgwr DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vavgwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_sxth IntRegs:$src1), + (A2_sxth IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_sxtb IntRegs:$src1), + (A2_sxtb IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_or_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3), + (C4_or_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vrcmaci_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_vrcmaci_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_sxtw IntRegs:$src1), + (A2_sxtw IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vabsdiffh DoubleRegs:$src1, DoubleRegs:$src2), + (M2_vabsdiffh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_hmmpyl_s1 IntRegs:$src1, IntRegs:$src2), + (M2_hmmpyl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_cl1p DoubleRegs:$src1), + (S2_cl1p DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vabsdiffw DoubleRegs:$src1, DoubleRegs:$src2), + (M2_vabsdiffw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_andnp DoubleRegs:$src1, DoubleRegs:$src2), + (A4_andnp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_vmux PredRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (C2_vmux PredRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_parityp DoubleRegs:$src1, DoubleRegs:$src2), + (S2_parityp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), + (S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyu_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyu_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sfcmpeq IntRegs:$src1, IntRegs:$src2), + (F2_sfcmpeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vaddb_map DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vaddub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (S2_lsr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vcmpheq DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vcmpheq DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_clbnorm IntRegs:$src1), + (S2_clbnorm IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cnacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_cnacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cnacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_cnacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_subaddi IntRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3), + (S4_subaddi IntRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyud_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyud_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_tstbit_r IntRegs:$src1, IntRegs:$src2), + (S2_tstbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3), + (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmachs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_mmachs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmachs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_mmachs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_tstbit_i IntRegs:$src1, u5_0ImmPred:$src2), + (S2_tstbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_up_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_up_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_extractu_rp IntRegs:$src1, DoubleRegs:$src2), + (S2_extractu_rp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmpyuh_rs0 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_mmpyuh_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2), + (S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_or_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M4_or_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_hh_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpyu_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_hh_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpyu_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_asl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyu_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyu_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_sat_ll_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_sat_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_sat_ll_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_sat_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_w2df IntRegs:$src1), + (F2_conv_w2df IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_subh_l16_sat_hl IntRegs:$src1, IntRegs:$src2), + (A2_subh_l16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_cmpeqi IntRegs:$src1, s32_0ImmPred:$src2), + (C2_cmpeqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vcnegh DoubleRegs:$src1, IntRegs:$src2), + (S2_vcnegh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred:$src2), + (A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vdmpyrs_s1 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_vdmpyrs_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_xor_xacc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M4_xor_xacc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vdmpys_s1 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_vdmpys_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vdmpys_s0 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_vdmpys_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vavgubr DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vavgubr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_hl_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpyu_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_hl_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpyu_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (S2_asl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_cl0p DoubleRegs:$src1), + (S2_cl0p DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3), + (S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sffixupd IntRegs:$src1, IntRegs:$src2), + (F2_sffixupd IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_sat_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_sat_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_sat_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_sat_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cmacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_cmacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cmacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_cmacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_ct1 IntRegs:$src1), + (S2_ct1 IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_ct0 IntRegs:$src1), + (S2_ct0 IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmpyul_rs1 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_mmpyul_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_ntstbit_i IntRegs:$src1, u5_0ImmPred:$src2), + (S4_ntstbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sffixupr IntRegs:$src1), + (F2_sffixupr IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_asr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyud_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyud_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vcmphgtu DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vcmphgtu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_andn PredRegs:$src1, PredRegs:$src2), + (C2_andn PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vmpy2s_s0pack IntRegs:$src1, IntRegs:$src2), + (M2_vmpy2s_s0pack IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3), + (S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyd_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_acc_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_acc_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_rcmpeqi IntRegs:$src1, s32_0ImmPred:$src2), + (A4_rcmpeqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_xor_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M4_xor_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), + (S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmpyuh_rs1 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_mmpyuh_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (S2_asr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_round_ri IntRegs:$src1, u5_0ImmPred:$src2), + (A4_round_ri IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_max IntRegs:$src1, IntRegs:$src2), + (A2_max IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_round_rr IntRegs:$src1, IntRegs:$src2), + (A4_round_rr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_combineii s8_0ImmPred:$src1, u32_0ImmPred:$src2), + (A4_combineii s8_0ImmPred:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_combineir s32_0ImmPred:$src1, IntRegs:$src2), + (A4_combineir s32_0ImmPred:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_and_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3), + (C4_and_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M5_vmacbuu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M5_vmacbuu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_rcmpeq IntRegs:$src1, IntRegs:$src2), + (A4_rcmpeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_cmpyr_whc DoubleRegs:$src1, IntRegs:$src2), + (M4_cmpyr_whc DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vzxtbh IntRegs:$src1), + (S2_vzxtbh IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmacuhs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_mmacuhs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_r_r_sat IntRegs:$src1, IntRegs:$src2), + (S2_asr_r_r_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_combinew IntRegs:$src1, IntRegs:$src2), + (A2_combinew IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cmpyi_s0 IntRegs:$src1, IntRegs:$src2), + (M2_cmpyi_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_ori_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S4_ori_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_nbitsset IntRegs:$src1, IntRegs:$src2), + (C4_nbitsset IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyu_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyu_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_ll_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpyu_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_ll_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpyu_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_addh_l16_ll IntRegs:$src1, IntRegs:$src2), + (A2_addh_l16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (S2_lsr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_modwrapu IntRegs:$src1, IntRegs:$src2), + (A4_modwrapu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_rcmpneq IntRegs:$src1, IntRegs:$src2), + (A4_rcmpneq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyd_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyd_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sfimm_p u10_0ImmPred:$src1), + (F2_sfimm_p u10_0ImmPred:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sfimm_n u10_0ImmPred:$src1), + (F2_sfimm_n u10_0ImmPred:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_cmpyr_wh DoubleRegs:$src1, IntRegs:$src2), + (M4_cmpyr_wh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_lsl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vavgub DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vavgub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_d2sf DoubleRegs:$src1), + (F2_conv_d2sf DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vavguh DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vavguh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cmpbeqi IntRegs:$src1, u8_0ImmPred:$src2), + (A4_cmpbeqi IntRegs:$src1, u8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sfcmpuo IntRegs:$src1, IntRegs:$src2), + (F2_sfcmpuo IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vavguw DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vavguw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), + (S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vsatwh_nopack DoubleRegs:$src1), + (S2_vsatwh_nopack DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_hh_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpyd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_hh_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpyd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_lsl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_minu IntRegs:$src1, IntRegs:$src2), + (A2_minu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_sat_lh_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_sat_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_or_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M4_or_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_minp DoubleRegs:$src1, DoubleRegs:$src2), + (A2_minp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3), + (S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmpyuh_s0 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_mmpyuh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmpyuh_s1 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_mmpyuh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_acc_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_acc_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sfcmpge IntRegs:$src1, IntRegs:$src2), + (F2_sfcmpge IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sfmin IntRegs:$src1, IntRegs:$src2), + (F2_sfmin IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sfcmpgt IntRegs:$src1, IntRegs:$src2), + (F2_sfcmpgt IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_vpmpyh IntRegs:$src1, IntRegs:$src2), + (M4_vpmpyh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmacuhs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_mmacuhs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpyd_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpyd_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_roundsat DoubleRegs:$src1), + (A2_roundsat DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_ct1p DoubleRegs:$src1), + (S2_ct1p DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_extract_rp IntRegs:$src1, DoubleRegs:$src2), + (S4_extract_rp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (S2_lsl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_cmplteui IntRegs:$src1, u32_0ImmPred:$src2), + (C4_cmplteui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_addi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S4_addi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_tfrcpp CtrRegs64:$src1), + (A4_tfrcpp CtrRegs64:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred:$src2), + (S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cmphgti IntRegs:$src1, s32_0ImmPred:$src2), + (A4_cmphgti IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vrminh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (A4_vrminh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vrminw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (A4_vrminw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cmphgtu IntRegs:$src1, IntRegs:$src2), + (A4_cmphgtu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_insertp_rp DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (S2_insertp_rp DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vnavghcr DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vnavghcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_subi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S4_subi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsl_r_vh DoubleRegs:$src1, IntRegs:$src2), + (S2_lsl_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_hh_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vsubws DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vsubws DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_sath IntRegs:$src1), + (A2_sath IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_satb IntRegs:$src1), + (A2_satb IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3, u6_0ImmPred:$src4), + (S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3, u6_0ImmPred:$src4)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpyd_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpyd_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), + (S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_extractup_rp DoubleRegs:$src1, DoubleRegs:$src2), + (S2_extractup_rp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_vxaddsubw DoubleRegs:$src1, DoubleRegs:$src2), + (S4_vxaddsubw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_vxaddsubh DoubleRegs:$src1, DoubleRegs:$src2), + (S4_vxaddsubh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_asrh IntRegs:$src1), + (A2_asrh IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_extractp_rp DoubleRegs:$src1, DoubleRegs:$src2), + (S4_extractp_rp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (S2_lsr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyd_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyd_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_or PredRegs:$src1, PredRegs:$src2), + (C2_or PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmpyul_s1 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_mmpyul_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vrcmacr_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_vrcmacr_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_xor IntRegs:$src1, IntRegs:$src2), + (A2_xor IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_add IntRegs:$src1, IntRegs:$src2), + (A2_add IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vsububs DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vsububs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vmpy2s_s1 IntRegs:$src1, IntRegs:$src2), + (M2_vmpy2s_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vmpy2s_s0 IntRegs:$src1, IntRegs:$src2), + (M2_vmpy2s_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vraddub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (A2_vraddub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sfinvsqrta IntRegs:$src1), + (F2_sfinvsqrta IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_ct0p DoubleRegs:$src1), + (S2_ct0p DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_svaddh IntRegs:$src1, IntRegs:$src2), + (A2_svaddh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vcrotate DoubleRegs:$src1, IntRegs:$src2), + (S2_vcrotate DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_aslh IntRegs:$src1), + (A2_aslh IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_subh_h16_lh IntRegs:$src1, IntRegs:$src2), + (A2_subh_h16_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_subh_h16_ll IntRegs:$src1, IntRegs:$src2), + (A2_subh_h16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_hmmpyl_rs1 IntRegs:$src1, IntRegs:$src2), + (M2_hmmpyl_rs1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_r_p DoubleRegs:$src1, IntRegs:$src2), + (S2_asr_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vsplatrh IntRegs:$src1), + (S2_vsplatrh IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_r_r IntRegs:$src1, IntRegs:$src2), + (S2_asr_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_addh_h16_hl IntRegs:$src1, IntRegs:$src2), + (A2_addh_h16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vsplatrb IntRegs:$src1), + (S2_vsplatrb IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_addh_h16_hh IntRegs:$src1, IntRegs:$src2), + (A2_addh_h16_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cmpyr_s0 IntRegs:$src1, IntRegs:$src2), + (M2_cmpyr_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_dpmpyss_rnd_s0 IntRegs:$src1, IntRegs:$src2), + (M2_dpmpyss_rnd_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_muxri PredRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3), + (C2_muxri PredRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vmac2es_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_vmac2es_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vmac2es_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_vmac2es_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_pxfer_map PredRegs:$src1), + (C2_pxfer_map PredRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_lh_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpyu_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_lh_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpyu_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyd_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyd_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vaddw DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vaddw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vaddh DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vaddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_nac_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_nac_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_nac_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_nac_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2), + (C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_mpyri_addi u32_0ImmPred:$src1, IntRegs:$src2, u6_0ImmPred:$src3), + (M4_mpyri_addi u32_0ImmPred:$src1, IntRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_andi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S4_andi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3), + (M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_tfrcrr CtrRegs:$src1), + (A2_tfrcrr CtrRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3), + (M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_orn PredRegs:$src1, PredRegs:$src2), + (C2_orn PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_and_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M4_and_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sfmpy IntRegs:$src1, IntRegs:$src2), + (F2_sfmpy IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyud_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyud_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_lsr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_r_vw DoubleRegs:$src1, IntRegs:$src2), + (S2_asr_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_and_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M4_and_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_r_vh DoubleRegs:$src1, IntRegs:$src2), + (S2_asr_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_mask PredRegs:$src1), + (C2_mask PredRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_up_s1_sat IntRegs:$src1, IntRegs:$src2), + (M2_mpy_up_s1_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmpbgt DoubleRegs:$src1, DoubleRegs:$src2), + (A4_vcmpbgt DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M5_vrmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M5_vrmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vrsadub DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vrsadub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_tfrrcr IntRegs:$src1), + (A2_tfrrcr IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vrcmpys_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (M2_vrcmpys_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_dfcmpge DoubleRegs:$src1, DoubleRegs:$src2), + (F2_dfcmpge DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3), + (M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A5_vaddhubs DoubleRegs:$src1, DoubleRegs:$src2), + (A5_vaddhubs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vmaxw DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vmaxw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vmaxb DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vmaxb DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vmaxh DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vmaxh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vsxthw IntRegs:$src1), + (S2_vsxthw IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_andi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S4_andi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), + (S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_cmpgt IntRegs:$src1, IntRegs:$src2), + (C2_cmpgt IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_df2d_chop DoubleRegs:$src1), + (F2_conv_df2d_chop DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyu_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyu_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_sf2w IntRegs:$src1), + (F2_conv_sf2w IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sfclass IntRegs:$src1, u5_0ImmPred:$src2), + (F2_sfclass IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyud_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_xor_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M4_xor_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred:$src3), + (S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M5_vdmpybsu DoubleRegs:$src1, DoubleRegs:$src2), + (M5_vdmpybsu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyu_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyu_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_addi IntRegs:$src1, s32_0ImmPred:$src2), + (A2_addi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_addp DoubleRegs:$src1, DoubleRegs:$src2), + (A2_addp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vmpy2s_s1pack IntRegs:$src1, IntRegs:$src2), + (M2_vmpy2s_s1pack IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_clbpnorm DoubleRegs:$src1), + (S4_clbpnorm DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_round_rr_sat IntRegs:$src1, IntRegs:$src2), + (A4_round_rr_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_nacci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_nacci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_shuffeh DoubleRegs:$src1, DoubleRegs:$src2), + (S2_shuffeh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_sat_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_sat_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_sat_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_sat_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_sf2uw IntRegs:$src1), + (F2_conv_sf2uw IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vsubh DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vsubh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_sf2ud IntRegs:$src1), + (F2_conv_sf2ud IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vsubw DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vsubw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vcmpwgt DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vcmpwgt DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_xor_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M4_xor_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_sf2uw_chop IntRegs:$src1), + (F2_conv_sf2uw_chop IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_r_vw DoubleRegs:$src1, IntRegs:$src2), + (S2_asl_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vsatwuh_nopack DoubleRegs:$src1), + (S2_vsatwuh_nopack DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_r_vh DoubleRegs:$src1, IntRegs:$src2), + (S2_asl_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_svsubuhs IntRegs:$src1, IntRegs:$src2), + (A2_svsubuhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M5_vmpybsu IntRegs:$src1, IntRegs:$src2), + (M5_vmpybsu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_subh_l16_sat_ll IntRegs:$src1, IntRegs:$src2), + (A2_subh_l16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_and_and PredRegs:$src1, PredRegs:$src2, PredRegs:$src3), + (C4_and_and PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyu_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyu_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_r_p DoubleRegs:$src1, IntRegs:$src2), + (S2_lsr_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_r_r IntRegs:$src1, IntRegs:$src2), + (S2_lsr_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_subp_c DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3), + (A4_subp_c DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vsubhs DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vsubhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_vitpack PredRegs:$src1, PredRegs:$src2), + (C2_vitpack PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vavguhr DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vavguhr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vsplicerb DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3), + (S2_vsplicerb DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_nbitsclr IntRegs:$src1, IntRegs:$src2), + (C4_nbitsclr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vcmpbgtu DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vcmpbgtu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cmpys_s1 IntRegs:$src1, IntRegs:$src2), + (M2_cmpys_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cmpys_s0 IntRegs:$src1, IntRegs:$src2), + (M2_cmpys_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_dfcmpuo DoubleRegs:$src1, DoubleRegs:$src2), + (F2_dfcmpuo DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_shuffob DoubleRegs:$src1, DoubleRegs:$src2), + (S2_shuffob DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_and PredRegs:$src1, PredRegs:$src2), + (C2_and PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S5_popcountp DoubleRegs:$src1), + (S5_popcountp DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_extractp DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3), + (S4_extractp DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_cl0 IntRegs:$src1), + (S2_cl0 IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred:$src2), + (A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmacls_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_mmacls_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmacls_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_mmacls_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_cmpneq IntRegs:$src1, IntRegs:$src2), + (C4_cmpneq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vmac2es DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_vmac2es DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vdmacs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_vdmacs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vdmacs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_vdmacs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_ll_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpyud_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_ll_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpyud_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_clb IntRegs:$src1), + (S2_clb IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyd_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyd_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vmaxuh DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vmaxuh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_bitspliti IntRegs:$src1, u5_0ImmPred:$src2), + (A4_bitspliti IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vmaxub DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vmaxub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_hh_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpyud_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_hh_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpyud_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vrmac_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_vrmac_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_sat_lh_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_sat_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_r_r_sat IntRegs:$src1, IntRegs:$src2), + (S2_asl_r_r_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_sf2d IntRegs:$src1), + (F2_conv_sf2d IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (S2_asr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_dfimm_n u10_0ImmPred:$src1), + (F2_dfimm_n u10_0ImmPred:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cmphgt IntRegs:$src1, IntRegs:$src2), + (A4_cmphgt IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_dfimm_p u10_0ImmPred:$src1), + (F2_dfimm_p u10_0ImmPred:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyud_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vcmpy_s1_sat_r DoubleRegs:$src1, DoubleRegs:$src2), + (M2_vcmpy_s1_sat_r DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred:$src2, IntRegs:$src3), + (M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vcmpy_s1_sat_i DoubleRegs:$src1, DoubleRegs:$src2), + (M2_vcmpy_s1_sat_i DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_lsl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M5_vrmacbuu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M5_vrmacbuu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3), + (S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cnacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_cnacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cnacs_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_cnacs_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_maxu IntRegs:$src1, IntRegs:$src2), + (A2_maxu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_maxp DoubleRegs:$src1, DoubleRegs:$src2), + (A2_maxp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_andir IntRegs:$src1, s32_0ImmPred:$src2), + (A2_andir IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sfrecipa IntRegs:$src1, IntRegs:$src2), + (F2_sfrecipa IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_combineii s32_0ImmPred:$src1, s8_0ImmPred:$src2), + (A2_combineii s32_0ImmPred:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_orn IntRegs:$src1, IntRegs:$src2), + (A4_orn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cmpbgtui IntRegs:$src1, u32_0ImmPred:$src2), + (A4_cmpbgtui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (S2_lsr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred:$src2), + (A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsl_r_r IntRegs:$src1, IntRegs:$src2), + (S2_lsl_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsl_r_p DoubleRegs:$src1, IntRegs:$src2), + (S2_lsl_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_or IntRegs:$src1, IntRegs:$src2), + (A2_or IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_dfcmpeq DoubleRegs:$src1, DoubleRegs:$src2), + (F2_dfcmpeq DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_cmpeq IntRegs:$src1, IntRegs:$src2), + (C2_cmpeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_tfrp DoubleRegs:$src1), + (A2_tfrp DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_and_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3), + (C4_and_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vsathub_nopack DoubleRegs:$src1), + (S2_vsathub_nopack DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_satuh IntRegs:$src1), + (A2_satuh IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_satub IntRegs:$src1), + (A2_satub IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vrcmpys_s1 DoubleRegs:$src1, IntRegs:$src2), + (M2_vrcmpys_s1 DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3), + (S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_fastcorner9_not PredRegs:$src1, PredRegs:$src2), + (C4_fastcorner9_not PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_tfrih IntRegs:$src1, u16_0ImmPred:$src2), + (A2_tfrih IntRegs:$src1, u16_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_tfril IntRegs:$src1, u16_0ImmPred:$src2), + (A2_tfril IntRegs:$src1, u16_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3), + (M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vtrunehb DoubleRegs:$src1), + (S2_vtrunehb DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vabsw DoubleRegs:$src1), + (A2_vabsw DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vabsh DoubleRegs:$src1), + (A2_vabsh DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sfsub IntRegs:$src1, IntRegs:$src2), + (F2_sfsub IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_muxii PredRegs:$src1, s32_0ImmPred:$src2, s8_0ImmPred:$src3), + (C2_muxii PredRegs:$src1, s32_0ImmPred:$src2, s8_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_muxir PredRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3), + (C2_muxir PredRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_swiz IntRegs:$src1), + (A2_swiz IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), + (S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cmpyrsc_s0 IntRegs:$src1, IntRegs:$src2), + (M2_cmpyrsc_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cmpyrsc_s1 IntRegs:$src1, IntRegs:$src2), + (M2_cmpyrsc_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vraddub DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vraddub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_tlbmatch DoubleRegs:$src1, IntRegs:$src2), + (A4_tlbmatch DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_df2w_chop DoubleRegs:$src1), + (F2_conv_df2w_chop DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_and IntRegs:$src1, IntRegs:$src2), + (A2_and IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_lsr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_nac_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_nac_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_nac_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_nac_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_extract IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3), + (S4_extract IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vcmpweq DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vcmpweq DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_acci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_acci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), + (S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), + (S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_ud2sf DoubleRegs:$src1), + (F2_conv_ud2sf DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_tfr IntRegs:$src1), + (A2_tfr IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), + (S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_subri s32_0ImmPred:$src1, IntRegs:$src2), + (A2_subri s32_0ImmPred:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vrmaxuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (A4_vrmaxuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M5_vmpybuu IntRegs:$src1, IntRegs:$src2), + (M5_vmpybuu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vrmaxuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (A4_vrmaxuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2), + (S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vavgw DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vavgw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_brev IntRegs:$src1), + (S2_brev IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vavgh DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vavgh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_clrbit_i IntRegs:$src1, u5_0ImmPred:$src2), + (S2_clrbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2), + (S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmpyl_rs1 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_mmpyl_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_hl_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpyud_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmpyl_s0 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_mmpyl_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmpyl_s1 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_mmpyl_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3), + (M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vrndpackwhs DoubleRegs:$src1), + (S2_vrndpackwhs DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vtrunewh DoubleRegs:$src1, DoubleRegs:$src2), + (S2_vtrunewh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_ll_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpyd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_ll_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpyd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_mac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M4_mac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred:$src4), + (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred:$src4)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_uw2df IntRegs:$src1), + (F2_conv_uw2df IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vaddubs DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vaddubs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (S2_asr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_orir IntRegs:$src1, s32_0ImmPred:$src2), + (A2_orir IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_andp DoubleRegs:$src1, DoubleRegs:$src2), + (A2_andp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lfsp DoubleRegs:$src1, DoubleRegs:$src2), + (S2_lfsp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_min IntRegs:$src1, IntRegs:$src2), + (A2_min IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpysmi IntRegs:$src1, m32_0ImmPred:$src2), + (M2_mpysmi IntRegs:$src1, m32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vcmpy_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2), + (M2_vcmpy_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyu_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyu_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyu_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_r_svw_trun DoubleRegs:$src1, IntRegs:$src2), + (S2_asr_r_svw_trun DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmpyh_s0 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_mmpyh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmpyh_s1 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_mmpyh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_sf2df IntRegs:$src1), + (F2_conv_sf2df IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vtrunohb DoubleRegs:$src1), + (S2_vtrunohb DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_sf2d_chop IntRegs:$src1), + (F2_conv_sf2d_chop IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_lh_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpyd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_df2w DoubleRegs:$src1), + (F2_conv_df2w DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred:$src2), + (S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_df2d DoubleRegs:$src1), + (F2_conv_df2d DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmaculs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_mmaculs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmaculs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_mmaculs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_svadduhs IntRegs:$src1, IntRegs:$src2), + (A2_svadduhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_sf2w_chop IntRegs:$src1), + (F2_conv_sf2w_chop IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_svsathub IntRegs:$src1), + (S2_svsathub IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpyd_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpyd_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_setbit_r IntRegs:$src1, IntRegs:$src2), + (S2_setbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vavghr DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vavghr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sffma_sc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, PredRegs:$src4), + (F2_sffma_sc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, PredRegs:$src4)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_dfclass DoubleRegs:$src1, u5_0ImmPred:$src2), + (F2_dfclass DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_df2ud DoubleRegs:$src1), + (F2_conv_df2ud DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_df2uw DoubleRegs:$src1), + (F2_conv_df2uw DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cmpyrs_s0 IntRegs:$src1, IntRegs:$src2), + (M2_cmpyrs_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cmpyrs_s1 IntRegs:$src1, IntRegs:$src2), + (M2_cmpyrs_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_cmpltei IntRegs:$src1, s32_0ImmPred:$src2), + (C4_cmpltei IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_cmplteu IntRegs:$src1, IntRegs:$src2), + (C4_cmplteu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vsubb_map DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_subh_l16_ll IntRegs:$src1, IntRegs:$src2), + (A2_subh_l16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred:$src2), + (S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vrmpy_s0 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_vrmpy_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpyd_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpyd_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_minup DoubleRegs:$src1, DoubleRegs:$src2), + (A2_minup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_valignrb DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3), + (S2_valignrb DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_asr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmpyl_rs0 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_mmpyl_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vrcmaci_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_vrcmaci_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vaddub DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vaddub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_combine_lh IntRegs:$src1, IntRegs:$src2), + (A2_combine_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M5_vdmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M5_vdmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_combine_ll IntRegs:$src1, IntRegs:$src2), + (A2_combine_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_hl_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpyud_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vrcmpyi_s0c DoubleRegs:$src1, DoubleRegs:$src2), + (M2_vrcmpyi_s0c DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred:$src2), + (S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_addpsat DoubleRegs:$src1, DoubleRegs:$src2), + (A2_addpsat DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_svaddhs IntRegs:$src1, IntRegs:$src2), + (A2_svaddhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_ori_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S4_ori_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_sat_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_sat_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_sat_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_sat_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vminw DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vminw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vminh DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vminh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vrcmpyr_s0 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_vrcmpyr_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vminb DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vminb DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vcmac_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_vcmac_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_lh_s0 IntRegs:$src1, IntRegs:$src2), + (M2_mpyud_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_lh_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpyud_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (S2_asl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_lsli s6_0ImmPred:$src1, IntRegs:$src2), + (S4_lsli s6_0ImmPred:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsl_r_vw DoubleRegs:$src1, IntRegs:$src2), + (S2_lsl_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_hh_s1 IntRegs:$src1, IntRegs:$src2), + (M2_mpy_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_vrmpyeh_s0 DoubleRegs:$src1, DoubleRegs:$src2), + (M4_vrmpyeh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_vrmpyeh_s1 DoubleRegs:$src1, DoubleRegs:$src2), + (M4_vrmpyeh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vraddh DoubleRegs:$src1, DoubleRegs:$src2), + (M2_vraddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_tfrrp IntRegs:$src1), + (C2_tfrrp IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_acc_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_acc_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_acc_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_acc_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vtrunowh DoubleRegs:$src1, DoubleRegs:$src2), + (S2_vtrunowh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_abs IntRegs:$src1), + (A2_abs IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cmpbeq IntRegs:$src1, IntRegs:$src2), + (A4_cmpbeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_negp DoubleRegs:$src1), + (A2_negp DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred:$src2), + (S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_addh_l16_sat_hl IntRegs:$src1, IntRegs:$src2), + (A2_addh_l16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vsatwuh DoubleRegs:$src1), + (S2_vsatwuh DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_dfcmpgt DoubleRegs:$src1, DoubleRegs:$src2), + (F2_dfcmpgt DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_svsathb IntRegs:$src1), + (S2_svsathb IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2), + (C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cround_ri IntRegs:$src1, u5_0ImmPred:$src2), + (A4_cround_ri IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred:$src2), + (S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cround_rr IntRegs:$src1, IntRegs:$src2), + (A4_cround_rr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_mux PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (C2_mux PredRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2), + (M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_shuffeb DoubleRegs:$src1, DoubleRegs:$src2), + (S2_shuffeb DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vminuw DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vminuw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vaddhs DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vaddhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_insert_rp IntRegs:$src1, IntRegs:$src2, DoubleRegs:$src3), + (S2_insert_rp IntRegs:$src1, IntRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vminuh DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vminuh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vminub DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vminub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_extractu IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3), + (S2_extractu IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_svsubh IntRegs:$src1, IntRegs:$src2), + (A2_svsubh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_clbaddi IntRegs:$src1, s6_0ImmPred:$src2), + (S4_clbaddi IntRegs:$src1, s6_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_sffms IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (F2_sffms IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vsxtbh IntRegs:$src1), + (S2_vsxtbh IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyud_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyud_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_subp DoubleRegs:$src1, DoubleRegs:$src2), + (A2_subp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vmpy2es_s1 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_vmpy2es_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vmpy2es_s0 DoubleRegs:$src1, DoubleRegs:$src2), + (M2_vmpy2es_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_parity IntRegs:$src1, IntRegs:$src2), + (S4_parity IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S4_addi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S4_addi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyd_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyd_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyd_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_cmpheqi IntRegs:$src1, s32_0ImmPred:$src2), + (A4_cmpheqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_F2_conv_sf2ud_chop IntRegs:$src1), + (F2_conv_sf2ud_chop IntRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_acc_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_acc_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_acc_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_acc_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + (S2_asl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_addh_h16_sat_lh IntRegs:$src1, IntRegs:$src2), + (A2_addh_h16_sat_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_addh_h16_sat_ll IntRegs:$src1, IntRegs:$src2), + (A2_addh_h16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M4_nac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M4_nac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyud_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpyud_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpyud_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_round_ri_sat IntRegs:$src1, u5_0ImmPred:$src2), + (A4_round_ri_sat IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vavghcr DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vavghcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmacls_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_mmacls_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mmacls_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_mmacls_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_cmaci_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_cmaci_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_setbit_i IntRegs:$src1, u5_0ImmPred:$src2), + (S2_setbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), + (S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_andn IntRegs:$src1, IntRegs:$src2), + (A4_andn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M5_vrmpybsu DoubleRegs:$src1, DoubleRegs:$src2), + (M5_vrmpybsu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_vrndpackwh DoubleRegs:$src1), + (S2_vrndpackwh DoubleRegs:$src1)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vcmac_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (M2_vcmac_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_vmaxuw DoubleRegs:$src1, DoubleRegs:$src2), + (A2_vmaxuw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C2_bitsclr IntRegs:$src1, IntRegs:$src2), + (C2_bitsclr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_xor_xacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_xor_xacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred:$src2), + (A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A4_ornp DoubleRegs:$src1, DoubleRegs:$src2), + (A4_ornp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_and_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3), + (C4_and_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_nac_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_nac_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_mpy_nac_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mpy_nac_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_subh_h16_sat_ll IntRegs:$src1, IntRegs:$src2), + (A2_subh_h16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_A2_subh_h16_sat_lh IntRegs:$src1, IntRegs:$src2), + (A2_subh_h16_sat_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vmpy2su_s1 IntRegs:$src1, IntRegs:$src2), + (M2_vmpy2su_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_M2_vmpy2su_s0 IntRegs:$src1, IntRegs:$src2), + (M2_vmpy2su_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), + (S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_C4_nbitsclri IntRegs:$src1, u6_0ImmPred:$src2), + (C4_nbitsclri IntRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2), + (S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>; +def: Pat<(int_hexagon_S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), + (S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>; + +// V55 Scalar Instructions. + +def: Pat<(int_hexagon_A5_ACS DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + (A5_ACS DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV55]>; + +// V60 Scalar Instructions. + +def: Pat<(int_hexagon_S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), + (S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), + (S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred:$src2), + (S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), + (S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), + (S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_r IntRegs:$src1, u5_0ImmPred:$src2), + (S6_rol_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3), + (S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>; +def: Pat<(int_hexagon_S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3), + (S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>; + +// V62 Scalar Instructions. + +def: Pat<(int_hexagon_S6_vtrunehb_ppp DoubleRegs:$src1, DoubleRegs:$src2), + (S6_vtrunehb_ppp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV62]>; +def: Pat<(int_hexagon_V6_ldntnt0 IntRegs:$src1), + (V6_ldntnt0 IntRegs:$src1)>, Requires<[HasV62]>; +def: Pat<(int_hexagon_M6_vabsdiffub DoubleRegs:$src1, DoubleRegs:$src2), + (M6_vabsdiffub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV62]>; +def: Pat<(int_hexagon_S6_vtrunohb_ppp DoubleRegs:$src1, DoubleRegs:$src2), + (S6_vtrunohb_ppp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV62]>; +def: Pat<(int_hexagon_M6_vabsdiffb DoubleRegs:$src1, DoubleRegs:$src2), + (M6_vabsdiffb DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV62]>; +def: Pat<(int_hexagon_A6_vminub_RdP DoubleRegs:$src1, DoubleRegs:$src2), + (A6_vminub_RdP DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV62]>; +def: Pat<(int_hexagon_S6_vsplatrbp IntRegs:$src1), + (S6_vsplatrbp IntRegs:$src1)>, Requires<[HasV62]>; + +// V65 Scalar Instructions. + +def: Pat<(int_hexagon_A6_vcmpbeq_notany DoubleRegs:$src1, DoubleRegs:$src2), + (A6_vcmpbeq_notany DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV65]>; + +// V66 Scalar Instructions. + +def: Pat<(int_hexagon_F2_dfsub DoubleRegs:$src1, DoubleRegs:$src2), + (F2_dfsub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV66]>; +def: Pat<(int_hexagon_F2_dfadd DoubleRegs:$src1, DoubleRegs:$src2), + (F2_dfadd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV66]>; +def: Pat<(int_hexagon_M2_mnaci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + (M2_mnaci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV66]>; +def: Pat<(int_hexagon_S2_mask u5_0ImmPred:$src1, u5_0ImmPred:$src2), + (S2_mask u5_0ImmPred:$src1, u5_0ImmPred:$src2)>, Requires<[HasV66]>; + +// V60 HVX Instructions. + +def: Pat<(int_hexagon_V6_veqb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_veqb_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vminub HvxVR:$src1, HvxVR:$src2), + (V6_vminub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vminub_128B HvxVR:$src1, HvxVR:$src2), + (V6_vminub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaslw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vaslw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaslw_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vaslw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyhvsrs HvxVR:$src1, HvxVR:$src2), + (V6_vmpyhvsrs HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyhvsrs_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpyhvsrs HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsathub HvxVR:$src1, HvxVR:$src2), + (V6_vsathub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsathub_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsathub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddh_dv HvxWR:$src1, HvxWR:$src2), + (V6_vaddh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddh_dv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vaddh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3), + (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpybusi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3), + (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vshufoh HvxVR:$src1, HvxVR:$src2), + (V6_vshufoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vshufoh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vshufoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasrwv HvxVR:$src1, HvxVR:$src2), + (V6_vasrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasrwv_128B HvxVR:$src1, HvxVR:$src2), + (V6_vasrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2), + (V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpyhsuisat_128B HvxWR:$src1, IntRegs:$src2), + (V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4), + (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrsadubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4), + (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vnavgw HvxVR:$src1, HvxVR:$src2), + (V6_vnavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vnavgw_128B HvxVR:$src1, HvxVR:$src2), + (V6_vnavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vnavgh HvxVR:$src1, HvxVR:$src2), + (V6_vnavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vnavgh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vnavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vavgub HvxVR:$src1, HvxVR:$src2), + (V6_vavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vavgub_128B HvxVR:$src1, HvxVR:$src2), + (V6_vavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubb HvxVR:$src1, HvxVR:$src2), + (V6_vsubb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubb_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsubb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtw_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vavgubrnd HvxVR:$src1, HvxVR:$src2), + (V6_vavgubrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vavgubrnd_128B HvxVR:$src1, HvxVR:$src2), + (V6_vavgubrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpybusv HvxVR:$src1, HvxVR:$src2), + (V6_vrmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpybusv_128B HvxVR:$src1, HvxVR:$src2), + (V6_vrmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vsubbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubbnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vsubbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vroundhb HvxVR:$src1, HvxVR:$src2), + (V6_vroundhb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vroundhb_128B HvxVR:$src1, HvxVR:$src2), + (V6_vroundhb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vadduhsat_dv HvxWR:$src1, HvxWR:$src2), + (V6_vadduhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vadduhsat_dv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vadduhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsububsat HvxVR:$src1, HvxVR:$src2), + (V6_vsububsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsububsat_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsububsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpabus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vmpabus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpabus_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vmpabus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmux HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmux HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmux_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmux HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyhus HvxVR:$src1, HvxVR:$src2), + (V6_vmpyhus HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyhus_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpyhus HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vpackeb HvxVR:$src1, HvxVR:$src2), + (V6_vpackeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vpackeb_128B HvxVR:$src1, HvxVR:$src2), + (V6_vpackeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vsubhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubhnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vsubhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vavghrnd HvxVR:$src1, HvxVR:$src2), + (V6_vavghrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vavghrnd_128B HvxVR:$src1, HvxVR:$src2), + (V6_vavghrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vtran2x2_map HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vtran2x2_map HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vtran2x2_map_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vtran2x2_map HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdelta HvxVR:$src1, HvxVR:$src2), + (V6_vdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdelta_128B HvxVR:$src1, HvxVR:$src2), + (V6_vdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtuh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtuh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtuh_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtuh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vtmpyhb HvxWR:$src1, IntRegs:$src2), + (V6_vtmpyhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vtmpyhb_128B HvxWR:$src1, IntRegs:$src2), + (V6_vtmpyhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vpackob HvxVR:$src1, HvxVR:$src2), + (V6_vpackob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vpackob_128B HvxVR:$src1, HvxVR:$src2), + (V6_vpackob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmaxh HvxVR:$src1, HvxVR:$src2), + (V6_vmaxh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmaxh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmaxh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vtmpybus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vtmpybus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vtmpybus_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vtmpybus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubuhsat HvxVR:$src1, HvxVR:$src2), + (V6_vsubuhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubuhsat_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsubuhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasrw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vasrw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasrw_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vasrw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_pred_or HvxQR:$src1, HvxQR:$src2), + (V6_pred_or HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_pred_or_128B HvxQR:$src1, HvxQR:$src2), + (V6_pred_or HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpyub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vrmpyub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpyub_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vrmpyub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_lo HvxWR:$src1), + (V6_lo HvxWR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_lo_128B HvxWR:$src1), + (V6_lo HvxWR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubb_dv HvxWR:$src1, HvxWR:$src2), + (V6_vsubb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubb_dv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vsubb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubhsat_dv HvxWR:$src1, HvxWR:$src2), + (V6_vsubhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubhsat_dv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vsubhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyiwh HvxVR:$src1, IntRegs:$src2), + (V6_vmpyiwh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyiwh_128B HvxVR:$src1, IntRegs:$src2), + (V6_vmpyiwh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyiwb HvxVR:$src1, IntRegs:$src2), + (V6_vmpyiwb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyiwb_128B HvxVR:$src1, IntRegs:$src2), + (V6_vmpyiwb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_ldu0 IntRegs:$src1), + (V6_ldu0 IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_ldu0_128B IntRegs:$src1), + (V6_ldu0 IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtuh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtuh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtuh_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtuh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgth_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgth_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgth_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgth_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vavgh HvxVR:$src1, HvxVR:$src2), + (V6_vavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vavgh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlalignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vlalignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlalignb_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vlalignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsh HvxVR:$src1), + (V6_vsh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsh_128B HvxVR:$src1), + (V6_vsh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_pred_and_n HvxQR:$src1, HvxQR:$src2), + (V6_pred_and_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_pred_and_n_128B HvxQR:$src1, HvxQR:$src2), + (V6_pred_and_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsb HvxVR:$src1), + (V6_vsb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsb_128B HvxVR:$src1), + (V6_vsb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vroundwuh HvxVR:$src1, HvxVR:$src2), + (V6_vroundwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vroundwuh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vroundwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasrhv HvxVR:$src1, HvxVR:$src2), + (V6_vasrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasrhv_128B HvxVR:$src1, HvxVR:$src2), + (V6_vasrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vshuffh HvxVR:$src1), + (V6_vshuffh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vshuffh_128B HvxVR:$src1), + (V6_vshuffh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddhsat_dv HvxWR:$src1, HvxWR:$src2), + (V6_vaddhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddhsat_dv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vaddhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vnavgub HvxVR:$src1, HvxVR:$src2), + (V6_vnavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vnavgub_128B HvxVR:$src1, HvxVR:$src2), + (V6_vnavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpybv HvxVR:$src1, HvxVR:$src2), + (V6_vrmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpybv_128B HvxVR:$src1, HvxVR:$src2), + (V6_vrmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vnormamth HvxVR:$src1), + (V6_vnormamth HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vnormamth_128B HvxVR:$src1), + (V6_vnormamth HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpyhb HvxVR:$src1, IntRegs:$src2), + (V6_vdmpyhb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpyhb_128B HvxVR:$src1, IntRegs:$src2), + (V6_vdmpyhb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vavguh HvxVR:$src1, HvxVR:$src2), + (V6_vavguh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vavguh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vavguh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlsrwv HvxVR:$src1, HvxVR:$src2), + (V6_vlsrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlsrwv_128B HvxVR:$src1, HvxVR:$src2), + (V6_vlsrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlsrhv HvxVR:$src1, HvxVR:$src2), + (V6_vlsrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlsrhv_128B HvxVR:$src1, HvxVR:$src2), + (V6_vlsrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpyhisat HvxWR:$src1, IntRegs:$src2), + (V6_vdmpyhisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpyhisat_128B HvxWR:$src1, IntRegs:$src2), + (V6_vdmpyhisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpyhvsat HvxVR:$src1, HvxVR:$src2), + (V6_vdmpyhvsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpyhvsat_128B HvxVR:$src1, HvxVR:$src2), + (V6_vdmpyhvsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddw HvxVR:$src1, HvxVR:$src2), + (V6_vaddw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddw_128B HvxVR:$src1, HvxVR:$src2), + (V6_vaddw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vzh HvxVR:$src1), + (V6_vzh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vzh_128B HvxVR:$src1), + (V6_vzh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddh HvxVR:$src1, HvxVR:$src2), + (V6_vaddh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vaddh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmaxub HvxVR:$src1, HvxVR:$src2), + (V6_vmaxub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmaxub_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmaxub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyhv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vadduhsat HvxVR:$src1, HvxVR:$src2), + (V6_vadduhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vadduhsat_128B HvxVR:$src1, HvxVR:$src2), + (V6_vadduhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vshufoeh HvxVR:$src1, HvxVR:$src2), + (V6_vshufoeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vshufoeh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vshufoeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyuhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyuhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyuhv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyuhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_veqh HvxVR:$src1, HvxVR:$src2), + (V6_veqh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_veqh_128B HvxVR:$src1, HvxVR:$src2), + (V6_veqh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpabuuv HvxWR:$src1, HvxWR:$src2), + (V6_vmpabuuv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpabuuv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vmpabuuv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasrwhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasrwhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasrwhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasrwhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vminuh HvxVR:$src1, HvxVR:$src2), + (V6_vminuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vminuh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vminuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vror HvxVR:$src1, IntRegs:$src2), + (V6_vror HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vror_128B HvxVR:$src1, IntRegs:$src2), + (V6_vror HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyowh_rnd_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyowh_rnd_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyowh_rnd_sacc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyowh_rnd_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmaxuh HvxVR:$src1, HvxVR:$src2), + (V6_vmaxuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmaxuh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmaxuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vabsh_sat HvxVR:$src1), + (V6_vabsh_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vabsh_sat_128B HvxVR:$src1), + (V6_vabsh_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_pred_or_n HvxQR:$src1, HvxQR:$src2), + (V6_pred_or_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_pred_or_n_128B HvxQR:$src1, HvxQR:$src2), + (V6_pred_or_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdealb HvxVR:$src1), + (V6_vdealb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdealb_128B HvxVR:$src1), + (V6_vdealb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpybusv HvxVR:$src1, HvxVR:$src2), + (V6_vmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpybusv_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vzb HvxVR:$src1), + (V6_vzb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vzb_128B HvxVR:$src1), + (V6_vzb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpybus_dv HvxWR:$src1, IntRegs:$src2), + (V6_vdmpybus_dv HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpybus_dv_128B HvxWR:$src1, IntRegs:$src2), + (V6_vdmpybus_dv HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vaddbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddbq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vaddbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddb HvxVR:$src1, HvxVR:$src2), + (V6_vaddb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddb_128B HvxVR:$src1, HvxVR:$src2), + (V6_vaddb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vaddwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddwq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vaddwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasrhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasrhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasrhubrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasrhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasrhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasrhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasrhubsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasrhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vshufoeb HvxVR:$src1, HvxVR:$src2), + (V6_vshufoeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vshufoeb_128B HvxVR:$src1, HvxVR:$src2), + (V6_vshufoeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vpackhub_sat HvxVR:$src1, HvxVR:$src2), + (V6_vpackhub_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vpackhub_sat_128B HvxVR:$src1, HvxVR:$src2), + (V6_vpackhub_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyiwh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpyiwh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyiwh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpyiwh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vtmpyb HvxWR:$src1, IntRegs:$src2), + (V6_vtmpyb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vtmpyb_128B HvxWR:$src1, IntRegs:$src2), + (V6_vtmpyb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpabusv HvxWR:$src1, HvxWR:$src2), + (V6_vmpabusv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpabusv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vmpabusv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_pred_and HvxQR:$src1, HvxQR:$src2), + (V6_pred_and HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_pred_and_128B HvxQR:$src1, HvxQR:$src2), + (V6_pred_and HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vsubwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubwnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vsubwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vpackwuh_sat HvxVR:$src1, HvxVR:$src2), + (V6_vpackwuh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vpackwuh_sat_128B HvxVR:$src1, HvxVR:$src2), + (V6_vpackwuh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vswap HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vswap HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vswap_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vswap HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpyubv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vrmpyubv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpyubv_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vrmpyubv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtb_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaslw HvxVR:$src1, IntRegs:$src2), + (V6_vaslw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaslw_128B HvxVR:$src1, IntRegs:$src2), + (V6_vaslw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vpackhb_sat HvxVR:$src1, HvxVR:$src2), + (V6_vpackhb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vpackhb_sat_128B HvxVR:$src1, HvxVR:$src2), + (V6_vpackhb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyih_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyih_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyih_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyih_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vshuffvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vshuffvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vshuffvdd_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vshuffvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddb_dv HvxWR:$src1, HvxWR:$src2), + (V6_vaddb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddb_dv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vaddb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vunpackub HvxVR:$src1), + (V6_vunpackub HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vunpackub_128B HvxVR:$src1), + (V6_vunpackub HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtuw HvxVR:$src1, HvxVR:$src2), + (V6_vgtuw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtuw_128B HvxVR:$src1, HvxVR:$src2), + (V6_vgtuw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlutvwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vlutvwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlutvwh_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vlutvwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtub HvxVR:$src1, HvxVR:$src2), + (V6_vgtub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtub_128B HvxVR:$src1, HvxVR:$src2), + (V6_vgtub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyowh HvxVR:$src1, HvxVR:$src2), + (V6_vmpyowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyowh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpyowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyieoh HvxVR:$src1, HvxVR:$src2), + (V6_vmpyieoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyieoh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpyieoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_extractw HvxVR:$src1, IntRegs:$src2), + (V6_extractw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_extractw_128B HvxVR:$src1, IntRegs:$src2), + (V6_extractw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vavgwrnd HvxVR:$src1, HvxVR:$src2), + (V6_vavgwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vavgwrnd_128B HvxVR:$src1, HvxVR:$src2), + (V6_vavgwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpyhsat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vdmpyhsat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpyhsat_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vdmpyhsat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtub_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtub_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtub_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtub_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyub HvxVR:$src1, IntRegs:$src2), + (V6_vmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyub_128B HvxVR:$src1, IntRegs:$src2), + (V6_vmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyuh HvxVR:$src1, IntRegs:$src2), + (V6_vmpyuh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyuh_128B HvxVR:$src1, IntRegs:$src2), + (V6_vmpyuh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vunpackob HvxWR:$src1, HvxVR:$src2), + (V6_vunpackob HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vunpackob_128B HvxWR:$src1, HvxVR:$src2), + (V6_vunpackob HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpahb HvxWR:$src1, IntRegs:$src2), + (V6_vmpahb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpahb_128B HvxWR:$src1, IntRegs:$src2), + (V6_vmpahb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_veqw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_veqw_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vandqrt HvxQR:$src1, IntRegs:$src2), + (V6_vandqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vandqrt_128B HvxQR:$src1, IntRegs:$src2), + (V6_vandqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vxor HvxVR:$src1, HvxVR:$src2), + (V6_vxor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vxor_128B HvxVR:$src1, HvxVR:$src2), + (V6_vxor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasrwhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasrwhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasrwhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasrwhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyhsat_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpyhsat_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyhsat_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpyhsat_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vrmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpybus_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vrmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubhw HvxVR:$src1, HvxVR:$src2), + (V6_vsubhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubhw_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsubhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdealb4w HvxVR:$src1, HvxVR:$src2), + (V6_vdealb4w HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdealb4w_128B HvxVR:$src1, HvxVR:$src2), + (V6_vdealb4w HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyowh_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyowh_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyowh_sacc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyowh_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpybv HvxVR:$src1, HvxVR:$src2), + (V6_vmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpybv_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vabsdiffh HvxVR:$src1, HvxVR:$src2), + (V6_vabsdiffh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vabsdiffh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vabsdiffh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vshuffob HvxVR:$src1, HvxVR:$src2), + (V6_vshuffob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vshuffob_128B HvxVR:$src1, HvxVR:$src2), + (V6_vshuffob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyub_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpyub_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyub_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpyub_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vnormamtw HvxVR:$src1), + (V6_vnormamtw HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vnormamtw_128B HvxVR:$src1), + (V6_vnormamtw HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vunpackuh HvxVR:$src1), + (V6_vunpackuh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vunpackuh_128B HvxVR:$src1), + (V6_vunpackuh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtuh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtuh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtuh_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtuh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyiewuh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyiewuh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyiewuh_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyiewuh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vunpackoh HvxWR:$src1, HvxVR:$src2), + (V6_vunpackoh HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vunpackoh_128B HvxWR:$src1, HvxVR:$src2), + (V6_vunpackoh HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpyhsat HvxVR:$src1, IntRegs:$src2), + (V6_vdmpyhsat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpyhsat_128B HvxVR:$src1, IntRegs:$src2), + (V6_vdmpyhsat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyubv HvxVR:$src1, HvxVR:$src2), + (V6_vmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyubv_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyhss HvxVR:$src1, IntRegs:$src2), + (V6_vmpyhss HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyhss_128B HvxVR:$src1, IntRegs:$src2), + (V6_vmpyhss HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_hi HvxWR:$src1), + (V6_hi HvxWR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_hi_128B HvxWR:$src1), + (V6_hi HvxWR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasrwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasrwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasrwuhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasrwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_veqw HvxVR:$src1, HvxVR:$src2), + (V6_veqw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_veqw_128B HvxVR:$src1, HvxVR:$src2), + (V6_veqw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdsaduh HvxWR:$src1, IntRegs:$src2), + (V6_vdsaduh HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdsaduh_128B HvxWR:$src1, IntRegs:$src2), + (V6_vdsaduh HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubw HvxVR:$src1, HvxVR:$src2), + (V6_vsubw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubw_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsubw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubw_dv HvxWR:$src1, HvxWR:$src2), + (V6_vsubw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubw_dv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vsubw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_veqb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_veqb_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyih HvxVR:$src1, HvxVR:$src2), + (V6_vmpyih HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyih_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpyih HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vtmpyb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vtmpyb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vtmpyb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vtmpyb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpybus HvxVR:$src1, IntRegs:$src2), + (V6_vrmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpybus_128B HvxVR:$src1, IntRegs:$src2), + (V6_vrmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpybus_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpybus_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpybus_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpybus_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgth_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgth_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgth_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgth_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubhsat HvxVR:$src1, HvxVR:$src2), + (V6_vsubhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubhsat_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsubhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4), + (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpyubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4), + (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vabsw HvxVR:$src1), + (V6_vabsw HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vabsw_128B HvxVR:$src1), + (V6_vabsw HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddwsat_dv HvxWR:$src1, HvxWR:$src2), + (V6_vaddwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddwsat_dv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vaddwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlsrw HvxVR:$src1, IntRegs:$src2), + (V6_vlsrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlsrw_128B HvxVR:$src1, IntRegs:$src2), + (V6_vlsrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vabsh HvxVR:$src1), + (V6_vabsh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vabsh_128B HvxVR:$src1), + (V6_vabsh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlsrh HvxVR:$src1, IntRegs:$src2), + (V6_vlsrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlsrh_128B HvxVR:$src1, IntRegs:$src2), + (V6_vlsrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_valignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_valignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_valignb_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_valignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vsubhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubhq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vsubhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vpackoh HvxVR:$src1, HvxVR:$src2), + (V6_vpackoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vpackoh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vpackoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vdmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpybus_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vdmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpyhvsat_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vdmpyhvsat_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpyhvsat_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vdmpyhvsat_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpybv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vrmpybv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpybv_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vrmpybv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddhsat HvxVR:$src1, HvxVR:$src2), + (V6_vaddhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddhsat_128B HvxVR:$src1, HvxVR:$src2), + (V6_vaddhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vcombine HvxVR:$src1, HvxVR:$src2), + (V6_vcombine HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vcombine_128B HvxVR:$src1, HvxVR:$src2), + (V6_vcombine HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vandqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3), + (V6_vandqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vandqrt_acc_128B HvxVR:$src1, HvxQR:$src2, IntRegs:$src3), + (V6_vandqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaslhv HvxVR:$src1, HvxVR:$src2), + (V6_vaslhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaslhv_128B HvxVR:$src1, HvxVR:$src2), + (V6_vaslhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vinsertwr HvxVR:$src1, IntRegs:$src2), + (V6_vinsertwr HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vinsertwr_128B HvxVR:$src1, IntRegs:$src2), + (V6_vinsertwr HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubh_dv HvxWR:$src1, HvxWR:$src2), + (V6_vsubh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubh_dv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vsubh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vshuffb HvxVR:$src1), + (V6_vshuffb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vshuffb_128B HvxVR:$src1), + (V6_vshuffb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vand HvxVR:$src1, HvxVR:$src2), + (V6_vand HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vand_128B HvxVR:$src1, HvxVR:$src2), + (V6_vand HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyhv HvxVR:$src1, HvxVR:$src2), + (V6_vmpyhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyhv_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpyhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpyhsuisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vdmpyhsuisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpyhsuisat_acc_128B HvxVR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vdmpyhsuisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsububsat_dv HvxWR:$src1, HvxWR:$src2), + (V6_vsububsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsububsat_dv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vsububsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtb_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdsaduh_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vdsaduh_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdsaduh_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vdsaduh_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpyub HvxVR:$src1, IntRegs:$src2), + (V6_vrmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpyub_128B HvxVR:$src1, IntRegs:$src2), + (V6_vrmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyuh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpyuh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyuh_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpyuh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vcl0h HvxVR:$src1), + (V6_vcl0h HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vcl0h_128B HvxVR:$src1), + (V6_vcl0h HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyhus_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyhus_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyhus_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyhus_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpybv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3), + (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrsadubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3), + (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpyhb_dv_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vshufeh HvxVR:$src1, HvxVR:$src2), + (V6_vshufeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vshufeh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vshufeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyewuh HvxVR:$src1, HvxVR:$src2), + (V6_vmpyewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyewuh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpyewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyhsrs HvxVR:$src1, IntRegs:$src2), + (V6_vmpyhsrs HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyhsrs_128B HvxVR:$src1, IntRegs:$src2), + (V6_vmpyhsrs HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpybus_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vdmpybus_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpybus_dv_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vdmpybus_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddubh HvxVR:$src1, HvxVR:$src2), + (V6_vaddubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddubh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vaddubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasrwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasrwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasrwh_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasrwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_ld0 IntRegs:$src1), + (V6_ld0 IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_ld0_128B IntRegs:$src1), + (V6_ld0 IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vpopcounth HvxVR:$src1), + (V6_vpopcounth HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vpopcounth_128B HvxVR:$src1), + (V6_vpopcounth HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_ldnt0 IntRegs:$src1), + (V6_ldnt0 IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_ldnt0_128B IntRegs:$src1), + (V6_ldnt0 IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgth_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgth_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgth_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgth_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddubsat_dv HvxWR:$src1, HvxWR:$src2), + (V6_vaddubsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddubsat_dv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vaddubsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vpackeh HvxVR:$src1, HvxVR:$src2), + (V6_vpackeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vpackeh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vpackeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyh HvxVR:$src1, IntRegs:$src2), + (V6_vmpyh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyh_128B HvxVR:$src1, IntRegs:$src2), + (V6_vmpyh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vminh HvxVR:$src1, HvxVR:$src2), + (V6_vminh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vminh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vminh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_pred_scalar2 IntRegs:$src1), + (V6_pred_scalar2 IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_pred_scalar2_128B IntRegs:$src1), + (V6_pred_scalar2 IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdealh HvxVR:$src1), + (V6_vdealh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdealh_128B HvxVR:$src1), + (V6_vdealh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vpackwh_sat HvxVR:$src1, HvxVR:$src2), + (V6_vpackwh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vpackwh_sat_128B HvxVR:$src1, HvxVR:$src2), + (V6_vpackwh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaslh HvxVR:$src1, IntRegs:$src2), + (V6_vaslh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaslh_128B HvxVR:$src1, IntRegs:$src2), + (V6_vaslh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtuw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtuw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtuw_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtuw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vor HvxVR:$src1, HvxVR:$src2), + (V6_vor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vor_128B HvxVR:$src1, HvxVR:$src2), + (V6_vor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlutvvb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vlutvvb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlutvvb_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vlutvvb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyiowh HvxVR:$src1, HvxVR:$src2), + (V6_vmpyiowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyiowh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpyiowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlutvvb_oracc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4), + (V6_vlutvvb_oracc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlutvvb_oracc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4), + (V6_vlutvvb_oracc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vandvrt HvxVR:$src1, IntRegs:$src2), + (V6_vandvrt HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vandvrt_128B HvxVR:$src1, IntRegs:$src2), + (V6_vandvrt HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_veqh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_veqh_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vadduhw HvxVR:$src1, HvxVR:$src2), + (V6_vadduhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vadduhw_128B HvxVR:$src1, HvxVR:$src2), + (V6_vadduhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vcl0w HvxVR:$src1), + (V6_vcl0w HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vcl0w_128B HvxVR:$src1), + (V6_vcl0w HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyihb HvxVR:$src1, IntRegs:$src2), + (V6_vmpyihb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyihb_128B HvxVR:$src1, IntRegs:$src2), + (V6_vmpyihb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vtmpybus HvxWR:$src1, IntRegs:$src2), + (V6_vtmpybus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vtmpybus_128B HvxWR:$src1, IntRegs:$src2), + (V6_vtmpybus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vd0 ), + (V6_vd0 )>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vd0_128B ), + (V6_vd0 )>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_veqh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_veqh_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtw_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpybus HvxVR:$src1, IntRegs:$src2), + (V6_vdmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpybus_128B HvxVR:$src1, IntRegs:$src2), + (V6_vdmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtub_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtub_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtub_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtub_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpybus HvxVR:$src1, IntRegs:$src2), + (V6_vmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpybus_128B HvxVR:$src1, IntRegs:$src2), + (V6_vmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpyhb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vdmpyhb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpyhb_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vdmpyhb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vandvrt_acc HvxQR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vandvrt_acc HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vandvrt_acc_128B HvxQR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vandvrt_acc HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vassign HvxVR:$src1), + (V6_vassign HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vassign_128B HvxVR:$src1), + (V6_vassign HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vaddwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddwnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vaddwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtub_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtub_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtub_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtub_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpyhb_dv HvxWR:$src1, IntRegs:$src2), + (V6_vdmpyhb_dv HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpyhb_dv_128B HvxWR:$src1, IntRegs:$src2), + (V6_vdmpyhb_dv HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vunpackb HvxVR:$src1), + (V6_vunpackb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vunpackb_128B HvxVR:$src1), + (V6_vunpackb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vunpackh HvxVR:$src1), + (V6_vunpackh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vunpackh_128B HvxVR:$src1), + (V6_vunpackh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpahb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vmpahb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpahb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vmpahb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddbnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3), + (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlalignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3), + (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsatwh HvxVR:$src1, HvxVR:$src2), + (V6_vsatwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsatwh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsatwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtuh HvxVR:$src1, HvxVR:$src2), + (V6_vgtuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtuh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vgtuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyihb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpyihb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyihb_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpyihb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpybusv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vrmpybusv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpybusv_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vrmpybusv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrdelta HvxVR:$src1, HvxVR:$src2), + (V6_vrdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrdelta_128B HvxVR:$src1, HvxVR:$src2), + (V6_vrdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vroundwh HvxVR:$src1, HvxVR:$src2), + (V6_vroundwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vroundwh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vroundwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddw_dv HvxWR:$src1, HvxWR:$src2), + (V6_vaddw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddw_dv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vaddw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyiwb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpyiwb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyiwb_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpyiwb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vsubbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubbq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vsubbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_veqh_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3), + (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_valignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3), + (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddwsat HvxVR:$src1, HvxVR:$src2), + (V6_vaddwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddwsat_128B HvxVR:$src1, HvxVR:$src2), + (V6_vaddwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_veqw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_veqw_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vabsdiffub HvxVR:$src1, HvxVR:$src2), + (V6_vabsdiffub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vabsdiffub_128B HvxVR:$src1, HvxVR:$src2), + (V6_vabsdiffub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vshuffeb HvxVR:$src1, HvxVR:$src2), + (V6_vshuffeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vshuffeb_128B HvxVR:$src1, HvxVR:$src2), + (V6_vshuffeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vabsdiffuh HvxVR:$src1, HvxVR:$src2), + (V6_vabsdiffuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vabsdiffuh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vabsdiffuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_veqw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_veqw_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgth HvxVR:$src1, HvxVR:$src2), + (V6_vgth HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgth_128B HvxVR:$src1, HvxVR:$src2), + (V6_vgth HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtuw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtuw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtuw_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtuw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtb HvxVR:$src1, HvxVR:$src2), + (V6_vgtb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtb_128B HvxVR:$src1, HvxVR:$src2), + (V6_vgtb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtw HvxVR:$src1, HvxVR:$src2), + (V6_vgtw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtw_128B HvxVR:$src1, HvxVR:$src2), + (V6_vgtw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vsubwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubwq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vsubwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vnot HvxVR:$src1), + (V6_vnot HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vnot_128B HvxVR:$src1), + (V6_vnot HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtb_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtuw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtuw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtuw_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtuw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddubsat HvxVR:$src1, HvxVR:$src2), + (V6_vaddubsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddubsat_128B HvxVR:$src1, HvxVR:$src2), + (V6_vaddubsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmaxw HvxVR:$src1, HvxVR:$src2), + (V6_vmaxw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmaxw_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmaxw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaslwv HvxVR:$src1, HvxVR:$src2), + (V6_vaslwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaslwv_128B HvxVR:$src1, HvxVR:$src2), + (V6_vaslwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vabsw_sat HvxVR:$src1), + (V6_vabsw_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vabsw_sat_128B HvxVR:$src1), + (V6_vabsw_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubwsat_dv HvxWR:$src1, HvxWR:$src2), + (V6_vsubwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubwsat_dv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vsubwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vroundhub HvxVR:$src1, HvxVR:$src2), + (V6_vroundhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vroundhub_128B HvxVR:$src1, HvxVR:$src2), + (V6_vroundhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpyhisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vdmpyhisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpyhisat_acc_128B HvxVR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vdmpyhisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpabus HvxWR:$src1, IntRegs:$src2), + (V6_vmpabus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpabus_128B HvxWR:$src1, IntRegs:$src2), + (V6_vmpabus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vassignp HvxWR:$src1), + (V6_vassignp HvxWR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vassignp_128B HvxWR:$src1), + (V6_vassignp HvxWR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_veqb HvxVR:$src1, HvxVR:$src2), + (V6_veqb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_veqb_128B HvxVR:$src1, HvxVR:$src2), + (V6_veqb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsububh HvxVR:$src1, HvxVR:$src2), + (V6_vsububh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsububh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsububh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_lvsplatw IntRegs:$src1), + (V6_lvsplatw IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_lvsplatw_128B IntRegs:$src1), + (V6_lvsplatw IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vaddhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddhnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vaddhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpyhsusat HvxVR:$src1, IntRegs:$src2), + (V6_vdmpyhsusat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpyhsusat_128B HvxVR:$src1, IntRegs:$src2), + (V6_vdmpyhsusat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_pred_not HvxQR:$src1), + (V6_pred_not HvxQR:$src1)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_pred_not_128B HvxQR:$src1), + (V6_pred_not HvxQR:$src1)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlutvwh_oracc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4), + (V6_vlutvwh_oracc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlutvwh_oracc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4), + (V6_vlutvwh_oracc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyiewh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyiewh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyiewh_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyiewh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdealvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vdealvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdealvdd_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vdealvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vavgw HvxVR:$src1, HvxVR:$src2), + (V6_vavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vavgw_128B HvxVR:$src1, HvxVR:$src2), + (V6_vavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpyhsusat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vdmpyhsusat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpyhsusat_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vdmpyhsusat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgtw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vgtw_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vtmpyhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vtmpyhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vtmpyhb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vtmpyhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddhw HvxVR:$src1, HvxVR:$src2), + (V6_vaddhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddhw_128B HvxVR:$src1, HvxVR:$src2), + (V6_vaddhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vaddhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddhq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vaddhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpyubv HvxVR:$src1, HvxVR:$src2), + (V6_vrmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpyubv_128B HvxVR:$src1, HvxVR:$src2), + (V6_vrmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubh HvxVR:$src1, HvxVR:$src2), + (V6_vsubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3), + (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpyubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3), + (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vminw HvxVR:$src1, HvxVR:$src2), + (V6_vminw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vminw_128B HvxVR:$src1, HvxVR:$src2), + (V6_vminw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyubv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyubv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyubv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyubv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_pred_xor HvxQR:$src1, HvxQR:$src2), + (V6_pred_xor HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_pred_xor_128B HvxQR:$src1, HvxQR:$src2), + (V6_pred_xor HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_veqb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_veqb_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyiewuh HvxVR:$src1, HvxVR:$src2), + (V6_vmpyiewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyiewuh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpyiewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpybusv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpybusv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpybusv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpybusv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vavguhrnd HvxVR:$src1, HvxVR:$src2), + (V6_vavguhrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vavguhrnd_128B HvxVR:$src1, HvxVR:$src2), + (V6_vavguhrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyowh_rnd HvxVR:$src1, HvxVR:$src2), + (V6_vmpyowh_rnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyowh_rnd_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpyowh_rnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubwsat HvxVR:$src1, HvxVR:$src2), + (V6_vsubwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubwsat_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsubwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubuhw HvxVR:$src1, HvxVR:$src2), + (V6_vsubuhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubuhw_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsubuhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4), + (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpybusi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4), + (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasrw HvxVR:$src1, IntRegs:$src2), + (V6_vasrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasrw_128B HvxVR:$src1, IntRegs:$src2), + (V6_vasrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasrh HvxVR:$src1, IntRegs:$src2), + (V6_vasrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasrh_128B HvxVR:$src1, IntRegs:$src2), + (V6_vasrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyuhv HvxVR:$src1, HvxVR:$src2), + (V6_vmpyuhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyuhv_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpyuhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasrhbrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasrhbrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasrhbrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasrhbrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubuhsat_dv HvxWR:$src1, HvxWR:$src2), + (V6_vsubuhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubuhsat_dv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vsubuhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vabsdiffw HvxVR:$src1, HvxVR:$src2), + (V6_vabsdiffw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vabsdiffw_128B HvxVR:$src1, HvxVR:$src2), + (V6_vabsdiffw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>; + +// V62 HVX Instructions. + +def: Pat<(int_hexagon_V6_vandnqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3), + (V6_vandnqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vandnqrt_acc_128B HvxVR:$src1, HvxQR:$src2, IntRegs:$src3), + (V6_vandnqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddclbh HvxVR:$src1, HvxVR:$src2), + (V6_vaddclbh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddclbh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vaddclbh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyowh_64_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyowh_64_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyowh_64_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpyowh_64_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyewuh_64 HvxVR:$src1, HvxVR:$src2), + (V6_vmpyewuh_64 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyewuh_64_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpyewuh_64 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsatuwuh HvxVR:$src1, HvxVR:$src2), + (V6_vsatuwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsatuwuh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsatuwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_shuffeqh HvxQR:$src1, HvxQR:$src2), + (V6_shuffeqh HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_shuffeqh_128B HvxQR:$src1, HvxQR:$src2), + (V6_shuffeqh HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_shuffeqw HvxQR:$src1, HvxQR:$src2), + (V6_shuffeqw HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_shuffeqw_128B HvxQR:$src1, HvxQR:$src2), + (V6_shuffeqw HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_ldcnpnt0 PredRegs:$src1, IntRegs:$src2), + (V6_ldcnpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_ldcnpnt0_128B PredRegs:$src1, IntRegs:$src2), + (V6_ldcnpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3), + (V6_vsubcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubcarry_128B HvxVR:$src1, HvxVR:$src2, HvxQR:$src3), + (V6_vsubcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasrhbsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasrhbsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasrhbsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasrhbsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vminb HvxVR:$src1, HvxVR:$src2), + (V6_vminb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vminb_128B HvxVR:$src1, HvxVR:$src2), + (V6_vminb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpauhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vmpauhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpauhb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vmpauhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vaddhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddhw_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vaddhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlsrb HvxVR:$src1, IntRegs:$src2), + (V6_vlsrb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlsrb_128B HvxVR:$src1, IntRegs:$src2), + (V6_vlsrb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3), + (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlutvwhi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3), + (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2), + (V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddububb_sat_128B HvxVR:$src1, HvxVR:$src2), + (V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubbsat_dv HvxWR:$src1, HvxWR:$src2), + (V6_vsubbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubbsat_dv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vsubbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_ldtp0 PredRegs:$src1, IntRegs:$src2), + (V6_ldtp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_ldtp0_128B PredRegs:$src1, IntRegs:$src2), + (V6_ldtp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4), + (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlutvvb_oracci_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4), + (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2), + (V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubuwsat_dv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_ldpnt0 PredRegs:$src1, IntRegs:$src2), + (V6_ldpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_ldpnt0_128B PredRegs:$src1, IntRegs:$src2), + (V6_ldpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vandvnqv HvxQR:$src1, HvxVR:$src2), + (V6_vandvnqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vandvnqv_128B HvxQR:$src1, HvxVR:$src2), + (V6_vandvnqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_lvsplatb IntRegs:$src1), + (V6_lvsplatb IntRegs:$src1)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_lvsplatb_128B IntRegs:$src1), + (V6_lvsplatb IntRegs:$src1)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_lvsplath IntRegs:$src1), + (V6_lvsplath IntRegs:$src1)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_lvsplath_128B IntRegs:$src1), + (V6_lvsplath IntRegs:$src1)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_ldtpnt0 PredRegs:$src1, IntRegs:$src2), + (V6_ldtpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_ldtpnt0_128B PredRegs:$src1, IntRegs:$src2), + (V6_ldtpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlutvwh_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vlutvwh_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlutvwh_nm_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vlutvwh_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_ldnpnt0 PredRegs:$src1, IntRegs:$src2), + (V6_ldnpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_ldnpnt0_128B PredRegs:$src1, IntRegs:$src2), + (V6_ldnpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpauhb HvxWR:$src1, IntRegs:$src2), + (V6_vmpauhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpauhb_128B HvxWR:$src1, IntRegs:$src2), + (V6_vmpauhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_ldtnp0 PredRegs:$src1, IntRegs:$src2), + (V6_ldtnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_ldtnp0_128B PredRegs:$src1, IntRegs:$src2), + (V6_ldtnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrounduhub HvxVR:$src1, HvxVR:$src2), + (V6_vrounduhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrounduhub_128B HvxVR:$src1, HvxVR:$src2), + (V6_vrounduhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vadduhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vadduhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vadduhw_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vadduhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_ldcp0 PredRegs:$src1, IntRegs:$src2), + (V6_ldcp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_ldcp0_128B PredRegs:$src1, IntRegs:$src2), + (V6_ldcp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vadduwsat HvxVR:$src1, HvxVR:$src2), + (V6_vadduwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vadduwsat_128B HvxVR:$src1, HvxVR:$src2), + (V6_vadduwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_ldtnpnt0 PredRegs:$src1, IntRegs:$src2), + (V6_ldtnpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_ldtnpnt0_128B PredRegs:$src1, IntRegs:$src2), + (V6_ldtnpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddbsat HvxVR:$src1, HvxVR:$src2), + (V6_vaddbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddbsat_128B HvxVR:$src1, HvxVR:$src2), + (V6_vaddbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vandnqrt HvxQR:$src1, IntRegs:$src2), + (V6_vandnqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vandnqrt_128B HvxQR:$src1, IntRegs:$src2), + (V6_vandnqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyiwub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpyiwub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyiwub_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpyiwub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmaxb HvxVR:$src1, HvxVR:$src2), + (V6_vmaxb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmaxb_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmaxb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vandvqv HvxQR:$src1, HvxVR:$src2), + (V6_vandvqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vandvqv_128B HvxQR:$src1, HvxVR:$src2), + (V6_vandvqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3), + (V6_vaddcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddcarry_128B HvxVR:$src1, HvxVR:$src2, HvxQR:$src3), + (V6_vaddcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasrwuhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3), + (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlutvvbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3), + (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubuwsat HvxVR:$src1, HvxVR:$src2), + (V6_vsubuwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubuwsat_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsubuwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddbsat_dv HvxWR:$src1, HvxWR:$src2), + (V6_vaddbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddbsat_dv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vaddbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_ldnp0 PredRegs:$src1, IntRegs:$src2), + (V6_ldnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_ldnp0_128B PredRegs:$src1, IntRegs:$src2), + (V6_ldnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasruwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasruwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasruwuhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasruwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrounduwuh HvxVR:$src1, HvxVR:$src2), + (V6_vrounduwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrounduwuh_128B HvxVR:$src1, HvxVR:$src2), + (V6_vrounduwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlutvvb_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vlutvvb_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlutvvb_nm_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vlutvvb_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_pred_scalar2v2 IntRegs:$src1), + (V6_pred_scalar2v2 IntRegs:$src1)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_pred_scalar2v2_128B IntRegs:$src1), + (V6_pred_scalar2v2 IntRegs:$src1)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_ldp0 PredRegs:$src1, IntRegs:$src2), + (V6_ldp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_ldp0_128B PredRegs:$src1, IntRegs:$src2), + (V6_ldp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddubh_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vaddubh_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddubh_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vaddubh_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaddclbw HvxVR:$src1, HvxVR:$src2), + (V6_vaddclbw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddclbw_128B HvxVR:$src1, HvxVR:$src2), + (V6_vaddclbw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_ldcpnt0 PredRegs:$src1, IntRegs:$src2), + (V6_ldcpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_ldcpnt0_128B PredRegs:$src1, IntRegs:$src2), + (V6_ldcpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vadduwsat_dv HvxWR:$src1, HvxWR:$src2), + (V6_vadduwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vadduwsat_dv_128B HvxWR:$src1, HvxWR:$src2), + (V6_vadduwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyiwub HvxVR:$src1, IntRegs:$src2), + (V6_vmpyiwub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyiwub_128B HvxVR:$src1, IntRegs:$src2), + (V6_vmpyiwub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubububb_sat HvxVR:$src1, HvxVR:$src2), + (V6_vsubububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubububb_sat_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsubububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_ldcnp0 PredRegs:$src1, IntRegs:$src2), + (V6_ldcnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_ldcnp0_128B PredRegs:$src1, IntRegs:$src2), + (V6_ldcnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4), + (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlutvwh_oracci_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4), + (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsubbsat HvxVR:$src1, HvxVR:$src2), + (V6_vsubbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsubbsat_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsubbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>; + +// V65 HVX Instructions. + +def: Pat<(int_hexagon_V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasruhubrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2), + (V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpybub_rtt_128B HvxVR:$src1, DoubleRegs:$src2), + (V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), + (V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpahhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), + (V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vavguwrnd HvxVR:$src1, HvxVR:$src2), + (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vavguwrnd_128B HvxVR:$src1, HvxVR:$src2), + (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vnavgb HvxVR:$src1, HvxVR:$src2), + (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vnavgb_128B HvxVR:$src1, HvxVR:$src2), + (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasrh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), + (V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpauhuhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), + (V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyh_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), + (V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpybub_rtt_acc_128B HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), + (V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vavgb HvxVR:$src1, HvxVR:$src2), + (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vavgb_128B HvxVR:$src1, HvxVR:$src2), + (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaslh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vavguw HvxVR:$src1, HvxVR:$src2), + (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vavguw_128B HvxVR:$src1, HvxVR:$src2), + (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vlut4 HvxVR:$src1, DoubleRegs:$src2), + (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vlut4_128B HvxVR:$src1, DoubleRegs:$src2), + (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyuhe_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2), + (V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpyub_rtt_128B HvxVR:$src1, DoubleRegs:$src2), + (V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), + (V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpsuhuhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3), + (V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasruhubsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyuhe HvxVR:$src1, IntRegs:$src2), + (V6_vmpyuhe HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyuhe_128B HvxVR:$src1, IntRegs:$src2), + (V6_vmpyuhe HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), + (V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrmpyub_rtt_acc_128B HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3), + (V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasruwuhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpabuu_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), + (V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vprefixqw HvxQR:$src1), + (V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vprefixqw_128B HvxQR:$src1), + (V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vprefixqh HvxQR:$src1), + (V6_vprefixqh HvxQR:$src1)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vprefixqh_128B HvxQR:$src1), + (V6_vprefixqh HvxQR:$src1)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vprefixqb HvxQR:$src1), + (V6_vprefixqb HvxQR:$src1)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vprefixqb_128B HvxQR:$src1), + (V6_vprefixqb HvxQR:$src1)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vabsb HvxVR:$src1), + (V6_vabsb HvxVR:$src1)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vabsb_128B HvxVR:$src1), + (V6_vabsb HvxVR:$src1)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vavgbrnd HvxVR:$src1, HvxVR:$src2), + (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vavgbrnd_128B HvxVR:$src1, HvxVR:$src2), + (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdd0 ), + (V6_vdd0 )>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdd0_128B ), + (V6_vdd0 )>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpabuu HvxWR:$src1, IntRegs:$src2), + (V6_vmpabuu HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpabuu_128B HvxWR:$src1, IntRegs:$src2), + (V6_vmpabuu HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vabsb_sat HvxVR:$src1), + (V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vabsb_sat_128B HvxVR:$src1), + (V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65, UseHVX128B]>; + +// V66 HVX Instructions. + +def: Pat<(int_hexagon_V6_vaddcarrysat HvxVR:$src1, HvxVR:$src2, HvxQR:$src3), + (V6_vaddcarrysat HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV66, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vaddcarrysat_128B HvxVR:$src1, HvxVR:$src2, HvxQR:$src3), + (V6_vaddcarrysat HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV66, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasr_into HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vasr_into HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV66, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasr_into_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vasr_into HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV66, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsatdw HvxVR:$src1, HvxVR:$src2), + (V6_vsatdw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsatdw_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsatdw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vrotr HvxVR:$src1, HvxVR:$src2), + (V6_vrotr HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vrotr_128B HvxVR:$src1, HvxVR:$src2), + (V6_vrotr HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX128B]>; diff --git a/lib/Target/Hexagon/HexagonDepMappings.td b/lib/Target/Hexagon/HexagonDepMappings.td index 03c504ff0b084ec781049ccdd1cf25ad22b3b1d0..b3132d41b903d607a0111a74781cda51aa9016e4 100644 --- a/lib/Target/Hexagon/HexagonDepMappings.td +++ b/lib/Target/Hexagon/HexagonDepMappings.td @@ -1,4 +1,4 @@ -//===- HexagonDepMappings.td ----------------------------------------------===// +//===----------------------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -9,7 +9,6 @@ // Automatically generated file, please consult code owner before editing. //===----------------------------------------------------------------------===// - def A2_negAlias : InstAlias<"$Rd32 = neg($Rs32)", (A2_subri IntRegs:$Rd32, 0, IntRegs:$Rs32)>; def A2_notAlias : InstAlias<"$Rd32 = not($Rs32)", (A2_subri IntRegs:$Rd32, -1, IntRegs:$Rs32)>; def A2_tfrfAlias : InstAlias<"if (!$Pu4) $Rd32 = $Rs32", (A2_paddif IntRegs:$Rd32, PredRegs:$Pu4, IntRegs:$Rs32, 0)>; @@ -252,6 +251,7 @@ def V6_vaslhv_altAlias : InstAlias<"$Vd32 = vaslh($Vu32,$Vv32)", (V6_vaslhv HvxV def V6_vaslw_acc_altAlias : InstAlias<"$Vx32 += vaslw($Vu32,$Rt32)", (V6_vaslw_acc HvxVR:$Vx32, HvxVR:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>; def V6_vaslw_altAlias : InstAlias<"$Vd32 = vaslw($Vu32,$Rt32)", (V6_vaslw HvxVR:$Vd32, HvxVR:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>; def V6_vaslwv_altAlias : InstAlias<"$Vd32 = vaslw($Vu32,$Vv32)", (V6_vaslwv HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>; +def V6_vasr_into_altAlias : InstAlias<"$Vxx32 = vasrinto($Vu32,$Vv32)", (V6_vasr_into HvxWR:$Vxx32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>; def V6_vasrh_acc_altAlias : InstAlias<"$Vx32 += vasrh($Vu32,$Rt32)", (V6_vasrh_acc HvxVR:$Vx32, HvxVR:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>; def V6_vasrh_altAlias : InstAlias<"$Vd32 = vasrh($Vu32,$Rt32)", (V6_vasrh HvxVR:$Vd32, HvxVR:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>; def V6_vasrhbrndsat_altAlias : InstAlias<"$Vd32 = vasrhb($Vu32,$Vv32,$Rt8):rnd:sat", (V6_vasrhbrndsat HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8)>; @@ -402,6 +402,7 @@ def V6_vrmpyubi_acc_altAlias : InstAlias<"$Vxx32 += vrmpyub($Vuu32,$Rt32,#$Ii)", def V6_vrmpyubi_altAlias : InstAlias<"$Vdd32 = vrmpyub($Vuu32,$Rt32,#$Ii)", (V6_vrmpyubi HvxWR:$Vdd32, HvxWR:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>; def V6_vrmpyubv_acc_altAlias : InstAlias<"$Vx32 += vrmpyub($Vu32,$Vv32)", (V6_vrmpyubv_acc HvxVR:$Vx32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>; def V6_vrmpyubv_altAlias : InstAlias<"$Vd32 = vrmpyub($Vu32,$Vv32)", (V6_vrmpyubv HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>; +def V6_vrotr_altAlias : InstAlias<"$Vd32 = vrotr($Vu32,$Vv32)", (V6_vrotr HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>; def V6_vroundhb_altAlias : InstAlias<"$Vd32 = vroundhb($Vu32,$Vv32):sat", (V6_vroundhb HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>; def V6_vroundhub_altAlias : InstAlias<"$Vd32 = vroundhub($Vu32,$Vv32):sat", (V6_vroundhub HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>; def V6_vrounduhub_altAlias : InstAlias<"$Vd32 = vrounduhub($Vu32,$Vv32):sat", (V6_vrounduhub HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>; @@ -473,4 +474,6 @@ def V6_vunpackub_altAlias : InstAlias<"$Vdd32 = vunpackub($Vu32)", (V6_vunpackub def V6_vunpackuh_altAlias : InstAlias<"$Vdd32 = vunpackuh($Vu32)", (V6_vunpackuh HvxWR:$Vdd32, HvxVR:$Vu32)>, Requires<[UseHVX]>; def V6_vzb_altAlias : InstAlias<"$Vdd32 = vzxtb($Vu32)", (V6_vzb HvxWR:$Vdd32, HvxVR:$Vu32)>, Requires<[UseHVX]>; def V6_vzh_altAlias : InstAlias<"$Vdd32 = vzxth($Vu32)", (V6_vzh HvxWR:$Vdd32, HvxVR:$Vu32)>, Requires<[UseHVX]>; +def V6_zld0Alias : InstAlias<"z = vmem($Rt32)", (V6_zLd_ai IntRegs:$Rt32, 0)>, Requires<[UseHVX]>; +def V6_zldp0Alias : InstAlias<"if ($Pv4) z = vmem($Rt32)", (V6_zLd_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>; def Y2_dcfetchAlias : InstAlias<"dcfetch($Rs32)", (Y2_dcfetchbo IntRegs:$Rs32, 0)>; diff --git a/lib/Target/Hexagon/HexagonDepOperands.td b/lib/Target/Hexagon/HexagonDepOperands.td index 9d960953f8f529627be4ebcf5b7dbef2e2e52736..ef2d4fa4570218fd8e5b5710366fcaf446ee1fe3 100644 --- a/lib/Target/Hexagon/HexagonDepOperands.td +++ b/lib/Target/Hexagon/HexagonDepOperands.td @@ -1,4 +1,4 @@ -//===- HexagonDepOperands.td ----------------------------------------------===// +//===----------------------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -9,16 +9,12 @@ // Automatically generated file, please consult code owner before editing. //===----------------------------------------------------------------------===// - def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; let RenderMethod = "addSignedImmOperands"; } def s4_0Imm : Operand { let ParserMatchClass = s4_0ImmOperand; let DecoderMethod = "s4_0ImmDecoder"; } def s4_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 0>(N->getSExtValue());}]>; def s29_3ImmOperand : AsmOperandClass { let Name = "s29_3Imm"; let RenderMethod = "addSignedImmOperands"; } def s29_3Imm : Operand { let ParserMatchClass = s29_3ImmOperand; let DecoderMethod = "s29_3ImmDecoder"; } def s29_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 3>(N->getSExtValue());}]>; -def s10_6ImmOperand : AsmOperandClass { let Name = "s10_6Imm"; let RenderMethod = "addSignedImmOperands"; } -def s10_6Imm : Operand { let ParserMatchClass = s10_6ImmOperand; let DecoderMethod = "s10_6ImmDecoder"; } -def s10_6ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<10, 6>(N->getSExtValue());}]>; def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; let RenderMethod = "addImmOperands"; } def u6_0Imm : Operand { let ParserMatchClass = u6_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } def u6_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 0>(N->getSExtValue());}]>; @@ -130,6 +126,3 @@ def u8_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<8, 0>(N->getSExtVal def u30_2ImmOperand : AsmOperandClass { let Name = "u30_2Imm"; let RenderMethod = "addImmOperands"; } def u30_2Imm : Operand { let ParserMatchClass = u30_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; } def u30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 2>(N->getSExtValue());}]>; -def s10_0ImmOperand : AsmOperandClass { let Name = "s10_0Imm"; let RenderMethod = "addSignedImmOperands"; } -def s10_0Imm : Operand { let ParserMatchClass = s10_0ImmOperand; let DecoderMethod = "s10_0ImmDecoder"; } -def s10_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<10, 0>(N->getSExtValue());}]>; diff --git a/lib/Target/Hexagon/HexagonDepTimingClasses.h b/lib/Target/Hexagon/HexagonDepTimingClasses.h index 656c83f2d0c4ef0a1d1eb74d96d7b8377bcf1b6c..0fd55e8b7997a970e2f0bd9d3d9509558040b24b 100644 --- a/lib/Target/Hexagon/HexagonDepTimingClasses.h +++ b/lib/Target/Hexagon/HexagonDepTimingClasses.h @@ -1,4 +1,4 @@ -//===- HexagonDepTimingClasses.h ------------------------------------------===// +//===----------------------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -10,7 +10,6 @@ //===----------------------------------------------------------------------===// - #ifndef TARGET_HEXAGON_HEXAGON_DEP_TIMING_CLASSES_H #define TARGET_HEXAGON_HEXAGON_DEP_TIMING_CLASSES_H @@ -20,19 +19,25 @@ namespace llvm { inline bool is_TC3x(unsigned SchedClass) { switch (SchedClass) { - case Hexagon::Sched::tc_16d0d8d5: - case Hexagon::Sched::tc_1853ea6d: - case Hexagon::Sched::tc_60571023: - case Hexagon::Sched::tc_7934b9df: - case Hexagon::Sched::tc_8fd5f294: - case Hexagon::Sched::tc_b9c0b731: - case Hexagon::Sched::tc_bcc96cee: - case Hexagon::Sched::tc_c6ce9b3f: - case Hexagon::Sched::tc_c6ebf8dd: - case Hexagon::Sched::tc_c82dc1ff: - case Hexagon::Sched::tc_caaebcba: - case Hexagon::Sched::tc_cf59f215: - case Hexagon::Sched::tc_e913dc32: + case Hexagon::Sched::tc_05d3a09b: + case Hexagon::Sched::tc_0d8f5752: + case Hexagon::Sched::tc_13bfbcf9: + case Hexagon::Sched::tc_174516e8: + case Hexagon::Sched::tc_1a2fd869: + case Hexagon::Sched::tc_1c4528a2: + case Hexagon::Sched::tc_32779c6f: + case Hexagon::Sched::tc_5b54b33f: + case Hexagon::Sched::tc_6b25e783: + case Hexagon::Sched::tc_76851da1: + case Hexagon::Sched::tc_9debc299: + case Hexagon::Sched::tc_a9d88b22: + case Hexagon::Sched::tc_bafaade3: + case Hexagon::Sched::tc_bcf98408: + case Hexagon::Sched::tc_bdceeac1: + case Hexagon::Sched::tc_c8ce0b5c: + case Hexagon::Sched::tc_d1aa9eaa: + case Hexagon::Sched::tc_d773585a: + case Hexagon::Sched::tc_df3319ed: return true; default: return false; @@ -41,8 +46,8 @@ inline bool is_TC3x(unsigned SchedClass) { inline bool is_TC2early(unsigned SchedClass) { switch (SchedClass) { - case Hexagon::Sched::tc_14cd4cfa: - case Hexagon::Sched::tc_2a160009: + case Hexagon::Sched::tc_b4407292: + case Hexagon::Sched::tc_fc3999b4: return true; default: return false; @@ -51,12 +56,13 @@ inline bool is_TC2early(unsigned SchedClass) { inline bool is_TC4x(unsigned SchedClass) { switch (SchedClass) { - case Hexagon::Sched::tc_038a1342: - case Hexagon::Sched::tc_4d99bca9: - case Hexagon::Sched::tc_6792d5ff: - case Hexagon::Sched::tc_9c00ce8d: - case Hexagon::Sched::tc_d580173f: - case Hexagon::Sched::tc_f3eaa14b: + case Hexagon::Sched::tc_2f7c551d: + case Hexagon::Sched::tc_2ff964b4: + case Hexagon::Sched::tc_3a867367: + case Hexagon::Sched::tc_3b470976: + case Hexagon::Sched::tc_4560740b: + case Hexagon::Sched::tc_a58fd5cc: + case Hexagon::Sched::tc_b8bffe55: return true; default: return false; @@ -65,23 +71,27 @@ inline bool is_TC4x(unsigned SchedClass) { inline bool is_TC2(unsigned SchedClass) { switch (SchedClass) { - case Hexagon::Sched::tc_00afc57e: - case Hexagon::Sched::tc_1b9c9ee5: - case Hexagon::Sched::tc_234a11a5: - case Hexagon::Sched::tc_2b6f77c6: - case Hexagon::Sched::tc_41d5298e: - case Hexagon::Sched::tc_5ba5997d: - case Hexagon::Sched::tc_84df2cd3: - case Hexagon::Sched::tc_87735c3b: - case Hexagon::Sched::tc_897d1a9d: - case Hexagon::Sched::tc_976ddc4f: - case Hexagon::Sched::tc_b44c6e2a: - case Hexagon::Sched::tc_b9c4623f: - case Hexagon::Sched::tc_c2f7d806: - case Hexagon::Sched::tc_c74f796f: - case Hexagon::Sched::tc_d088982c: - case Hexagon::Sched::tc_ef84f62f: - case Hexagon::Sched::tc_f49e76f4: + case Hexagon::Sched::tc_002cb246: + case Hexagon::Sched::tc_14b5c689: + case Hexagon::Sched::tc_1c80410a: + case Hexagon::Sched::tc_4414d8b1: + case Hexagon::Sched::tc_6132ba3d: + case Hexagon::Sched::tc_61830035: + case Hexagon::Sched::tc_679309b8: + case Hexagon::Sched::tc_703e822c: + case Hexagon::Sched::tc_779080bf: + case Hexagon::Sched::tc_784490da: + case Hexagon::Sched::tc_88b4f13d: + case Hexagon::Sched::tc_9461ff31: + case Hexagon::Sched::tc_9e313203: + case Hexagon::Sched::tc_a813cf9a: + case Hexagon::Sched::tc_bfec0f01: + case Hexagon::Sched::tc_cf8126ae: + case Hexagon::Sched::tc_d08ee0f4: + case Hexagon::Sched::tc_e4a7f9f0: + case Hexagon::Sched::tc_f429765c: + case Hexagon::Sched::tc_f675fee8: + case Hexagon::Sched::tc_f9058dd7: return true; default: return false; @@ -90,45 +100,43 @@ inline bool is_TC2(unsigned SchedClass) { inline bool is_TC1(unsigned SchedClass) { switch (SchedClass) { - case Hexagon::Sched::tc_181af5d0: - case Hexagon::Sched::tc_1b82a277: - case Hexagon::Sched::tc_1e856f58: - case Hexagon::Sched::tc_351fed2d: - case Hexagon::Sched::tc_3669266a: - case Hexagon::Sched::tc_3cb8ea06: - case Hexagon::Sched::tc_452f85af: - case Hexagon::Sched::tc_481e5e5c: - case Hexagon::Sched::tc_49eb22c8: - case Hexagon::Sched::tc_523fcf30: - case Hexagon::Sched::tc_52d7bbea: - case Hexagon::Sched::tc_53bc8a6a: - case Hexagon::Sched::tc_540fdfbc: - case Hexagon::Sched::tc_55050d58: - case Hexagon::Sched::tc_609d2efe: - case Hexagon::Sched::tc_68cb12ce: - case Hexagon::Sched::tc_6ebb4a12: - case Hexagon::Sched::tc_6efc556e: - case Hexagon::Sched::tc_73043bf4: - case Hexagon::Sched::tc_7a830544: - case Hexagon::Sched::tc_855b0b61: - case Hexagon::Sched::tc_8fe6b782: - case Hexagon::Sched::tc_90f3e30c: - case Hexagon::Sched::tc_97743097: - case Hexagon::Sched::tc_99be14ca: - case Hexagon::Sched::tc_9faf76ae: - case Hexagon::Sched::tc_a46f0df5: - case Hexagon::Sched::tc_a904d137: - case Hexagon::Sched::tc_b9488031: - case Hexagon::Sched::tc_be706f30: - case Hexagon::Sched::tc_c6aa82f7: - case Hexagon::Sched::tc_cde8b071: - case Hexagon::Sched::tc_d6bf0472: - case Hexagon::Sched::tc_dbdffe3d: - case Hexagon::Sched::tc_e0739b8c: - case Hexagon::Sched::tc_e1e99bfa: - case Hexagon::Sched::tc_e9fae2d6: - case Hexagon::Sched::tc_f2704b9a: - case Hexagon::Sched::tc_f8eeed7a: + case Hexagon::Sched::tc_0663f615: + case Hexagon::Sched::tc_0a705168: + case Hexagon::Sched::tc_0ae0825c: + case Hexagon::Sched::tc_1b6f7cec: + case Hexagon::Sched::tc_1fc97744: + case Hexagon::Sched::tc_20cdee80: + case Hexagon::Sched::tc_2332b92e: + case Hexagon::Sched::tc_2eabeebe: + case Hexagon::Sched::tc_3d495a39: + case Hexagon::Sched::tc_4c5ba658: + case Hexagon::Sched::tc_56336eb0: + case Hexagon::Sched::tc_56f114f4: + case Hexagon::Sched::tc_57890846: + case Hexagon::Sched::tc_5a2711e5: + case Hexagon::Sched::tc_5b7c0967: + case Hexagon::Sched::tc_640086b5: + case Hexagon::Sched::tc_643b4717: + case Hexagon::Sched::tc_85c9c08f: + case Hexagon::Sched::tc_85d5d03f: + case Hexagon::Sched::tc_862b3e70: + case Hexagon::Sched::tc_946df596: + case Hexagon::Sched::tc_9c3ecd83: + case Hexagon::Sched::tc_9fc3dae0: + case Hexagon::Sched::tc_a1123dda: + case Hexagon::Sched::tc_a1c00888: + case Hexagon::Sched::tc_ae53734a: + case Hexagon::Sched::tc_b31c2e97: + case Hexagon::Sched::tc_b4b5c03a: + case Hexagon::Sched::tc_b51dc29a: + case Hexagon::Sched::tc_cd374165: + case Hexagon::Sched::tc_cfd8378a: + case Hexagon::Sched::tc_d5b7b0c1: + case Hexagon::Sched::tc_d9d43ecb: + case Hexagon::Sched::tc_db2bce9c: + case Hexagon::Sched::tc_de4df740: + case Hexagon::Sched::tc_de554571: + case Hexagon::Sched::tc_e78647bd: return true; default: return false; @@ -136,4 +144,4 @@ inline bool is_TC1(unsigned SchedClass) { } } // namespace llvm -#endif +#endif \ No newline at end of file diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp index 2f3e18c99c54465ec8280b307f864e873b0880bd..f5736546a87cca1d7ea38ad48c628e116c66441b 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -550,6 +550,37 @@ void HexagonFrameLowering::emitPrologue(MachineFunction &MF, } } +/// Returns true if the target can safely skip saving callee-saved registers +/// for noreturn nounwind functions. +bool HexagonFrameLowering::enableCalleeSaveSkip( + const MachineFunction &MF) const { + const auto &F = MF.getFunction(); + assert(F.hasFnAttribute(Attribute::NoReturn) && + F.getFunction().hasFnAttribute(Attribute::NoUnwind) && + !F.getFunction().hasFnAttribute(Attribute::UWTable)); + (void)F; + + // No need to save callee saved registers if the function does not return. + return MF.getSubtarget().noreturnStackElim(); +} + +// Helper function used to determine when to eliminate the stack frame for +// functions marked as noreturn and when the noreturn-stack-elim options are +// specified. When both these conditions are true, then a FP may not be needed +// if the function makes a call. It is very similar to enableCalleeSaveSkip, +// but it used to check if the allocframe can be eliminated as well. +static bool enableAllocFrameElim(const MachineFunction &MF) { + const auto &F = MF.getFunction(); + const auto &MFI = MF.getFrameInfo(); + const auto &HST = MF.getSubtarget(); + assert(!MFI.hasVarSizedObjects() && + !HST.getRegisterInfo()->needsStackRealignment(MF)); + return F.hasFnAttribute(Attribute::NoReturn) && + F.hasFnAttribute(Attribute::NoUnwind) && + !F.hasFnAttribute(Attribute::UWTable) && HST.noreturnStackElim() && + MFI.getStackSize() == 0; +} + void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB, bool PrologueStubs) const { MachineFunction &MF = *MBB.getParent(); @@ -994,7 +1025,7 @@ bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const { } const auto &HMFI = *MF.getInfo(); - if (MFI.hasCalls() || HMFI.hasClobberLR()) + if ((MFI.hasCalls() && !enableAllocFrameElim(MF)) || HMFI.hasClobberLR()) return true; return false; diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h index 988718860c5b874436ed73fe0b6dc1145434f37f..d65d870750f871dc1cd60da4af61c5faf28a064d 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.h +++ b/lib/Target/Hexagon/HexagonFrameLowering.h @@ -41,6 +41,8 @@ public: void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override {} + bool enableCalleeSaveSkip(const MachineFunction &MF) const override; + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector &CSI, const TargetRegisterInfo *TRI) const override { diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index 637fb3ace6a427ea62c2a5018193b03ce943c6de..b796e442d4faf4c020c04ef63cd1b91d74e779d1 100644 --- a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -120,7 +120,7 @@ struct Coloring { return Color == ColorKind::Red ? ColorKind::Black : ColorKind::Red; } - void dump() const; + LLVM_DUMP_METHOD void dump() const; private: ArrayRef Order; @@ -267,7 +267,7 @@ bool Coloring::color() { return true; } -LLVM_DUMP_METHOD +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void Coloring::dump() const { dbgs() << "{ Order: {"; for (unsigned I = 0; I != Order.size(); ++I) { @@ -309,6 +309,7 @@ void Coloring::dump() const { dbgs() << " " << C.first << " -> " << ColorKindToName(C.second) << "\n"; dbgs() << " }\n}\n"; } +#endif namespace { // Base class of for reordering networks. They don't strictly need to be @@ -651,6 +652,7 @@ struct OpRef { IndexBits = 28, }; + LLVM_DUMP_METHOD void print(raw_ostream &OS, const SelectionDAG &G) const; private: @@ -663,7 +665,7 @@ struct NodeTemplate { MVT Ty = MVT::Other; std::vector Ops; - void print(raw_ostream &OS, const SelectionDAG &G) const; + LLVM_DUMP_METHOD void print(raw_ostream &OS, const SelectionDAG &G) const; }; struct ResultStack { @@ -699,10 +701,12 @@ struct ResultStack { BaseType List; + LLVM_DUMP_METHOD void print(raw_ostream &OS, const SelectionDAG &G) const; }; } // namespace +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void OpRef::print(raw_ostream &OS, const SelectionDAG &G) const { if (isValue()) { OpV.getNode()->print(OS, &G); @@ -752,6 +756,7 @@ void ResultStack::print(raw_ostream &OS, const SelectionDAG &G) const { OS << '\n'; } } +#endif namespace { struct ShuffleMask { diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index 755a8539be7f1d12b65356b6c37990f0eb30be84..b5be507bb97d67984868e024a9a867681450ac5a 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -1505,13 +1505,6 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); - // Subtarget-specific operation actions. - // - if (Subtarget.hasV60Ops()) { - setOperationAction(ISD::ROTL, MVT::i32, Custom); - setOperationAction(ISD::ROTL, MVT::i64, Custom); - } - // V5+. setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FADD, MVT::f64, Expand); @@ -1542,6 +1535,17 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setIndexedStoreAction(ISD::POST_INC, VT, Legal); } + // Subtarget-specific operation actions. + // + if (Subtarget.hasV60Ops()) { + setOperationAction(ISD::ROTL, MVT::i32, Custom); + setOperationAction(ISD::ROTL, MVT::i64, Custom); + } + if (Subtarget.hasV66Ops()) { + setOperationAction(ISD::FADD, MVT::f64, Legal); + setOperationAction(ISD::FSUB, MVT::f64, Legal); + } + if (Subtarget.useHVXOps()) initializeHVXLowering(); @@ -3082,6 +3086,10 @@ HexagonTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, bool HexagonTargetLowering::shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { + // TODO: This may be worth removing. Check regression tests for diffs. + if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT)) + return false; + auto *L = cast(Load); std::pair BO = getBaseAndOffset(L->getBasePtr()); // Small-data object, do not shrink. diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td index a1082e7a77760db4d87461ca5a1de264e61737fe..2236140d5dd7bfe71cabd52bff26adcd2b25a976 100644 --- a/lib/Target/Hexagon/HexagonInstrFormats.td +++ b/lib/Target/Hexagon/HexagonInstrFormats.td @@ -69,101 +69,101 @@ class InstHexagon pattern, // Instruction type according to the ISA. IType Type = type; - let TSFlags{5-0} = Type.Value; + let TSFlags{6-0} = Type.Value; // Solo instructions, i.e., those that cannot be in a packet with others. bits<1> isSolo = 0; - let TSFlags{6} = isSolo; + let TSFlags{7} = isSolo; // Packed only with A or X-type instructions. bits<1> isSoloAX = 0; - let TSFlags{7} = isSoloAX; + let TSFlags{8} = isSoloAX; // Restricts slot 1 to ALU-only instructions. bits<1> isRestrictSlot1AOK = 0; - let TSFlags{8} = isRestrictSlot1AOK; + let TSFlags{9} = isRestrictSlot1AOK; // Predicated instructions. bits<1> isPredicated = 0; - let TSFlags{9} = isPredicated; + let TSFlags{10} = isPredicated; bits<1> isPredicatedFalse = 0; - let TSFlags{10} = isPredicatedFalse; + let TSFlags{11} = isPredicatedFalse; bits<1> isPredicatedNew = 0; - let TSFlags{11} = isPredicatedNew; + let TSFlags{12} = isPredicatedNew; bits<1> isPredicateLate = 0; - let TSFlags{12} = isPredicateLate; // Late predicate producer insn. + let TSFlags{13} = isPredicateLate; // Late predicate producer insn. // New-value insn helper fields. bits<1> isNewValue = 0; - let TSFlags{13} = isNewValue; // New-value consumer insn. + let TSFlags{14} = isNewValue; // New-value consumer insn. bits<1> hasNewValue = 0; - let TSFlags{14} = hasNewValue; // New-value producer insn. + let TSFlags{15} = hasNewValue; // New-value producer insn. bits<3> opNewValue = 0; - let TSFlags{17-15} = opNewValue; // New-value produced operand. + let TSFlags{18-16} = opNewValue; // New-value produced operand. bits<1> isNVStorable = 0; - let TSFlags{18} = isNVStorable; // Store that can become new-value store. + let TSFlags{19} = isNVStorable; // Store that can become new-value store. bits<1> isNVStore = 0; - let TSFlags{19} = isNVStore; // New-value store insn. + let TSFlags{20} = isNVStore; // New-value store insn. bits<1> isCVLoadable = 0; - let TSFlags{20} = isCVLoadable; // Load that can become cur-value load. + let TSFlags{21} = isCVLoadable; // Load that can become cur-value load. bits<1> isCVLoad = 0; - let TSFlags{21} = isCVLoad; // Cur-value load insn. + let TSFlags{22} = isCVLoad; // Cur-value load insn. // Immediate extender helper fields. bits<1> isExtendable = 0; - let TSFlags{22} = isExtendable; // Insn may be extended. + let TSFlags{23} = isExtendable; // Insn may be extended. bits<1> isExtended = 0; - let TSFlags{23} = isExtended; // Insn must be extended. + let TSFlags{24} = isExtended; // Insn must be extended. bits<3> opExtendable = 0; - let TSFlags{26-24} = opExtendable; // Which operand may be extended. + let TSFlags{27-25} = opExtendable; // Which operand may be extended. bits<1> isExtentSigned = 0; - let TSFlags{27} = isExtentSigned; // Signed or unsigned range. + let TSFlags{28} = isExtentSigned; // Signed or unsigned range. bits<5> opExtentBits = 0; - let TSFlags{32-28} = opExtentBits; //Number of bits of range before extending. + let TSFlags{33-29} = opExtentBits; //Number of bits of range before extending. bits<2> opExtentAlign = 0; - let TSFlags{34-33} = opExtentAlign; // Alignment exponent before extending. + let TSFlags{35-34} = opExtentAlign; // Alignment exponent before extending. bit cofMax1 = 0; - let TSFlags{35} = cofMax1; + let TSFlags{36} = cofMax1; bit cofRelax1 = 0; - let TSFlags{36} = cofRelax1; + let TSFlags{37} = cofRelax1; bit cofRelax2 = 0; - let TSFlags{37} = cofRelax2; + let TSFlags{38} = cofRelax2; bit isRestrictNoSlot1Store = 0; - let TSFlags{38} = isRestrictNoSlot1Store; + let TSFlags{39} = isRestrictNoSlot1Store; // Addressing mode for load/store instructions. AddrModeType addrMode = NoAddrMode; - let TSFlags{43-41} = addrMode.Value; + let TSFlags{44-42} = addrMode.Value; // Memory access size for mem access instructions (load/store) MemAccessSize accessSize = NoMemAccess; - let TSFlags{47-44} = accessSize.Value; + let TSFlags{48-45} = accessSize.Value; bits<1> isTaken = 0; - let TSFlags {48} = isTaken; // Branch prediction. + let TSFlags {49} = isTaken; // Branch prediction. bits<1> isFP = 0; - let TSFlags {49} = isFP; // Floating-point. + let TSFlags {50} = isFP; // Floating-point. bits<1> isSomeOK = 0; - let TSFlags {50} = isSomeOK; // Relax some grouping constraints. + let TSFlags {51} = isSomeOK; // Relax some grouping constraints. bits<1> hasNewValue2 = 0; - let TSFlags{51} = hasNewValue2; // Second New-value producer insn. + let TSFlags{52} = hasNewValue2; // Second New-value producer insn. bits<3> opNewValue2 = 0; - let TSFlags{54-52} = opNewValue2; // Second New-value produced operand. + let TSFlags{55-53} = opNewValue2; // Second New-value produced operand. bits<1> isAccumulator = 0; - let TSFlags{55} = isAccumulator; + let TSFlags{56} = isAccumulator; bits<1> prefersSlot3 = 0; - let TSFlags{56} = prefersSlot3; // Complex XU + let TSFlags{57} = prefersSlot3; // Complex XU bits<1> hasTmpDst = 0; - let TSFlags{59} = hasTmpDst; // v65 : 'fake" register VTMP is set + let TSFlags{60} = hasTmpDst; // v65 : 'fake" register VTMP is set bit CVINew = 0; - let TSFlags{61} = CVINew; + let TSFlags{62} = CVINew; // Fields used for relation models. bit isNonTemporal = 0; diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV5.td b/lib/Target/Hexagon/HexagonInstrFormatsV5.td index 482688ab90aa72f6c940a28af61659e309c94d12..c8de5cbcc1e0ccd5525a143fe8393ff95bc2f579 100644 --- a/lib/Target/Hexagon/HexagonInstrFormatsV5.td +++ b/lib/Target/Hexagon/HexagonInstrFormatsV5.td @@ -49,39 +49,39 @@ class InstDuplex iClass, list pattern = [], // *** Must match MCTargetDesc/HexagonBaseInfo.h *** - let TSFlags{5-0} = Type.Value; + let TSFlags{6-0} = Type.Value; // Predicated instructions. bits<1> isPredicated = 0; - let TSFlags{6} = isPredicated; + let TSFlags{7} = isPredicated; bits<1> isPredicatedFalse = 0; - let TSFlags{7} = isPredicatedFalse; + let TSFlags{8} = isPredicatedFalse; bits<1> isPredicatedNew = 0; - let TSFlags{8} = isPredicatedNew; + let TSFlags{9} = isPredicatedNew; // New-value insn helper fields. bits<1> isNewValue = 0; - let TSFlags{9} = isNewValue; // New-value consumer insn. + let TSFlags{10} = isNewValue; // New-value consumer insn. bits<1> hasNewValue = 0; - let TSFlags{10} = hasNewValue; // New-value producer insn. + let TSFlags{11} = hasNewValue; // New-value producer insn. bits<3> opNewValue = 0; - let TSFlags{13-11} = opNewValue; // New-value produced operand. + let TSFlags{14-12} = opNewValue; // New-value produced operand. bits<1> isNVStorable = 0; - let TSFlags{14} = isNVStorable; // Store that can become new-value store. + let TSFlags{15} = isNVStorable; // Store that can become new-value store. bits<1> isNVStore = 0; - let TSFlags{15} = isNVStore; // New-value store insn. + let TSFlags{16} = isNVStore; // New-value store insn. // Immediate extender helper fields. bits<1> isExtendable = 0; - let TSFlags{16} = isExtendable; // Insn may be extended. + let TSFlags{17} = isExtendable; // Insn may be extended. bits<1> isExtended = 0; - let TSFlags{17} = isExtended; // Insn must be extended. + let TSFlags{18} = isExtended; // Insn must be extended. bits<3> opExtendable = 0; - let TSFlags{20-18} = opExtendable; // Which operand may be extended. + let TSFlags{21-19} = opExtendable; // Which operand may be extended. bits<1> isExtentSigned = 0; - let TSFlags{21} = isExtentSigned; // Signed or unsigned range. + let TSFlags{22} = isExtentSigned; // Signed or unsigned range. bits<5> opExtentBits = 0; - let TSFlags{26-22} = opExtentBits; //Number of bits of range before extending. + let TSFlags{27-23} = opExtentBits; //Number of bits of range before extending. bits<2> opExtentAlign = 0; - let TSFlags{28-27} = opExtentAlign; // Alignment exponent before extending. + let TSFlags{29-28} = opExtentAlign; // Alignment exponent before extending. } diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp index a3c160d01f88f8c97e88afd0e374d9766e1a25ce..de0d6c4d9e4e781200b184ed8adb93faea2bd342 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1293,7 +1293,6 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vccombine)) .add(Op0) .addReg(PReg, S) - .add(Op1) .addReg(SrcHi) .addReg(SrcLo); if (IsDestLive) @@ -2894,14 +2893,15 @@ bool HexagonInstrInfo::addLatencyToSchedule(const MachineInstr &MI1, } /// Get the base register and byte offset of a load/store instr. -bool HexagonInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, - unsigned &BaseReg, int64_t &Offset, const TargetRegisterInfo *TRI) - const { +bool HexagonInstrInfo::getMemOperandWithOffset( + MachineInstr &LdSt, MachineOperand *&BaseOp, int64_t &Offset, + const TargetRegisterInfo *TRI) const { unsigned AccessSize = 0; - int OffsetVal = 0; - BaseReg = getBaseAndOffset(LdSt, OffsetVal, AccessSize); - Offset = OffsetVal; - return BaseReg != 0; + BaseOp = getBaseAndOffset(LdSt, Offset, AccessSize); + assert((!BaseOp || BaseOp->isReg()) && + "getMemOperandWithOffset only supports base " + "operands of type register."); + return BaseOp != nullptr; } /// Can these instructions execute at the same time in a bundle. @@ -3108,21 +3108,22 @@ unsigned HexagonInstrInfo::getAddrMode(const MachineInstr &MI) const { // Returns the base register in a memory access (load/store). The offset is // returned in Offset and the access size is returned in AccessSize. -// If the base register has a subregister or the offset field does not contain -// an immediate value, return 0. -unsigned HexagonInstrInfo::getBaseAndOffset(const MachineInstr &MI, - int &Offset, unsigned &AccessSize) const { +// If the base operand has a subregister or the offset field does not contain +// an immediate value, return nullptr. +MachineOperand *HexagonInstrInfo::getBaseAndOffset(const MachineInstr &MI, + int64_t &Offset, + unsigned &AccessSize) const { // Return if it is not a base+offset type instruction or a MemOp. if (getAddrMode(MI) != HexagonII::BaseImmOffset && getAddrMode(MI) != HexagonII::BaseLongOffset && !isMemOp(MI) && !isPostIncrement(MI)) - return 0; + return nullptr; AccessSize = getMemAccessSize(MI); unsigned BasePos = 0, OffsetPos = 0; if (!getBaseAndOffsetPosition(MI, BasePos, OffsetPos)) - return 0; + return nullptr; // Post increment updates its EA after the mem access, // so we need to treat its offset as zero. @@ -3131,14 +3132,14 @@ unsigned HexagonInstrInfo::getBaseAndOffset(const MachineInstr &MI, } else { const MachineOperand &OffsetOp = MI.getOperand(OffsetPos); if (!OffsetOp.isImm()) - return 0; + return nullptr; Offset = OffsetOp.getImm(); } const MachineOperand &BaseOp = MI.getOperand(BasePos); if (BaseOp.getSubReg() != 0) - return 0; - return BaseOp.getReg(); + return nullptr; + return &const_cast(BaseOp); } /// Return the position of the base and offset operands for this instruction. diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h index fe4a2f3662e3d5fb9f3a1eda148595014e3012f2..9b840762e88ae8dff8aa60ffd04d0130f1d9fe57 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/lib/Target/Hexagon/HexagonInstrInfo.h @@ -216,9 +216,9 @@ public: bool expandPostRAPseudo(MachineInstr &MI) const override; /// Get the base register and byte offset of a load/store instr. - bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, - int64_t &Offset, - const TargetRegisterInfo *TRI) const override; + bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp, + int64_t &Offset, + const TargetRegisterInfo *TRI) const override; /// Reverses the branch condition of the specified condition list, /// returning false on success and true if it cannot be reversed. @@ -436,8 +436,8 @@ public: bool predOpcodeHasNot(ArrayRef Cond) const; unsigned getAddrMode(const MachineInstr &MI) const; - unsigned getBaseAndOffset(const MachineInstr &MI, int &Offset, - unsigned &AccessSize) const; + MachineOperand *getBaseAndOffset(const MachineInstr &MI, int64_t &Offset, + unsigned &AccessSize) const; SmallVector getBranchingInstrs(MachineBasicBlock& MBB) const; unsigned getCExtOpNum(const MachineInstr &MI) const; HexagonII::CompoundGroup diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td index 206e74983d203aba49b4bf3f3f2ab95d57d2751c..9cab5748bef2292d75e2c4c782064af6c7b4e0e8 100644 --- a/lib/Target/Hexagon/HexagonIntrinsics.td +++ b/lib/Target/Hexagon/HexagonIntrinsics.td @@ -6,726 +6,78 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// This is populated based on the following specs: -// Hexagon V2 Architecture -// Application-Level Specification -// 80-V9418-8 Rev. B -// March 4, 2008 -//===----------------------------------------------------------------------===// -class T_I_pat - : Pat <(IntID imm:$Is), - (MI imm:$Is)>; +// These intrinsic patterns are not auto-generated. class T_R_pat : Pat <(IntID I32:$Rs), (MI I32:$Rs)>; -class T_P_pat - : Pat <(IntID I64:$Rs), - (MI I64:$Rs)>; - -class T_II_pat - : Pat<(IntID Imm1:$Is, Imm2:$It), - (MI Imm1:$Is, Imm2:$It)>; - -class T_RI_pat > - : Pat<(IntID I32:$Rs, ImmPred:$It), - (MI I32:$Rs, ImmPred:$It)>; - -class T_IR_pat > - : Pat<(IntID ImmPred:$Is, I32:$Rt), - (MI ImmPred:$Is, I32:$Rt)>; - -class T_PI_pat - : Pat<(IntID I64:$Rs, imm:$It), - (MI I64:$Rs, imm:$It)>; - -class T_RP_pat - : Pat<(IntID I32:$Rs, I64:$Rt), - (MI I32:$Rs, I64:$Rt)>; - class T_RR_pat : Pat <(IntID I32:$Rs, I32:$Rt), (MI I32:$Rs, I32:$Rt)>; -class T_PP_pat - : Pat <(IntID I64:$Rs, I64:$Rt), - (MI I64:$Rs, I64:$Rt)>; - -class T_QQ_pat - : Pat <(IntID I32:$Rs, I32:$Rt), - (MI (C2_tfrrp I32:$Rs), (C2_tfrrp I32:$Rt))>; - -class T_QII_pat - : Pat <(IntID I32:$Rp, Imm1:$Is, Imm2:$It), - (MI (C2_tfrrp I32:$Rp), Imm1:$Is, Imm2:$It)>; - -class T_QRR_pat - : Pat <(IntID I32:$Rp, I32:$Rs, I32:$Rt), - (MI (C2_tfrrp I32:$Rp), I32:$Rs, I32:$Rt)>; - -class T_QRI_pat - : Pat <(IntID I32:$Rp, I32:$Rs, ImmPred:$Is), - (MI (C2_tfrrp I32:$Rp), I32:$Rs, ImmPred:$Is)>; - -class T_QIR_pat - : Pat <(IntID I32:$Rp, ImmPred:$Is, I32:$Rs), - (MI (C2_tfrrp I32:$Rp), ImmPred:$Is, I32:$Rs)>; - -class T_QPP_pat - : Pat <(IntID I32:$Rp, I64:$Rs, I64:$Rt), - (MI (C2_tfrrp I32:$Rp), I64:$Rs, I64:$Rt)>; - -class T_RRI_pat - : Pat <(IntID I32:$Rs, I32:$Rt, imm:$Iu), - (MI I32:$Rs, I32:$Rt, imm:$Iu)>; - -class T_RII_pat - : Pat <(IntID I32:$Rs, imm:$It, imm:$Iu), - (MI I32:$Rs, imm:$It, imm:$Iu)>; - -class T_IRI_pat - : Pat <(IntID imm:$It, I32:$Rs, imm:$Iu), - (MI imm:$It, I32:$Rs, imm:$Iu)>; - -class T_IRR_pat - : Pat <(IntID imm:$Is, I32:$Rs, I32:$Rt), - (MI imm:$Is, I32:$Rs, I32:$Rt)>; - -class T_RIR_pat - : Pat <(IntID I32:$Rs, imm:$Is, I32:$Rt), - (MI I32:$Rs, imm:$Is, I32:$Rt)>; - -class T_RRR_pat - : Pat <(IntID I32:$Rs, I32:$Rt, I32:$Ru), - (MI I32:$Rs, I32:$Rt, I32:$Ru)>; - -class T_PPI_pat - : Pat <(IntID I64:$Rs, I64:$Rt, imm:$Iu), - (MI I64:$Rs, I64:$Rt, imm:$Iu)>; - -class T_PII_pat - : Pat <(IntID I64:$Rs, imm:$It, imm:$Iu), - (MI I64:$Rs, imm:$It, imm:$Iu)>; - -class T_PPP_pat - : Pat <(IntID I64:$Rs, I64:$Rt, I64:$Ru), - (MI I64:$Rs, I64:$Rt, I64:$Ru)>; - -class T_PPR_pat - : Pat <(IntID I64:$Rs, I64:$Rt, I32:$Ru), - (MI I64:$Rs, I64:$Rt, I32:$Ru)>; - -class T_PRR_pat - : Pat <(IntID I64:$Rs, I32:$Rt, I32:$Ru), - (MI I64:$Rs, I32:$Rt, I32:$Ru)>; - -class T_PPQ_pat - : Pat <(IntID I64:$Rs, I64:$Rt, I32:$Rp), - (MI I64:$Rs, I64:$Rt, (C2_tfrrp I32:$Rp))>; - -class T_PR_pat - : Pat <(IntID I64:$Rs, I32:$Rt), - (MI I64:$Rs, I32:$Rt)>; - -class T_D_pat - : Pat<(IntID (F64:$Rs)), - (MI (F64:$Rs))>; - -class T_DI_pat > - : Pat<(IntID F64:$Rs, ImmPred:$It), - (MI F64:$Rs, ImmPred:$It)>; - -class T_F_pat - : Pat<(IntID F32:$Rs), - (MI F32:$Rs)>; - -class T_FI_pat > - : Pat<(IntID F32:$Rs, ImmPred:$It), - (MI F32:$Rs, ImmPred:$It)>; - -class T_FF_pat - : Pat<(IntID F32:$Rs, F32:$Rt), - (MI F32:$Rs, F32:$Rt)>; - -class T_DD_pat - : Pat<(IntID F64:$Rs, F64:$Rt), - (MI F64:$Rs, F64:$Rt)>; - -class T_FFF_pat - : Pat<(IntID F32:$Rs, F32:$Rt, F32:$Ru), - (MI F32:$Rs, F32:$Rt, F32:$Ru)>; - -class T_FFFQ_pat - : Pat <(IntID F32:$Rs, F32:$Rt, F32:$Ru, I32:$Rp), - (MI F32:$Rs, F32:$Rt, F32:$Ru, (C2_tfrrp I32:$Rp))>; - -class T_Q_RI_pat > - : Pat<(IntID I32:$Rs, ImmPred:$It), - (C2_tfrpr (MI I32:$Rs, ImmPred:$It))>; - -class T_Q_RR_pat - : Pat <(IntID I32:$Rs, I32:$Rt), - (C2_tfrpr (MI I32:$Rs, I32:$Rt))>; - -class T_Q_RP_pat +class T_RP_pat : Pat <(IntID I32:$Rs, I64:$Rt), - (C2_tfrpr (MI I32:$Rs, I64:$Rt))>; - -class T_Q_PR_pat - : Pat <(IntID I64:$Rs, I32:$Rt), - (C2_tfrpr (MI I64:$Rs, I32:$Rt))>; - -class T_Q_PI_pat - : Pat<(IntID I64:$Rs, imm:$It), - (C2_tfrpr (MI I64:$Rs, imm:$It))>; - -class T_Q_PP_pat - : Pat <(IntID I64:$Rs, I64:$Rt), - (C2_tfrpr (MI I64:$Rs, I64:$Rt))>; - -class T_Q_Q_pat - : Pat <(IntID I32:$Rp), - (C2_tfrpr (MI (C2_tfrrp I32:$Rp)))>; - -class T_Q_QQ_pat - : Pat <(IntID I32:$Rp, I32:$Rq), - (C2_tfrpr (MI (C2_tfrrp I32:$Rp), (C2_tfrrp I32:$Rq)))>; - -class T_Q_FF_pat - : Pat<(IntID F32:$Rs, F32:$Rt), - (C2_tfrpr (MI F32:$Rs, F32:$Rt))>; - -class T_Q_DD_pat - : Pat<(IntID F64:$Rs, F64:$Rt), - (C2_tfrpr (MI F64:$Rs, F64:$Rt))>; - -class T_Q_FI_pat - : Pat<(IntID F32:$Rs, imm:$It), - (C2_tfrpr (MI F32:$Rs, imm:$It))>; - -class T_Q_DI_pat - : Pat<(IntID F64:$Rs, imm:$It), - (C2_tfrpr (MI F64:$Rs, imm:$It))>; - -class T_Q_QQQ_pat - : Pat <(IntID I32:$Rp, I32:$Rq, I32:$Rs), - (C2_tfrpr (MI (C2_tfrrp I32:$Rp), (C2_tfrrp I32:$Rq), - (C2_tfrrp I32:$Rs)))>; - -//===----------------------------------------------------------------------===// -// MPYS / Multipy signed/unsigned halfwords -//Rd=mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:rnd][:sat] -//===----------------------------------------------------------------------===// - -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; - -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; - -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; - -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; - -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; - - -//===----------------------------------------------------------------------===// -// MPYS / Multipy signed/unsigned halfwords and add/subtract the -// result from the accumulator. -//Rx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat] -//===----------------------------------------------------------------------===// - -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; - -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; - -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; - -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; - -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; - -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; - - -//===----------------------------------------------------------------------===// -// Multiply signed/unsigned halfwords with and without saturation and rounding -// into a 64-bits destination register. -//===----------------------------------------------------------------------===// - -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; - -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; - -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; - -//===----------------------------------------------------------------------===// -// MPYS / Multipy signed/unsigned halfwords and add/subtract the -// result from the 64-bit destination register. -//Rxx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat] -//===----------------------------------------------------------------------===// - -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; - -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; - -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; - -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; - -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; - -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; - -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; - -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; - -// Vector complex multiply imaginary: Rdd=vcmpyi(Rss,Rtt)[:<<1]:sat -def : T_PP_pat ; -def : T_PP_pat ; - -// Vector complex multiply real: Rdd=vcmpyr(Rss,Rtt)[:<<1]:sat -def : T_PP_pat ; -def : T_PP_pat ; - -// Vector dual multiply: Rdd=vdmpy(Rss,Rtt)[:<<1]:sat -def : T_PP_pat ; -def : T_PP_pat ; - -// Vector multiply even halfwords: Rdd=vmpyeh(Rss,Rtt)[:<<1]:sat -def : T_PP_pat ; -def : T_PP_pat ; - -//Rdd=vmpywoh(Rss,Rtt)[:<<1][:rnd]:sat -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; - -//Rdd=vmpyweh(Rss,Rtt)[:<<1][:rnd]:sat -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; - -//Rdd=vmpywouh(Rss,Rtt)[:<<1][:rnd]:sat -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; - -//Rdd=vmpyweuh(Rss,Rtt)[:<<1][:rnd]:sat -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; - -// Vector reduce add unsigned bytes: Rdd32[+]=vrmpybu(Rss32,Rtt32) -def : T_PP_pat ; -def : T_PPP_pat ; - -// Vector sum of absolute differences unsigned bytes: Rdd=vrsadub(Rss,Rtt) -def : T_PP_pat ; -def : T_PPP_pat ; - -// Vector absolute difference: Rdd=vabsdiffh(Rtt,Rss) -def : T_PP_pat ; - -// Vector absolute difference words: Rdd=vabsdiffw(Rtt,Rss) -def : T_PP_pat ; - -// Vector reduce complex multiply real or imaginary: -// Rdd[+]=vrcmpy[ir](Rss,Rtt[*]) -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PPP_pat ; -def : T_PPP_pat ; - -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PPP_pat ; -def : T_PPP_pat ; - -// Vector reduce halfwords -// Rdd[+]=vrmpyh(Rss,Rtt) -def : T_PP_pat ; -def : T_PPP_pat ; - -//===----------------------------------------------------------------------===// -// Vector Multipy with accumulation -//===----------------------------------------------------------------------===// - -// Vector multiply word by signed half with accumulation -// Rxx+=vmpyw[eo]h(Rss,Rtt)[:<<1][:rnd]:sat -def : T_PPP_pat ; -def : T_PPP_pat ; -def : T_PPP_pat ; -def : T_PPP_pat ; -def : T_PPP_pat ; -def : T_PPP_pat ; -def : T_PPP_pat ; -def : T_PPP_pat ; - -// Vector multiply word by unsigned half with accumulation -// Rxx+=vmpyw[eo]uh(Rss,Rtt)[:<<1][:rnd]:sat -def : T_PPP_pat ; -def : T_PPP_pat ; -def : T_PPP_pat ; -def : T_PPP_pat ; -def : T_PPP_pat ; -def : T_PPP_pat ; -def : T_PPP_pat ; -def : T_PPP_pat ; - -// Vector multiply even halfwords with accumulation -// Rxx+=vmpyeh(Rss,Rtt)[:<<1][:sat] -def : T_PPP_pat ; -def : T_PPP_pat ; -def : T_PPP_pat ; - -// Vector dual multiply with accumulation -// Rxx+=vdmpy(Rss,Rtt)[:sat] -def : T_PPP_pat ; -def : T_PPP_pat ; - -// Vector complex multiply real or imaginary with accumulation -// Rxx+=vcmpy[ir](Rss,Rtt):sat -def : T_PPP_pat ; -def : T_PPP_pat ; - -//===----------------------------------------------------------------------===// -// Add/Subtract halfword -// Rd=add(Rt.L,Rs.[HL])[:sat] -// Rd=sub(Rt.L,Rs.[HL])[:sat] -// Rd=add(Rt.[LH],Rs.[HL])[:sat][:<16] -// Rd=sub(Rt.[LH],Rs.[HL])[:sat][:<16] -//===----------------------------------------------------------------------===// - -//Rd=add(Rt.L,Rs.[LH]) -def : T_RR_pat ; -def : T_RR_pat ; - -//Rd=add(Rt.L,Rs.[LH]):sat -def : T_RR_pat ; -def : T_RR_pat ; - -//Rd=sub(Rt.L,Rs.[LH]) -def : T_RR_pat ; -def : T_RR_pat ; - -//Rd=sub(Rt.L,Rs.[LH]):sat -def : T_RR_pat ; -def : T_RR_pat ; - -//Rd=add(Rt.[LH],Rs.[LH]):<<16 -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; - -//Rd=sub(Rt.[LH],Rs.[LH]):<<16 -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; - -//Rd=add(Rt.[LH],Rs.[LH]):sat:<<16 -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; - -//Rd=sub(Rt.[LH],Rs.[LH]):sat:<<16 -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; - -// ALU64 / ALU / min max -def : T_RR_pat; -def : T_RR_pat; -def : T_RR_pat; -def : T_RR_pat; - -// Shift and accumulate -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; - -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; - -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; - -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; - -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; - -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; - -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; - -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; - -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; - -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; - -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; - -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; -def : T_PPI_pat ; - -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; - -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRR_pat ; - -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; - -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; -def : T_PPR_pat ; - -//******************************************************************* -// ALU32/ALU -//******************************************************************* -def : T_RR_pat; -def : T_RI_pat; -def : T_RR_pat; -def : T_IR_pat; -def : T_RR_pat; -def : T_RI_pat; -def : T_RR_pat; -def : T_RI_pat; -def : T_RR_pat; -def : T_RR_pat; + (MI I32:$Rs, I64:$Rt)>; + +def: Pat<(int_hexagon_A2_add IntRegs:$Rs, IntRegs:$Rt), + (A2_add IntRegs:$Rs, IntRegs:$Rt)>; +def: Pat<(int_hexagon_A2_addi IntRegs:$Rs, imm:$s16), + (A2_addi IntRegs:$Rs, imm:$s16)>; +def: Pat<(int_hexagon_A2_addp DoubleRegs:$Rs, DoubleRegs:$Rt), + (A2_addp DoubleRegs:$Rs, DoubleRegs:$Rt)>; + +def: Pat<(int_hexagon_A2_sub IntRegs:$Rs, IntRegs:$Rt), + (A2_sub IntRegs:$Rs, IntRegs:$Rt)>; +def: Pat<(int_hexagon_A2_subri imm:$s10, IntRegs:$Rs), + (A2_subri imm:$s10, IntRegs:$Rs)>; +def: Pat<(int_hexagon_A2_subp DoubleRegs:$Rs, DoubleRegs:$Rt), + (A2_subp DoubleRegs:$Rs, DoubleRegs:$Rt)>; + +def: Pat<(int_hexagon_M2_mpyi IntRegs:$Rs, IntRegs:$Rt), + (M2_mpyi IntRegs:$Rs, IntRegs:$Rt)>; +def: Pat<(int_hexagon_M2_mpyui IntRegs:$Rs, IntRegs:$Rt), // Same as M2_mpyi + (M2_mpyi IntRegs:$Rs, IntRegs:$Rt)>; +def: Pat<(int_hexagon_M2_mpysmi IntRegs:$Rs, imm:$s9), + (M2_mpysmi IntRegs:$Rs, imm:$s9)>; +def: Pat<(int_hexagon_M2_dpmpyss_s0 IntRegs:$Rs, IntRegs:$Rt), + (M2_dpmpyss_s0 IntRegs:$Rs, IntRegs:$Rt)>; +def: Pat<(int_hexagon_M2_dpmpyuu_s0 IntRegs:$Rs, IntRegs:$Rt), + (M2_dpmpyuu_s0 IntRegs:$Rs, IntRegs:$Rt)>; + +def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$Rs, imm:$u5), + (S2_asl_i_r IntRegs:$Rs, imm:$u5)>; +def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$Rs, imm:$u5), + (S2_lsr_i_r IntRegs:$Rs, imm:$u5)>; +def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$Rs, imm:$u5), + (S2_asr_i_r IntRegs:$Rs, imm:$u5)>; +def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$Rs, imm:$u6), + (S2_asl_i_p DoubleRegs:$Rs, imm:$u6)>; +def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$Rs, imm:$u6), + (S2_lsr_i_p DoubleRegs:$Rs, imm:$u6)>; +def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$Rs, imm:$u6), + (S2_asr_i_p DoubleRegs:$Rs, imm:$u6)>; + +def: Pat<(int_hexagon_A2_and IntRegs:$Rs, IntRegs:$Rt), + (A2_and IntRegs:$Rs, IntRegs:$Rt)>; +def: Pat<(int_hexagon_A2_andir IntRegs:$Rs, imm:$s10), + (A2_andir IntRegs:$Rs, imm:$s10)>; +def: Pat<(int_hexagon_A2_or IntRegs:$Rs, IntRegs:$Rt), + (A2_or IntRegs:$Rs, IntRegs:$Rt)>; +def: Pat<(int_hexagon_A2_orir IntRegs:$Rs, imm:$s10), + (A2_orir IntRegs:$Rs, imm:$s10)>; +def: Pat<(int_hexagon_A2_xor IntRegs:$Rs, IntRegs:$Rt), + (A2_xor IntRegs:$Rs, IntRegs:$Rt)>; + +def: Pat<(int_hexagon_A2_sxtb IntRegs:$Rs), + (A2_sxtb IntRegs:$Rs)>; +def: Pat<(int_hexagon_A2_sxth IntRegs:$Rs), + (A2_sxth IntRegs:$Rs)>; +def: Pat<(int_hexagon_A2_zxtb IntRegs:$Rs), + (A2_zxtb IntRegs:$Rs)>; +def: Pat<(int_hexagon_A2_zxth IntRegs:$Rs), + (A2_zxth IntRegs:$Rs)>; // Assembler mapped from Rd32=not(Rs32) to Rd32=sub(#-1,Rs32) def : Pat <(int_hexagon_A2_not I32:$Rs), @@ -757,16 +109,6 @@ def : Pat <(int_hexagon_S5_vasrhrnd_goodsyntax I64:$Rs, u4_0ImmPred:$imm), def : Pat <(int_hexagon_S5_asrhub_rnd_sat_goodsyntax I64:$Rs, u4_0ImmPred:$imm), (S5_asrhub_rnd_sat I64:$Rs, (UDEC1 u4_0ImmPred:$imm))>; -// Transfer immediate -def : Pat <(int_hexagon_A2_tfril I32:$Rs, u16_0ImmPred:$Is), - (A2_tfril I32:$Rs, u16_0ImmPred:$Is)>; -def : Pat <(int_hexagon_A2_tfrih I32:$Rs, u16_0ImmPred:$Is), - (A2_tfrih I32:$Rs, u16_0ImmPred:$Is)>; - -// Transfer Register/immediate. -def : T_R_pat ; -def : T_I_pat ; - def ImmExt64: SDNodeXFormgetSExtValue(); return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i64); @@ -783,49 +125,6 @@ def ImmExt64: SDNodeXForm; -// Assembler mapped from Rdd32=Rss32 to Rdd32=combine(Rss.H32,Rss.L32) -def : Pat<(int_hexagon_A2_tfrp I64:$src), - (A2_combinew (HiReg I64:$src), (LoReg I64:$src))>; - -//******************************************************************* -// ALU32/PERM -//******************************************************************* -// Combine -def: T_RR_pat; -def: T_RR_pat; -def: T_RR_pat; -def: T_RR_pat; - -def: T_II_pat; - -// Mux -def : T_QRR_pat; -def : T_QRI_pat; -def : T_QIR_pat; -def : T_QII_pat; - -// Shift halfword -def : T_R_pat; -def : T_R_pat; - -// Sign/zero extend -def : T_R_pat; -def : T_R_pat; -def : T_R_pat; -def : T_R_pat; - -//******************************************************************* -// ALU32/PRED -//******************************************************************* -// Compare -def : T_Q_RR_pat; -def : T_Q_RR_pat; -def : T_Q_RR_pat; - -def : T_Q_RI_pat; -def : T_Q_RI_pat; -def : T_Q_RI_pat; - def : Pat <(int_hexagon_C2_cmpgei I32:$src1, s32_0ImmPred:$src2), (C2_tfrpr (C2_cmpgti I32:$src1, (SDEC1 s32_0ImmPred:$src2)))>; @@ -839,420 +138,6 @@ def : Pat <(int_hexagon_C2_cmplt I32:$src1, I32:$src2), def : Pat <(int_hexagon_C2_cmpltu I32:$src1, I32:$src2), (C2_tfrpr (C2_cmpgtu I32:$src2, I32:$src1))>; -//******************************************************************* -// ALU32/VH -//******************************************************************* -// Vector add, subtract, average halfwords -def: T_RR_pat; -def: T_RR_pat; -def: T_RR_pat; - -def: T_RR_pat; -def: T_RR_pat; -def: T_RR_pat; - -def: T_RR_pat; -def: T_RR_pat; -def: T_RR_pat; - -//******************************************************************* -// ALU64/ALU -//******************************************************************* -def: T_RR_pat; -def: T_RR_pat; -def: T_PP_pat; -def: T_PP_pat; - -def: T_PP_pat; -def: T_PP_pat; -def: T_PP_pat; - -def: T_Q_PP_pat; -def: T_Q_PP_pat; -def: T_Q_PP_pat; - -def: T_PP_pat; -def: T_RR_pat; - -//******************************************************************* -// ALU64/VB -//******************************************************************* -// ALU64 - Vector add -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; - -// ALU64 - Vector average -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; - -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; - -// ALU64 - Vector negative average -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; - -// ALU64 - Vector max -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; - -// ALU64 - Vector min -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; - -// ALU64 - Vector sub -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; - -// ALU64 - Vector compare bytes -def : T_Q_PP_pat ; -def : T_Q_PP_pat ; -def : T_Q_PP_pat ; - -// ALU64 - Vector compare halfwords -def : T_Q_PP_pat ; -def : T_Q_PP_pat ; -def : T_Q_PP_pat ; - -// ALU64 - Vector compare words -def : T_Q_PP_pat ; -def : T_Q_PP_pat ; -def : T_Q_PP_pat ; - -// ALU64 / VB / Vector mux. -def : T_QPP_pat ; - -// MPY - Multiply and use full result -// Rdd = mpy[u](Rs, Rt) -def : T_RR_pat ; -def : T_RR_pat ; - -// Complex multiply real or imaginary -def : T_RR_pat ; -def : T_RR_pat ; - -// Complex multiply -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; - -// Vector multiply halfwords -// Rdd=vmpyh(Rs,Rt)[:<<1]:sat -def : T_RR_pat ; -def : T_RR_pat ; - -// Rxx[+-]= mpy[u](Rs,Rt) -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; - -// Rxx[-+]=cmpy(Rs,Rt)[:<<1]:sat -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; - -// Rxx[-+]=cmpy(Rs,Rt*)[:<<1]:sat -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; - -// Rxx+=cmpy[ir](Rs,Rt) -def : T_PRR_pat ; -def : T_PRR_pat ; - -// Rxx+=vmpyh(Rs,Rt)[:<<1][:sat] -def : T_PRR_pat ; -def : T_PRR_pat ; -def : T_PRR_pat ; - -//******************************************************************* -// CR -//******************************************************************* -def: T_Q_Q_pat; -def: T_Q_Q_pat; -def: T_Q_Q_pat; -def: T_Q_Q_pat; - -def: T_Q_QQ_pat; -def: T_Q_QQ_pat; -def: T_Q_QQ_pat; -def: T_Q_QQ_pat; -def: T_Q_QQ_pat; - -// Multiply 32x32 and use lower result -def : T_RRI_pat ; -def : T_RRI_pat ; -def : T_RRR_pat ; - -// Subtract and accumulate -def : T_RRR_pat ; - -// Add and accumulate -def : T_RRR_pat ; -def : T_RRR_pat ; -def : T_RRI_pat ; -def : T_RRI_pat ; - -// XOR and XOR with destination -def : T_RRR_pat ; - -// Vector dual multiply with round and pack -def : T_PP_pat ; -def : T_PP_pat ; - -// Vector multiply halfwords with round and pack -def : T_RR_pat ; -def : T_RR_pat ; - -// Multiply and use lower result -def : T_RR_pat ; -def : T_RI_pat ; - -// Assembler mapped from Rd32=mpyui(Rs32,Rt32) to Rd32=mpyi(Rs32,Rt32) -def : T_RR_pat ; - -// Multiply and use upper result -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; - -// Complex multiply with round and pack -// Rxx32+=cmpy(Rs32,[*]Rt32:<<1]:rnd:sat -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; - -//******************************************************************* -// STYPE/ALU -//******************************************************************* -def : T_P_pat ; -def : T_P_pat ; -def : T_P_pat ; - -//******************************************************************* -// STYPE/BIT -//******************************************************************* - -// Count leading/trailing -def: T_R_pat; -def: T_P_pat; -def: T_R_pat; -def: T_P_pat; -def: T_R_pat; -def: T_P_pat; -def: T_R_pat; -def: T_R_pat; -def: T_R_pat; - -// Compare bit mask -def: T_RR_pat; -def: T_RI_pat; -def: T_RR_pat; - -// Vector shuffle -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; -def : T_PP_pat ; - -// Vector truncate -def : T_PP_pat ; -def : T_PP_pat ; - -// Linear feedback-shift Iteration. -def : T_PP_pat ; - -// Vector align -// Need custom lowering -def : T_PPQ_pat ; -def : T_PPI_pat ; - -// Vector splice -def : T_PPQ_pat ; -def : T_PPI_pat ; - -// Shift by immediate and add -def : T_RRI_pat; - -// Extract bitfield -def : T_PII_pat; -def : T_RII_pat; -def : T_RP_pat ; -def : T_PP_pat ; - -// Insert bitfield -def : Pat <(int_hexagon_S2_insert_rp I32:$src1, I32:$src2, I64:$src3), - (S2_insert_rp I32:$src1, I32:$src2, I64:$src3)>; - -def : Pat<(i64 (int_hexagon_S2_insertp_rp I64:$src1, I64:$src2, I64:$src3)), - (i64 (S2_insertp_rp I64:$src1, I64:$src2, I64:$src3))>; - -def : Pat<(int_hexagon_S2_insert I32:$src1, I32:$src2, - u5_0ImmPred:$src3, u5_0ImmPred:$src4), - (S2_insert I32:$src1, I32:$src2, - u5_0ImmPred:$src3, u5_0ImmPred:$src4)>; - -def : Pat<(i64 (int_hexagon_S2_insertp I64:$src1, I64:$src2, - u6_0ImmPred:$src3, u6_0ImmPred:$src4)), - (i64 (S2_insertp I64:$src1, I64:$src2, - u6_0ImmPred:$src3, u6_0ImmPred:$src4))>; - -// Innterleave/deinterleave -def : T_P_pat ; -def : T_P_pat ; - -// Set/Clear/Toggle Bit -def: T_RI_pat; -def: T_RI_pat; -def: T_RI_pat; - -def: T_RR_pat; -def: T_RR_pat; -def: T_RR_pat; - -// Test Bit -def: T_Q_RI_pat; -def: T_Q_RR_pat; - -//******************************************************************* -// STYPE/COMPLEX -//******************************************************************* -// Vector Complex conjugate -def : T_P_pat ; - -// Vector Complex rotate -def : T_PR_pat ; - -//******************************************************************* -// STYPE/PERM -//******************************************************************* - -// Vector saturate without pack -def : T_P_pat ; -def : T_P_pat ; -def : T_P_pat ; -def : T_P_pat ; - -//******************************************************************* -// STYPE/PRED -//******************************************************************* - -// Predicate transfer -def: Pat<(i32 (int_hexagon_C2_tfrpr I32:$Rs)), - (i32 (C2_tfrpr (C2_tfrrp I32:$Rs)))>; -def: Pat<(i32 (int_hexagon_C2_tfrrp I32:$Rs)), - (i32 (C2_tfrpr (C2_tfrrp I32:$Rs)))>; - -// Mask generate from predicate -def: Pat<(i64 (int_hexagon_C2_mask I32:$Rs)), - (i64 (C2_mask (C2_tfrrp I32:$Rs)))>; - -// Viterbi pack even and odd predicate bits -def: T_QQ_pat; - -//******************************************************************* -// STYPE/SHIFT -//******************************************************************* - -def : T_PI_pat ; -def : T_PI_pat ; -def : T_PI_pat ; - -def : T_PR_pat ; -def : T_PR_pat ; -def : T_PR_pat ; -def : T_PR_pat ; - -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; -def : T_RR_pat ; - -def : T_RR_pat ; -def : T_RR_pat ; - -def : T_R_pat ; -def : T_R_pat ; -def : T_R_pat ; -def : T_R_pat ; -def : T_R_pat ; -def : T_R_pat ; - -// Vector saturate and pack -def : T_R_pat ; -def : T_R_pat ; -def : T_P_pat ; -def : T_P_pat ; -def : T_P_pat ; -def : T_P_pat ; - -def : T_P_pat ; -def : T_P_pat ; -def : T_P_pat ; -def : T_P_pat ; -def : T_R_pat ; -def : T_R_pat ; - -def : T_R_pat ; -def : T_R_pat ; -def : T_R_pat ; - -def : T_R_pat ; - -def : T_P_pat ; -def : T_R_pat ; -def : T_R_pat ; -def : T_R_pat ; -def : T_R_pat ; - -// Vector arithmetic shift right by immediate with truncate and pack. -def : T_PI_pat; - -def : T_RI_pat ; -def : T_RI_pat ; -def : T_RI_pat ; -def : T_RI_pat ; -def : T_RI_pat ; - -// Shift left by immediate with saturation. -def : T_RI_pat ; - //===----------------------------------------------------------------------===// // Template 'def pat' to map tableidx[bhwd] intrinsics to :raw instructions. //===----------------------------------------------------------------------===// @@ -1277,11 +162,8 @@ def SDEC3 : SDNodeXForm; - +def : S2op_tableidx_pat ; def : S2op_tableidx_pat ; def : S2op_tableidx_pat ; -//******************************************************************* -// STYPE/VH -//******************************************************************* - -// Vector absolute value halfwords with and without saturation -// Rdd64=vabsh(Rss64)[:sat] -def : T_P_pat ; -def : T_P_pat ; - -// Vector shift halfwords by immediate -// Rdd64=[vaslh/vasrh/vlsrh](Rss64,u4) -def : T_PI_pat ; -def : T_PI_pat ; -def : T_PI_pat ; - -// Vector shift halfwords by register -// Rdd64=[vaslw/vasrw/vlslw/vlsrw](Rss64,Rt32) -def : T_PR_pat ; -def : T_PR_pat ; -def : T_PR_pat ; -def : T_PR_pat ; - -//******************************************************************* -// STYPE/VW -//******************************************************************* - -// Vector absolute value words with and without saturation -def : T_P_pat ; -def : T_P_pat ; - -// Vector shift words by immediate. -// Rdd64=[vasrw/vlsrw|vaslw](Rss64,u5) -def : T_PI_pat ; -def : T_PI_pat ; -def : T_PI_pat ; - -// Vector shift words by register. -// Rdd64=[vasrw/vlsrw|vaslw|vlslw](Rss64,Rt32) -def : T_PR_pat ; -def : T_PR_pat ; -def : T_PR_pat ; -def : T_PR_pat ; - -// Vector shift words with truncate and pack -def : T_PR_pat ; - // Load/store locked. def : T_R_pat; def : T_R_pat; @@ -1370,10 +206,13 @@ def: T_stc_pat; multiclass MaskedStore { def : Pat<(IntID HvxQR:$src1, IntRegs:$src2, HvxVR:$src3), - (MI HvxQR:$src1, IntRegs:$src2, #0, HvxVR:$src3)>; + (MI HvxQR:$src1, IntRegs:$src2, #0, HvxVR:$src3)>, + Requires<[UseHVX]>; + def : Pat<(!cast(IntID#"_128B") HvxQR:$src1, IntRegs:$src2, HvxVR:$src3), - (MI HvxQR:$src1, IntRegs:$src2, #0, HvxVR:$src3)>; + (MI HvxQR:$src1, IntRegs:$src2, #0, HvxVR:$src3)>, + Requires<[UseHVX]>; } defm : MaskedStore ; @@ -1398,5 +237,241 @@ def: T_R_pat; def: T_RR_pat; def: T_RP_pat; -include "HexagonIntrinsicsV5.td" -include "HexagonIntrinsicsV60.td" +// +// Patterns for optimizing code generations for HVX. + +def u3_64_ImmPred : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)(64 - N->getSExtValue()); + return isUInt<3>(v); +}]>; + +def u3_128_ImmPred : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)(128 - N->getSExtValue()); + return isUInt<3>(v); +}]>; + +def SUB_64_VAL : SDNodeXFormgetSExtValue(); + return CurDAG->getTargetConstant(64 - Imm, SDLoc(N), MVT::i32); +}]>; + +def SUB_128_VAL : SDNodeXFormgetSExtValue(); + return CurDAG->getTargetConstant(128 - Imm, SDLoc(N), MVT::i32); +}]>; + +let AddedComplexity = 100 in { +def : Pat <(v16i32 (int_hexagon_V6_lo (v32i32 HvxWR:$src1))), + (v16i32 (EXTRACT_SUBREG (v32i32 HvxWR:$src1), vsub_lo))>, + Requires<[UseHVX]>; + +def : Pat <(v16i32 (int_hexagon_V6_hi (v32i32 HvxWR:$src1))), + (v16i32 (EXTRACT_SUBREG (v32i32 HvxWR:$src1), vsub_hi))>, + Requires<[UseHVX]>; + +def : Pat <(v32i32 (int_hexagon_V6_lo_128B (v64i32 HvxWR:$src1))), + (v32i32 (EXTRACT_SUBREG (v64i32 HvxWR:$src1), vsub_lo))>, + Requires<[UseHVX]>; + +def : Pat <(v32i32 (int_hexagon_V6_hi_128B (v64i32 HvxWR:$src1))), + (v32i32 (EXTRACT_SUBREG (v64i32 HvxWR:$src1), vsub_hi))>, + Requires<[UseHVX]>; +} + +def : Pat <(v512i1 (bitconvert (v16i32 HvxVR:$src1))), + (v512i1 (V6_vandvrt (v16i32 HvxVR:$src1), (A2_tfrsi 0x01010101)))>, + Requires<[UseHVX]>; + +def : Pat <(v512i1 (bitconvert (v32i16 HvxVR:$src1))), + (v512i1 (V6_vandvrt (v32i16 HvxVR:$src1), (A2_tfrsi 0x01010101)))>, + Requires<[UseHVX]>; + +def : Pat <(v512i1 (bitconvert (v64i8 HvxVR:$src1))), + (v512i1 (V6_vandvrt (v64i8 HvxVR:$src1), (A2_tfrsi 0x01010101)))>, + Requires<[UseHVX]>; + +def : Pat <(v16i32 (bitconvert (v512i1 HvxQR:$src1))), + (v16i32 (V6_vandqrt (v512i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>, + Requires<[UseHVX]>; + +def : Pat <(v32i16 (bitconvert (v512i1 HvxQR:$src1))), + (v32i16 (V6_vandqrt (v512i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>, + Requires<[UseHVX]>; + +def : Pat <(v64i8 (bitconvert (v512i1 HvxQR:$src1))), + (v64i8 (V6_vandqrt (v512i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>, + Requires<[UseHVX]>; + +def : Pat <(v1024i1 (bitconvert (v32i32 HvxVR:$src1))), + (v1024i1 (V6_vandvrt (v32i32 HvxVR:$src1), (A2_tfrsi 0x01010101)))>, + Requires<[UseHVX]>; + +def : Pat <(v1024i1 (bitconvert (v64i16 HvxVR:$src1))), + (v1024i1 (V6_vandvrt (v64i16 HvxVR:$src1), (A2_tfrsi 0x01010101)))>, + Requires<[UseHVX]>; + +def : Pat <(v1024i1 (bitconvert (v128i8 HvxVR:$src1))), + (v1024i1 (V6_vandvrt (v128i8 HvxVR:$src1), (A2_tfrsi 0x01010101)))>, + Requires<[UseHVX]>; + +def : Pat <(v32i32 (bitconvert (v1024i1 HvxQR:$src1))), + (v32i32 (V6_vandqrt (v1024i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>, + Requires<[UseHVX]>; + +def : Pat <(v64i16 (bitconvert (v1024i1 HvxQR:$src1))), + (v64i16 (V6_vandqrt (v1024i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>, + Requires<[UseHVX]>; + +def : Pat <(v128i8 (bitconvert (v1024i1 HvxQR:$src1))), + (v128i8 (V6_vandqrt (v1024i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>, + Requires<[UseHVX]>; + +let AddedComplexity = 140 in { +def : Pat <(store (v512i1 HvxQR:$src1), (i32 IntRegs:$addr)), + (V6_vS32b_ai IntRegs:$addr, 0, + (v16i32 (V6_vandqrt (v512i1 HvxQR:$src1), (A2_tfrsi 0x01010101))))>, + Requires<[UseHVX]>; + +def : Pat <(v512i1 (load (i32 IntRegs:$addr))), + (v512i1 (V6_vandvrt + (v16i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>, + Requires<[UseHVX]>; + +def : Pat <(store (v1024i1 HvxQR:$src1), (i32 IntRegs:$addr)), + (V6_vS32b_ai IntRegs:$addr, 0, + (v32i32 (V6_vandqrt (v1024i1 HvxQR:$src1), (A2_tfrsi 0x01010101))))>, + Requires<[UseHVX]>; + +def : Pat <(v1024i1 (load (i32 IntRegs:$addr))), + (v1024i1 (V6_vandvrt + (v32i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>, + Requires<[UseHVX]>; +} + +def: Pat<(v64i16 (trunc v64i32:$Vdd)), + (v64i16 (V6_vpackwh_sat + (v32i32 (V6_hi HvxWR:$Vdd)), + (v32i32 (V6_lo HvxWR:$Vdd))))>, + Requires<[UseHVX]>; + +def: Pat<(int_hexagon_S2_asr_i_vh DoubleRegs:$src1, IntRegs:$src2), + (S2_asr_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV55]>; + +multiclass T_VI_pat { + def: Pat<(IntID HvxVR:$src1, u3_0ImmPred:$src2), + (MI HvxVR:$src1, HvxVR:$src1, u3_0ImmPred:$src2)>, + Requires<[UseHVX]>; + + def: Pat<(!cast(IntID#"_128B") HvxVR:$src1, u3_0ImmPred:$src2), + (MI HvxVR:$src1, HvxVR:$src1, u3_0ImmPred:$src2)>, + Requires<[UseHVX]>; +} + +multiclass T_VI_inv_pat { + def: Pat<(IntID HvxVR:$src1, u3_64_ImmPred:$src2), + (MI HvxVR:$src1, HvxVR:$src1, + (SUB_64_VAL u3_64_ImmPred:$src2))>, + Requires<[UseHVX]>; + + def: Pat<(!cast(IntID#"_128B") HvxVR:$src1, u3_128_ImmPred:$src2), + (MI HvxVR:$src1, HvxVR:$src1, (SUB_128_VAL u3_128_ImmPred:$src2))>, + Requires<[UseHVX]>; +} + +multiclass T_VVI_pat { + def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3), + (MI HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, + Requires<[UseHVX]>; + + def: Pat<(!cast(IntID#"_128B") HvxVR:$src1, HvxVR:$src2, + u3_0ImmPred:$src3), + (MI HvxVR:$src1, HvxVR:$src2, + u3_0ImmPred:$src3)>, + Requires<[UseHVX]>; +} + +multiclass T_VVI_inv_pat { + def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, u3_64_ImmPred:$src3), + (MI HvxVR:$src1, HvxVR:$src2, + (SUB_64_VAL u3_64_ImmPred:$src3))>, + Requires<[UseHVX]>; + + def: Pat<(!cast(IntID#"_128B") HvxVR:$src1, HvxVR:$src2, + u3_128_ImmPred:$src3), + (MI HvxVR:$src1, HvxVR:$src2, + (SUB_128_VAL u3_128_ImmPred:$src3))>, + Requires<[UseHVX]>; +} + +multiclass T_VVR_pat { + def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), + (MI HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, + Requires<[UseHVX]>; + + def: Pat<(!cast(IntID#"_128B") HvxVR:$src1, HvxVR:$src2, + IntRegs:$src3), + (MI HvxVR:$src1, HvxVR:$src2, + IntRegs:$src3)>, + Requires<[UseHVX]>; +} + +defm : T_VI_pat ; +defm : T_VI_inv_pat ; + +defm : T_VVI_pat ; +defm : T_VVI_inv_pat ; +defm : T_VVI_inv_pat ; +defm : T_VVR_pat ; +defm : T_VVI_pat ; +defm : T_VVI_inv_pat ; +defm : T_VVI_inv_pat ; +defm : T_VVR_pat ; + +def: Pat<(int_hexagon_V6_vd0), + (V6_vd0)>, Requires<[HasV60, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vd0_128B ), + (V6_vd0)>, Requires<[HasV60, UseHVX128B]>; + +def: Pat<(int_hexagon_V6_vdd0), + (V6_vdd0)>, Requires<[HasV65, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdd0_128B), + (V6_vdd0)>, Requires<[HasV65, UseHVX128B]>; + +def: Pat<(int_hexagon_V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), + (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>; +def: Pat<(int_hexagon_V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), + (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>; +def: Pat<(int_hexagon_V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), + (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>; +def: Pat<(int_hexagon_V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), + (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>; +def: Pat<(int_hexagon_V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), + (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>; +def: Pat<(int_hexagon_V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), + (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>; +def: Pat<(int_hexagon_V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), + (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>; +def: Pat<(int_hexagon_V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), + (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>; +def: Pat<(int_hexagon_V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5), + (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>; +def: Pat<(int_hexagon_V6_vscattermw_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), + (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>; +def: Pat<(int_hexagon_V6_vscattermh_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), + (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>; +def: Pat<(int_hexagon_V6_vscattermw_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), + (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>; +def: Pat<(int_hexagon_V6_vscattermh_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4), + (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>; +def: Pat<(int_hexagon_V6_vscattermwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), + (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>; +def: Pat<(int_hexagon_V6_vscattermhq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5), + (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>; +def: Pat<(int_hexagon_V6_vscattermhw_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), + (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>; +def: Pat<(int_hexagon_V6_vscattermhw_add_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4), + (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX]>; +def: Pat<(int_hexagon_V6_vscattermhwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5), + (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX]>; + +include "HexagonDepMapAsm2Intrin.td" diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp index 29c044b3b7295761ed0f20bbb4276c98bfa7dc33..c3a5bd5d57bfeee7c32697458f8dd0035d349179 100644 --- a/lib/Target/Hexagon/HexagonOptAddrMode.cpp +++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp @@ -502,7 +502,8 @@ bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp, MIB.add(ImmOp); OpStart = 4; Changed = true; - } else if (HII->getAddrMode(*OldMI) == HexagonII::BaseImmOffset) { + } else if (HII->getAddrMode(*OldMI) == HexagonII::BaseImmOffset && + OldMI->getOperand(2).isImm()) { short NewOpCode = HII->changeAddrMode_io_abs(*OldMI); assert(NewOpCode >= 0 && "Invalid New opcode\n"); MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode)) @@ -518,17 +519,19 @@ bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp, LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n"); LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n"); - } else if (ImmOpNum == 2 && OldMI->getOperand(3).getImm() == 0) { - short NewOpCode = HII->changeAddrMode_rr_io(*OldMI); - assert(NewOpCode >= 0 && "Invalid New opcode\n"); - MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode)); - MIB.add(OldMI->getOperand(0)); - MIB.add(OldMI->getOperand(1)); - MIB.add(ImmOp); - OpStart = 4; - Changed = true; - LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n"); - LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n"); + } else if (ImmOpNum == 2) { + if (OldMI->getOperand(3).isImm() && OldMI->getOperand(3).getImm() == 0) { + short NewOpCode = HII->changeAddrMode_rr_io(*OldMI); + assert(NewOpCode >= 0 && "Invalid New opcode\n"); + MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode)); + MIB.add(OldMI->getOperand(0)); + MIB.add(OldMI->getOperand(1)); + MIB.add(ImmOp); + OpStart = 4; + Changed = true; + LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n"); + LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n"); + } } if (Changed) @@ -758,11 +761,13 @@ bool HexagonOptAddrMode::processBlock(NodeAddr BA) { // This could happen, for example, when DefR = R4, but the used // register is D2. + // Change UseMI if replacement is possible. If any replacement failed, + // or wasn't attempted, make sure to keep the TFR. + bool Xformed = false; if (UseMOnum >= 0 && InstrEvalResult[UseMI]) - // Change UseMI if replacement is possible. - Changed |= xformUseMI(MI, UseMI, UseN, UseMOnum); - else - KeepTfr = true; + Xformed = xformUseMI(MI, UseMI, UseN, UseMOnum); + Changed |= Xformed; + KeepTfr |= !Xformed; } if (!KeepTfr) Deleted.insert(MI); diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td index ddf5a9ca3645f958c1c11638078548f04e7b2a5a..311b5a702d5a7d5faaf0c980e2b402b995721be5 100644 --- a/lib/Target/Hexagon/HexagonPatterns.td +++ b/lib/Target/Hexagon/HexagonPatterns.td @@ -218,6 +218,8 @@ def I1toI32: OutPatFrag<(ops node:$Rs), (C2_muxii (i1 $Rs), 1, 0)>; def I32toI1: OutPatFrag<(ops node:$Rs), (i1 (C2_cmpgtui (i32 $Rs), (i32 0)))>; def ToZext64: OutPatFrag<(ops node:$Rs), (i64 (A4_combineir 0, (i32 $Rs)))>; def ToSext64: OutPatFrag<(ops node:$Rs), (i64 (A2_sxtw (i32 $Rs)))>; +def ToAext64: OutPatFrag<(ops node:$Rs), + (REG_SEQUENCE DoubleRegs, (i32 (IMPLICIT_DEF)), isub_hi, (i32 $Rs), isub_lo)>; def Combinew: OutPatFrag<(ops node:$Rs, node:$Rt), (REG_SEQUENCE DoubleRegs, $Rs, isub_hi, $Rt, isub_lo)>; @@ -246,6 +248,9 @@ def Aext64: PatFrag<(ops node:$Rs), (i64 (anyext node:$Rs))>; def Zext64: PatFrag<(ops node:$Rs), (i64 (zext node:$Rs))>; def Sext64: PatLeaf<(i64 Usxtw:$Rs)>; +def azext: PatFrags<(ops node:$Rs), [(zext node:$Rs), (anyext node:$Rs)]>; +def asext: PatFrags<(ops node:$Rs), [(sext node:$Rs), (anyext node:$Rs)]>; + def: Pat<(IsOrAdd (i32 AddrFI:$Rs), s32_0ImmPred:$off), (PS_fi (i32 AddrFI:$Rs), imm:$off)>; @@ -416,52 +421,48 @@ def: Pat<(sext_inreg I64:$Rs, i32), (A2_sxtw (LoReg $Rs))>; def: Pat<(sext_inreg I64:$Rs, i16), (A2_sxtw (A2_sxth (LoReg $Rs)))>; def: Pat<(sext_inreg I64:$Rs, i8), (A2_sxtw (A2_sxtb (LoReg $Rs)))>; -def: Pat<(i64 (sext I1:$Pu)), - (Combinew (C2_muxii PredRegs:$Pu, -1, 0), - (C2_muxii PredRegs:$Pu, -1, 0))>; - -def: Pat<(i32 (sext I1:$Pu)), (C2_muxii I1:$Pu, -1, 0)>; -def: Pat<(i32 (zext I1:$Pu)), (C2_muxii I1:$Pu, 1, 0)>; -def: Pat<(i64 (zext I1:$Pu)), (ToZext64 (C2_muxii I1:$Pu, 1, 0))>; -def: Pat<(v2i16 (sext V2I1:$Pu)), (S2_vtrunehb (C2_mask V2I1:$Pu))>; -def: Pat<(v2i32 (sext V2I1:$Pu)), (C2_mask V2I1:$Pu)>; -def: Pat<(v4i8 (sext V4I1:$Pu)), (S2_vtrunehb (C2_mask V4I1:$Pu))>; -def: Pat<(v4i16 (sext V4I1:$Pu)), (C2_mask V4I1:$Pu)>; -def: Pat<(v8i8 (sext V8I1:$Pu)), (C2_mask V8I1:$Pu)>; - def: Pat<(i64 (sext I32:$Rs)), (A2_sxtw I32:$Rs)>; def: Pat<(Zext64 I32:$Rs), (ToZext64 $Rs)>; def: Pat<(Aext64 I32:$Rs), (ToZext64 $Rs)>; def: Pat<(i32 (trunc I64:$Rs)), (LoReg $Rs)>; -def: Pat<(i1 (trunc I64:$Rs)), (C2_tfrrp (LoReg $Rs))>; +def: Pat<(i1 (trunc I32:$Rs)), (S2_tstbit_i I32:$Rs, 0)>; +def: Pat<(i1 (trunc I64:$Rs)), (S2_tstbit_i (LoReg $Rs), 0)>; let AddedComplexity = 20 in { def: Pat<(and I32:$Rs, 255), (A2_zxtb I32:$Rs)>; def: Pat<(and I32:$Rs, 65535), (A2_zxth I32:$Rs)>; } -def: Pat<(i32 (anyext I1:$Pu)), (C2_muxii I1:$Pu, 1, 0)>; -def: Pat<(i64 (anyext I1:$Pu)), (ToZext64 (C2_muxii I1:$Pu, 1, 0))>; +// Extensions from i1 or vectors of i1. +def: Pat<(i32 (azext I1:$Pu)), (C2_muxii I1:$Pu, 1, 0)>; +def: Pat<(i64 (azext I1:$Pu)), (ToZext64 (C2_muxii I1:$Pu, 1, 0))>; +def: Pat<(i32 (sext I1:$Pu)), (C2_muxii I1:$Pu, -1, 0)>; +def: Pat<(i64 (sext I1:$Pu)), (Combinew (C2_muxii PredRegs:$Pu, -1, 0), + (C2_muxii PredRegs:$Pu, -1, 0))>; + +def: Pat<(v2i16 (sext V2I1:$Pu)), (S2_vtrunehb (C2_mask V2I1:$Pu))>; +def: Pat<(v2i32 (sext V2I1:$Pu)), (C2_mask V2I1:$Pu)>; +def: Pat<(v4i8 (sext V4I1:$Pu)), (S2_vtrunehb (C2_mask V4I1:$Pu))>; +def: Pat<(v4i16 (sext V4I1:$Pu)), (C2_mask V4I1:$Pu)>; +def: Pat<(v8i8 (sext V8I1:$Pu)), (C2_mask V8I1:$Pu)>; def Vsplatpi: OutPatFrag<(ops node:$V), (Combinew (A2_tfrsi $V), (A2_tfrsi $V))>; -def: Pat<(v8i8 (zext V8I1:$Pu)), - (A2_andp (C2_mask V8I1:$Pu), (Vsplatpi (i32 0x01010101)))>; -def: Pat<(v4i16 (zext V4I1:$Pu)), - (A2_andp (C2_mask V4I1:$Pu), (Vsplatpi (i32 0x00010001)))>; -def: Pat<(v2i32 (zext V2I1:$Pu)), - (A2_andp (C2_mask V2I1:$Pu), (A2_combineii (i32 1), (i32 1)))>; -def: Pat<(v4i8 (zext V4I1:$Pu)), - (A2_andir (LoReg (C2_mask V4I1:$Pu)), (i32 0x01010101))>; -def: Pat<(v2i16 (zext V2I1:$Pu)), +def: Pat<(v2i16 (azext V2I1:$Pu)), (A2_andir (LoReg (C2_mask V2I1:$Pu)), (i32 0x00010001))>; +def: Pat<(v2i32 (azext V2I1:$Pu)), + (A2_andp (C2_mask V2I1:$Pu), (A2_combineii (i32 1), (i32 1)))>; +def: Pat<(v4i8 (azext V4I1:$Pu)), + (A2_andir (LoReg (C2_mask V4I1:$Pu)), (i32 0x01010101))>; +def: Pat<(v4i16 (azext V4I1:$Pu)), + (A2_andp (C2_mask V4I1:$Pu), (Vsplatpi (i32 0x00010001)))>; +def: Pat<(v8i8 (azext V8I1:$Pu)), + (A2_andp (C2_mask V8I1:$Pu), (Vsplatpi (i32 0x01010101)))>; -def: Pat<(v4i16 (zext V4I8:$Rs)), (S2_vzxtbh V4I8:$Rs)>; -def: Pat<(v2i32 (zext V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>; -def: Pat<(v4i16 (anyext V4I8:$Rs)), (S2_vzxtbh V4I8:$Rs)>; -def: Pat<(v2i32 (anyext V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>; +def: Pat<(v4i16 (azext V4I8:$Rs)), (S2_vzxtbh V4I8:$Rs)>; +def: Pat<(v2i32 (azext V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>; def: Pat<(v4i16 (sext V4I8:$Rs)), (S2_vsxtbh V4I8:$Rs)>; def: Pat<(v2i32 (sext V2I16:$Rs)), (S2_vsxthw V2I16:$Rs)>; @@ -812,9 +813,9 @@ def: Pat<(select (not I1:$Pu), F32:$Rt, f32ImmPred:$I), (C2_muxri I1:$Pu, (ftoi $I), F32:$Rt)>; def: Pat<(select I1:$Pu, V4I8:$Rs, V4I8:$Rt), - (LoReg (C2_vmux I1:$Pu, (ToZext64 $Rs), (ToZext64 $Rt)))>; + (LoReg (C2_vmux I1:$Pu, (ToAext64 $Rs), (ToAext64 $Rt)))>; def: Pat<(select I1:$Pu, V2I16:$Rs, V2I16:$Rt), - (LoReg (C2_vmux I1:$Pu, (ToZext64 $Rs), (ToZext64 $Rt)))>; + (LoReg (C2_vmux I1:$Pu, (ToAext64 $Rs), (ToAext64 $Rt)))>; def: Pat<(select I1:$Pu, V2I32:$Rs, V2I32:$Rt), (Combinew (C2_mux I1:$Pu, (HiReg $Rs), (HiReg $Rt)), (C2_mux I1:$Pu, (LoReg $Rs), (LoReg $Rt)))>; @@ -1169,6 +1170,18 @@ def: Pat<(srl V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c))), def: Pat<(shl V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c))), (S2_asl_i_vh V4I16:$b, imm:$c)>; +def: Pat<(HexagonVASR V2I16:$Rs, u4_0ImmPred:$S), + (LoReg (S2_asr_i_vh (ToAext64 $Rs), imm:$S))>; +def: Pat<(HexagonVASL V2I16:$Rs, u4_0ImmPred:$S), + (LoReg (S2_asl_i_vh (ToAext64 $Rs), imm:$S))>; +def: Pat<(HexagonVLSR V2I16:$Rs, u4_0ImmPred:$S), + (LoReg (S2_lsr_i_vh (ToAext64 $Rs), imm:$S))>; +def: Pat<(HexagonVASR V2I16:$Rs, I32:$Rt), + (LoReg (S2_asr_i_vh (ToAext64 $Rs), I32:$Rt))>; +def: Pat<(HexagonVASL V2I16:$Rs, I32:$Rt), + (LoReg (S2_asl_i_vh (ToAext64 $Rs), I32:$Rt))>; +def: Pat<(HexagonVLSR V2I16:$Rs, I32:$Rt), + (LoReg (S2_lsr_i_vh (ToAext64 $Rs), I32:$Rt))>; // --(9) Arithmetic/bitwise ---------------------------------------------- // @@ -1259,12 +1272,19 @@ def: OpR_RR_pat, f32, F32>; def: OpR_RR_pat, f32, F32>; def: OpR_RR_pat, f32, F32>; +let Predicates = [HasV66] in { + def: OpR_RR_pat, f64, F64>; + def: OpR_RR_pat, f64, F64>; +} + // In expressions like a0*b0 + a1*b1 + ..., prefer to generate multiply-add, // over add-add with individual multiplies as inputs. let AddedComplexity = 10 in { def: AccRRI_pat, I32, u32_0ImmPred>; def: AccRRI_pat, I32, u32_0ImmPred>; def: AccRRR_pat, I32, I32, I32>; + let Predicates = [HasV66] in + def: AccRRR_pat, I32, I32, I32>; } def: AccRRI_pat, I32, s32_0ImmPred>; @@ -1506,9 +1526,9 @@ def: Pat<(add V2I32:$Rx, (mul V2I32:$Rs, V2I32:$Rt)), // Add/subtract two v4i8: Hexagon does not have an insn for this one, so // we use the double add v8i8, and use only the low part of the result. def: Pat<(add V4I8:$Rs, V4I8:$Rt), - (LoReg (A2_vaddub (ToZext64 $Rs), (ToZext64 $Rt)))>; + (LoReg (A2_vaddub (ToAext64 $Rs), (ToAext64 $Rt)))>; def: Pat<(sub V4I8:$Rs, V4I8:$Rt), - (LoReg (A2_vsubub (ToZext64 $Rs), (ToZext64 $Rt)))>; + (LoReg (A2_vsubub (ToAext64 $Rs), (ToAext64 $Rt)))>; // Use M2_vmpy2s_s0 for half-word vector multiply. It multiplies two // half-words, and saturates the result to a 32-bit value, except the @@ -1857,10 +1877,10 @@ let AddedComplexity = 20 in { } let AddedComplexity = 30 in { - defm: Loadxim_pat; - defm: Loadxim_pat; - defm: Loadxim_pat; - defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; defm: Loadxim_pat; defm: Loadxim_pat; defm: Loadxim_pat; @@ -1895,13 +1915,13 @@ let AddedComplexity = 60 in { def: Loadxum_pat; def: Loadxum_pat; - def: Loadxum_pat; + def: Loadxum_pat; def: Loadxum_pat; def: Loadxum_pat; - def: Loadxum_pat; + def: Loadxum_pat; def: Loadxum_pat; def: Loadxum_pat; - def: Loadxum_pat; + def: Loadxum_pat; } let AddedComplexity = 40 in { @@ -1941,25 +1961,25 @@ let AddedComplexity = 20 in { } let AddedComplexity = 40 in { - def: Loadxrm_shl_pat; + def: Loadxrm_shl_pat; def: Loadxrm_shl_pat; def: Loadxrm_shl_pat; - def: Loadxrm_shl_pat; + def: Loadxrm_shl_pat; def: Loadxrm_shl_pat; def: Loadxrm_shl_pat; - def: Loadxrm_shl_pat; + def: Loadxrm_shl_pat; def: Loadxrm_shl_pat; def: Loadxrm_shl_pat; } let AddedComplexity = 20 in { - def: Loadxrm_add_pat; + def: Loadxrm_add_pat; def: Loadxrm_add_pat; def: Loadxrm_add_pat; - def: Loadxrm_add_pat; + def: Loadxrm_add_pat; def: Loadxrm_add_pat; def: Loadxrm_add_pat; - def: Loadxrm_add_pat; + def: Loadxrm_add_pat; def: Loadxrm_add_pat; def: Loadxrm_add_pat; } @@ -1991,13 +2011,13 @@ let AddedComplexity = 60 in { } let AddedComplexity = 30 in { - def: Loadam_pat; + def: Loadam_pat; def: Loadam_pat; def: Loadam_pat; - def: Loadam_pat; + def: Loadam_pat; def: Loadam_pat; def: Loadam_pat; - def: Loadam_pat; + def: Loadam_pat; def: Loadam_pat; def: Loadam_pat; @@ -2033,13 +2053,13 @@ let AddedComplexity = 100 in { } let AddedComplexity = 70 in { - def: Loadam_pat; + def: Loadam_pat; def: Loadam_pat; def: Loadam_pat; - def: Loadam_pat; + def: Loadam_pat; def: Loadam_pat; def: Loadam_pat; - def: Loadam_pat; + def: Loadam_pat; def: Loadam_pat; def: Loadam_pat; diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td index 6935e3b7bebca915214d196d2bdb25a7ae515037..b9748c7e189c303dccaa6b977645160fbe4ca12e 100644 --- a/lib/Target/Hexagon/HexagonPseudo.td +++ b/lib/Target/Hexagon/HexagonPseudo.td @@ -526,11 +526,11 @@ let isCodeGenOnly = 1, isPseudo = 1, Defs = [CS], Uses = [CS], addrMode = PostInc, accessSize = MS, hasSideEffects = 0 in { def NAME#_pci : LDInst<(outs RC:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, IntRegs:$Cs), - ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_4403ca65>; + ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_e93a3d71>; def NAME#_pcr : LDInst<(outs RC:$Rd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Cs), - ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_2fc0c436>; + ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_44d3da28>; } } @@ -547,11 +547,11 @@ let isCodeGenOnly = 1, isPseudo = 1, Defs = [CS], Uses = [CS], addrMode = PostInc, accessSize = MS, hasSideEffects = 0 in { def NAME#_pci : STInst<(outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, RC:$Rt32, IntRegs:$Cs), - ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_9fdb5406>; + ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_e86aa961>; def NAME#_pcr : STInst<(outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, ModRegs:$Mu2, RC:$Rt32, IntRegs:$Cs), - ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_f86c328a>; + ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_da97ee82>; } } diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp index 545def45a1c307d669f7da63ac80cddf4b4c19b5..9b8f4e07376fca35520aee0ceca56750ded42cd3 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp +++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp @@ -312,6 +312,7 @@ unsigned HexagonRegisterInfo::getHexagonSubRegIndex( static const unsigned ISub[] = { Hexagon::isub_lo, Hexagon::isub_hi }; static const unsigned VSub[] = { Hexagon::vsub_lo, Hexagon::vsub_hi }; + static const unsigned WSub[] = { Hexagon::wsub_lo, Hexagon::wsub_hi }; switch (RC.getID()) { case Hexagon::CtrRegs64RegClassID: @@ -319,6 +320,8 @@ unsigned HexagonRegisterInfo::getHexagonSubRegIndex( return ISub[GenIdx]; case Hexagon::HvxWRRegClassID: return VSub[GenIdx]; + case Hexagon::HvxVQRRegClassID: + return WSub[GenIdx]; } if (const TargetRegisterClass *SuperRC = *RC.getSuperClasses()) diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td index 1fe1ef4ac572c317083345aff0b7dfb76f7ee954..da90911e2c0514cba15f6b04ff1f2e5a8edf464d 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.td +++ b/lib/Target/Hexagon/HexagonRegisterInfo.td @@ -82,13 +82,14 @@ let Namespace = "Hexagon" in { def isub_hi : SubRegIndex<32, 32>; def vsub_lo : SubRegIndex<512>; def vsub_hi : SubRegIndex<512, 512>; + def wsub_lo : SubRegIndex<1024>; + def wsub_hi : SubRegIndex<1024, 1024>; def subreg_overflow : SubRegIndex<1, 0>; // Integer registers. foreach i = 0-28 in { def R#i : Ri, DwarfRegNum<[i]>; } - def R29 : Ri<29, "r29", ["sp"]>, DwarfRegNum<[29]>; def R30 : Ri<30, "r30", ["fp"]>, DwarfRegNum<[30]>; def R31 : Ri<31, "r31", ["lr"]>, DwarfRegNum<[31]>; @@ -206,6 +207,18 @@ let Namespace = "Hexagon" in { def W15 : Rd<30, "v31:30", [V30, V31]>, DwarfRegNum<[129]>; } + // Aliases of the V* registers used to hold quad vec values. + let SubRegIndices = [wsub_lo, wsub_hi], CoveredBySubRegs = 1 in { + def VQ0 : Rd< 0, "v3:0", [W0, W1]>, DwarfRegNum<[252]>; + def VQ1 : Rd< 4, "v7:4", [W2, W3]>, DwarfRegNum<[253]>; + def VQ2 : Rd< 8, "v11:8", [W4, W5]>, DwarfRegNum<[254]>; + def VQ3 : Rd<12, "v15:12", [W6, W7]>, DwarfRegNum<[255]>; + def VQ4 : Rd<16, "v19:16", [W8, W9]>, DwarfRegNum<[256]>; + def VQ5 : Rd<20, "v23:20", [W10, W11]>, DwarfRegNum<[257]>; + def VQ6 : Rd<24, "v27:24", [W12, W13]>, DwarfRegNum<[258]>; + def VQ7 : Rd<28, "v31:28", [W14, W15]>, DwarfRegNum<[259]>; + } + // Vector Predicate registers. def Q0 : Rq<0, "q0">, DwarfRegNum<[131]>; def Q1 : Rq<1, "q1">, DwarfRegNum<[132]>; @@ -295,29 +308,6 @@ def VecQ32: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], // HVX register classes -// Register classes. -// -// FIXME: the register order should be defined in terms of the preferred -// allocation order... -// -def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32, - (add (sequence "R%u", 0, 9), (sequence "R%u", 12, 28), - R10, R11, R29, R30, R31)>; - -// Registers are listed in reverse order for allocation preference reasons. -def GeneralSubRegs : RegisterClass<"Hexagon", [i32], 32, - (add R23, R22, R21, R20, R19, R18, R17, R16, - R7, R6, R5, R4, R3, R2, R1, R0)>; - -def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32, - (add R7, R6, R5, R4, R3, R2, R1, R0)> ; - -def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64, - (add (sequence "D%u", 0, 4), (sequence "D%u", 6, 13), D5, D14, D15)>; - -def GeneralDoubleLow8Regs : RegisterClass<"Hexagon", [i64], 64, - (add D11, D10, D9, D8, D3, D2, D1, D0)>; - def HvxVR : RegisterClass<"Hexagon", [VecI8, VecI16, VecI32], 512, (add (sequence "V%u", 0, 31), VTMP)> { let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode], @@ -336,6 +326,32 @@ def HvxQR : RegisterClass<"Hexagon", [VecI1, VecQ8, VecQ16, VecQ32], 512, [RegInfo<512,512,512>, RegInfo<1024,1024,1024>, RegInfo<512,512,512>]>; } +def HvxVQR : RegisterClass<"Hexagon", [untyped], 2048, + (add (sequence "VQ%u", 0, 7))> { + let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode], + [RegInfo<2048,2048,2048>, RegInfo<4096,4096,4096>, RegInfo<2048,2048,2048>]>; +} + +// Core register classes + +def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32, + (add (sequence "R%u", 0, 9), (sequence "R%u", 12, 28), + R10, R11, R29, R30, R31)>; + +// Registers are listed in reverse order for allocation preference reasons. +def GeneralSubRegs : RegisterClass<"Hexagon", [i32], 32, + (add R23, R22, R21, R20, R19, R18, R17, R16, + R7, R6, R5, R4, R3, R2, R1, R0)>; + +def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32, + (add R7, R6, R5, R4, R3, R2, R1, R0)> ; + +def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64, + (add (sequence "D%u", 0, 4), (sequence "D%u", 6, 13), D5, D14, D15)>; + +def GeneralDoubleLow8Regs : RegisterClass<"Hexagon", [i64], 64, + (add D11, D10, D9, D8, D3, D2, D1, D0)>; + let Size = 32 in def PredRegs : RegisterClass<"Hexagon", [i1, v2i1, v4i1, v8i1, v4i8, v2i16, i32], 32, (add P0, P1, P2, P3)>; diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td index fa4f9ca639ccf209ef2e51716c55465c952c0517..1024198e9b3ff9f9b631d586aecb504b21cdd12a 100644 --- a/lib/Target/Hexagon/HexagonSchedule.td +++ b/lib/Target/Hexagon/HexagonSchedule.td @@ -27,6 +27,7 @@ def CVI_SHIFT : FuncUnit; def CVI_MPY0 : FuncUnit; def CVI_MPY1 : FuncUnit; def CVI_LD : FuncUnit; +def CVI_ZW : FuncUnit; // Z register write port // Combined functional units. def CVI_XLSHF : FuncUnit; @@ -84,3 +85,9 @@ include "HexagonScheduleV62.td" //===----------------------------------------------------------------------===// include "HexagonScheduleV65.td" + +//===----------------------------------------------------------------------===// +// V66 Machine Info + +//===----------------------------------------------------------------------===// + +include "HexagonScheduleV66.td" diff --git a/lib/Target/Hexagon/HexagonScheduleV60.td b/lib/Target/Hexagon/HexagonScheduleV60.td index a2544c92a72c05299b1fca4363bba1a2c0f95ab1..861a8d2b0339d7ceb960e10d7050b0e77c11fb00 100644 --- a/lib/Target/Hexagon/HexagonScheduleV60.td +++ b/lib/Target/Hexagon/HexagonScheduleV60.td @@ -65,7 +65,7 @@ def HexagonItinerariesV60 : ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP, CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1, CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL, - CVI_ALL_NOMEM], + CVI_ALL_NOMEM, CVI_ZW], [Hex_FWD, HVX_FWD], HexagonV60ItinList.ItinList>; def HexagonModelV60 : SchedMachineModel { diff --git a/lib/Target/Hexagon/HexagonScheduleV62.td b/lib/Target/Hexagon/HexagonScheduleV62.td index a0a8595f185fb0fdd9da145b90aeaa624b53ff75..1c274191277ca1dbede467254477d3c5ce48ecc0 100644 --- a/lib/Target/Hexagon/HexagonScheduleV62.td +++ b/lib/Target/Hexagon/HexagonScheduleV62.td @@ -21,7 +21,7 @@ def HexagonItinerariesV62 : ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP, CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1, CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL, - CVI_ALL_NOMEM], + CVI_ALL_NOMEM, CVI_ZW], [Hex_FWD, HVX_FWD], HexagonV62ItinList.ItinList>; def HexagonModelV62 : SchedMachineModel { diff --git a/lib/Target/Hexagon/HexagonScheduleV65.td b/lib/Target/Hexagon/HexagonScheduleV65.td index e3b1313923f5c7a68f9391f653d471abdb7cc2d3..46a79d521795cef9b050e706ff7f59c09604240c 100644 --- a/lib/Target/Hexagon/HexagonScheduleV65.td +++ b/lib/Target/Hexagon/HexagonScheduleV65.td @@ -23,7 +23,7 @@ def HexagonItinerariesV65 : ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP, CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1, CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL, - CVI_ALL_NOMEM], + CVI_ALL_NOMEM, CVI_ZW], [Hex_FWD, HVX_FWD], HexagonV65ItinList.ItinList>; diff --git a/lib/Target/Hexagon/HexagonScheduleV66.td b/lib/Target/Hexagon/HexagonScheduleV66.td new file mode 100644 index 0000000000000000000000000000000000000000..38e3d21d370174133bf010918b22cb1651b35565 --- /dev/null +++ b/lib/Target/Hexagon/HexagonScheduleV66.td @@ -0,0 +1,41 @@ +//=-HexagonScheduleV66.td - HexagonV66 Scheduling Definitions *- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// +// ScalarItin and HVXItin contain some old itineraries +// still used by a handful of instructions. Hopefully, we will be able +// to get rid of them soon. + +def HexagonV66ItinList : DepScalarItinV66, ScalarItin, + DepHVXItinV66, HVXItin, PseudoItin { + list ItinList = + !listconcat(DepScalarItinV66_list, ScalarItin_list, + DepHVXItinV66_list, HVXItin_list, PseudoItin_list); +} + +def HexagonItinerariesV66 : + ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP, + CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1, + CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL, + CVI_ALL_NOMEM, CVI_ZW], + [Hex_FWD, HVX_FWD], + HexagonV66ItinList.ItinList>; + +def HexagonModelV66 : SchedMachineModel { + // Max issue per cycle == bundle width. + let IssueWidth = 4; + let Itineraries = HexagonItinerariesV66; + let LoadLatency = 1; + let CompleteModel = 0; +} + +//===----------------------------------------------------------------------===// +// Hexagon V66 Resource Definitions - +//===----------------------------------------------------------------------===// + diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp index 68e276be0f691c15ce3831024e6eaf71afb83bda..9c77135c2f2f686580d2c12463cec38a7768a25e 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -98,6 +98,7 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { {"hexagonv60", Hexagon::ArchEnum::V60}, {"hexagonv62", Hexagon::ArchEnum::V62}, {"hexagonv65", Hexagon::ArchEnum::V65}, + {"hexagonv66", Hexagon::ArchEnum::V66}, }; auto FoundIt = CpuTable.find(CPUString); @@ -275,11 +276,11 @@ void HexagonSubtarget::BankConflictMutation::apply(ScheduleDAGInstrs *DAG) { if (!L0.mayLoad() || L0.mayStore() || HII.getAddrMode(L0) != HexagonII::BaseImmOffset) continue; - int Offset0; + int64_t Offset0; unsigned Size0; - unsigned Base0 = HII.getBaseAndOffset(L0, Offset0, Size0); + MachineOperand *BaseOp0 = HII.getBaseAndOffset(L0, Offset0, Size0); // Is the access size is longer than the L1 cache line, skip the check. - if (Base0 == 0 || Size0 >= 32) + if (BaseOp0 == nullptr || !BaseOp0->isReg() || Size0 >= 32) continue; // Scan only up to 32 instructions ahead (to avoid n^2 complexity). for (unsigned j = i+1, m = std::min(i+32, e); j != m; ++j) { @@ -288,10 +289,11 @@ void HexagonSubtarget::BankConflictMutation::apply(ScheduleDAGInstrs *DAG) { if (!L1.mayLoad() || L1.mayStore() || HII.getAddrMode(L1) != HexagonII::BaseImmOffset) continue; - int Offset1; + int64_t Offset1; unsigned Size1; - unsigned Base1 = HII.getBaseAndOffset(L1, Offset1, Size1); - if (Base1 == 0 || Size1 >= 32 || Base0 != Base1) + MachineOperand *BaseOp1 = HII.getBaseAndOffset(L1, Offset1, Size1); + if (BaseOp1 == nullptr || !BaseOp1->isReg() || Size1 >= 32 || + BaseOp0->getReg() != BaseOp1->getReg()) continue; // Check bits 3 and 4 of the offset: if they differ, a bank conflict // is unlikely. diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h index eaae4db6ba90a6ce502d6abcbbc27eaecdd9b044..3a5acb53682cd2dee84e6f821745a12dea14b225 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.h +++ b/lib/Target/Hexagon/HexagonSubtarget.h @@ -52,10 +52,12 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo { bool UseNewValueJumps = false; bool UseNewValueStores = false; bool UseSmallData = false; + bool UseZRegOps = false; bool HasMemNoShuf = false; bool EnableDuplex = false; bool ReservedR19 = false; + bool NoreturnStackElim = false; public: Hexagon::ArchEnum HexagonArchVersion; @@ -150,6 +152,12 @@ public: bool hasV65OpsOnly() const { return getHexagonArchVersion() == Hexagon::ArchEnum::V65; } + bool hasV66Ops() const { + return getHexagonArchVersion() >= Hexagon::ArchEnum::V66; + } + bool hasV66OpsOnly() const { + return getHexagonArchVersion() == Hexagon::ArchEnum::V66; + } bool useLongCalls() const { return UseLongCalls; } bool useMemops() const { return UseMemops; } @@ -157,6 +165,7 @@ public: bool useNewValueJumps() const { return UseNewValueJumps; } bool useNewValueStores() const { return UseNewValueStores; } bool useSmallData() const { return UseSmallData; } + bool useZRegOps() const { return UseZRegOps; } bool useHVXOps() const { return HexagonHVXVersion > Hexagon::ArchEnum::NoArch; @@ -168,6 +177,8 @@ public: bool hasReservedR19() const { return ReservedR19; } bool usePredicatedCalls() const; + bool noreturnStackElim() const { return NoreturnStackElim; } + bool useBSBScheduling() const { return UseBSBScheduling; } bool enableMachineScheduler() const override; diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index 184e5b774184fd502402f79bf4d3cca600e3348f..ddfda7e27793947dfa1df82d1565de16c58f2ea8 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -180,12 +180,6 @@ static Reloc::Model getEffectiveRelocModel(Optional RM) { return *RM; } -static CodeModel::Model getEffectiveCodeModel(Optional CM) { - if (CM) - return *CM; - return CodeModel::Small; -} - extern "C" void LLVMInitializeHexagonTarget() { // Register the target. RegisterTargetMachine X(getTheHexagonTarget()); @@ -222,7 +216,8 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT, "i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-" "v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048", TT, CPU, FS, Options, getEffectiveRelocModel(RM), - getEffectiveCodeModel(CM), (HexagonNoOpt ? CodeGenOpt::None : OL)), + getEffectiveCodeModel(CM, CodeModel::Small), + (HexagonNoOpt ? CodeGenOpt::None : OL)), TLOF(make_unique()) { initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry()); initAsmInfo(); diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp index 386cd14c827beaf3e6cb7e03dd8b332f5162a3cf..2185bf8eebc66d0fab86786a6e68bd5ae9748e7c 100644 --- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp +++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp @@ -199,11 +199,10 @@ MCSection *HexagonTargetObjectFile::getExplicitSectionGlobal( /// section. bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO, const TargetMachine &TM) const { - if (!isSmallDataEnabled(TM)) { - LLVM_DEBUG(dbgs() << "Small data is not available.\n"); - return false; - } - + bool HaveSData = isSmallDataEnabled(TM); + if (!HaveSData) + LLVM_DEBUG(dbgs() << "Small-data allocation is disabled, but symbols " + "may have explicit section assignments...\n"); // Only global variables, not functions. LLVM_DEBUG(dbgs() << "Checking if value is in small-data, -G" << SmallDataThreshold << ": \"" << GO->getName() << "\": "); @@ -223,6 +222,12 @@ bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO, return IsSmall; } + // If sdata is disabled, stop the checks here. + if (!HaveSData) { + LLVM_DEBUG(dbgs() << "no, small-data allocation is disabled\n"); + return false; + } + if (GVar->isConstant()) { LLVM_DEBUG(dbgs() << "no, is a constant\n"); return false; diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h index cb504b5c3d5d9bbcfd0491095de8cb107ef464cf..6543d831390064346a9185984783878e5607226f 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h @@ -26,11 +26,9 @@ namespace llvm { /// instruction info tracks. namespace HexagonII { unsigned const TypeCVI_FIRST = TypeCVI_4SLOT_MPY; - unsigned const TypeCVI_LAST = TypeCVI_VX_LATE; + unsigned const TypeCVI_LAST = TypeCVI_ZW; enum SubTarget { - HasV4SubT = 0x3f, - HasV5SubT = 0x3e, HasV55SubT = 0x3c, HasV60SubT = 0x38, }; @@ -57,117 +55,117 @@ namespace HexagonII { // MCInstrDesc TSFlags // *** Must match HexagonInstrFormat*.td *** enum { - // This 5-bit field describes the insn type. - TypePos = 0, - TypeMask = 0x3f, + // This 7-bit field describes the insn type. + TypePos = 0, + TypeMask = 0x7f, // Solo instructions. - SoloPos = 6, + SoloPos = 7, SoloMask = 0x1, // Packed only with A or X-type instructions. - SoloAXPos = 7, + SoloAXPos = 8, SoloAXMask = 0x1, // Only A-type instruction in first slot or nothing. - RestrictSlot1AOKPos = 8, + RestrictSlot1AOKPos = 9, RestrictSlot1AOKMask = 0x1, // Predicated instructions. - PredicatedPos = 9, + PredicatedPos = 10, PredicatedMask = 0x1, - PredicatedFalsePos = 10, + PredicatedFalsePos = 11, PredicatedFalseMask = 0x1, - PredicatedNewPos = 11, + PredicatedNewPos = 12, PredicatedNewMask = 0x1, - PredicateLatePos = 12, + PredicateLatePos = 13, PredicateLateMask = 0x1, // New-Value consumer instructions. - NewValuePos = 13, + NewValuePos = 14, NewValueMask = 0x1, // New-Value producer instructions. - hasNewValuePos = 14, + hasNewValuePos = 15, hasNewValueMask = 0x1, // Which operand consumes or produces a new value. - NewValueOpPos = 15, + NewValueOpPos = 16, NewValueOpMask = 0x7, // Stores that can become new-value stores. - mayNVStorePos = 18, + mayNVStorePos = 19, mayNVStoreMask = 0x1, // New-value store instructions. - NVStorePos = 19, + NVStorePos = 20, NVStoreMask = 0x1, // Loads that can become current-value loads. - mayCVLoadPos = 20, + mayCVLoadPos = 21, mayCVLoadMask = 0x1, // Current-value load instructions. - CVLoadPos = 21, + CVLoadPos = 22, CVLoadMask = 0x1, // Extendable insns. - ExtendablePos = 22, + ExtendablePos = 23, ExtendableMask = 0x1, // Insns must be extended. - ExtendedPos = 23, + ExtendedPos = 24, ExtendedMask = 0x1, // Which operand may be extended. - ExtendableOpPos = 24, + ExtendableOpPos = 25, ExtendableOpMask = 0x7, // Signed or unsigned range. - ExtentSignedPos = 27, + ExtentSignedPos = 28, ExtentSignedMask = 0x1, // Number of bits of range before extending operand. - ExtentBitsPos = 28, + ExtentBitsPos = 29, ExtentBitsMask = 0x1f, // Alignment power-of-two before extending operand. - ExtentAlignPos = 33, + ExtentAlignPos = 34, ExtentAlignMask = 0x3, - CofMax1Pos = 35, + CofMax1Pos = 36, CofMax1Mask = 0x1, - CofRelax1Pos = 36, + CofRelax1Pos = 37, CofRelax1Mask = 0x1, - CofRelax2Pos = 37, + CofRelax2Pos = 38, CofRelax2Mask = 0x1, - RestrictNoSlot1StorePos = 38, + RestrictNoSlot1StorePos = 39, RestrictNoSlot1StoreMask = 0x1, // Addressing mode for load/store instructions. - AddrModePos = 41, + AddrModePos = 42, AddrModeMask = 0x7, // Access size for load/store instructions. - MemAccessSizePos = 44, + MemAccessSizePos = 45, MemAccesSizeMask = 0xf, // Branch predicted taken. - TakenPos = 48, + TakenPos = 49, TakenMask = 0x1, // Floating-point instructions. - FPPos = 49, + FPPos = 50, FPMask = 0x1, // New-Value producer-2 instructions. - hasNewValuePos2 = 51, + hasNewValuePos2 = 52, hasNewValueMask2 = 0x1, // Which operand consumes or produces a new value. - NewValueOpPos2 = 52, + NewValueOpPos2 = 53, NewValueOpMask2 = 0x7, // Accumulator instructions. - AccumulatorPos = 55, + AccumulatorPos = 56, AccumulatorMask = 0x1, // Complex XU, prevent xu competition by preferring slot3 - PrefersSlot3Pos = 56, + PrefersSlot3Pos = 57, PrefersSlot3Mask = 0x1, // v65 - HasTmpDstPos = 59, + HasTmpDstPos = 60, HasTmpDstMask = 0x1, - CVINewPos = 61, - CVINewMask = 0x1 + CVINewPos = 62, + CVINewMask = 0x1, }; // *** The code above must match HexagonInstrFormat*.td *** // @@ -176,7 +174,7 @@ namespace HexagonII { enum HexagonMOTargetFlagVal { // Hexagon-specific MachineOperand target flags. // - // When chaning these, make sure to update + // When changing these, make sure to update // getSerializableDirectMachineOperandTargetFlags and // getSerializableBitmaskMachineOperandTargetFlags if needed. MO_NO_FLAG, @@ -189,7 +187,8 @@ namespace HexagonII { MO_GOT, // Low or high part of a symbol. - MO_LO16, MO_HI16, + MO_LO16, + MO_HI16, // Offset from the base of the SDA. MO_GPREL, diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp index 8f3c09e7204f5407e40462f8c781dcbba6160543..92ce7345f3584c9de0d40ce065281f9537f4c5e2 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp @@ -71,6 +71,8 @@ cl::opt MV62("mv62", cl::Hidden, cl::desc("Build for Hexagon V62"), cl::init(false)); cl::opt MV65("mv65", cl::Hidden, cl::desc("Build for Hexagon V65"), cl::init(false)); +cl::opt MV66("mv66", cl::Hidden, cl::desc("Build for Hexagon V66"), + cl::init(false)); } // namespace cl::opt @@ -80,9 +82,10 @@ cl::opt clEnumValN(Hexagon::ArchEnum::V60, "v60", "Build for HVX v60"), clEnumValN(Hexagon::ArchEnum::V62, "v62", "Build for HVX v62"), clEnumValN(Hexagon::ArchEnum::V65, "v65", "Build for HVX v65"), - // Sentinal for no value specified + clEnumValN(Hexagon::ArchEnum::V66, "v66", "Build for HVX v66"), + // Sentinel for no value specified. clEnumValN(Hexagon::ArchEnum::Generic, "", "")), - // Sentinal for flag not present + // Sentinel for flag not present. cl::init(Hexagon::ArchEnum::NoArch), cl::ValueOptional); static cl::opt @@ -103,6 +106,8 @@ static StringRef HexagonGetArchVariant() { return "hexagonv62"; if (MV65) return "hexagonv65"; + if (MV66) + return "hexagonv66"; return ""; } @@ -289,11 +294,15 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) { case Hexagon::ArchEnum::V65: Result.push_back("+hvxv65"); break; + case Hexagon::ArchEnum::V66: + Result.push_back("+hvxv66"); + break; case Hexagon::ArchEnum::Generic:{ Result.push_back(StringSwitch(CPU) .Case("hexagonv60", "+hvxv60") .Case("hexagonv62", "+hvxv62") - .Case("hexagonv65", "+hvxv65")); + .Case("hexagonv65", "+hvxv65") + .Case("hexagonv66", "+hvxv66")); break; } case Hexagon::ArchEnum::NoArch: @@ -308,7 +317,7 @@ static bool isCPUValid(std::string CPU) { std::vector table { "generic", "hexagonv5", "hexagonv55", "hexagonv60", - "hexagonv62", "hexagonv65", + "hexagonv62", "hexagonv65", "hexagonv66", }; return std::find(table.begin(), table.end(), CPU) != table.end(); @@ -330,7 +339,7 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) { // turns on hvxvNN, corresponding to the existing ArchVNN. FeatureBitset FB = S; unsigned CpuArch = ArchV5; - for (unsigned F : {ArchV65, ArchV62, ArchV60, ArchV55, ArchV5}) { + for (unsigned F : {ArchV66, ArchV65, ArchV62, ArchV60, ArchV55, ArchV5}) { if (!FB.test(F)) continue; CpuArch = F; @@ -344,7 +353,8 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) { break; } bool HasHvxVer = false; - for (unsigned F : {ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65}) { + for (unsigned F : {ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65, + ExtensionHVXV66}) { if (!FB.test(F)) continue; HasHvxVer = true; @@ -357,6 +367,9 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) { // HasHvxVer is false, and UseHvx is true. switch (CpuArch) { + case ArchV66: + FB.set(ExtensionHVXV66); + LLVM_FALLTHROUGH; case ArchV65: FB.set(ExtensionHVXV65); LLVM_FALLTHROUGH; @@ -400,6 +413,7 @@ unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) { {"hexagonv60", ELF::EF_HEXAGON_MACH_V60}, {"hexagonv62", ELF::EF_HEXAGON_MACH_V62}, {"hexagonv65", ELF::EF_HEXAGON_MACH_V65}, + {"hexagonv66", ELF::EF_HEXAGON_MACH_V66}, }; auto F = ElfFlags.find(STI.getCPU()); diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp index 59f3caa6af9407c72f2d2c3087d5c53310124670..f4ee2bbfaaaa548ba698e2fe06c9c546ddb19bdf 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp @@ -138,6 +138,8 @@ void HexagonCVIResource::SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU) { UnitsAndLanes(CVI_XLANE | CVI_MPY0, 2); (*TUL)[HexagonII::TypeCVI_SCATTER_NEW_ST] = UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1); + (*TUL)[HexagonII::TypeCVI_4SLOT_MPY] = UnitsAndLanes(CVI_XLANE, 4); + (*TUL)[HexagonII::TypeCVI_ZW] = UnitsAndLanes(CVI_ZW, 1); } HexagonCVIResource::HexagonCVIResource(TypeUnitsAndLanes *TUL, @@ -300,6 +302,7 @@ bool HexagonShuffler::check() { // Number of memory operations, loads, solo loads, stores, solo stores, single // stores. unsigned memory = 0, loads = 0, load0 = 0, stores = 0, store0 = 0, store1 = 0; + unsigned NonZCVIloads = 0, AllCVIloads = 0, CVIstores = 0; // Number of duplex insns unsigned duplex = 0; unsigned pSlot3Cnt = 0; @@ -331,6 +334,11 @@ bool HexagonShuffler::check() { case HexagonII::TypeCVI_VM_TMP_LD: case HexagonII::TypeCVI_GATHER: case HexagonII::TypeCVI_GATHER_RST: + ++NonZCVIloads; + LLVM_FALLTHROUGH; + case HexagonII::TypeCVI_ZW: + ++AllCVIloads; + LLVM_FALLTHROUGH; case HexagonII::TypeLD: ++loads; ++memory; @@ -348,6 +356,8 @@ bool HexagonShuffler::check() { case HexagonII::TypeCVI_SCATTER_RST: case HexagonII::TypeCVI_SCATTER_NEW_RST: case HexagonII::TypeCVI_SCATTER_NEW_ST: + ++CVIstores; + LLVM_FALLTHROUGH; case HexagonII::TypeST: ++stores; ++memory; @@ -405,7 +415,11 @@ bool HexagonShuffler::check() { applySlotRestrictions(); // Check if the packet is legal. - if ((load0 > 1 || store0 > 1) || (duplex > 1 || (duplex && memory))) { + const unsigned ZCVIloads = AllCVIloads - NonZCVIloads; + const bool ValidHVXMem = + NonZCVIloads <= 1 && ZCVIloads <= 1 && CVIstores <= 1; + if ((load0 > 1 || store0 > 1 || !ValidHVXMem) || + (duplex > 1 || (duplex && memory))) { reportError(llvm::Twine("invalid instruction packet")); return false; } diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h index 37f90bc46ac70cfa6903aee169a5ee7bc425561a..ef50c5bebbfb5e6dbe0e328dc3730a7c9e98be27 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h @@ -75,7 +75,8 @@ private: CVI_XLANE = 1 << 0, CVI_SHIFT = 1 << 1, CVI_MPY0 = 1 << 2, - CVI_MPY1 = 1 << 3 + CVI_MPY1 = 1 << 3, + CVI_ZW = 1 << 4 }; // Count of adjacent slots that the insn requires to be executed. diff --git a/lib/Target/Lanai/LanaiInstrInfo.cpp b/lib/Target/Lanai/LanaiInstrInfo.cpp index a18352738e132a2ad75471e4d884ec32ec3f505d..196768fdc56a3f01594c4621754329543a78febd 100644 --- a/lib/Target/Lanai/LanaiInstrInfo.cpp +++ b/lib/Target/Lanai/LanaiInstrInfo.cpp @@ -101,12 +101,12 @@ bool LanaiInstrInfo::areMemAccessesTriviallyDisjoint( // the width doesn't overlap the offset of a higher memory access, // then the memory accesses are different. const TargetRegisterInfo *TRI = &getRegisterInfo(); - unsigned BaseRegA = 0, BaseRegB = 0; + MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; int64_t OffsetA = 0, OffsetB = 0; unsigned int WidthA = 0, WidthB = 0; - if (getMemOpBaseRegImmOfsWidth(MIa, BaseRegA, OffsetA, WidthA, TRI) && - getMemOpBaseRegImmOfsWidth(MIb, BaseRegB, OffsetB, WidthB, TRI)) { - if (BaseRegA == BaseRegB) { + if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) && + getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) { + if (BaseOpA->isIdenticalTo(*BaseOpB)) { int LowOffset = std::min(OffsetA, OffsetB); int HighOffset = std::max(OffsetA, OffsetB); int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; @@ -755,9 +755,9 @@ unsigned LanaiInstrInfo::isStoreToStackSlot(const MachineInstr &MI, return 0; } -bool LanaiInstrInfo::getMemOpBaseRegImmOfsWidth( - MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, unsigned &Width, - const TargetRegisterInfo * /*TRI*/) const { +bool LanaiInstrInfo::getMemOperandWithOffsetWidth( + MachineInstr &LdSt, MachineOperand *&BaseOp, int64_t &Offset, + unsigned &Width, const TargetRegisterInfo * /*TRI*/) const { // Handle only loads/stores with base register followed by immediate offset // and with add as ALU op. if (LdSt.getNumOperands() != 4) @@ -787,14 +787,17 @@ bool LanaiInstrInfo::getMemOpBaseRegImmOfsWidth( break; } - BaseReg = LdSt.getOperand(1).getReg(); + BaseOp = &LdSt.getOperand(1); Offset = LdSt.getOperand(2).getImm(); + assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " + "operands of type register."); return true; } -bool LanaiInstrInfo::getMemOpBaseRegImmOfs( - MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, - const TargetRegisterInfo *TRI) const { +bool LanaiInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt, + MachineOperand *&BaseOp, + int64_t &Offset, + const TargetRegisterInfo *TRI) const { switch (LdSt.getOpcode()) { default: return false; @@ -808,6 +811,6 @@ bool LanaiInstrInfo::getMemOpBaseRegImmOfs( case Lanai::LDBs_RI: case Lanai::LDBz_RI: unsigned Width; - return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI); + return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI); } } diff --git a/lib/Target/Lanai/LanaiInstrInfo.h b/lib/Target/Lanai/LanaiInstrInfo.h index fe22fde2470b76c14e39b9b109326414c407fc93..bdcf9a361b5f0a4158384d4aa5d78cfcb5ed06df 100644 --- a/lib/Target/Lanai/LanaiInstrInfo.h +++ b/lib/Target/Lanai/LanaiInstrInfo.h @@ -68,13 +68,13 @@ public: bool expandPostRAPseudo(MachineInstr &MI) const override; - bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, - int64_t &Offset, - const TargetRegisterInfo *TRI) const override; + bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp, + int64_t &Offset, + const TargetRegisterInfo *TRI) const override; - bool getMemOpBaseRegImmOfsWidth(MachineInstr &LdSt, unsigned &BaseReg, - int64_t &Offset, unsigned &Width, - const TargetRegisterInfo *TRI) const; + bool getMemOperandWithOffsetWidth(MachineInstr &LdSt, MachineOperand *&BaseOp, + int64_t &Offset, unsigned &Width, + const TargetRegisterInfo *TRI) const; std::pair decomposeMachineOperandsTargetFlags(unsigned TF) const override; diff --git a/lib/Target/Lanai/LanaiTargetMachine.cpp b/lib/Target/Lanai/LanaiTargetMachine.cpp index 2c21a53b13bb5719886c1b6f6ad7e9b51d686738..10bd9e2c65d2f20437d5a62c0c5da0d6c790bf4c 100644 --- a/lib/Target/Lanai/LanaiTargetMachine.cpp +++ b/lib/Target/Lanai/LanaiTargetMachine.cpp @@ -53,12 +53,6 @@ static Reloc::Model getEffectiveRelocModel(Optional RM) { return *RM; } -static CodeModel::Model getEffectiveCodeModel(Optional CM) { - if (CM) - return *CM; - return CodeModel::Medium; -} - LanaiTargetMachine::LanaiTargetMachine(const Target &T, const Triple &TT, StringRef Cpu, StringRef FeatureString, const TargetOptions &Options, @@ -67,7 +61,8 @@ LanaiTargetMachine::LanaiTargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OptLevel, bool JIT) : LLVMTargetMachine(T, computeDataLayout(), TT, Cpu, FeatureString, Options, getEffectiveRelocModel(RM), - getEffectiveCodeModel(CodeModel), OptLevel), + getEffectiveCodeModel(CodeModel, CodeModel::Medium), + OptLevel), Subtarget(TT, Cpu, FeatureString, *this, Options, getCodeModel(), OptLevel), TLOF(new LanaiTargetObjectFile()) { diff --git a/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp index 3f7d1860e9a9388d88dedc1dab8aed6c7569d928..3cc6da2c21aaf8d6912853ff3c46ad152c4f9f27 100644 --- a/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp +++ b/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp @@ -30,7 +30,9 @@ #define DEBUG_TYPE "msp430-asm-parser" -namespace llvm { +using namespace llvm; + +namespace { /// Parses MSP430 assembly from a stream. class MSP430AsmParser : public MCTargetAsmParser { @@ -49,6 +51,7 @@ class MSP430AsmParser : public MCTargetAsmParser { SMLoc NameLoc, OperandVector &Operands) override; bool ParseDirective(AsmToken DirectiveID) override; + bool ParseDirectiveRefSym(AsmToken DirectiveID); unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override; @@ -244,6 +247,7 @@ public: } } }; +} // end anonymous namespace bool MSP430AsmParser::MatchAndEmitInstruction(SMLoc Loc, unsigned &Opcode, OperandVector &Operands, @@ -404,6 +408,16 @@ bool MSP430AsmParser::ParseInstruction(ParseInstructionInfo &Info, return false; } +bool MSP430AsmParser::ParseDirectiveRefSym(AsmToken DirectiveID) { + StringRef Name; + if (getParser().parseIdentifier(Name)) + return TokError("expected identifier in directive"); + + MCSymbol *Sym = getContext().getOrCreateSymbol(Name); + getStreamer().EmitSymbolAttribute(Sym, MCSA_Global); + return false; +} + bool MSP430AsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getIdentifier(); if (IDVal.lower() == ".long") { @@ -412,6 +426,8 @@ bool MSP430AsmParser::ParseDirective(AsmToken DirectiveID) { ParseLiteralValues(2, DirectiveID.getLoc()); } else if (IDVal.lower() == ".byte") { ParseLiteralValues(1, DirectiveID.getLoc()); + } else if (IDVal.lower() == ".refsym") { + return ParseDirectiveRefSym(DirectiveID); } return true; } @@ -558,5 +574,3 @@ unsigned MSP430AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, return Match_InvalidOperand; } - -} // end of namespace llvm diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp index 30d077b5b58831f1b200bc136364f1f395e2ef25..e47db2400a05345944bb9f6c97aaca4037a318d2 100644 --- a/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp +++ b/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp @@ -34,7 +34,7 @@ protected: // Translate fixup kind to ELF relocation type. switch ((unsigned)Fixup.getKind()) { case FK_Data_1: return ELF::R_MSP430_8; - case FK_Data_2: return ELF::R_MSP430_16; + case FK_Data_2: return ELF::R_MSP430_16_BYTE; case FK_Data_4: return ELF::R_MSP430_32; case MSP430::fixup_32: return ELF::R_MSP430_32; case MSP430::fixup_10_pcrel: return ELF::R_MSP430_10_PCREL; diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp index ba9f7d7a9a5e86ee6be15f2a8ed4f61fd5caa33a..adf2384f6e99515bdae8300b11b4516172e79bc1 100644 --- a/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp +++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp @@ -91,12 +91,11 @@ void MSP430MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, Offset = 2; uint64_t BinaryOpCode = getBinaryCodeForInstr(MI, Fixups, STI); - const uint16_t *Words = reinterpret_cast(&BinaryOpCode); size_t WordCount = Size / 2; - for (size_t i = 0; i < WordCount; ++i) { - uint16_t Word = Words[i]; - support::endian::write(OS, Word, support::little); + while (WordCount--) { + support::endian::write(OS, (uint16_t)BinaryOpCode, support::little); + BinaryOpCode >>= 16; } } diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp index ac93d7efc2b97a3ebeb833c45a2a67ead08d9ff4..73c8793aa01c08d337e8b4e7219aecc5fac403a1 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -217,8 +217,6 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, // { RTLIB::NEG_F64, "__mspabi_negd", ISD::SETCC_INVALID }, // { RTLIB::NEG_F32, "__mspabi_negf", ISD::SETCC_INVALID }, - // TODO: SLL/SRA/SRL are in libgcc, RLL isn't - // Universal Integer Operations - EABI Table 9 { RTLIB::SDIV_I16, "__mspabi_divi", ISD::SETCC_INVALID }, { RTLIB::SDIV_I32, "__mspabi_divli", ISD::SETCC_INVALID }, @@ -233,6 +231,13 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, { RTLIB::UREM_I32, "__mspabi_remul", ISD::SETCC_INVALID }, { RTLIB::UREM_I64, "__mspabi_remull", ISD::SETCC_INVALID }, + // Bitwise Operations - EABI Table 10 + // TODO: __mspabi_[srli/srai/slli] ARE implemented in libgcc + { RTLIB::SRL_I32, "__mspabi_srll", ISD::SETCC_INVALID }, + { RTLIB::SRA_I32, "__mspabi_sral", ISD::SETCC_INVALID }, + { RTLIB::SHL_I32, "__mspabi_slll", ISD::SETCC_INVALID }, + // __mspabi_[srlll/srall/sllll/rlli/rlll] are NOT implemented in libgcc + }; for (const auto &LC : LibraryCalls) { @@ -945,10 +950,20 @@ SDValue MSP430TargetLowering::LowerShifts(SDValue Op, uint64_t ShiftAmount = cast(N->getOperand(1))->getZExtValue(); // Expand the stuff into sequence of shifts. - // FIXME: for some shift amounts this might be done better! - // E.g.: foo >> (8 + N) => sxt(swpb(foo)) >> N SDValue Victim = N->getOperand(0); + if ((Opc == ISD::SRA || Opc == ISD::SRL) && ShiftAmount >= 8) { + // foo >> (8 + N) => sxt(swpb(foo)) >> N + assert(VT == MVT::i16 && "Can not shift i8 by 8 and more"); + Victim = DAG.getNode(ISD::BSWAP, dl, VT, Victim); + if (Opc == ISD::SRA) + Victim = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Victim, + DAG.getValueType(MVT::i8)); + else + Victim = DAG.getZeroExtendInReg(Victim, dl, MVT::i8); + ShiftAmount -= 8; + } + if (Opc == ISD::SRL && ShiftAmount) { // Emit a special goodness here: // srl A, 1 => clrc; rrc A diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp index 01f44e266d7bc10b0cc65ab7965821a648820ed2..9f6ebba75ec66d7f032eaeddee1c9514b20e243b 100644 --- a/lib/Target/MSP430/MSP430TargetMachine.cpp +++ b/lib/Target/MSP430/MSP430TargetMachine.cpp @@ -32,12 +32,6 @@ static Reloc::Model getEffectiveRelocModel(Optional RM) { return *RM; } -static CodeModel::Model getEffectiveCodeModel(Optional CM) { - if (CM) - return *CM; - return CodeModel::Small; -} - static std::string computeDataLayout(const Triple &TT, StringRef CPU, const TargetOptions &Options) { return "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16"; @@ -51,7 +45,7 @@ MSP430TargetMachine::MSP430TargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OL, bool JIT) : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options), TT, CPU, FS, Options, getEffectiveRelocModel(RM), - getEffectiveCodeModel(CM), OL), + getEffectiveCodeModel(CM, CodeModel::Small), OL), TLOF(make_unique()), Subtarget(TT, CPU, FS, *this) { initAsmInfo(); diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 79e0c001a636cfdd6ca71cac52e015cbb1b58f87..78dfed68b9228fbab51a90e23ad92613017a2490 100644 --- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -39,6 +39,7 @@ #include "llvm/MC/MCValue.h" #include "llvm/MC/SubtargetFeature.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -64,6 +65,11 @@ class MCInstrInfo; } // end namespace llvm +static cl::opt +EmitJalrReloc("mips-jalr-reloc", cl::Hidden, + cl::desc("MIPS: Emit R_{MICRO}MIPS_JALR relocation with jalr"), + cl::init(true)); + namespace { class MipsAssemblerOptions { @@ -2065,9 +2071,21 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, JalrInst.addOperand(MCOperand::createReg(Mips::RA)); JalrInst.addOperand(MCOperand::createReg(Mips::T9)); - // FIXME: Add an R_(MICRO)MIPS_JALR relocation after the JALR. - // This relocation is supposed to be an optimization hint for the linker - // and is not necessary for correctness. + if (EmitJalrReloc) { + // As an optimization hint for the linker, before the JALR we add: + // .reloc tmplabel, R_{MICRO}MIPS_JALR, symbol + // tmplabel: + MCSymbol *TmpLabel = getContext().createTempSymbol(); + const MCExpr *TmpExpr = MCSymbolRefExpr::create(TmpLabel, getContext()); + const MCExpr *RelocJalrExpr = + MCSymbolRefExpr::create(JalSym, MCSymbolRefExpr::VK_None, + getContext(), IDLoc); + + TOut.getStreamer().EmitRelocDirective(*TmpExpr, + inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR", + RelocJalrExpr, IDLoc, *STI); + TOut.getStreamer().EmitLabel(TmpLabel); + } Inst = JalrInst; ExpandedJalSym = true; diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp index 63f9151da6babaa50cb606a1d1fe1f22352558b4..265d1141cb0b22837bd5576e28a886a603cd3bca 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp @@ -339,6 +339,8 @@ Optional MipsAsmBackend::getFixupKind(StringRef Name) const { (MCFixupKind)Mips::fixup_MICROMIPS_TLS_TPREL_HI16) .Case("R_MICROMIPS_TLS_TPREL_LO16", (MCFixupKind)Mips::fixup_MICROMIPS_TLS_TPREL_LO16) + .Case("R_MIPS_JALR", (MCFixupKind)Mips::fixup_Mips_JALR) + .Case("R_MICROMIPS_JALR", (MCFixupKind)Mips::fixup_MICROMIPS_JALR) .Default(MCAsmBackend::getFixupKind(Name)); } @@ -417,7 +419,9 @@ getFixupKindInfo(MCFixupKind Kind) const { { "fixup_MICROMIPS_TLS_TPREL_HI16", 0, 16, 0 }, { "fixup_MICROMIPS_TLS_TPREL_LO16", 0, 16, 0 }, { "fixup_Mips_SUB", 0, 64, 0 }, - { "fixup_MICROMIPS_SUB", 0, 64, 0 } + { "fixup_MICROMIPS_SUB", 0, 64, 0 }, + { "fixup_Mips_JALR", 0, 32, 0 }, + { "fixup_MICROMIPS_JALR", 0, 32, 0 } }; static_assert(array_lengthof(LittleEndianInfos) == Mips::NumTargetFixupKinds, "Not all MIPS little endian fixup kinds added!"); @@ -495,7 +499,9 @@ getFixupKindInfo(MCFixupKind Kind) const { { "fixup_MICROMIPS_TLS_TPREL_HI16", 16, 16, 0 }, { "fixup_MICROMIPS_TLS_TPREL_LO16", 16, 16, 0 }, { "fixup_Mips_SUB", 0, 64, 0 }, - { "fixup_MICROMIPS_SUB", 0, 64, 0 } + { "fixup_MICROMIPS_SUB", 0, 64, 0 }, + { "fixup_Mips_JALR", 0, 32, 0 }, + { "fixup_MICROMIPS_JALR", 0, 32, 0 } }; static_assert(array_lengthof(BigEndianInfos) == Mips::NumTargetFixupKinds, "Not all MIPS big endian fixup kinds added!"); @@ -553,6 +559,7 @@ bool MipsAsmBackend::shouldForceRelocation(const MCAssembler &Asm, case Mips::fixup_Mips_TLSLDM: case Mips::fixup_Mips_TPREL_HI: case Mips::fixup_Mips_TPREL_LO: + case Mips::fixup_Mips_JALR: case Mips::fixup_MICROMIPS_CALL16: case Mips::fixup_MICROMIPS_GOT_DISP: case Mips::fixup_MICROMIPS_GOT_PAGE: @@ -565,6 +572,7 @@ bool MipsAsmBackend::shouldForceRelocation(const MCAssembler &Asm, case Mips::fixup_MICROMIPS_TLS_LDM: case Mips::fixup_MICROMIPS_TLS_TPREL_HI16: case Mips::fixup_MICROMIPS_TLS_TPREL_LO16: + case Mips::fixup_MICROMIPS_JALR: return true; } } diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index c600382446a84c97169cdc54f705536db53e331f..0e8de0577a8a93361be17187cfb0cfaf5f410ae7 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -401,6 +401,10 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx, return ELF::R_MICROMIPS_HIGHER; case Mips::fixup_MICROMIPS_HIGHEST: return ELF::R_MICROMIPS_HIGHEST; + case Mips::fixup_Mips_JALR: + return ELF::R_MIPS_JALR; + case Mips::fixup_MICROMIPS_JALR: + return ELF::R_MICROMIPS_JALR; } llvm_unreachable("invalid fixup kind!"); diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h index d7f6cf91db73fc0ad9b9920f3e859303a7cb7a38..eedad16dddc30686de2db04367adcbd13289329d 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h +++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h @@ -222,6 +222,10 @@ namespace Mips { fixup_Mips_SUB, fixup_MICROMIPS_SUB, + // resulting in - R_MIPS_JALR/R_MICROMIPS_JALR + fixup_Mips_JALR, + fixup_MICROMIPS_JALR, + // Marker LastTargetFixupKind, NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp index 7bcda12051a515a7ea1c11b9d6b2dc1f38c4417c..1506b4a83649ea6fada9e31198324d9e8ec4cf09 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp @@ -49,25 +49,5 @@ MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) { ExceptionsType = ExceptionHandling::DwarfCFI; DwarfRegNumForCFI = true; HasMipsExpressions = true; - - // Enable IAS by default for O32. - if (TheTriple.isMIPS32()) - UseIntegratedAssembler = true; - - // Enable IAS by default for Debian mips64/mips64el. - if (TheTriple.getEnvironment() == Triple::GNUABI64) - UseIntegratedAssembler = true; - - // Enable IAS by default for Debian mipsn32/mipsn32el. - if (TheTriple.getEnvironment() == Triple::GNUABIN32) - UseIntegratedAssembler = true; - - // Enable IAS by default for Android mips64el that uses N64 ABI. - if (TheTriple.getArch() == Triple::mips64el && TheTriple.isAndroid()) - UseIntegratedAssembler = true; - - // Enable IAS by default for FreeBSD / OpenBSD mips64/mips64el. - if (TheTriple.isOSFreeBSD() || - TheTriple.isOSOpenBSD()) - UseIntegratedAssembler = true; + UseIntegratedAssembler = true; } diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp index 8e880b635c15d03f3a77e887ebad24f72c410f4a..f43a4d980f92aac74211faa9ed6320f9e9ec3537 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp @@ -613,6 +613,9 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl &Fixups, case MipsMCExpr::MEK_Special: llvm_unreachable("Unhandled fixup kind!"); break; + case MipsMCExpr::MEK_DTPREL: + llvm_unreachable("MEK_DTPREL is used for TLS DIEExpr only"); + break; case MipsMCExpr::MEK_CALL_HI16: FixupKind = Mips::fixup_Mips_CALL_HI16; break; diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp index 0bddba781453003840b36a23d68c7ad5289229e2..99857e083c6ca880964945fb67c0ad3ceeb6022a 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp @@ -43,6 +43,9 @@ void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { case MEK_Special: llvm_unreachable("MEK_None and MEK_Special are invalid"); break; + case MEK_DTPREL: + llvm_unreachable("MEK_DTPREL is used for TLS DIEExpr only"); + break; case MEK_CALL_HI16: OS << "%call_hi"; break; @@ -157,6 +160,8 @@ MipsMCExpr::evaluateAsRelocatableImpl(MCValue &Res, case MEK_None: case MEK_Special: llvm_unreachable("MEK_None and MEK_Special are invalid"); + case MEK_DTPREL: + llvm_unreachable("MEK_DTPREL is used for TLS DIEExpr only"); case MEK_DTPREL_HI: case MEK_DTPREL_LO: case MEK_GOT: @@ -244,6 +249,9 @@ void MipsMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const { case MEK_Special: llvm_unreachable("MEK_None and MEK_Special are invalid"); break; + case MEK_DTPREL: + llvm_unreachable("MEK_DTPREL is used for TLS DIEExpr only"); + break; case MEK_CALL_HI16: case MEK_CALL_LO16: case MEK_GOT: diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h index 495d525ccff42d92f7c76a83737d144f7c18570e..bf3274ab5d173196bc1a1c01ae1f91d891c08137 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h +++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h @@ -22,6 +22,7 @@ public: MEK_None, MEK_CALL_HI16, MEK_CALL_LO16, + MEK_DTPREL, MEK_DTPREL_HI, MEK_DTPREL_LO, MEK_GOT, diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp index 16a2481a00d8996d1ac91c9a22482112a1f0695a..362431fd42a6c0a1910b07c0876290e1688c2577 100644 --- a/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/lib/Target/Mips/MipsAsmPrinter.cpp @@ -1204,18 +1204,23 @@ void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI, // Emit .dtprelword or .dtpreldword directive // and value for debug thread local expression. -void MipsAsmPrinter::EmitDebugThreadLocal(const MCExpr *Value, - unsigned Size) const { - switch (Size) { - case 4: - OutStreamer->EmitDTPRel32Value(Value); - break; - case 8: - OutStreamer->EmitDTPRel64Value(Value); - break; - default: - llvm_unreachable("Unexpected size of expression value."); +void MipsAsmPrinter::EmitDebugValue(const MCExpr *Value, unsigned Size) const { + if (auto *MipsExpr = dyn_cast(Value)) { + if (MipsExpr && MipsExpr->getKind() == MipsMCExpr::MEK_DTPREL) { + switch (Size) { + case 4: + OutStreamer->EmitDTPRel32Value(MipsExpr->getSubExpr()); + break; + case 8: + OutStreamer->EmitDTPRel64Value(MipsExpr->getSubExpr()); + break; + default: + llvm_unreachable("Unexpected size of expression value."); + } + return; + } } + AsmPrinter::EmitDebugValue(Value, Size); } // Align all targets of indirect branches on bundle size. Used only if target diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h index 999b6f896baed110ceed3408c2192a76d03dd9f3..eb58234e3e77b6b15cbf7df22feac82ca6b4f4bc 100644 --- a/lib/Target/Mips/MipsAsmPrinter.h +++ b/lib/Target/Mips/MipsAsmPrinter.h @@ -160,7 +160,7 @@ public: void EmitStartOfAsmFile(Module &M) override; void EmitEndOfAsmFile(Module &M) override; void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); - void EmitDebugThreadLocal(const MCExpr *Value, unsigned Size) const override; + void EmitDebugValue(const MCExpr *Value, unsigned Size) const override; }; } // end namespace llvm diff --git a/lib/Target/Mips/MipsInstructionSelector.cpp b/lib/Target/Mips/MipsInstructionSelector.cpp index fbc15649c8a3da62adcab66f77e03e5a27ec28fb..4964ca9cbe15048f4553029f149461a0dae96e0b 100644 --- a/lib/Target/Mips/MipsInstructionSelector.cpp +++ b/lib/Target/Mips/MipsInstructionSelector.cpp @@ -145,6 +145,33 @@ bool MipsInstructionSelector::select(MachineInstr &I, .addMemOperand(*I.memoperands_begin()); break; } + case G_UDIV: + case G_UREM: + case G_SDIV: + case G_SREM: { + unsigned HILOReg = MRI.createVirtualRegister(&Mips::ACC64RegClass); + bool IsSigned = I.getOpcode() == G_SREM || I.getOpcode() == G_SDIV; + bool IsDiv = I.getOpcode() == G_UDIV || I.getOpcode() == G_SDIV; + + MachineInstr *PseudoDIV, *PseudoMove; + PseudoDIV = BuildMI(MBB, I, I.getDebugLoc(), + TII.get(IsSigned ? Mips::PseudoSDIV : Mips::PseudoUDIV)) + .addDef(HILOReg) + .add(I.getOperand(1)) + .add(I.getOperand(2)); + if (!constrainSelectedInstRegOperands(*PseudoDIV, TII, TRI, RBI)) + return false; + + PseudoMove = BuildMI(MBB, I, I.getDebugLoc(), + TII.get(IsDiv ? Mips::PseudoMFLO : Mips::PseudoMFHI)) + .addDef(I.getOperand(0).getReg()) + .addUse(HILOReg); + if (!constrainSelectedInstRegOperands(*PseudoMove, TII, TRI, RBI)) + return false; + + I.eraseFromParent(); + return true; + } case G_CONSTANT: { int Imm = I.getOperand(1).getCImm()->getValue().getLimitedValue(); unsigned LUiReg = MRI.createVirtualRegister(&Mips::GPR32RegClass); @@ -258,8 +285,8 @@ bool MipsInstructionSelector::select(MachineInstr &I, MachineIRBuilder B(I); for (const struct Instr &Instruction : Instructions) { - MachineInstrBuilder MIB = - B.buildInstr(Instruction.Opcode, Instruction.Def, Instruction.LHS); + MachineInstrBuilder MIB = B.buildInstr( + Instruction.Opcode, {Instruction.Def}, {Instruction.LHS}); if (Instruction.hasImm()) MIB.addImm(Instruction.RHS); diff --git a/lib/Target/Mips/MipsLegalizerInfo.cpp b/lib/Target/Mips/MipsLegalizerInfo.cpp index 02701f31e32947e4952280118350e2fd7058cece..0d80bd479d5eb7397e3fe592a45acc3863c5a9da 100644 --- a/lib/Target/Mips/MipsLegalizerInfo.cpp +++ b/lib/Target/Mips/MipsLegalizerInfo.cpp @@ -20,29 +20,40 @@ using namespace llvm; MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) { using namespace TargetOpcode; + const LLT s1 = LLT::scalar(1); const LLT s32 = LLT::scalar(32); const LLT s64 = LLT::scalar(64); const LLT p0 = LLT::pointer(0, 32); getActionDefinitionsBuilder(G_ADD) .legalFor({s32}) - .minScalar(0, s32) - .customFor({s64}); + .clampScalar(0, s32, s32); + + getActionDefinitionsBuilder(G_UADDE) + .lowerFor({{s32, s1}}); getActionDefinitionsBuilder({G_LOAD, G_STORE}) .legalForCartesianProduct({p0, s32}, {p0}); - getActionDefinitionsBuilder({G_AND, G_OR, G_XOR, G_SHL, G_ASHR, G_LSHR}) + getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) + .legalFor({s32}) + .clampScalar(0, s32, s32); + + getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR}) .legalFor({s32}); + getActionDefinitionsBuilder({G_SDIV, G_SREM, G_UREM, G_UDIV}) + .legalFor({s32}) + .minScalar(0, s32) + .libcallFor({s64}); + getActionDefinitionsBuilder(G_ICMP) .legalFor({{s32, s32}}) .minScalar(0, s32); getActionDefinitionsBuilder(G_CONSTANT) .legalFor({s32}) - .minScalar(0, s32) - .customFor({s64}); + .clampScalar(0, s32, s32); getActionDefinitionsBuilder(G_GEP) .legalFor({{p0, s32}}); @@ -59,64 +70,12 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) { bool MipsLegalizerInfo::legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const { + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const { using namespace TargetOpcode; MIRBuilder.setInstr(MI); - switch (MI.getOpcode()) { - case G_ADD: { - unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - - const LLT sHalf = LLT::scalar(Size / 2); - - unsigned RHSLow = MRI.createGenericVirtualRegister(sHalf); - unsigned RHSHigh = MRI.createGenericVirtualRegister(sHalf); - unsigned LHSLow = MRI.createGenericVirtualRegister(sHalf); - unsigned LHSHigh = MRI.createGenericVirtualRegister(sHalf); - unsigned ResLow = MRI.createGenericVirtualRegister(sHalf); - unsigned ResHigh = MRI.createGenericVirtualRegister(sHalf); - unsigned Carry = MRI.createGenericVirtualRegister(sHalf); - unsigned TmpResHigh = MRI.createGenericVirtualRegister(sHalf); - - MIRBuilder.buildUnmerge({RHSLow, RHSHigh}, MI.getOperand(2).getReg()); - MIRBuilder.buildUnmerge({LHSLow, LHSHigh}, MI.getOperand(1).getReg()); - - MIRBuilder.buildAdd(TmpResHigh, LHSHigh, RHSHigh); - MIRBuilder.buildAdd(ResLow, LHSLow, RHSLow); - MIRBuilder.buildICmp(CmpInst::ICMP_ULT, Carry, ResLow, LHSLow); - MIRBuilder.buildAdd(ResHigh, TmpResHigh, Carry); - - MIRBuilder.buildMerge(MI.getOperand(0).getReg(), {ResLow, ResHigh}); - - MI.eraseFromParent(); - break; - } - case G_CONSTANT: { - - unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - const LLT sHalf = LLT::scalar(Size / 2); - - const APInt &CImmValue = MI.getOperand(1).getCImm()->getValue(); - - unsigned ResLow = MRI.createGenericVirtualRegister(sHalf); - unsigned ResHigh = MRI.createGenericVirtualRegister(sHalf); - MIRBuilder.buildConstant( - ResLow, *ConstantInt::get(MI.getMF()->getFunction().getContext(), - CImmValue.trunc(Size / 2))); - MIRBuilder.buildConstant( - ResHigh, *ConstantInt::get(MI.getMF()->getFunction().getContext(), - CImmValue.lshr(Size / 2).trunc(Size / 2))); - - MIRBuilder.buildMerge(MI.getOperand(0).getReg(), {ResLow, ResHigh}); - - MI.eraseFromParent(); - break; - } - default: - return false; - } - - return true; + return false; } diff --git a/lib/Target/Mips/MipsLegalizerInfo.h b/lib/Target/Mips/MipsLegalizerInfo.h index 96d95746ac879841a3161fd84c704812075df3de..75fadd6cf6139e179ce43163d46e2aeba0db390c 100644 --- a/lib/Target/Mips/MipsLegalizerInfo.h +++ b/lib/Target/Mips/MipsLegalizerInfo.h @@ -14,6 +14,7 @@ #ifndef LLVM_LIB_TARGET_MIPS_MIPSMACHINELEGALIZER_H #define LLVM_LIB_TARGET_MIPS_MIPSMACHINELEGALIZER_H +#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" namespace llvm { @@ -26,7 +27,8 @@ public: MipsLegalizerInfo(const MipsSubtarget &ST); bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const override; + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const override; }; } // end namespace llvm #endif diff --git a/lib/Target/Mips/MipsRegisterBankInfo.cpp b/lib/Target/Mips/MipsRegisterBankInfo.cpp index a0c80048c024c735266fab33a6f97f1c3abc8d3c..53b9e42dc63a80070bc4047a6b575c301747ba88 100644 --- a/lib/Target/Mips/MipsRegisterBankInfo.cpp +++ b/lib/Target/Mips/MipsRegisterBankInfo.cpp @@ -93,6 +93,10 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case G_SHL: case G_ASHR: case G_LSHR: + case G_SDIV: + case G_UDIV: + case G_SREM: + case G_UREM: OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx]; break; case G_CONSTANT: diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp index d745ce00149011b7b850a8d523aea8d64f6160c7..a78e544c35f01b8a5a116f18b0e7967e09b084c3 100644 --- a/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/lib/Target/Mips/MipsSEISelLowering.cpp @@ -2360,24 +2360,6 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op, } } -/// Check if the given BuildVectorSDNode is a splat. -/// This method currently relies on DAG nodes being reused when equivalent, -/// so it's possible for this to return false even when isConstantSplat returns -/// true. -static bool isSplatVector(const BuildVectorSDNode *N) { - unsigned int nOps = N->getNumOperands(); - assert(nOps > 1 && "isSplatVector has 0 or 1 sized build vector"); - - SDValue Operand0 = N->getOperand(0); - - for (unsigned int i = 1; i < nOps; ++i) { - if (N->getOperand(i) != Operand0) - return false; - } - - return true; -} - // Lower ISD::EXTRACT_VECTOR_ELT into MipsISD::VEXTRACT_SEXT_ELT. // // The non-value bits resulting from ISD::EXTRACT_VECTOR_ELT are undefined. We @@ -2488,7 +2470,7 @@ SDValue MipsSETargetLowering::lowerBUILD_VECTOR(SDValue Op, Result = DAG.getNode(ISD::BITCAST, SDLoc(Node), ResTy, Result); return Result; - } else if (isSplatVector(Node)) + } else if (DAG.isSplatValue(Op, /* AllowUndefs */ false)) return Op; else if (!isConstantOrUndefBUILD_VECTOR(Node)) { // Use INSERT_VECTOR_ELT operations rather than expand to stores. diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp index c8df44d9f0b2c79479092d8d34a52d7a201b3617..9cc91d302893d94994e62e1f0d4be1bc1037b5bc 100644 --- a/lib/Target/Mips/MipsTargetMachine.cpp +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -101,12 +101,6 @@ static Reloc::Model getEffectiveRelocModel(bool JIT, return *RM; } -static CodeModel::Model getEffectiveCodeModel(Optional CM) { - if (CM) - return *CM; - return CodeModel::Small; -} - // On function prologue, the stack is created by decrementing // its pointer. Once decremented, all references are done with positive // offset from the stack/frame pointer, using StackGrowsUp enables @@ -121,7 +115,7 @@ MipsTargetMachine::MipsTargetMachine(const Target &T, const Triple &TT, bool isLittle) : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT, CPU, FS, Options, getEffectiveRelocModel(JIT, RM), - getEffectiveCodeModel(CM), OL), + getEffectiveCodeModel(CM, CodeModel::Small), OL), isLittle(isLittle), TLOF(llvm::make_unique()), ABI(MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions)), Subtarget(nullptr), DefaultSubtarget(TT, CPU, FS, isLittle, *this, diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp index f767c83219883e03f90ea4bedcc57ec53885677c..f53ee0631b5e659448d994b3ee53b41846748984 100644 --- a/lib/Target/Mips/MipsTargetObjectFile.cpp +++ b/lib/Target/Mips/MipsTargetObjectFile.cpp @@ -10,6 +10,7 @@ #include "MipsTargetObjectFile.h" #include "MipsSubtarget.h" #include "MipsTargetMachine.h" +#include "MCTargetDesc/MipsMCExpr.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" @@ -189,6 +190,7 @@ const MCExpr * MipsTargetObjectFile::getDebugThreadLocalSymbol(const MCSymbol *Sym) const { const MCExpr *Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); - return MCBinaryExpr::createAdd( + Expr = MCBinaryExpr::createAdd( Expr, MCConstantExpr::create(0x8000, getContext()), getContext()); + return MipsMCExpr::create(MipsMCExpr::MEK_DTPREL, Expr, getContext()); } diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index aec0d7db81a8c715052dc58e59837323bfdfca8f..63e227a12edc06b479d324e95c8cc1ace62d66e6 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -219,11 +219,12 @@ void NVPTXAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) { return; } + const NVPTXSubtarget &STI = MI->getMF()->getSubtarget(); for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI->getOperand(i); MCOperand MCOp; - if (!nvptxSubtarget->hasImageHandles()) { + if (!STI.hasImageHandles()) { if (lowerImageHandleOperand(MI, i, MCOp)) { OutMI.addOperand(MCOp); continue; @@ -329,11 +330,12 @@ MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) { void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { const DataLayout &DL = getDataLayout(); - const TargetLowering *TLI = nvptxSubtarget->getTargetLowering(); + const NVPTXSubtarget &STI = TM.getSubtarget(*F); + const TargetLowering *TLI = STI.getTargetLowering(); Type *Ty = F->getReturnType(); - bool isABI = (nvptxSubtarget->getSmVersion() >= 20); + bool isABI = (STI.getSmVersion() >= 20); if (Ty->getTypeID() == Type::VoidTyID) return; @@ -474,7 +476,6 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() { } bool NVPTXAsmPrinter::runOnMachineFunction(MachineFunction &F) { - nvptxSubtarget = &F.getSubtarget(); bool Result = AsmPrinter::runOnMachineFunction(F); // Emit closing brace for the body of function F. // The closing brace must be emitted here because we need to emit additional @@ -508,8 +509,9 @@ void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const { OutStreamer->AddComment(Twine("implicit-def: ") + getVirtualRegisterName(RegNo)); } else { + const NVPTXSubtarget &STI = MI->getMF()->getSubtarget(); OutStreamer->AddComment(Twine("implicit-def: ") + - nvptxSubtarget->getRegisterInfo()->getName(RegNo)); + STI.getRegisterInfo()->getName(RegNo)); } OutStreamer->AddBlankLine(); } @@ -1431,12 +1433,14 @@ void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I, void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { const DataLayout &DL = getDataLayout(); const AttributeList &PAL = F->getAttributes(); - const TargetLowering *TLI = nvptxSubtarget->getTargetLowering(); + const NVPTXSubtarget &STI = TM.getSubtarget(*F); + const TargetLowering *TLI = STI.getTargetLowering(); Function::const_arg_iterator I, E; unsigned paramIndex = 0; bool first = true; bool isKernelFunc = isKernelFunction(*F); - bool isABI = (nvptxSubtarget->getSmVersion() >= 20); + bool isABI = (STI.getSmVersion() >= 20); + bool hasImageHandles = STI.hasImageHandles(); MVT thePointerTy = TLI->getPointerTy(DL); if (F->arg_empty()) { @@ -1460,7 +1464,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { if (isImage(*I)) { std::string sname = I->getName(); if (isImageWriteOnly(*I) || isImageReadWrite(*I)) { - if (nvptxSubtarget->hasImageHandles()) + if (hasImageHandles) O << "\t.param .u64 .ptr .surfref "; else O << "\t.param .surfref "; @@ -1468,7 +1472,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { O << "_param_" << paramIndex; } else { // Default image is read_only - if (nvptxSubtarget->hasImageHandles()) + if (hasImageHandles) O << "\t.param .u64 .ptr .texref "; else O << "\t.param .texref "; @@ -1476,7 +1480,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { O << "_param_" << paramIndex; } } else { - if (nvptxSubtarget->hasImageHandles()) + if (hasImageHandles) O << "\t.param .u64 .ptr .samplerref "; else O << "\t.param .samplerref "; diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h index efe98003b1c8f80e2204852926128b7d78fd4b5a..44a09f5fe513039be490bbcd3f2b914b184d3a6b 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.h +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -258,9 +258,6 @@ private: typedef DenseMap VRegRCMap; VRegRCMap VRegMapping; - // Cache the subtarget here. - const NVPTXSubtarget *nvptxSubtarget; - // List of variables demoted to a function scope. std::map> localDecls; diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index c352b9b9c9dc66a9f4f76649056e2a1b87d1dcff..5c16c34e21d8551f1de742ebe6724260e9d0c190 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -180,6 +180,18 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, return; } + // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs. + if (StructType *STy = dyn_cast(Ty)) { + auto const *SL = DL.getStructLayout(STy); + auto ElementNum = 0; + for(auto *EI : STy->elements()) { + ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets, + StartingOffset + SL->getElementOffset(ElementNum)); + ++ElementNum; + } + return; + } + ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { EVT VT = TempVTs[i]; @@ -1649,7 +1661,12 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } - if (!Func) { + // Both indirect calls and libcalls have nullptr Func. In order to distinguish + // between them we must rely on the call site value which is valid for + // indirect calls but is always null for libcalls. + bool isIndirectCall = !Func && CS; + + if (isIndirectCall) { // This is indirect function call case : PTX requires a prototype of the // form // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); @@ -1673,7 +1690,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag }; // We model convergent calls as separate opcodes. - unsigned Opcode = Func ? NVPTXISD::PrintCallUni : NVPTXISD::PrintCall; + unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni; if (CLI.IsConvergent) Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni : NVPTXISD::PrintConvergentCall; @@ -1707,12 +1724,12 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue CallArgEndOps[] = { Chain, - DAG.getConstant(Func ? 1 : 0, dl, MVT::i32), + DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32), InFlag }; Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); InFlag = Chain.getValue(1); - if (!Func) { + if (isIndirectCall) { SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue PrototypeOps[] = { Chain, DAG.getConstant(uniqueCallSite, dl, MVT::i32), diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td index f0fbe5033fa8fbfa70c799b012365ee5c84075cf..48db941db9b80a7489d833e670a9bf0f891e7b89 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1318,9 +1318,6 @@ def ROTR64reg_sw : // Create SDNodes so they can be used in the DAG code, e.g. // NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) -def SDTIntShiftDOp : - SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, - SDTCisInt<0>, SDTCisInt<3>]>; def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index b9cce28726dd826a7b82cf2f64ccf02c3097c9ee..8c009aed887706c1238c328c604d7a6cc09a9824 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -102,12 +102,6 @@ static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { return Ret; } -static CodeModel::Model getEffectiveCodeModel(Optional CM) { - if (CM) - return *CM; - return CodeModel::Small; -} - NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, @@ -118,7 +112,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, // specified, as it is the only relocation model currently supported. : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT, CPU, FS, Options, Reloc::PIC_, - getEffectiveCodeModel(CM), OL), + getEffectiveCodeModel(CM, CodeModel::Small), OL), is64bit(is64bit), UseShortPointers(UseShortPointersOpt), TLOF(llvm::make_unique()), Subtarget(TT, CPU, FS, *this) { diff --git a/lib/Target/Nios2/LLVMBuild.txt b/lib/Target/Nios2/LLVMBuild.txt index 0125bbedea58f264fb7674096e31246c7bdffda3..46dd93308c0412da965ee365a6198500124af7df 100644 --- a/lib/Target/Nios2/LLVMBuild.txt +++ b/lib/Target/Nios2/LLVMBuild.txt @@ -53,6 +53,7 @@ required_libraries = AsmPrinter Core GlobalISel MC + Nios2AsmPrinter Nios2Desc Nios2Info SelectionDAG diff --git a/lib/Target/Nios2/Nios2TargetMachine.cpp b/lib/Target/Nios2/Nios2TargetMachine.cpp index b7594dde709d649773018582d3bdc289daedd75b..4f90db9e6f6a50e3d4bf6ed8482183765f16a2a1 100644 --- a/lib/Target/Nios2/Nios2TargetMachine.cpp +++ b/lib/Target/Nios2/Nios2TargetMachine.cpp @@ -37,23 +37,15 @@ static Reloc::Model getEffectiveRelocModel(Optional RM) { return *RM; } -static CodeModel::Model getEffectiveCodeModel(Optional CM, - Reloc::Model RM, bool JIT) { - if (CM) - return *CM; - return CodeModel::Small; -} - Nios2TargetMachine::Nios2TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Optional RM, Optional CM, CodeGenOpt::Level OL, bool JIT) - : LLVMTargetMachine( - T, computeDataLayout(), TT, CPU, FS, Options, - getEffectiveRelocModel(RM), - getEffectiveCodeModel(CM, getEffectiveRelocModel(RM), JIT), OL), + : LLVMTargetMachine(T, computeDataLayout(), TT, CPU, FS, Options, + getEffectiveRelocModel(RM), + getEffectiveCodeModel(CM, CodeModel::Small), OL), TLOF(make_unique()), Subtarget(TT, CPU, FS, *this) { initAsmInfo(); diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt index ff27768128452107a80f090a75dab5f8367e0c72..3130d10fa5ed6eb279841576c533feb74929668b 100644 --- a/lib/Target/PowerPC/CMakeLists.txt +++ b/lib/Target/PowerPC/CMakeLists.txt @@ -10,6 +10,7 @@ tablegen(LLVM PPCGenInstrInfo.inc -gen-instr-info) tablegen(LLVM PPCGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM PPCGenRegisterInfo.inc -gen-register-info) tablegen(LLVM PPCGenSubtargetInfo.inc -gen-subtarget) +tablegen(LLVM PPCGenExegesis.inc -gen-exegesis) add_public_tablegen_target(PowerPCCommonTableGen) diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp index 7c0f5f0eb30c889bb68df2fd26ac4c0b26cbc631..8c15ade6f9c4fd33405c92465475bc0937d74e9c 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -13,18 +13,13 @@ #include "MCTargetDesc/PPCFixupKinds.h" #include "PPCInstrInfo.h" +#include "PPCMCCodeEmitter.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Triple.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCCodeEmitter.h" -#include "llvm/MC/MCContext.h" #include "llvm/MC/MCFixup.h" -#include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrDesc.h" -#include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/Endian.h" #include "llvm/Support/EndianStream.h" #include "llvm/Support/ErrorHandling.h" @@ -39,117 +34,6 @@ using namespace llvm; STATISTIC(MCNumEmitted, "Number of MC instructions emitted"); -namespace { - -class PPCMCCodeEmitter : public MCCodeEmitter { - const MCInstrInfo &MCII; - const MCContext &CTX; - bool IsLittleEndian; - -public: - PPCMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) - : MCII(mcii), CTX(ctx), - IsLittleEndian(ctx.getAsmInfo()->isLittleEndian()) {} - PPCMCCodeEmitter(const PPCMCCodeEmitter &) = delete; - void operator=(const PPCMCCodeEmitter &) = delete; - ~PPCMCCodeEmitter() override = default; - - unsigned getDirectBrEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - unsigned getCondBrEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - unsigned getAbsDirectBrEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - unsigned getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - unsigned getImm16Encoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - unsigned getMemRIXEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - unsigned getMemRIX16Encoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - unsigned getSPE8DisEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - unsigned getSPE4DisEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - unsigned getSPE2DisEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - unsigned getTLSRegEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - unsigned getTLSCallEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - unsigned get_crbitm_encoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - /// getMachineOpValue - Return binary encoding of operand. If the machine - /// operand requires relocation, record the relocation and return zero. - unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - // getBinaryCodeForInstr - TableGen'erated function for getting the - // binary encoding for an instruction. - uint64_t getBinaryCodeForInstr(const MCInst &MI, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - void encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const override { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - - unsigned Opcode = MI.getOpcode(); - const MCInstrDesc &Desc = MCII.get(Opcode); - - uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI); - - // Output the constant in big/little endian byte order. - unsigned Size = Desc.getSize(); - support::endianness E = IsLittleEndian ? support::little : support::big; - switch (Size) { - case 0: - break; - case 4: - support::endian::write(OS, Bits, E); - break; - case 8: - // If we emit a pair of instructions, the first one is - // always in the top 32 bits, even on little-endian. - support::endian::write(OS, Bits >> 32, E); - support::endian::write(OS, Bits, E); - break; - default: - llvm_unreachable("Invalid instruction size"); - } - - ++MCNumEmitted; // Keep track of the # of mi's emitted. - } - -private: - uint64_t computeAvailableFeatures(const FeatureBitset &FB) const; - void verifyInstructionPredicates(const MCInst &MI, - uint64_t AvailableFeatures) const; -}; - -} // end anonymous namespace - MCCodeEmitter *llvm::createPPCMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx) { @@ -396,5 +280,42 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO, return MO.getImm(); } +void PPCMCCodeEmitter::encodeInstruction( + const MCInst &MI, raw_ostream &OS, SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + verifyInstructionPredicates(MI, + computeAvailableFeatures(STI.getFeatureBits())); + + uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI); + + // Output the constant in big/little endian byte order. + unsigned Size = getInstSizeInBytes(MI); + support::endianness E = IsLittleEndian ? support::little : support::big; + switch (Size) { + case 0: + break; + case 4: + support::endian::write(OS, Bits, E); + break; + case 8: + // If we emit a pair of instructions, the first one is + // always in the top 32 bits, even on little-endian. + support::endian::write(OS, Bits >> 32, E); + support::endian::write(OS, Bits, E); + break; + default: + llvm_unreachable("Invalid instruction size"); + } + + ++MCNumEmitted; // Keep track of the # of mi's emitted. +} + +// Get the number of bytes used to encode the given MCInst. +unsigned PPCMCCodeEmitter::getInstSizeInBytes(const MCInst &MI) const { + unsigned Opcode = MI.getOpcode(); + const MCInstrDesc &Desc = MCII.get(Opcode); + return Desc.getSize(); +} + #define ENABLE_INSTR_PREDICATE_VERIFIER #include "PPCGenMCCodeEmitter.inc" diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h new file mode 100644 index 0000000000000000000000000000000000000000..a4bcff4b9450c31e6e9882bf6baa1fdcee3b1037 --- /dev/null +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h @@ -0,0 +1,109 @@ +//===-- PPCMCCodeEmitter.h - Convert PPC code to machine code -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the PPCMCCodeEmitter class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_PPC_MCCODEEMITTER_PPCCODEEMITTER_H +#define LLVM_LIB_TARGET_PPC_MCCODEEMITTER_PPCCODEEMITTER_H + +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInst.h" + +namespace llvm { + +class PPCMCCodeEmitter : public MCCodeEmitter { + const MCInstrInfo &MCII; + const MCContext &CTX; + bool IsLittleEndian; + +public: + PPCMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) + : MCII(mcii), CTX(ctx), + IsLittleEndian(ctx.getAsmInfo()->isLittleEndian()) {} + PPCMCCodeEmitter(const PPCMCCodeEmitter &) = delete; + void operator=(const PPCMCCodeEmitter &) = delete; + ~PPCMCCodeEmitter() override = default; + + unsigned getDirectBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getCondBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getAbsDirectBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getImm16Encoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getMemRIXEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getMemRIX16Encoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getSPE8DisEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getSPE4DisEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getSPE2DisEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getTLSRegEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getTLSCallEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + unsigned get_crbitm_encoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + /// getMachineOpValue - Return binary encoding of operand. If the machine + /// operand requires relocation, record the relocation and return zero. + unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + // getBinaryCodeForInstr - TableGen'erated function for getting the + // binary encoding for an instruction. + uint64_t getBinaryCodeForInstr(const MCInst &MI, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + void encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const override; + + // Get the number of bytes used to encode the given MCInst. + unsigned getInstSizeInBytes(const MCInst &MI) const; + +private: + uint64_t computeAvailableFeatures(const FeatureBitset &FB) const; + void verifyInstructionPredicates(const MCInst &MI, + uint64_t AvailableFeatures) const; +}; + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_PPC_MCCODEEMITTER_PPCCODEEMITTER_H diff --git a/lib/Target/PowerPC/P9InstrResources.td b/lib/Target/PowerPC/P9InstrResources.td index c6cbb9037ede1f4867fa687b29eda25f86d41189..17c37964c5622ecebe3e546fbb8354c3ea5a43e6 100644 --- a/lib/Target/PowerPC/P9InstrResources.td +++ b/lib/Target/PowerPC/P9InstrResources.td @@ -111,11 +111,11 @@ def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C], (instregex "CNT(L|T)Z(D|W)(8)?(o)?$"), (instregex "POPCNT(D|W)$"), (instregex "CMPB(8)?$"), + (instregex "SETB(8)?$"), XSTDIVDP, XSTSQRTDP, XSXSIGDP, XSCVSPDPN, - SETB, BPERMD )>; diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td index 80ad4962a20f4adb8f9ccdf767823d8552f19cca..98e6e98e69744a9e04f564b4b4e7ed3b440cf443 100644 --- a/lib/Target/PowerPC/PPC.td +++ b/lib/Target/PowerPC/PPC.td @@ -305,11 +305,11 @@ def : Processor<"generic", G3Itineraries, [Directive32, FeatureHardFloat, FeatureMFTB]>; def : ProcessorModel<"440", PPC440Model, [Directive440, FeatureISEL, FeatureFRES, FeatureFRSQRTE, - FeatureICBT, FeatureBookE, + FeatureICBT, FeatureBookE, FeatureMSYNC, FeatureMFTB]>; def : ProcessorModel<"450", PPC440Model, [Directive440, FeatureISEL, FeatureFRES, FeatureFRSQRTE, - FeatureICBT, FeatureBookE, + FeatureICBT, FeatureBookE, FeatureMSYNC, FeatureMFTB]>; def : Processor<"601", G3Itineraries, [Directive601, FeatureFPU]>; def : Processor<"602", G3Itineraries, [Directive602, FeatureFPU, @@ -348,7 +348,7 @@ def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec, FeatureFRES, FeatureFRSQRTE, FeatureMFTB]>; def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec, - FeatureFRES, FeatureFRSQRTE, + FeatureFRES, FeatureFRSQRTE, FeatureMFTB]>; def : ProcessorModel<"970", G5Model, @@ -369,11 +369,11 @@ def : ProcessorModel<"e500", PPCE500Model, FeatureISEL, FeatureMFTB]>; def : ProcessorModel<"e500mc", PPCE500mcModel, [DirectiveE500mc, - FeatureSTFIWX, FeatureICBT, FeatureBookE, + FeatureSTFIWX, FeatureICBT, FeatureBookE, FeatureISEL, FeatureMFTB]>; def : ProcessorModel<"e5500", PPCE5500Model, [DirectiveE5500, FeatureMFOCRF, Feature64Bit, - FeatureSTFIWX, FeatureICBT, FeatureBookE, + FeatureSTFIWX, FeatureICBT, FeatureBookE, FeatureISEL, FeatureMFTB]>; def : ProcessorModel<"a2", PPCA2Model, [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF, @@ -428,7 +428,7 @@ def : ProcessorModel<"pwr6x", G5Model, FeatureMFTB, DeprecatedDST]>; def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.Power7FeatureList>; def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.Power8FeatureList>; -def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.Power9FeatureList>; +def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.Power9FeatureList>; def : Processor<"ppc", G3Itineraries, [Directive32, FeatureHardFloat, FeatureMFTB]>; def : Processor<"ppc32", G3Itineraries, [Directive32, FeatureHardFloat, @@ -478,3 +478,9 @@ def PPC : Target { let AssemblyParserVariants = [PPCAsmParserVariant]; let AllowRegisterRenaming = 1; } + +//===----------------------------------------------------------------------===// +// Pfm Counters +//===----------------------------------------------------------------------===// + +include "PPCPfmCounters.td" diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp index a00f5bda2aab9c7ebb3addfccc2b7aca04ce817b..04aa3c9b1e22cd2d32cc8d4ac7f4d331bb2d2e15 100644 --- a/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -327,7 +327,7 @@ MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) { } void PPCAsmPrinter::EmitEndOfAsmFile(Module &M) { - SM.serializeToStackMapSection(); + emitStackMaps(SM); } void PPCAsmPrinter::LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI) { diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp index 668169839e78cc539ece36e3c38bb84e311b4890..aa55ac1f7acc428f63827295a46fa67978f85f2e 100644 --- a/lib/Target/PowerPC/PPCFastISel.cpp +++ b/lib/Target/PowerPC/PPCFastISel.cpp @@ -2354,7 +2354,8 @@ bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, PPCSubTarget->hasSPE() ? PPC::EVLDD : PPC::LFD)) return false; - MI->eraseFromParent(); + MachineBasicBlock::iterator I(MI); + removeDeadCode(I, std::next(I)); return true; } diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp index 84dacf39646248b41df64fc0a88ae2ae72ccd8c3..8263954994d278862855d54e620aeb36c4116e66 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -17,6 +17,7 @@ #include "PPCMachineFunctionInfo.h" #include "PPCSubtarget.h" #include "PPCTargetMachine.h" +#include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -28,6 +29,16 @@ using namespace llvm; +#define DEBUG_TYPE "framelowering" +STATISTIC(NumNoNeedForFrame, "Number of functions without frames"); +STATISTIC(NumPESpillVSR, "Number of spills to vector in prologue"); +STATISTIC(NumPEReloadVSR, "Number of reloads from vector in epilogue"); + +static cl::opt +EnablePEVectorSpills("ppc-enable-pe-vector-spills", + cl::desc("Enable spills in prologue to vector registers."), + cl::init(false), cl::Hidden); + /// VRRegNo - Map from a numbered VR register to its enum value. /// static const MCPhysReg VRRegNo[] = { @@ -466,6 +477,7 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF, // Check whether we can skip adjusting the stack pointer (by using red zone) if (!DisableRedZone && CanUseRedZone && FitsInRedZone) { + NumNoNeedForFrame++; // No need for frame if (UpdateMF) MFI.setStackSize(0); @@ -1213,11 +1225,20 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, continue; } - int Offset = MFI.getObjectOffset(CSI[I].getFrameIdx()); - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( - nullptr, MRI->getDwarfRegNum(Reg, true), Offset)); - BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + if (CSI[I].isSpilledToReg()) { + unsigned SpilledReg = CSI[I].getDstReg(); + unsigned CFIRegister = MF.addFrameInst(MCCFIInstruction::createRegister( + nullptr, MRI->getDwarfRegNum(Reg, true), + MRI->getDwarfRegNum(SpilledReg, true))); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIRegister); + } else { + int Offset = MFI.getObjectOffset(CSI[I].getFrameIdx()); + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( + nullptr, MRI->getDwarfRegNum(Reg, true), Offset)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } } } } @@ -1822,17 +1843,19 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, // Move general register save area spill slots down, taking into account // the size of the Floating-point register save area. for (unsigned i = 0, e = GPRegs.size(); i != e; ++i) { - int FI = GPRegs[i].getFrameIdx(); - - MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI)); + if (!GPRegs[i].isSpilledToReg()) { + int FI = GPRegs[i].getFrameIdx(); + MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI)); + } } // Move general register save area spill slots down, taking into account // the size of the Floating-point register save area. for (unsigned i = 0, e = G8Regs.size(); i != e; ++i) { - int FI = G8Regs[i].getFrameIdx(); - - MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI)); + if (!G8Regs[i].isSpilledToReg()) { + int FI = G8Regs[i].getFrameIdx(); + MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI)); + } } unsigned MinReg = @@ -1947,6 +1970,64 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF, } } +// This function checks if a callee saved gpr can be spilled to a volatile +// vector register. This occurs for leaf functions when the option +// ppc-enable-pe-vector-spills is enabled. If there are any remaining registers +// which were not spilled to vectors, return false so the target independent +// code can handle them by assigning a FrameIdx to a stack slot. +bool PPCFrameLowering::assignCalleeSavedSpillSlots( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector &CSI) const { + + if (CSI.empty()) + return true; // Early exit if no callee saved registers are modified! + + // Early exit if cannot spill gprs to volatile vector registers. + MachineFrameInfo &MFI = MF.getFrameInfo(); + if (!EnablePEVectorSpills || MFI.hasCalls() || !Subtarget.hasP9Vector()) + return false; + + // Build a BitVector of VSRs that can be used for spilling GPRs. + BitVector BVAllocatable = TRI->getAllocatableSet(MF); + BitVector BVCalleeSaved(TRI->getNumRegs()); + const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); + for (unsigned i = 0; CSRegs[i]; ++i) + BVCalleeSaved.set(CSRegs[i]); + + for (unsigned Reg : BVAllocatable.set_bits()) { + // Set to 0 if the register is not a volatile VF/F8 register, or if it is + // used in the function. + if (BVCalleeSaved[Reg] || + (!PPC::F8RCRegClass.contains(Reg) && + !PPC::VFRCRegClass.contains(Reg)) || + (MF.getRegInfo().isPhysRegUsed(Reg))) + BVAllocatable.reset(Reg); + } + + bool AllSpilledToReg = true; + for (auto &CS : CSI) { + if (BVAllocatable.none()) + return false; + + unsigned Reg = CS.getReg(); + if (!PPC::G8RCRegClass.contains(Reg) && !PPC::GPRCRegClass.contains(Reg)) { + AllSpilledToReg = false; + continue; + } + + unsigned VolatileVFReg = BVAllocatable.find_first(); + if (VolatileVFReg < BVAllocatable.size()) { + CS.setDstReg(VolatileVFReg); + BVAllocatable.reset(VolatileVFReg); + } else { + AllSpilledToReg = false; + } + } + return AllSpilledToReg; +} + + bool PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, @@ -2012,12 +2093,18 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, CSI[i].getFrameIdx())); } } else { - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - // Use !IsLiveIn for the kill flag. - // We do not want to kill registers that are live in this function - // before their use because they will become undefined registers. - TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, - CSI[i].getFrameIdx(), RC, TRI); + if (CSI[i].isSpilledToReg()) { + NumPESpillVSR++; + BuildMI(MBB, MI, DL, TII.get(PPC::MTVSRD), CSI[i].getDstReg()) + .addReg(Reg, getKillRegState(true)); + } else { + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + // Use !IsLiveIn for the kill flag. + // We do not want to kill registers that are live in this function + // before their use because they will become undefined registers. + TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, + CSI[i].getFrameIdx(), RC, TRI); + } } } return true; @@ -2157,13 +2244,19 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, CR2Spilled = CR3Spilled = CR4Spilled = false; } - // Default behavior for non-CR saves. - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), - RC, TRI); - assert(I != MBB.begin() && - "loadRegFromStackSlot didn't insert any code!"); + if (CSI[i].isSpilledToReg()) { + DebugLoc DL; + NumPEReloadVSR++; + BuildMI(MBB, I, DL, TII.get(PPC::MFVSRD), Reg) + .addReg(CSI[i].getDstReg(), getKillRegState(true)); + } else { + // Default behavior for non-CR saves. + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI); + assert(I != MBB.begin() && + "loadRegFromStackSlot didn't insert any code!"); } + } // Insert in reverse order. if (AtStart) diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h index 01c155594c442f9a953915d9a91a0b9352fa9a2b..69bd1484d6e59f6d9609bf9a30495a77a5095d51 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.h +++ b/lib/Target/PowerPC/PPCFrameLowering.h @@ -99,6 +99,13 @@ public: MachineBasicBlock::iterator MI, const std::vector &CSI, const TargetRegisterInfo *TRI) const override; + /// This function will assign callee saved gprs to volatile vector registers + /// for prologue spills when applicable. It returns false if there are any + /// registers which were not spilled to volatile vector registers. + bool + assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector &CSI) const override; MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp index cf3547f3cfb04f538e86ea91a78450ed694caad3..5f6966cecd61a310dfb5cafc9c33c172467fb5bc 100644 --- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp +++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp @@ -103,7 +103,7 @@ bool PPCDispatchGroupSBHazardRecognizer::mustComeFirst(const MCInstrDesc *MCID, case PPC::Sched::IIC_LdStLHA: case PPC::Sched::IIC_LdStLHAU: case PPC::Sched::IIC_LdStLWA: - case PPC::Sched::IIC_LdStSTDU: + case PPC::Sched::IIC_LdStSTU: case PPC::Sched::IIC_LdStSTFDU: NSlots = 2; break; @@ -112,7 +112,7 @@ bool PPCDispatchGroupSBHazardRecognizer::mustComeFirst(const MCInstrDesc *MCID, case PPC::Sched::IIC_LdStLHAUX: case PPC::Sched::IIC_LdStLWARX: case PPC::Sched::IIC_LdStLDARX: - case PPC::Sched::IIC_LdStSTDUX: + case PPC::Sched::IIC_LdStSTUX: case PPC::Sched::IIC_LdStSTDCX: case PPC::Sched::IIC_LdStSTWCX: case PPC::Sched::IIC_BrMCRX: // mtcr diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 8861de6f0d8953dbd30ada9c80cdaa4c56b2195c..4f05f50f6994fa06bb7781498a88abb6d677d6d3 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -81,6 +81,8 @@ STATISTIC(NumLogicOpsOnComparison, "Number of logical ops on i1 values calculated in GPR."); STATISTIC(OmittedForNonExtendUses, "Number of compares not eliminated as they have non-extending uses."); +STATISTIC(NumP9Setb, + "Number of compares lowered to setb."); // FIXME: Remove this once the bug has been fixed! cl::opt ANDIGlueBug("expose-ppc-andi-glue-bug", @@ -327,7 +329,6 @@ private: bool isOffsetMultipleOf(SDNode *N, unsigned Val) const; void transferMemOperands(SDNode *N, SDNode *Result); - MachineSDNode *flipSignBit(const SDValue &N, SDNode **SignBit = nullptr); }; } // end anonymous namespace @@ -4138,49 +4139,144 @@ void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) { CurDAG->setNodeMemRefs(cast(Result), {MemOp}); } -/// This method returns a node after flipping the MSB of each element -/// of vector integer type. Additionally, if SignBitVec is non-null, -/// this method sets a node with one at MSB of all elements -/// and zero at other bits in SignBitVec. -MachineSDNode * -PPCDAGToDAGISel::flipSignBit(const SDValue &N, SDNode **SignBitVec) { - SDLoc dl(N); - EVT VecVT = N.getValueType(); - if (VecVT == MVT::v4i32) { - if (SignBitVec) { - SDNode *ZV = CurDAG->getMachineNode(PPC::V_SET0, dl, MVT::v4i32); - *SignBitVec = CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT, - SDValue(ZV, 0)); - } - return CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT, N); - } - else if (VecVT == MVT::v8i16) { - SDNode *Hi = CurDAG->getMachineNode(PPC::LIS, dl, MVT::i32, - getI32Imm(0x8000, dl)); - SDNode *ScaImm = CurDAG->getMachineNode(PPC::ORI, dl, MVT::i32, - SDValue(Hi, 0), - getI32Imm(0x8000, dl)); - SDNode *VecImm = CurDAG->getMachineNode(PPC::MTVSRWS, dl, VecVT, - SDValue(ScaImm, 0)); - /* - Alternatively, we can do this as follow to use VRF instead of GPR. - vspltish 5, 1 - vspltish 6, 15 - vslh 5, 6, 5 - */ - if (SignBitVec) *SignBitVec = VecImm; - return CurDAG->getMachineNode(PPC::VADDUHM, dl, VecVT, N, - SDValue(VecImm, 0)); - } - else if (VecVT == MVT::v16i8) { - SDNode *VecImm = CurDAG->getMachineNode(PPC::XXSPLTIB, dl, MVT::i32, - getI32Imm(0x80, dl)); - if (SignBitVec) *SignBitVec = VecImm; - return CurDAG->getMachineNode(PPC::VADDUBM, dl, VecVT, N, - SDValue(VecImm, 0)); +static bool mayUseP9Setb(SDNode *N, const ISD::CondCode &CC, SelectionDAG *DAG, + bool &NeedSwapOps, bool &IsUnCmp) { + + assert(N->getOpcode() == ISD::SELECT_CC && "Expecting a SELECT_CC here."); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue TrueRes = N->getOperand(2); + SDValue FalseRes = N->getOperand(3); + ConstantSDNode *TrueConst = dyn_cast(TrueRes); + if (!TrueConst) + return false; + + assert((N->getSimpleValueType(0) == MVT::i64 || + N->getSimpleValueType(0) == MVT::i32) && + "Expecting either i64 or i32 here."); + + // We are looking for any of: + // (select_cc lhs, rhs, 1, (sext (setcc [lr]hs, [lr]hs, cc2)), cc1) + // (select_cc lhs, rhs, -1, (zext (setcc [lr]hs, [lr]hs, cc2)), cc1) + // (select_cc lhs, rhs, 0, (select_cc [lr]hs, [lr]hs, 1, -1, cc2), seteq) + // (select_cc lhs, rhs, 0, (select_cc [lr]hs, [lr]hs, -1, 1, cc2), seteq) + int64_t TrueResVal = TrueConst->getSExtValue(); + if ((TrueResVal < -1 || TrueResVal > 1) || + (TrueResVal == -1 && FalseRes.getOpcode() != ISD::ZERO_EXTEND) || + (TrueResVal == 1 && FalseRes.getOpcode() != ISD::SIGN_EXTEND) || + (TrueResVal == 0 && + (FalseRes.getOpcode() != ISD::SELECT_CC || CC != ISD::SETEQ))) + return false; + + bool InnerIsSel = FalseRes.getOpcode() == ISD::SELECT_CC; + SDValue SetOrSelCC = InnerIsSel ? FalseRes : FalseRes.getOperand(0); + if (SetOrSelCC.getOpcode() != ISD::SETCC && + SetOrSelCC.getOpcode() != ISD::SELECT_CC) + return false; + + // Without this setb optimization, the outer SELECT_CC will be manually + // selected to SELECT_CC_I4/SELECT_CC_I8 Pseudo, then expand-isel-pseudos pass + // transforms pseduo instruction to isel instruction. When there are more than + // one use for result like zext/sext, with current optimization we only see + // isel is replaced by setb but can't see any significant gain. Since + // setb has longer latency than original isel, we should avoid this. Another + // point is that setb requires comparison always kept, it can break the + // oppotunity to get the comparison away if we have in future. + if (!SetOrSelCC.hasOneUse() || (!InnerIsSel && !FalseRes.hasOneUse())) + return false; + + SDValue InnerLHS = SetOrSelCC.getOperand(0); + SDValue InnerRHS = SetOrSelCC.getOperand(1); + ISD::CondCode InnerCC = + cast(SetOrSelCC.getOperand(InnerIsSel ? 4 : 2))->get(); + // If the inner comparison is a select_cc, make sure the true/false values are + // 1/-1 and canonicalize it if needed. + if (InnerIsSel) { + ConstantSDNode *SelCCTrueConst = + dyn_cast(SetOrSelCC.getOperand(2)); + ConstantSDNode *SelCCFalseConst = + dyn_cast(SetOrSelCC.getOperand(3)); + if (!SelCCTrueConst || !SelCCFalseConst) + return false; + int64_t SelCCTVal = SelCCTrueConst->getSExtValue(); + int64_t SelCCFVal = SelCCFalseConst->getSExtValue(); + // The values must be -1/1 (requiring a swap) or 1/-1. + if (SelCCTVal == -1 && SelCCFVal == 1) { + std::swap(InnerLHS, InnerRHS); + } else if (SelCCTVal != 1 || SelCCFVal != -1) + return false; } - else - llvm_unreachable("Unsupported vector data type for flipSignBit"); + + // Canonicalize unsigned case + if (InnerCC == ISD::SETULT || InnerCC == ISD::SETUGT) { + IsUnCmp = true; + InnerCC = (InnerCC == ISD::SETULT) ? ISD::SETLT : ISD::SETGT; + } + + bool InnerSwapped = false; + if (LHS == InnerRHS && RHS == InnerLHS) + InnerSwapped = true; + else if (LHS != InnerLHS || RHS != InnerRHS) + return false; + + switch (CC) { + // (select_cc lhs, rhs, 0, \ + // (select_cc [lr]hs, [lr]hs, 1, -1, setlt/setgt), seteq) + case ISD::SETEQ: + if (!InnerIsSel) + return false; + if (InnerCC != ISD::SETLT && InnerCC != ISD::SETGT) + return false; + NeedSwapOps = (InnerCC == ISD::SETGT) ? InnerSwapped : !InnerSwapped; + break; + + // (select_cc lhs, rhs, -1, (zext (setcc [lr]hs, [lr]hs, setne)), setu?lt) + // (select_cc lhs, rhs, -1, (zext (setcc lhs, rhs, setgt)), setu?lt) + // (select_cc lhs, rhs, -1, (zext (setcc rhs, lhs, setlt)), setu?lt) + // (select_cc lhs, rhs, 1, (sext (setcc [lr]hs, [lr]hs, setne)), setu?lt) + // (select_cc lhs, rhs, 1, (sext (setcc lhs, rhs, setgt)), setu?lt) + // (select_cc lhs, rhs, 1, (sext (setcc rhs, lhs, setlt)), setu?lt) + case ISD::SETULT: + if (!IsUnCmp && InnerCC != ISD::SETNE) + return false; + IsUnCmp = true; + LLVM_FALLTHROUGH; + case ISD::SETLT: + if (InnerCC == ISD::SETNE || (InnerCC == ISD::SETGT && !InnerSwapped) || + (InnerCC == ISD::SETLT && InnerSwapped)) + NeedSwapOps = (TrueResVal == 1); + else + return false; + break; + + // (select_cc lhs, rhs, 1, (sext (setcc [lr]hs, [lr]hs, setne)), setu?gt) + // (select_cc lhs, rhs, 1, (sext (setcc lhs, rhs, setlt)), setu?gt) + // (select_cc lhs, rhs, 1, (sext (setcc rhs, lhs, setgt)), setu?gt) + // (select_cc lhs, rhs, -1, (zext (setcc [lr]hs, [lr]hs, setne)), setu?gt) + // (select_cc lhs, rhs, -1, (zext (setcc lhs, rhs, setlt)), setu?gt) + // (select_cc lhs, rhs, -1, (zext (setcc rhs, lhs, setgt)), setu?gt) + case ISD::SETUGT: + if (!IsUnCmp && InnerCC != ISD::SETNE) + return false; + IsUnCmp = true; + LLVM_FALLTHROUGH; + case ISD::SETGT: + if (InnerCC == ISD::SETNE || (InnerCC == ISD::SETLT && !InnerSwapped) || + (InnerCC == ISD::SETGT && InnerSwapped)) + NeedSwapOps = (TrueResVal == -1); + else + return false; + break; + + default: + return false; + } + + LLVM_DEBUG(dbgs() << "Found a node that can be lowered to a SETB: "); + LLVM_DEBUG(N->dump()); + + return true; } // Select - Convert the specified operand from a target-independent to a @@ -4645,6 +4741,31 @@ void PPCDAGToDAGISel::Select(SDNode *N) { N->getOperand(0).getValueType() == MVT::i1) break; + if (PPCSubTarget->isISA3_0() && PPCSubTarget->isPPC64()) { + bool NeedSwapOps = false; + bool IsUnCmp = false; + if (mayUseP9Setb(N, CC, CurDAG, NeedSwapOps, IsUnCmp)) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + if (NeedSwapOps) + std::swap(LHS, RHS); + + // Make use of SelectCC to generate the comparison to set CR bits, for + // equality comparisons having one literal operand, SelectCC probably + // doesn't need to materialize the whole literal and just use xoris to + // check it first, it leads the following comparison result can't + // exactly represent GT/LT relationship. So to avoid this we specify + // SETGT/SETUGT here instead of SETEQ. + SDValue GenCC = + SelectCC(LHS, RHS, IsUnCmp ? ISD::SETUGT : ISD::SETGT, dl); + CurDAG->SelectNodeTo( + N, N->getSimpleValueType(0) == MVT::i64 ? PPC::SETB8 : PPC::SETB, + N->getValueType(0), GenCC); + NumP9Setb++; + return; + } + } + // Handle the setcc cases here. select_cc lhs, 0, 1, 0, cc if (!isPPC64) if (ConstantSDNode *N1C = dyn_cast(N->getOperand(1))) @@ -4736,14 +4857,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) { CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops); return; } - case ISD::VSELECT: - if (PPCSubTarget->hasVSX()) { - SDValue Ops[] = { N->getOperand(2), N->getOperand(1), N->getOperand(0) }; - CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops); - return; - } - break; - case ISD::VECTOR_SHUFFLE: if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 || N->getValueType(0) == MVT::v2i64)) { @@ -4840,6 +4953,15 @@ void PPCDAGToDAGISel::Select(SDNode *N) { case PPC::PRED_NE: Opc = PPC::CRXOR; Swap = false; break; } + // A signed comparison of i1 values produces the opposite result to an + // unsigned one if the condition code includes less-than or greater-than. + // This is because 1 is the most negative signed i1 number and the most + // positive unsigned i1 number. The CR-logical operations used for such + // comparisons are non-commutative so for signed comparisons vs. unsigned + // ones, the input operands just need to be swapped. + if (ISD::isSignedIntSetCC(CC)) + Swap = !Swap; + SDValue BitComp(CurDAG->getMachineNode(Opc, dl, MVT::i1, N->getOperand(Swap ? 3 : 2), N->getOperand(Swap ? 2 : 3)), 0); @@ -4896,9 +5018,9 @@ void PPCDAGToDAGISel::Select(SDNode *N) { SDValue TOCbase = N->getOperand(1); SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64, TOCbase, GA); - - if (isa(GA) || isa(GA) || - CModel == CodeModel::Large) { + if (PPCLowering->isAccessedAsGotIndirect(GA)) { + // If it is access as got-indirect, we need an extra LD to load + // the address. SDNode *MN = CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA, SDValue(Tmp, 0)); transferMemOperands(N, MN); @@ -4906,18 +5028,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) { return; } - if (GlobalAddressSDNode *G = dyn_cast(GA)) { - const GlobalValue *GV = G->getGlobal(); - unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV); - if (GVFlags & PPCII::MO_NLP_FLAG) { - SDNode *MN = CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA, - SDValue(Tmp, 0)); - transferMemOperands(N, MN); - ReplaceNode(N, MN); - return; - } - } - + // Build the address relative to the TOC-pointer.. ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64, SDValue(Tmp, 0), GA)); return; @@ -5003,55 +5114,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) { return; } } - case ISD::ABS: { - assert(PPCSubTarget->hasP9Vector() && "ABS is supported with P9 Vector"); - - // For vector absolute difference, we use VABSDUW instruction of POWER9. - // Since VABSDU instructions are for unsigned integers, we need adjustment - // for signed integers. - // For abs(sub(a, b)), we generate VABSDUW(a+0x80000000, b+0x80000000). - // Otherwise, abs(sub(-1, 0)) returns 0xFFFFFFFF(=-1) instead of 1. - // For abs(a), we generate VABSDUW(a+0x80000000, 0x80000000). - EVT VecVT = N->getOperand(0).getValueType(); - SDNode *AbsOp = nullptr; - unsigned AbsOpcode; - - if (VecVT == MVT::v4i32) - AbsOpcode = PPC::VABSDUW; - else if (VecVT == MVT::v8i16) - AbsOpcode = PPC::VABSDUH; - else if (VecVT == MVT::v16i8) - AbsOpcode = PPC::VABSDUB; - else - llvm_unreachable("Unsupported vector data type for ISD::ABS"); - - // Even for signed integers, we can skip adjustment if all values are - // known to be positive (as signed integer) due to zero-extended inputs. - if (N->getOperand(0).getOpcode() == ISD::SUB && - N->getOperand(0)->getOperand(0).getOpcode() == ISD::ZERO_EXTEND && - N->getOperand(0)->getOperand(1).getOpcode() == ISD::ZERO_EXTEND) { - AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT, - SDValue(N->getOperand(0)->getOperand(0)), - SDValue(N->getOperand(0)->getOperand(1))); - ReplaceNode(N, AbsOp); - return; - } - if (N->getOperand(0).getOpcode() == ISD::SUB) { - SDValue SubVal = N->getOperand(0); - SDNode *Op0 = flipSignBit(SubVal->getOperand(0)); - SDNode *Op1 = flipSignBit(SubVal->getOperand(1)); - AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT, - SDValue(Op0, 0), SDValue(Op1, 0)); - } - else { - SDNode *Op1 = nullptr; - SDNode *Op0 = flipSignBit(N->getOperand(0), &Op1); - AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT, SDValue(Op0, 0), - SDValue(Op1, 0)); - } - ReplaceNode(N, AbsOp); - return; - } } SelectCode(N); diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index c6f0212ab404ded54197438490319b8cc6e3774c..92af82dc4b9f52cb1af665dfdbd7b532ab07a939 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -251,12 +251,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::UREM, MVT::i64, Expand); } - if (Subtarget.hasP9Vector()) { - setOperationAction(ISD::ABS, MVT::v4i32, Legal); - setOperationAction(ISD::ABS, MVT::v8i16, Legal); - setOperationAction(ISD::ABS, MVT::v16i8, Legal); - } - // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM. setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); @@ -323,12 +317,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // to speed up scalar BSWAP64. // CTPOP or CTTZ were introduced in P8/P9 respectively setOperationAction(ISD::BSWAP, MVT::i32 , Expand); - if (Subtarget.isISA3_0()) { + if (Subtarget.hasP9Vector()) setOperationAction(ISD::BSWAP, MVT::i64 , Custom); + else + setOperationAction(ISD::BSWAP, MVT::i64 , Expand); + if (Subtarget.isISA3_0()) { setOperationAction(ISD::CTTZ , MVT::i32 , Legal); setOperationAction(ISD::CTTZ , MVT::i64 , Legal); } else { - setOperationAction(ISD::BSWAP, MVT::i64 , Expand); setOperationAction(ISD::CTTZ , MVT::i32 , Expand); setOperationAction(ISD::CTTZ , MVT::i64 , Expand); } @@ -554,6 +550,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // add/sub are legal for all supported vector VT's. setOperationAction(ISD::ADD, VT, Legal); setOperationAction(ISD::SUB, VT, Legal); + setOperationAction(ISD::ABS, VT, Custom); // Vector instructions introduced in P8 if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) { @@ -586,6 +583,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, AddPromotedToType (ISD::LOAD , VT, MVT::v4i32); setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v4i32); + setOperationAction(ISD::VSELECT, VT, Legal); setOperationAction(ISD::SELECT_CC, VT, Promote); AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32); setOperationAction(ISD::STORE, VT, Promote); @@ -626,7 +624,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); - setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); @@ -659,6 +656,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); + // Without hasP8Altivec set, v2i64 SMAX isn't available. + // But ABS custom lowering requires SMAX support. + if (!Subtarget.hasP8Altivec()) + setOperationAction(ISD::ABS, MVT::v2i64, Expand); + addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass); addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass); addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass); @@ -727,12 +729,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FDIV, MVT::v2f64, Legal); setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); - setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); - setOperationAction(ISD::VSELECT, MVT::v8i16, Legal); - setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); - setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); - // Share the Altivec comparison restrictions. setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand); setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand); @@ -1087,6 +1083,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setTargetDAGCombine(ISD::FSQRT); } + if (Subtarget.hasP9Altivec()) { + setTargetDAGCombine(ISD::ABS); + setTargetDAGCombine(ISD::VSELECT); + } + // Darwin long double math library functions have $LDBL128 appended. if (Subtarget.isDarwin()) { setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128"); @@ -1347,6 +1348,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::RFEBB: return "PPCISD::RFEBB"; case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD"; case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN"; + case PPCISD::VABSD: return "PPCISD::VABSD"; case PPCISD::QVFPERM: return "PPCISD::QVFPERM"; case PPCISD::QVGPCI: return "PPCISD::QVGPCI"; case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI"; @@ -9007,35 +9009,6 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getRegister(PPC::R2, MVT::i32); } - // We are looking for absolute values here. - // The idea is to try to fit one of two patterns: - // max (a, (0-a)) OR max ((0-a), a) - if (Subtarget.hasP9Vector() && - (IntrinsicID == Intrinsic::ppc_altivec_vmaxsw || - IntrinsicID == Intrinsic::ppc_altivec_vmaxsh || - IntrinsicID == Intrinsic::ppc_altivec_vmaxsb)) { - SDValue V1 = Op.getOperand(1); - SDValue V2 = Op.getOperand(2); - if (V1.getSimpleValueType() == V2.getSimpleValueType() && - (V1.getSimpleValueType() == MVT::v4i32 || - V1.getSimpleValueType() == MVT::v8i16 || - V1.getSimpleValueType() == MVT::v16i8)) { - if ( V1.getOpcode() == ISD::SUB && - ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) && - V1.getOperand(1) == V2 ) { - // Generate the abs instruction with the operands - return DAG.getNode(ISD::ABS, dl, V2.getValueType(),V2); - } - - if ( V2.getOpcode() == ISD::SUB && - ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) && - V2.getOperand(1) == V1 ) { - // Generate the abs instruction with the operands - return DAG.getNode(ISD::ABS, dl, V1.getValueType(),V1); - } - } - } - // If this is a lowered altivec predicate compare, CompareOpc is set to the // opcode number of the comparison. int CompareOpc; @@ -9576,6 +9549,44 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { } } +SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { + + assert(Op.getOpcode() == ISD::ABS && "Should only be called for ISD::ABS"); + + EVT VT = Op.getValueType(); + assert(VT.isVector() && + "Only set vector abs as custom, scalar abs shouldn't reach here!"); + assert((VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || + VT == MVT::v16i8) && + "Unexpected vector element type!"); + assert((VT != MVT::v2i64 || Subtarget.hasP8Altivec()) && + "Current subtarget doesn't support smax v2i64!"); + + // For vector abs, it can be lowered to: + // abs x + // ==> + // y = -x + // smax(x, y) + + SDLoc dl(Op); + SDValue X = Op.getOperand(0); + SDValue Zero = DAG.getConstant(0, dl, VT); + SDValue Y = DAG.getNode(ISD::SUB, dl, VT, Zero, X); + + // SMAX patch https://reviews.llvm.org/D47332 + // hasn't landed yet, so use intrinsic first here. + // TODO: Should use SMAX directly once SMAX patch landed + Intrinsic::ID BifID = Intrinsic::ppc_altivec_vmaxsw; + if (VT == MVT::v2i64) + BifID = Intrinsic::ppc_altivec_vmaxsd; + else if (VT == MVT::v8i16) + BifID = Intrinsic::ppc_altivec_vmaxsh; + else if (VT == MVT::v16i8) + BifID = Intrinsic::ppc_altivec_vmaxsb; + + return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT); +} + /// LowerOperation - Provide custom lowering hooks for some operations. /// SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -9628,6 +9639,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); + case ISD::ABS: return LowerABS(Op, DAG); // For counter-based loop handling. case ISD::INTRINSIC_W_CHAIN: return SDValue(); @@ -12610,9 +12622,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, (Op1VT == MVT::i32 || Op1VT == MVT::i16 || (Subtarget.hasLDBRX() && Subtarget.isPPC64() && Op1VT == MVT::i64))) { - // STBRX can only handle simple types. + // STBRX can only handle simple types and it makes no sense to store less + // two bytes in byte-reversed order. EVT mVT = cast(N)->getMemoryVT(); - if (mVT.isExtended()) + if (mVT.isExtended() || mVT.getSizeInBits() < 16) break; SDValue BSwapOp = N->getOperand(1).getOperand(0); @@ -12988,6 +13001,39 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } } } + + // Combine vmaxsw/h/b(a, a's negation) to abs(a) + // Expose the vabsduw/h/b opportunity for down stream + if (!DCI.isAfterLegalizeDAG() && Subtarget.hasP9Altivec() && + (IID == Intrinsic::ppc_altivec_vmaxsw || + IID == Intrinsic::ppc_altivec_vmaxsh || + IID == Intrinsic::ppc_altivec_vmaxsb)) { + SDValue V1 = N->getOperand(1); + SDValue V2 = N->getOperand(2); + if ((V1.getSimpleValueType() == MVT::v4i32 || + V1.getSimpleValueType() == MVT::v8i16 || + V1.getSimpleValueType() == MVT::v16i8) && + V1.getSimpleValueType() == V2.getSimpleValueType()) { + // (0-a, a) + if (V1.getOpcode() == ISD::SUB && + ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) && + V1.getOperand(1) == V2) { + return DAG.getNode(ISD::ABS, dl, V2.getValueType(), V2); + } + // (a, 0-a) + if (V2.getOpcode() == ISD::SUB && + ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) && + V2.getOperand(1) == V1) { + return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1); + } + // (x-y, y-x) + if (V1.getOpcode() == ISD::SUB && V2.getOpcode() == ISD::SUB && + V1.getOperand(0) == V2.getOperand(1) && + V1.getOperand(1) == V2.getOperand(0)) { + return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1); + } + } + } } break; @@ -13220,6 +13266,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } case ISD::BUILD_VECTOR: return DAGCombineBuildVector(N, DCI); + case ISD::ABS: + return combineABS(N, DCI); + case ISD::VSELECT: + return combineVSelect(N, DCI); } return SDValue(); @@ -13713,6 +13763,35 @@ unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT, report_fatal_error("Invalid register name global variable"); } +bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const { + // 32-bit SVR4 ABI access everything as got-indirect. + if (Subtarget.isSVR4ABI() && !Subtarget.isPPC64()) + return true; + + CodeModel::Model CModel = getTargetMachine().getCodeModel(); + // If it is small or large code model, module locals are accessed + // indirectly by loading their address from .toc/.got. The difference + // is that for large code model we have ADDISTocHa + LDtocL and for + // small code model we simply have LDtoc. + if (CModel == CodeModel::Small || CModel == CodeModel::Large) + return true; + + // JumpTable and BlockAddress are accessed as got-indirect. + if (isa(GA) || isa(GA)) + return true; + + if (GlobalAddressSDNode *G = dyn_cast(GA)) { + const GlobalValue *GV = G->getGlobal(); + unsigned char GVFlags = Subtarget.classifyGlobalReference(GV); + // The NLP flag indicates that a global access has to use an + // extra indirection. + if (GVFlags & PPCII::MO_NLP_FLAG) + return true; + } + + return false; +} + bool PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // The PowerPC target isn't yet aware of offsets. @@ -14477,3 +14556,109 @@ isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const { // For non-constant masks, we can always use the record-form and. return true; } + +// Transform (abs (sub (zext a), (zext b))) to (vabsd a b 0) +// Transform (abs (sub (zext a), (zext_invec b))) to (vabsd a b 0) +// Transform (abs (sub (zext_invec a), (zext_invec b))) to (vabsd a b 0) +// Transform (abs (sub (zext_invec a), (zext b))) to (vabsd a b 0) +// Transform (abs (sub a, b) to (vabsd a b 1)) if a & b of type v4i32 +SDValue PPCTargetLowering::combineABS(SDNode *N, DAGCombinerInfo &DCI) const { + assert((N->getOpcode() == ISD::ABS) && "Need ABS node here"); + assert(Subtarget.hasP9Altivec() && + "Only combine this when P9 altivec supported!"); + EVT VT = N->getValueType(0); + if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + if (N->getOperand(0).getOpcode() == ISD::SUB) { + // Even for signed integers, if it's known to be positive (as signed + // integer) due to zero-extended inputs. + unsigned SubOpcd0 = N->getOperand(0)->getOperand(0).getOpcode(); + unsigned SubOpcd1 = N->getOperand(0)->getOperand(1).getOpcode(); + if ((SubOpcd0 == ISD::ZERO_EXTEND || + SubOpcd0 == ISD::ZERO_EXTEND_VECTOR_INREG) && + (SubOpcd1 == ISD::ZERO_EXTEND || + SubOpcd1 == ISD::ZERO_EXTEND_VECTOR_INREG)) { + return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(), + N->getOperand(0)->getOperand(0), + N->getOperand(0)->getOperand(1), + DAG.getTargetConstant(0, dl, MVT::i32)); + } + + // For type v4i32, it can be optimized with xvnegsp + vabsduw + if (N->getOperand(0).getValueType() == MVT::v4i32 && + N->getOperand(0).hasOneUse()) { + return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(), + N->getOperand(0)->getOperand(0), + N->getOperand(0)->getOperand(1), + DAG.getTargetConstant(1, dl, MVT::i32)); + } + } + + return SDValue(); +} + +// For type v4i32/v8ii16/v16i8, transform +// from (vselect (setcc a, b, setugt), (sub a, b), (sub b, a)) to (vabsd a, b) +// from (vselect (setcc a, b, setuge), (sub a, b), (sub b, a)) to (vabsd a, b) +// from (vselect (setcc a, b, setult), (sub b, a), (sub a, b)) to (vabsd a, b) +// from (vselect (setcc a, b, setule), (sub b, a), (sub a, b)) to (vabsd a, b) +SDValue PPCTargetLowering::combineVSelect(SDNode *N, + DAGCombinerInfo &DCI) const { + assert((N->getOpcode() == ISD::VSELECT) && "Need VSELECT node here"); + assert(Subtarget.hasP9Altivec() && + "Only combine this when P9 altivec supported!"); + + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + SDValue Cond = N->getOperand(0); + SDValue TrueOpnd = N->getOperand(1); + SDValue FalseOpnd = N->getOperand(2); + EVT VT = N->getOperand(1).getValueType(); + + if (Cond.getOpcode() != ISD::SETCC || TrueOpnd.getOpcode() != ISD::SUB || + FalseOpnd.getOpcode() != ISD::SUB) + return SDValue(); + + // ABSD only available for type v4i32/v8i16/v16i8 + if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8) + return SDValue(); + + // At least to save one more dependent computation + if (!(Cond.hasOneUse() || TrueOpnd.hasOneUse() || FalseOpnd.hasOneUse())) + return SDValue(); + + ISD::CondCode CC = cast(Cond.getOperand(2))->get(); + + // Can only handle unsigned comparison here + switch (CC) { + default: + return SDValue(); + case ISD::SETUGT: + case ISD::SETUGE: + break; + case ISD::SETULT: + case ISD::SETULE: + std::swap(TrueOpnd, FalseOpnd); + break; + } + + SDValue CmpOpnd1 = Cond.getOperand(0); + SDValue CmpOpnd2 = Cond.getOperand(1); + + // SETCC CmpOpnd1 CmpOpnd2 cond + // TrueOpnd = CmpOpnd1 - CmpOpnd2 + // FalseOpnd = CmpOpnd2 - CmpOpnd1 + if (TrueOpnd.getOperand(0) == CmpOpnd1 && + TrueOpnd.getOperand(1) == CmpOpnd2 && + FalseOpnd.getOperand(0) == CmpOpnd2 && + FalseOpnd.getOperand(1) == CmpOpnd1) { + return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(1).getValueType(), + CmpOpnd1, CmpOpnd2, + DAG.getTargetConstant(0, dl, MVT::i32)); + } + + return SDValue(); +} diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 1020cab48c8c7bdbf2c29472a04ce81c3803acd1..09039cb77363d909561ccea7d3c6098d5c5e7d94 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -373,6 +373,21 @@ namespace llvm { /// An SDNode for swaps that are not associated with any loads/stores /// and thereby have no chain. SWAP_NO_CHAIN, + + /// An SDNode for Power9 vector absolute value difference. + /// operand #0 vector + /// operand #1 vector + /// operand #2 constant i32 0 or 1, to indicate whether needs to patch + /// the most significant bit for signed i32 + /// + /// Power9 VABSD* instructions are designed to support unsigned integer + /// vectors (byte/halfword/word), if we want to make use of them for signed + /// integer vectors, we have to flip their sign bits first. To flip sign bit + /// for byte/halfword integer vector would become inefficient, but for word + /// integer vector, we can leverage XVNEGSP to make it efficiently. eg: + /// abs(sub(a,b)) => VABSDUW(a+0x80000000, b+0x80000000) + /// => VABSDUW((XVNEGSP a), (XVNEGSP b)) + VABSD, /// QVFPERM = This corresponds to the QPX qvfperm instruction. QVFPERM, @@ -789,6 +804,9 @@ namespace llvm { return true; } + // Returns true if the address of the global is stored in TOC entry. + bool isAccessedAsGotIndirect(SDValue N) const; + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; bool getTgtMemIntrinsic(IntrinsicInfo &Info, @@ -995,6 +1013,7 @@ namespace llvm { SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const; @@ -1098,6 +1117,8 @@ namespace llvm { SDValue combineADD(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineABS(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineVSelect(SDNode *N, DAGCombinerInfo &DCI) const; /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces /// SETCC with integer subtraction when (1) there is a legal way of doing it diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index b533efd0ffa4433f253575eaf48cdc3bea91e5f0..2ce6ad3293eb1bc0644ecbef8b9176d7f01c5655 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -94,7 +94,7 @@ let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in { } let Defs = [LR8] in - def MovePCtoLR8 : Pseudo<(outs), (ins), "#MovePCtoLR8", []>, + def MovePCtoLR8 : PPCEmitTimePseudo<(outs), (ins), "#MovePCtoLR8", []>, PPC970_Unit_BRU; let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in { @@ -199,47 +199,45 @@ def : Pat<(PPCcall_nop (i64 texternalsym:$dst)), // clean this up in PPCMIPeephole with calls to // PPCInstrInfo::convertToImmediateForm() but we should probably not emit them // in the first place. -let usesCustomInserter = 1 in { - let Defs = [CR0] in { - def ATOMIC_LOAD_ADD_I64 : Pseudo< - (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_ADD_I64", - [(set i64:$dst, (atomic_load_add_64 xoaddr:$ptr, i64:$incr))]>; - def ATOMIC_LOAD_SUB_I64 : Pseudo< - (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_SUB_I64", - [(set i64:$dst, (atomic_load_sub_64 xoaddr:$ptr, i64:$incr))]>; - def ATOMIC_LOAD_OR_I64 : Pseudo< - (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_OR_I64", - [(set i64:$dst, (atomic_load_or_64 xoaddr:$ptr, i64:$incr))]>; - def ATOMIC_LOAD_XOR_I64 : Pseudo< - (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_XOR_I64", - [(set i64:$dst, (atomic_load_xor_64 xoaddr:$ptr, i64:$incr))]>; - def ATOMIC_LOAD_AND_I64 : Pseudo< - (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_AND_i64", - [(set i64:$dst, (atomic_load_and_64 xoaddr:$ptr, i64:$incr))]>; - def ATOMIC_LOAD_NAND_I64 : Pseudo< - (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_NAND_I64", - [(set i64:$dst, (atomic_load_nand_64 xoaddr:$ptr, i64:$incr))]>; - def ATOMIC_LOAD_MIN_I64 : Pseudo< - (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_MIN_I64", - [(set i64:$dst, (atomic_load_min_64 xoaddr:$ptr, i64:$incr))]>; - def ATOMIC_LOAD_MAX_I64 : Pseudo< - (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_MAX_I64", - [(set i64:$dst, (atomic_load_max_64 xoaddr:$ptr, i64:$incr))]>; - def ATOMIC_LOAD_UMIN_I64 : Pseudo< - (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_UMIN_I64", - [(set i64:$dst, (atomic_load_umin_64 xoaddr:$ptr, i64:$incr))]>; - def ATOMIC_LOAD_UMAX_I64 : Pseudo< - (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_UMAX_I64", - [(set i64:$dst, (atomic_load_umax_64 xoaddr:$ptr, i64:$incr))]>; - - def ATOMIC_CMP_SWAP_I64 : Pseudo< - (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$old, g8rc:$new), "#ATOMIC_CMP_SWAP_I64", - [(set i64:$dst, (atomic_cmp_swap_64 xoaddr:$ptr, i64:$old, i64:$new))]>; - - def ATOMIC_SWAP_I64 : Pseudo< - (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$new), "#ATOMIC_SWAP_I64", - [(set i64:$dst, (atomic_swap_64 xoaddr:$ptr, i64:$new))]>; - } +let Defs = [CR0] in { + def ATOMIC_LOAD_ADD_I64 : PPCCustomInserterPseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_ADD_I64", + [(set i64:$dst, (atomic_load_add_64 xoaddr:$ptr, i64:$incr))]>; + def ATOMIC_LOAD_SUB_I64 : PPCCustomInserterPseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_SUB_I64", + [(set i64:$dst, (atomic_load_sub_64 xoaddr:$ptr, i64:$incr))]>; + def ATOMIC_LOAD_OR_I64 : PPCCustomInserterPseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_OR_I64", + [(set i64:$dst, (atomic_load_or_64 xoaddr:$ptr, i64:$incr))]>; + def ATOMIC_LOAD_XOR_I64 : PPCCustomInserterPseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_XOR_I64", + [(set i64:$dst, (atomic_load_xor_64 xoaddr:$ptr, i64:$incr))]>; + def ATOMIC_LOAD_AND_I64 : PPCCustomInserterPseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_AND_i64", + [(set i64:$dst, (atomic_load_and_64 xoaddr:$ptr, i64:$incr))]>; + def ATOMIC_LOAD_NAND_I64 : PPCCustomInserterPseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_NAND_I64", + [(set i64:$dst, (atomic_load_nand_64 xoaddr:$ptr, i64:$incr))]>; + def ATOMIC_LOAD_MIN_I64 : PPCCustomInserterPseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_MIN_I64", + [(set i64:$dst, (atomic_load_min_64 xoaddr:$ptr, i64:$incr))]>; + def ATOMIC_LOAD_MAX_I64 : PPCCustomInserterPseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_MAX_I64", + [(set i64:$dst, (atomic_load_max_64 xoaddr:$ptr, i64:$incr))]>; + def ATOMIC_LOAD_UMIN_I64 : PPCCustomInserterPseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_UMIN_I64", + [(set i64:$dst, (atomic_load_umin_64 xoaddr:$ptr, i64:$incr))]>; + def ATOMIC_LOAD_UMAX_I64 : PPCCustomInserterPseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_UMAX_I64", + [(set i64:$dst, (atomic_load_umax_64 xoaddr:$ptr, i64:$incr))]>; + + def ATOMIC_CMP_SWAP_I64 : PPCCustomInserterPseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$old, g8rc:$new), "#ATOMIC_CMP_SWAP_I64", + [(set i64:$dst, (atomic_cmp_swap_64 xoaddr:$ptr, i64:$old, i64:$new))]>; + + def ATOMIC_SWAP_I64 : PPCCustomInserterPseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$new), "#ATOMIC_SWAP_I64", + [(set i64:$dst, (atomic_swap_64 xoaddr:$ptr, i64:$new))]>; } // Instructions to support atomic operations @@ -269,18 +267,18 @@ def STDAT : X_RD5_RS5_IM5<31, 742, (outs), (ins g8rc:$rS, g8rc:$rA, u5imm:$FC), let Interpretation64Bit = 1, isCodeGenOnly = 1 in { let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in -def TCRETURNdi8 :Pseudo< (outs), +def TCRETURNdi8 :PPCEmitTimePseudo< (outs), (ins calltarget:$dst, i32imm:$offset), "#TC_RETURNd8 $dst $offset", []>; let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in -def TCRETURNai8 :Pseudo<(outs), (ins abscalltarget:$func, i32imm:$offset), +def TCRETURNai8 :PPCEmitTimePseudo<(outs), (ins abscalltarget:$func, i32imm:$offset), "#TC_RETURNa8 $func $offset", [(PPCtc_return (i64 imm:$func), imm:$offset)]>; let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in -def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset), +def TCRETURNri8 : PPCEmitTimePseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset), "#TC_RETURNr8 $dst $offset", []>; @@ -347,14 +345,19 @@ def MFCR8 : XFXForm_3<31, 19, (outs g8rc:$rT), (ins), } // hasExtraSrcRegAllocReq = 1 } // hasSideEffects = 0 -let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in { +// While longjmp is a control-flow barrier (fallthrough isn't allowed), setjmp +// is not. +let hasSideEffects = 1 in { let Defs = [CTR8] in - def EH_SjLj_SetJmp64 : Pseudo<(outs gprc:$dst), (ins memr:$buf), + def EH_SjLj_SetJmp64 : PPCCustomInserterPseudo<(outs gprc:$dst), (ins memr:$buf), "#EH_SJLJ_SETJMP64", [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>, Requires<[In64BitMode]>; +} + +let hasSideEffects = 1, isBarrier = 1 in { let isTerminator = 1 in - def EH_SjLj_LongJmp64 : Pseudo<(outs), (ins memr:$buf), + def EH_SjLj_LongJmp64 : PPCCustomInserterPseudo<(outs), (ins memr:$buf), "#EH_SJLJ_LONGJMP64", [(PPCeh_sjlj_longjmp addr:$buf)]>, Requires<[In64BitMode]>; @@ -396,10 +399,10 @@ def MFTB8 : XFXForm_1_ext<31, 339, 268, (outs g8rc:$rT), (ins), // the POWER3. let Defs = [X1], Uses = [X1] in -def DYNALLOC8 : Pseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi),"#DYNALLOC8", +def DYNALLOC8 : PPCEmitTimePseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi),"#DYNALLOC8", [(set i64:$result, (PPCdynalloc i64:$negsize, iaddr:$fpsi))]>; -def DYNAREAOFFSET8 : Pseudo<(outs i64imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET8", +def DYNAREAOFFSET8 : PPCEmitTimePseudo<(outs i64imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET8", [(set i64:$result, (PPCdynareaoffset iaddr:$fpsi))]>; let Defs = [LR8] in { @@ -774,8 +777,12 @@ def MADDHDU : VAForm_1a<49, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC) "maddhdu $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64; def MADDLD : VAForm_1a<51, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC), "maddld $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64; -def SETB : XForm_44<31, 128, (outs g8rc:$RT), (ins crrc:$BFA), - "setb $RT, $BFA", IIC_IntGeneral>, isPPC64; +def SETB : XForm_44<31, 128, (outs gprc:$RT), (ins crrc:$BFA), + "setb $RT, $BFA", IIC_IntGeneral>, isPPC64; +let Interpretation64Bit = 1, isCodeGenOnly = 1 in { + def SETB8 : XForm_44<31, 128, (outs g8rc:$RT), (ins crrc:$BFA), + "setb $RT, $BFA", IIC_IntGeneral>, isPPC64; +} def DARN : XForm_45<31, 755, (outs g8rc:$RT), (ins i32imm:$L), "darn $RT, $L", IIC_LdStLD>, isPPC64; def ADDPCIS : DXForm<19, 2, (outs g8rc:$RT), (ins i32imm:$D), @@ -1019,19 +1026,19 @@ def LD : DSForm_1<58, 0, (outs g8rc:$rD), (ins memrix:$src), // The following four definitions are selected for small code model only. // Otherwise, we need to create two instructions to form a 32-bit offset, // so we have a custom matcher for TOC_ENTRY in PPCDAGToDAGIsel::Select(). -def LDtoc: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg), +def LDtoc: PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg), "#LDtoc", [(set i64:$rD, (PPCtoc_entry tglobaladdr:$disp, i64:$reg))]>, isPPC64; -def LDtocJTI: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg), +def LDtocJTI: PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg), "#LDtocJTI", [(set i64:$rD, (PPCtoc_entry tjumptable:$disp, i64:$reg))]>, isPPC64; -def LDtocCPT: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg), +def LDtocCPT: PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg), "#LDtocCPT", [(set i64:$rD, (PPCtoc_entry tconstpool:$disp, i64:$reg))]>, isPPC64; -def LDtocBA: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg), +def LDtocBA: PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg), "#LDtocCPT", [(set i64:$rD, (PPCtoc_entry tblockaddress:$disp, i64:$reg))]>, isPPC64; @@ -1072,40 +1079,40 @@ def LDMX : XForm_1<31, 309, (outs g8rc:$rD), (ins memrr:$src), // Support for medium and large code model. let hasSideEffects = 0 in { let isReMaterializable = 1 in { -def ADDIStocHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), +def ADDIStocHA: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), "#ADDIStocHA", []>, isPPC64; -def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), +def ADDItocL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), "#ADDItocL", []>, isPPC64; } let mayLoad = 1 in -def LDtocL: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg), +def LDtocL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg), "#LDtocL", []>, isPPC64; } // Support for thread-local storage. -def ADDISgotTprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), +def ADDISgotTprelHA: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), "#ADDISgotTprelHA", [(set i64:$rD, (PPCaddisGotTprelHA i64:$reg, tglobaltlsaddr:$disp))]>, isPPC64; -def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg), +def LDgotTprelL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg), "#LDgotTprelL", [(set i64:$rD, (PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>, isPPC64; -let isPseudo = 1, Defs = [CR7], Itinerary = IIC_LdStSync in -def CFENCE8 : Pseudo<(outs), (ins g8rc:$cr), "#CFENCE8", []>; +let Defs = [CR7], Itinerary = IIC_LdStSync in +def CFENCE8 : PPCPostRAExpPseudo<(outs), (ins g8rc:$cr), "#CFENCE8", []>; def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g), (ADD8TLS $in, tglobaltlsaddr:$g)>; -def ADDIStlsgdHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), +def ADDIStlsgdHA: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), "#ADDIStlsgdHA", [(set i64:$rD, (PPCaddisTlsgdHA i64:$reg, tglobaltlsaddr:$disp))]>, isPPC64; -def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), +def ADDItlsgdL : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), "#ADDItlsgdL", [(set i64:$rD, (PPCaddiTlsgdL i64:$reg, tglobaltlsaddr:$disp))]>, @@ -1116,7 +1123,7 @@ def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), // correct because the branch select pass is relying on it. let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, Size = 8, Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in -def GETtlsADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym), +def GETtlsADDR : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym), "#GETtlsADDR", [(set i64:$rD, (PPCgetTlsAddr i64:$reg, tglobaltlsaddr:$sym))]>, @@ -1126,7 +1133,7 @@ def GETtlsADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym), let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, Defs = [X0,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in -def ADDItlsgdLADDR : Pseudo<(outs g8rc:$rD), +def ADDItlsgdLADDR : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp, tlsgd:$sym), "#ADDItlsgdLADDR", [(set i64:$rD, @@ -1134,12 +1141,12 @@ def ADDItlsgdLADDR : Pseudo<(outs g8rc:$rD), tglobaltlsaddr:$disp, tglobaltlsaddr:$sym))]>, isPPC64; -def ADDIStlsldHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), +def ADDIStlsldHA: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), "#ADDIStlsldHA", [(set i64:$rD, (PPCaddisTlsldHA i64:$reg, tglobaltlsaddr:$disp))]>, isPPC64; -def ADDItlsldL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), +def ADDItlsldL : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), "#ADDItlsldL", [(set i64:$rD, (PPCaddiTlsldL i64:$reg, tglobaltlsaddr:$disp))]>, @@ -1148,7 +1155,7 @@ def ADDItlsldL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), // explicitly defined when this op is created, so not mentioned here. let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in -def GETtlsldADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym), +def GETtlsldADDR : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym), "#GETtlsldADDR", [(set i64:$rD, (PPCgetTlsldAddr i64:$reg, tglobaltlsaddr:$sym))]>, @@ -1158,7 +1165,7 @@ def GETtlsldADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym), let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, Defs = [X0,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in -def ADDItlsldLADDR : Pseudo<(outs g8rc:$rD), +def ADDItlsldLADDR : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp, tlsgd:$sym), "#ADDItlsldLADDR", [(set i64:$rD, @@ -1166,13 +1173,13 @@ def ADDItlsldLADDR : Pseudo<(outs g8rc:$rD), tglobaltlsaddr:$disp, tglobaltlsaddr:$sym))]>, isPPC64; -def ADDISdtprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), +def ADDISdtprelHA: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), "#ADDISdtprelHA", [(set i64:$rD, (PPCaddisDtprelHA i64:$reg, tglobaltlsaddr:$disp))]>, isPPC64; -def ADDIdtprelL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), +def ADDIdtprelL : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), "#ADDIdtprelL", [(set i64:$rD, (PPCaddiDtprelL i64:$reg, tglobaltlsaddr:$disp))]>, @@ -1222,30 +1229,30 @@ def STDBRX: XForm_8_memOp<31, 660, (outs), (ins g8rc:$rS, memrr:$dst), let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in { let Interpretation64Bit = 1, isCodeGenOnly = 1 in { def STBU8 : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst), - "stbu $rS, $dst", IIC_LdStStoreUpd, []>, + "stbu $rS, $dst", IIC_LdStSTU, []>, RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; def STHU8 : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst), - "sthu $rS, $dst", IIC_LdStStoreUpd, []>, + "sthu $rS, $dst", IIC_LdStSTU, []>, RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; def STWU8 : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst), - "stwu $rS, $dst", IIC_LdStStoreUpd, []>, + "stwu $rS, $dst", IIC_LdStSTU, []>, RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; def STBUX8: XForm_8_memOp<31, 247, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst), - "stbux $rS, $dst", IIC_LdStStoreUpd, []>, + "stbux $rS, $dst", IIC_LdStSTUX, []>, RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, PPC970_DGroup_Cracked; def STHUX8: XForm_8_memOp<31, 439, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst), - "sthux $rS, $dst", IIC_LdStStoreUpd, []>, + "sthux $rS, $dst", IIC_LdStSTUX, []>, RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, PPC970_DGroup_Cracked; def STWUX8: XForm_8_memOp<31, 183, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst), - "stwux $rS, $dst", IIC_LdStStoreUpd, []>, + "stwux $rS, $dst", IIC_LdStSTUX, []>, RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, PPC970_DGroup_Cracked; @@ -1253,13 +1260,13 @@ def STWUX8: XForm_8_memOp<31, 183, (outs ptr_rc_nor0:$ea_res), def STDU : DSForm_1<62, 1, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrix:$dst), - "stdu $rS, $dst", IIC_LdStSTDU, []>, + "stdu $rS, $dst", IIC_LdStSTU, []>, RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">, isPPC64; def STDUX : XForm_8_memOp<31, 181, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst), - "stdux $rS, $dst", IIC_LdStSTDUX, []>, + "stdux $rS, $dst", IIC_LdStSTUX, []>, RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, PPC970_DGroup_Cracked, isPPC64; diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td index 24969d7ef853773e75f4fea921f3a2f820cd8ca3..69b19e45c3e9ccb9c118d210d1557ae1f93435ce 100644 --- a/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/lib/Target/PowerPC/PPCInstrAltivec.td @@ -1051,6 +1051,20 @@ def : Pat<(v4f32 (ftrunc v4f32:$vA)), def : Pat<(v4f32 (fnearbyint v4f32:$vA)), (VRFIN $vA)>; +// Vector selection +def : Pat<(v16i8 (vselect v16i8:$vA, v16i8:$vB, v16i8:$vC)), + (VSEL $vC, $vB, $vA)>; +def : Pat<(v8i16 (vselect v8i16:$vA, v8i16:$vB, v8i16:$vC)), + (VSEL $vC, $vB, $vA)>; +def : Pat<(v4i32 (vselect v4i32:$vA, v4i32:$vB, v4i32:$vC)), + (VSEL $vC, $vB, $vA)>; +def : Pat<(v2i64 (vselect v2i64:$vA, v2i64:$vB, v2i64:$vC)), + (VSEL $vC, $vB, $vA)>; +def : Pat<(v4f32 (vselect v4i32:$vA, v4f32:$vB, v4f32:$vC)), + (VSEL $vC, $vB, $vA)>; +def : Pat<(v2f64 (vselect v2i64:$vA, v2f64:$vB, v2f64:$vC)), + (VSEL $vC, $vB, $vA)>; + } // end HasAltivec def HasP8Altivec : Predicate<"PPCSubTarget->hasP8Altivec()">; diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td index f5f4b46344cff8959e347a437cd49b894229b33c..2fe765dd99e1e90fdd104f6fcdbf737486cddfe1 100644 --- a/lib/Target/PowerPC/PPCInstrFormats.td +++ b/lib/Target/PowerPC/PPCInstrFormats.td @@ -2153,7 +2153,9 @@ class Z23Form_8 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, } //===----------------------------------------------------------------------===// -class Pseudo pattern> +// EmitTimePseudo won't have encoding information for the [MC]CodeEmitter +// stuff +class PPCEmitTimePseudo pattern> : I<0, OOL, IOL, asmstr, NoItinerary> { let isCodeGenOnly = 1; let PPC64 = 0; @@ -2162,6 +2164,21 @@ class Pseudo pattern> let hasNoSchedulingInfo = 1; } +// Instruction that require custom insertion support +// a.k.a. ISelPseudos, however, these won't have isPseudo set +class PPCCustomInserterPseudo pattern> + : PPCEmitTimePseudo { + let usesCustomInserter = 1; +} + +// PostRAPseudo will be expanded in expandPostRAPseudo, isPseudo flag in td +// files is set only for PostRAPseudo +class PPCPostRAExpPseudo pattern> + : PPCEmitTimePseudo { + let isPseudo = 1; +} + class PseudoXFormMemOp pattern> - : Pseudo, XFormMemOp; + : PPCPostRAExpPseudo, XFormMemOp; diff --git a/lib/Target/PowerPC/PPCInstrHTM.td b/lib/Target/PowerPC/PPCInstrHTM.td index 6c4e2129087cabb135101cadff829482a94fb6c9..0efe797c765daa7107c0fdf5e1a2a80aa925c705 100644 --- a/lib/Target/PowerPC/PPCInstrHTM.td +++ b/lib/Target/PowerPC/PPCInstrHTM.td @@ -20,8 +20,8 @@ def HTM_get_imm : SDNodeXFormgetZExtValue(), SDLoc(N)); }]>; -let hasSideEffects = 1, usesCustomInserter = 1 in { -def TCHECK_RET : Pseudo<(outs crrc:$out), (ins), "#TCHECK_RET", []>; +let hasSideEffects = 1 in { +def TCHECK_RET : PPCCustomInserterPseudo<(outs crrc:$out), (ins), "#TCHECK_RET", []>; } diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index 559ed59bec9f1f08f9f1ba02bcf639e935aeb7a9..d26af451303b6368e0e513033860d47ffa31b104 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1429,17 +1429,15 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI, : (isPPC64 ? PPC::BDZLR8 : PPC::BDZLR))); } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) { MI.setDesc(get(PPC::BCLR)); - MachineInstrBuilder(*MI.getParent()->getParent(), MI) - .addReg(Pred[1].getReg()); + MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]); } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) { MI.setDesc(get(PPC::BCLRn)); - MachineInstrBuilder(*MI.getParent()->getParent(), MI) - .addReg(Pred[1].getReg()); + MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]); } else { MI.setDesc(get(PPC::BCCLR)); MachineInstrBuilder(*MI.getParent()->getParent(), MI) .addImm(Pred[0].getImm()) - .addReg(Pred[1].getReg()); + .add(Pred[1]); } return true; @@ -1454,7 +1452,7 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI, MI.setDesc(get(PPC::BC)); MachineInstrBuilder(*MI.getParent()->getParent(), MI) - .addReg(Pred[1].getReg()) + .add(Pred[1]) .addMBB(MBB); } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) { MachineBasicBlock *MBB = MI.getOperand(0).getMBB(); @@ -1462,7 +1460,7 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI, MI.setDesc(get(PPC::BCn)); MachineInstrBuilder(*MI.getParent()->getParent(), MI) - .addReg(Pred[1].getReg()) + .add(Pred[1]) .addMBB(MBB); } else { MachineBasicBlock *MBB = MI.getOperand(0).getMBB(); @@ -1471,13 +1469,13 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI, MI.setDesc(get(PPC::BCC)); MachineInstrBuilder(*MI.getParent()->getParent(), MI) .addImm(Pred[0].getImm()) - .addReg(Pred[1].getReg()) + .add(Pred[1]) .addMBB(MBB); } return true; - } else if (OpC == PPC::BCTR || OpC == PPC::BCTR8 || - OpC == PPC::BCTRL || OpC == PPC::BCTRL8) { + } else if (OpC == PPC::BCTR || OpC == PPC::BCTR8 || OpC == PPC::BCTRL || + OpC == PPC::BCTRL8) { if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) llvm_unreachable("Cannot predicate bctr[l] on the ctr register"); @@ -1487,14 +1485,12 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI, if (Pred[0].getImm() == PPC::PRED_BIT_SET) { MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8 : PPC::BCCTR8) : (setLR ? PPC::BCCTRL : PPC::BCCTR))); - MachineInstrBuilder(*MI.getParent()->getParent(), MI) - .addReg(Pred[1].getReg()); + MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]); return true; } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) { MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8n : PPC::BCCTR8n) : (setLR ? PPC::BCCTRLn : PPC::BCCTRn))); - MachineInstrBuilder(*MI.getParent()->getParent(), MI) - .addReg(Pred[1].getReg()); + MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]); return true; } @@ -1502,7 +1498,7 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI, : (setLR ? PPC::BCCCTRL : PPC::BCCCTR))); MachineInstrBuilder(*MI.getParent()->getParent(), MI) .addImm(Pred[0].getImm()) - .addReg(Pred[1].getReg()); + .add(Pred[1]); return true; } @@ -2984,8 +2980,10 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI, if (PostRA) { if (isVFReg(MI.getOperand(0).getReg())) III.ImmOpcode = PPC::LXSSP; - else + else { III.ImmOpcode = PPC::LFS; + III.ImmMustBeMultipleOf = 1; + } break; } LLVM_FALLTHROUGH; @@ -2996,8 +2994,10 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI, if (PostRA) { if (isVFReg(MI.getOperand(0).getReg())) III.ImmOpcode = PPC::LXSD; - else + else { III.ImmOpcode = PPC::LFD; + III.ImmMustBeMultipleOf = 1; + } break; } LLVM_FALLTHROUGH; @@ -3012,8 +3012,10 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI, if (PostRA) { if (isVFReg(MI.getOperand(0).getReg())) III.ImmOpcode = PPC::STXSSP; - else + else { III.ImmOpcode = PPC::STFS; + III.ImmMustBeMultipleOf = 1; + } break; } LLVM_FALLTHROUGH; @@ -3024,8 +3026,10 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI, if (PostRA) { if (isVFReg(MI.getOperand(0).getReg())) III.ImmOpcode = PPC::STXSD; - else + else { III.ImmOpcode = PPC::STFD; + III.ImmMustBeMultipleOf = 1; + } break; } LLVM_FALLTHROUGH; @@ -3484,6 +3488,7 @@ static bool isSignExtendingOp(const MachineInstr &MI) { Opcode == PPC::EXTSH || Opcode == PPC::EXTSHo || Opcode == PPC::EXTSB8 || Opcode == PPC::EXTSH8 || Opcode == PPC::EXTSW || Opcode == PPC::EXTSWo || + Opcode == PPC::SETB || Opcode == PPC::SETB8 || Opcode == PPC::EXTSH8_32_64 || Opcode == PPC::EXTSW_32_64 || Opcode == PPC::EXTSB8_32_64) return true; diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index d7e32a5d89f12f13c014e26f0f5a3dfa3707397c..dd3f1ac790894961d9550ffd2380c8cbd4f52424 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -1195,77 +1195,76 @@ multiclass AForm_3r opcode, bits<5> xo, dag OOL, dag IOL, //===----------------------------------------------------------------------===// // PowerPC Instruction Definitions. -// Pseudo-instructions: +// Pseudo instructions: let hasCtrlDep = 1 in { let Defs = [R1], Uses = [R1] in { -def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), +def ADJCALLSTACKDOWN : PPCEmitTimePseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "#ADJCALLSTACKDOWN $amt1 $amt2", [(callseq_start timm:$amt1, timm:$amt2)]>; -def ADJCALLSTACKUP : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), +def ADJCALLSTACKUP : PPCEmitTimePseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2", [(callseq_end timm:$amt1, timm:$amt2)]>; } -def UPDATE_VRSAVE : Pseudo<(outs gprc:$rD), (ins gprc:$rS), +def UPDATE_VRSAVE : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$rS), "UPDATE_VRSAVE $rD, $rS", []>; } let Defs = [R1], Uses = [R1] in -def DYNALLOC : Pseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC", +def DYNALLOC : PPCEmitTimePseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC", [(set i32:$result, (PPCdynalloc i32:$negsize, iaddr:$fpsi))]>; -def DYNAREAOFFSET : Pseudo<(outs i32imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET", +def DYNAREAOFFSET : PPCEmitTimePseudo<(outs i32imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET", [(set i32:$result, (PPCdynareaoffset iaddr:$fpsi))]>; // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after // instruction selection into a branch sequence. -let usesCustomInserter = 1, // Expanded after instruction selection. - PPC970_Single = 1 in { +let PPC970_Single = 1 in { // Note that SELECT_CC_I4 and SELECT_CC_I8 use the no-r0 register classes // because either operand might become the first operand in an isel, and // that operand cannot be r0. - def SELECT_CC_I4 : Pseudo<(outs gprc:$dst), (ins crrc:$cond, + def SELECT_CC_I4 : PPCCustomInserterPseudo<(outs gprc:$dst), (ins crrc:$cond, gprc_nor0:$T, gprc_nor0:$F, i32imm:$BROPC), "#SELECT_CC_I4", []>; - def SELECT_CC_I8 : Pseudo<(outs g8rc:$dst), (ins crrc:$cond, + def SELECT_CC_I8 : PPCCustomInserterPseudo<(outs g8rc:$dst), (ins crrc:$cond, g8rc_nox0:$T, g8rc_nox0:$F, i32imm:$BROPC), "#SELECT_CC_I8", []>; - def SELECT_CC_F4 : Pseudo<(outs f4rc:$dst), (ins crrc:$cond, f4rc:$T, f4rc:$F, + def SELECT_CC_F4 : PPCCustomInserterPseudo<(outs f4rc:$dst), (ins crrc:$cond, f4rc:$T, f4rc:$F, i32imm:$BROPC), "#SELECT_CC_F4", []>; - def SELECT_CC_F8 : Pseudo<(outs f8rc:$dst), (ins crrc:$cond, f8rc:$T, f8rc:$F, + def SELECT_CC_F8 : PPCCustomInserterPseudo<(outs f8rc:$dst), (ins crrc:$cond, f8rc:$T, f8rc:$F, i32imm:$BROPC), "#SELECT_CC_F8", []>; - def SELECT_CC_F16 : Pseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F, + def SELECT_CC_F16 : PPCCustomInserterPseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F, i32imm:$BROPC), "#SELECT_CC_F16", []>; - def SELECT_CC_VRRC: Pseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F, + def SELECT_CC_VRRC: PPCCustomInserterPseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F, i32imm:$BROPC), "#SELECT_CC_VRRC", []>; // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition // register bit directly. - def SELECT_I4 : Pseudo<(outs gprc:$dst), (ins crbitrc:$cond, + def SELECT_I4 : PPCCustomInserterPseudo<(outs gprc:$dst), (ins crbitrc:$cond, gprc_nor0:$T, gprc_nor0:$F), "#SELECT_I4", [(set i32:$dst, (select i1:$cond, i32:$T, i32:$F))]>; - def SELECT_I8 : Pseudo<(outs g8rc:$dst), (ins crbitrc:$cond, + def SELECT_I8 : PPCCustomInserterPseudo<(outs g8rc:$dst), (ins crbitrc:$cond, g8rc_nox0:$T, g8rc_nox0:$F), "#SELECT_I8", [(set i64:$dst, (select i1:$cond, i64:$T, i64:$F))]>; let Predicates = [HasFPU] in { - def SELECT_F4 : Pseudo<(outs f4rc:$dst), (ins crbitrc:$cond, + def SELECT_F4 : PPCCustomInserterPseudo<(outs f4rc:$dst), (ins crbitrc:$cond, f4rc:$T, f4rc:$F), "#SELECT_F4", [(set f32:$dst, (select i1:$cond, f32:$T, f32:$F))]>; - def SELECT_F8 : Pseudo<(outs f8rc:$dst), (ins crbitrc:$cond, + def SELECT_F8 : PPCCustomInserterPseudo<(outs f8rc:$dst), (ins crbitrc:$cond, f8rc:$T, f8rc:$F), "#SELECT_F8", [(set f64:$dst, (select i1:$cond, f64:$T, f64:$F))]>; - def SELECT_F16 : Pseudo<(outs vrrc:$dst), (ins crbitrc:$cond, + def SELECT_F16 : PPCCustomInserterPseudo<(outs vrrc:$dst), (ins crbitrc:$cond, vrrc:$T, vrrc:$F), "#SELECT_F16", [(set f128:$dst, (select i1:$cond, f128:$T, f128:$F))]>; } - def SELECT_VRRC: Pseudo<(outs vrrc:$dst), (ins crbitrc:$cond, + def SELECT_VRRC: PPCCustomInserterPseudo<(outs vrrc:$dst), (ins crbitrc:$cond, vrrc:$T, vrrc:$F), "#SELECT_VRRC", [(set v4i32:$dst, (select i1:$cond, v4i32:$T, v4i32:$F))]>; @@ -1274,18 +1273,18 @@ let Predicates = [HasFPU] in { // SPILL_CR - Indicate that we're dumping the CR register, so we'll need to // scavenge a register for it. let mayStore = 1 in { -def SPILL_CR : Pseudo<(outs), (ins crrc:$cond, memri:$F), +def SPILL_CR : PPCEmitTimePseudo<(outs), (ins crrc:$cond, memri:$F), "#SPILL_CR", []>; -def SPILL_CRBIT : Pseudo<(outs), (ins crbitrc:$cond, memri:$F), +def SPILL_CRBIT : PPCEmitTimePseudo<(outs), (ins crbitrc:$cond, memri:$F), "#SPILL_CRBIT", []>; } // RESTORE_CR - Indicate that we're restoring the CR register (previously // spilled), so we'll need to scavenge a register for it. let mayLoad = 1 in { -def RESTORE_CR : Pseudo<(outs crrc:$cond), (ins memri:$F), +def RESTORE_CR : PPCEmitTimePseudo<(outs crrc:$cond), (ins memri:$F), "#RESTORE_CR", []>; -def RESTORE_CRBIT : Pseudo<(outs crbitrc:$cond), (ins memri:$F), +def RESTORE_CRBIT : PPCEmitTimePseudo<(outs crbitrc:$cond), (ins memri:$F), "#RESTORE_CRBIT", []>; } @@ -1311,10 +1310,10 @@ let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in { } let Defs = [LR] in - def MovePCtoLR : Pseudo<(outs), (ins), "#MovePCtoLR", []>, + def MovePCtoLR : PPCEmitTimePseudo<(outs), (ins), "#MovePCtoLR", []>, PPC970_Unit_BRU; let Defs = [LR] in - def MoveGOTtoLR : Pseudo<(outs), (ins), "#MoveGOTtoLR", []>, + def MoveGOTtoLR : PPCEmitTimePseudo<(outs), (ins), "#MoveGOTtoLR", []>, PPC970_Unit_BRU; let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in { @@ -1512,19 +1511,19 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in { } let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in -def TCRETURNdi :Pseudo< (outs), +def TCRETURNdi :PPCEmitTimePseudo< (outs), (ins calltarget:$dst, i32imm:$offset), "#TC_RETURNd $dst $offset", []>; let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in -def TCRETURNai :Pseudo<(outs), (ins abscalltarget:$func, i32imm:$offset), +def TCRETURNai :PPCEmitTimePseudo<(outs), (ins abscalltarget:$func, i32imm:$offset), "#TC_RETURNa $func $offset", [(PPCtc_return (i32 imm:$func), imm:$offset)]>; let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in -def TCRETURNri : Pseudo<(outs), (ins CTRRC:$dst, i32imm:$offset), +def TCRETURNri : PPCEmitTimePseudo<(outs), (ins CTRRC:$dst, i32imm:$offset), "#TC_RETURNr $dst $offset", []>; @@ -1550,14 +1549,19 @@ def TAILBA : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst), } -let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in { +// While longjmp is a control-flow barrier (fallthrough isn't allowed), setjmp +// is not. +let hasSideEffects = 1 in { let Defs = [CTR] in - def EH_SjLj_SetJmp32 : Pseudo<(outs gprc:$dst), (ins memr:$buf), + def EH_SjLj_SetJmp32 : PPCCustomInserterPseudo<(outs gprc:$dst), (ins memr:$buf), "#EH_SJLJ_SETJMP32", [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>, Requires<[In32BitMode]>; +} + +let hasSideEffects = 1, isBarrier = 1 in { let isTerminator = 1 in - def EH_SjLj_LongJmp32 : Pseudo<(outs), (ins memr:$buf), + def EH_SjLj_LongJmp32 : PPCCustomInserterPseudo<(outs), (ins memr:$buf), "#EH_SJLJ_LONGJMP32", [(PPCeh_sjlj_longjmp addr:$buf)]>, Requires<[In32BitMode]>; @@ -1567,7 +1571,7 @@ let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in { // a terminator. Size is set to 0 to prevent the builtin assembler // from emitting it. let isBranch = 1, isTerminator = 1, Size = 0 in { - def EH_SjLj_Setup : Pseudo<(outs), (ins directbrtarget:$dst), + def EH_SjLj_Setup : PPCEmitTimePseudo<(outs), (ins directbrtarget:$dst), "#EH_SjLj_Setup\t$dst", []>; } @@ -1654,119 +1658,117 @@ def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 0)), // clean this up in PPCMIPeephole with calls to // PPCInstrInfo::convertToImmediateForm() but we should probably not emit them // in the first place. -let usesCustomInserter = 1 in { - let Defs = [CR0] in { - def ATOMIC_LOAD_ADD_I8 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I8", - [(set i32:$dst, (atomic_load_add_8 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_SUB_I8 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I8", - [(set i32:$dst, (atomic_load_sub_8 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_AND_I8 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I8", - [(set i32:$dst, (atomic_load_and_8 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_OR_I8 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I8", - [(set i32:$dst, (atomic_load_or_8 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_XOR_I8 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "ATOMIC_LOAD_XOR_I8", - [(set i32:$dst, (atomic_load_xor_8 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_NAND_I8 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I8", - [(set i32:$dst, (atomic_load_nand_8 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_MIN_I8 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I8", - [(set i32:$dst, (atomic_load_min_8 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_MAX_I8 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I8", - [(set i32:$dst, (atomic_load_max_8 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_UMIN_I8 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I8", - [(set i32:$dst, (atomic_load_umin_8 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_UMAX_I8 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I8", - [(set i32:$dst, (atomic_load_umax_8 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_ADD_I16 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I16", - [(set i32:$dst, (atomic_load_add_16 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_SUB_I16 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I16", - [(set i32:$dst, (atomic_load_sub_16 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_AND_I16 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I16", - [(set i32:$dst, (atomic_load_and_16 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_OR_I16 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I16", - [(set i32:$dst, (atomic_load_or_16 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_XOR_I16 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I16", - [(set i32:$dst, (atomic_load_xor_16 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_NAND_I16 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I16", - [(set i32:$dst, (atomic_load_nand_16 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_MIN_I16 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I16", - [(set i32:$dst, (atomic_load_min_16 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_MAX_I16 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I16", - [(set i32:$dst, (atomic_load_max_16 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_UMIN_I16 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I16", - [(set i32:$dst, (atomic_load_umin_16 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_UMAX_I16 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I16", - [(set i32:$dst, (atomic_load_umax_16 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_ADD_I32 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I32", - [(set i32:$dst, (atomic_load_add_32 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_SUB_I32 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I32", - [(set i32:$dst, (atomic_load_sub_32 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_AND_I32 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I32", - [(set i32:$dst, (atomic_load_and_32 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_OR_I32 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I32", - [(set i32:$dst, (atomic_load_or_32 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_XOR_I32 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I32", - [(set i32:$dst, (atomic_load_xor_32 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_NAND_I32 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I32", - [(set i32:$dst, (atomic_load_nand_32 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_MIN_I32 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I32", - [(set i32:$dst, (atomic_load_min_32 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_MAX_I32 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I32", - [(set i32:$dst, (atomic_load_max_32 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_UMIN_I32 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I32", - [(set i32:$dst, (atomic_load_umin_32 xoaddr:$ptr, i32:$incr))]>; - def ATOMIC_LOAD_UMAX_I32 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I32", - [(set i32:$dst, (atomic_load_umax_32 xoaddr:$ptr, i32:$incr))]>; - - def ATOMIC_CMP_SWAP_I8 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I8", - [(set i32:$dst, (atomic_cmp_swap_8 xoaddr:$ptr, i32:$old, i32:$new))]>; - def ATOMIC_CMP_SWAP_I16 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I16 $dst $ptr $old $new", - [(set i32:$dst, (atomic_cmp_swap_16 xoaddr:$ptr, i32:$old, i32:$new))]>; - def ATOMIC_CMP_SWAP_I32 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I32 $dst $ptr $old $new", - [(set i32:$dst, (atomic_cmp_swap_32 xoaddr:$ptr, i32:$old, i32:$new))]>; - - def ATOMIC_SWAP_I8 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_i8", - [(set i32:$dst, (atomic_swap_8 xoaddr:$ptr, i32:$new))]>; - def ATOMIC_SWAP_I16 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I16", - [(set i32:$dst, (atomic_swap_16 xoaddr:$ptr, i32:$new))]>; - def ATOMIC_SWAP_I32 : Pseudo< - (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I32", - [(set i32:$dst, (atomic_swap_32 xoaddr:$ptr, i32:$new))]>; - } +let Defs = [CR0] in { + def ATOMIC_LOAD_ADD_I8 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I8", + [(set i32:$dst, (atomic_load_add_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_SUB_I8 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I8", + [(set i32:$dst, (atomic_load_sub_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_AND_I8 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I8", + [(set i32:$dst, (atomic_load_and_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_OR_I8 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I8", + [(set i32:$dst, (atomic_load_or_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_XOR_I8 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "ATOMIC_LOAD_XOR_I8", + [(set i32:$dst, (atomic_load_xor_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_NAND_I8 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I8", + [(set i32:$dst, (atomic_load_nand_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_MIN_I8 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I8", + [(set i32:$dst, (atomic_load_min_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_MAX_I8 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I8", + [(set i32:$dst, (atomic_load_max_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_UMIN_I8 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I8", + [(set i32:$dst, (atomic_load_umin_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_UMAX_I8 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I8", + [(set i32:$dst, (atomic_load_umax_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_ADD_I16 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I16", + [(set i32:$dst, (atomic_load_add_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_SUB_I16 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I16", + [(set i32:$dst, (atomic_load_sub_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_AND_I16 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I16", + [(set i32:$dst, (atomic_load_and_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_OR_I16 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I16", + [(set i32:$dst, (atomic_load_or_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_XOR_I16 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I16", + [(set i32:$dst, (atomic_load_xor_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_NAND_I16 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I16", + [(set i32:$dst, (atomic_load_nand_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_MIN_I16 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I16", + [(set i32:$dst, (atomic_load_min_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_MAX_I16 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I16", + [(set i32:$dst, (atomic_load_max_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_UMIN_I16 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I16", + [(set i32:$dst, (atomic_load_umin_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_UMAX_I16 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I16", + [(set i32:$dst, (atomic_load_umax_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_ADD_I32 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I32", + [(set i32:$dst, (atomic_load_add_32 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_SUB_I32 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I32", + [(set i32:$dst, (atomic_load_sub_32 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_AND_I32 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I32", + [(set i32:$dst, (atomic_load_and_32 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_OR_I32 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I32", + [(set i32:$dst, (atomic_load_or_32 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_XOR_I32 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I32", + [(set i32:$dst, (atomic_load_xor_32 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_NAND_I32 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I32", + [(set i32:$dst, (atomic_load_nand_32 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_MIN_I32 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I32", + [(set i32:$dst, (atomic_load_min_32 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_MAX_I32 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I32", + [(set i32:$dst, (atomic_load_max_32 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_UMIN_I32 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I32", + [(set i32:$dst, (atomic_load_umin_32 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_UMAX_I32 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I32", + [(set i32:$dst, (atomic_load_umax_32 xoaddr:$ptr, i32:$incr))]>; + + def ATOMIC_CMP_SWAP_I8 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I8", + [(set i32:$dst, (atomic_cmp_swap_8 xoaddr:$ptr, i32:$old, i32:$new))]>; + def ATOMIC_CMP_SWAP_I16 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I16 $dst $ptr $old $new", + [(set i32:$dst, (atomic_cmp_swap_16 xoaddr:$ptr, i32:$old, i32:$new))]>; + def ATOMIC_CMP_SWAP_I32 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I32 $dst $ptr $old $new", + [(set i32:$dst, (atomic_cmp_swap_32 xoaddr:$ptr, i32:$old, i32:$new))]>; + + def ATOMIC_SWAP_I8 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_i8", + [(set i32:$dst, (atomic_swap_8 xoaddr:$ptr, i32:$new))]>; + def ATOMIC_SWAP_I16 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I16", + [(set i32:$dst, (atomic_swap_16 xoaddr:$ptr, i32:$new))]>; + def ATOMIC_SWAP_I32 : PPCCustomInserterPseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I32", + [(set i32:$dst, (atomic_swap_32 xoaddr:$ptr, i32:$new))]>; } def : Pat<(PPCatomicCmpSwap_8 xoaddr:$ptr, i32:$old, i32:$new), @@ -1994,15 +1996,15 @@ def LMW : DForm_1<46, (outs gprc:$rD), (ins memri:$src), // Unindexed (r+i) Stores. let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in { -def STB : DForm_1<38, (outs), (ins gprc:$rS, memri:$src), - "stb $rS, $src", IIC_LdStStore, - [(truncstorei8 i32:$rS, iaddr:$src)]>; -def STH : DForm_1<44, (outs), (ins gprc:$rS, memri:$src), - "sth $rS, $src", IIC_LdStStore, - [(truncstorei16 i32:$rS, iaddr:$src)]>; -def STW : DForm_1<36, (outs), (ins gprc:$rS, memri:$src), - "stw $rS, $src", IIC_LdStStore, - [(store i32:$rS, iaddr:$src)]>; +def STB : DForm_1<38, (outs), (ins gprc:$rS, memri:$dst), + "stb $rS, $dst", IIC_LdStStore, + [(truncstorei8 i32:$rS, iaddr:$dst)]>; +def STH : DForm_1<44, (outs), (ins gprc:$rS, memri:$dst), + "sth $rS, $dst", IIC_LdStStore, + [(truncstorei16 i32:$rS, iaddr:$dst)]>; +def STW : DForm_1<36, (outs), (ins gprc:$rS, memri:$dst), + "stw $rS, $dst", IIC_LdStStore, + [(store i32:$rS, iaddr:$dst)]>; let Predicates = [HasFPU] in { def STFS : DForm_1<52, (outs), (ins f4rc:$rS, memri:$dst), "stfs $rS, $dst", IIC_LdStSTFD, @@ -2016,13 +2018,13 @@ def STFD : DForm_1<54, (outs), (ins f8rc:$rS, memri:$dst), // Unindexed (r+i) Stores with Update (preinc). let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in { def STBU : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst), - "stbu $rS, $dst", IIC_LdStStoreUpd, []>, + "stbu $rS, $dst", IIC_LdStSTU, []>, RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; def STHU : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst), - "sthu $rS, $dst", IIC_LdStStoreUpd, []>, + "sthu $rS, $dst", IIC_LdStSTU, []>, RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; def STWU : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst), - "stwu $rS, $dst", IIC_LdStStoreUpd, []>, + "stwu $rS, $dst", IIC_LdStSTU, []>, RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; let Predicates = [HasFPU] in { def STFSU : DForm_1<53, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memri:$dst), @@ -2090,19 +2092,19 @@ def STFDX : XForm_28_memOp<31, 727, (outs), (ins f8rc:$frS, memrr:$dst), let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in { def STBUX : XForm_8_memOp<31, 247, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst), - "stbux $rS, $dst", IIC_LdStStoreUpd, []>, + "stbux $rS, $dst", IIC_LdStSTUX, []>, RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, PPC970_DGroup_Cracked; def STHUX : XForm_8_memOp<31, 439, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst), - "sthux $rS, $dst", IIC_LdStStoreUpd, []>, + "sthux $rS, $dst", IIC_LdStSTUX, []>, RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, PPC970_DGroup_Cracked; def STWUX : XForm_8_memOp<31, 183, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst), - "stwux $rS, $dst", IIC_LdStStoreUpd, []>, + "stwux $rS, $dst", IIC_LdStSTUX, []>, RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, PPC970_DGroup_Cracked; @@ -2549,8 +2551,8 @@ def MTPMR : XFXForm_1<31, 462, (outs), (ins i32imm:$SPR, gprc:$RT), // A pseudo-instruction used to implement the read of the 64-bit cycle counter // on a 32-bit target. -let hasSideEffects = 1, usesCustomInserter = 1 in -def ReadTB : Pseudo<(outs gprc:$lo, gprc:$hi), (ins), +let hasSideEffects = 1 in +def ReadTB : PPCCustomInserterPseudo<(outs gprc:$lo, gprc:$hi), (ins), "#ReadTB", []>; let Uses = [CTR] in { @@ -2609,13 +2611,13 @@ def : InstAlias<"mfvrsave $rS", (MFVRSAVE gprc:$rS)>; // SPILL_VRSAVE - Indicate that we're dumping the VRSAVE register, // so we'll need to scavenge a register for it. let mayStore = 1 in -def SPILL_VRSAVE : Pseudo<(outs), (ins VRSAVERC:$vrsave, memri:$F), +def SPILL_VRSAVE : PPCEmitTimePseudo<(outs), (ins VRSAVERC:$vrsave, memri:$F), "#SPILL_VRSAVE", []>; // RESTORE_VRSAVE - Indicate that we're restoring the VRSAVE register (previously // spilled), so we'll need to scavenge a register for it. let mayLoad = 1 in -def RESTORE_VRSAVE : Pseudo<(outs VRSAVERC:$vrsave), (ins memri:$F), +def RESTORE_VRSAVE : PPCEmitTimePseudo<(outs VRSAVERC:$vrsave), (ins memri:$F), "#RESTORE_VRSAVE", []>; let hasSideEffects = 0 in { @@ -2654,9 +2656,9 @@ def MCRXRX : X_BF3<31, 576, (outs crrc:$BF), (ins), } // hasSideEffects = 0 let Predicates = [HasFPU] in { -// Pseudo instruction to perform FADD in round-to-zero mode. -let usesCustomInserter = 1, Uses = [RM] in { - def FADDrtz: Pseudo<(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), "", +// Custom inserter instruction to perform FADD in round-to-zero mode. +let Uses = [RM] in { + def FADDrtz: PPCCustomInserterPseudo<(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), "", [(set f64:$FRT, (PPCfaddrtz f64:$FRA, f64:$FRB))]>; } @@ -3028,23 +3030,23 @@ def : Pat<(add i32:$in, (PPChi tblockaddress:$g, 0)), (ADDIS $in, tblockaddress:$g)>; // Support for thread-local storage. -def PPC32GOT: Pseudo<(outs gprc:$rD), (ins), "#PPC32GOT", +def PPC32GOT: PPCEmitTimePseudo<(outs gprc:$rD), (ins), "#PPC32GOT", [(set i32:$rD, (PPCppc32GOT))]>; // Get the _GLOBAL_OFFSET_TABLE_ in PIC mode. // This uses two output registers, the first as the real output, the second as a // temporary register, used internally in code generation. -def PPC32PICGOT: Pseudo<(outs gprc:$rD, gprc:$rT), (ins), "#PPC32PICGOT", +def PPC32PICGOT: PPCEmitTimePseudo<(outs gprc:$rD, gprc:$rT), (ins), "#PPC32PICGOT", []>, NoEncode<"$rT">; -def LDgotTprelL32: Pseudo<(outs gprc:$rD), (ins s16imm:$disp, gprc_nor0:$reg), +def LDgotTprelL32: PPCEmitTimePseudo<(outs gprc:$rD), (ins s16imm:$disp, gprc_nor0:$reg), "#LDgotTprelL32", [(set i32:$rD, (PPCldGotTprelL tglobaltlsaddr:$disp, i32:$reg))]>; def : Pat<(PPCaddTls i32:$in, tglobaltlsaddr:$g), (ADD4TLS $in, tglobaltlsaddr:$g)>; -def ADDItlsgdL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp), +def ADDItlsgdL32 : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp), "#ADDItlsgdL32", [(set i32:$rD, (PPCaddiTlsgdL i32:$reg, tglobaltlsaddr:$disp))]>; @@ -3052,7 +3054,7 @@ def ADDItlsgdL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp), // explicitly defined when this op is created, so not mentioned here. let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, Defs = [R0,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in -def GETtlsADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym), +def GETtlsADDR32 : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym), "GETtlsADDR32", [(set i32:$rD, (PPCgetTlsAddr i32:$reg, tglobaltlsaddr:$sym))]>; @@ -3060,14 +3062,14 @@ def GETtlsADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym), // are true defines while the rest of the Defs are clobbers. let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, Defs = [R0,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in -def ADDItlsgdLADDR32 : Pseudo<(outs gprc:$rD), +def ADDItlsgdLADDR32 : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp, tlsgd32:$sym), "#ADDItlsgdLADDR32", [(set i32:$rD, (PPCaddiTlsgdLAddr i32:$reg, tglobaltlsaddr:$disp, tglobaltlsaddr:$sym))]>; -def ADDItlsldL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp), +def ADDItlsldL32 : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp), "#ADDItlsldL32", [(set i32:$rD, (PPCaddiTlsldL i32:$reg, tglobaltlsaddr:$disp))]>; @@ -3075,7 +3077,7 @@ def ADDItlsldL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp), // explicitly defined when this op is created, so not mentioned here. let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, Defs = [R0,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in -def GETtlsldADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym), +def GETtlsldADDR32 : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym), "GETtlsldADDR32", [(set i32:$rD, (PPCgetTlsldAddr i32:$reg, @@ -3084,31 +3086,31 @@ def GETtlsldADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym), // are true defines while the rest of the Defs are clobbers. let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, Defs = [R0,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in -def ADDItlsldLADDR32 : Pseudo<(outs gprc:$rD), +def ADDItlsldLADDR32 : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp, tlsgd32:$sym), "#ADDItlsldLADDR32", [(set i32:$rD, (PPCaddiTlsldLAddr i32:$reg, tglobaltlsaddr:$disp, tglobaltlsaddr:$sym))]>; -def ADDIdtprelL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp), +def ADDIdtprelL32 : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp), "#ADDIdtprelL32", [(set i32:$rD, (PPCaddiDtprelL i32:$reg, tglobaltlsaddr:$disp))]>; -def ADDISdtprelHA32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp), +def ADDISdtprelHA32 : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp), "#ADDISdtprelHA32", [(set i32:$rD, (PPCaddisDtprelHA i32:$reg, tglobaltlsaddr:$disp))]>; // Support for Position-independent code -def LWZtoc : Pseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg), +def LWZtoc : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg), "#LWZtoc", [(set i32:$rD, (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>; // Get Global (GOT) Base Register offset, from the word immediately preceding // the function label. -def UpdateGBR : Pseudo<(outs gprc:$rD, gprc:$rT), (ins gprc:$rI), "#UpdateGBR", []>; +def UpdateGBR : PPCEmitTimePseudo<(outs gprc:$rD, gprc:$rT), (ins gprc:$rI), "#UpdateGBR", []>; // Standard shifts. These are represented separately from the real shifts above @@ -3936,21 +3938,19 @@ def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETUGT)), def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETNE)), (SELECT_VRRC (CRXOR $lhs, $rhs), $tval, $fval)>; -let usesCustomInserter = 1 in { -def ANDIo_1_EQ_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in), +def ANDIo_1_EQ_BIT : PPCCustomInserterPseudo<(outs crbitrc:$dst), (ins gprc:$in), "#ANDIo_1_EQ_BIT", [(set i1:$dst, (trunc (not i32:$in)))]>; -def ANDIo_1_GT_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in), +def ANDIo_1_GT_BIT : PPCCustomInserterPseudo<(outs crbitrc:$dst), (ins gprc:$in), "#ANDIo_1_GT_BIT", [(set i1:$dst, (trunc i32:$in))]>; -def ANDIo_1_EQ_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in), +def ANDIo_1_EQ_BIT8 : PPCCustomInserterPseudo<(outs crbitrc:$dst), (ins g8rc:$in), "#ANDIo_1_EQ_BIT8", [(set i1:$dst, (trunc (not i64:$in)))]>; -def ANDIo_1_GT_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in), +def ANDIo_1_GT_BIT8 : PPCCustomInserterPseudo<(outs crbitrc:$dst), (ins g8rc:$in), "#ANDIo_1_GT_BIT8", [(set i1:$dst, (trunc i64:$in))]>; -} def : Pat<(i1 (not (trunc i32:$in))), (ANDIo_1_EQ_BIT $in)>; diff --git a/lib/Target/PowerPC/PPCInstrQPX.td b/lib/Target/PowerPC/PPCInstrQPX.td index c4bb02695b360ffb4989dd81c9a8159030d87856..ef589ad01fd7fa9e496178e7fc83be6bd887ff6f 100644 --- a/lib/Target/PowerPC/PPCInstrQPX.td +++ b/lib/Target/PowerPC/PPCInstrQPX.td @@ -245,32 +245,30 @@ let Uses = [RM] in { // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after // instruction selection into a branch sequence. - let usesCustomInserter = 1 in { - def SELECT_CC_QFRC: Pseudo<(outs qfrc:$dst), (ins crrc:$cond, qfrc:$T, qfrc:$F, - i32imm:$BROPC), "#SELECT_CC_QFRC", - []>; - def SELECT_CC_QSRC: Pseudo<(outs qsrc:$dst), (ins crrc:$cond, qsrc:$T, qsrc:$F, - i32imm:$BROPC), "#SELECT_CC_QSRC", - []>; - def SELECT_CC_QBRC: Pseudo<(outs qbrc:$dst), (ins crrc:$cond, qbrc:$T, qbrc:$F, - i32imm:$BROPC), "#SELECT_CC_QBRC", - []>; - - // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition - // register bit directly. - def SELECT_QFRC: Pseudo<(outs qfrc:$dst), (ins crbitrc:$cond, - qfrc:$T, qfrc:$F), "#SELECT_QFRC", - [(set v4f64:$dst, - (select i1:$cond, v4f64:$T, v4f64:$F))]>; - def SELECT_QSRC: Pseudo<(outs qsrc:$dst), (ins crbitrc:$cond, - qsrc:$T, qsrc:$F), "#SELECT_QSRC", - [(set v4f32:$dst, - (select i1:$cond, v4f32:$T, v4f32:$F))]>; - def SELECT_QBRC: Pseudo<(outs qbrc:$dst), (ins crbitrc:$cond, - qbrc:$T, qbrc:$F), "#SELECT_QBRC", - [(set v4i1:$dst, - (select i1:$cond, v4i1:$T, v4i1:$F))]>; - } + def SELECT_CC_QFRC: PPCCustomInserterPseudo<(outs qfrc:$dst), (ins crrc:$cond, qfrc:$T, qfrc:$F, + i32imm:$BROPC), "#SELECT_CC_QFRC", + []>; + def SELECT_CC_QSRC: PPCCustomInserterPseudo<(outs qsrc:$dst), (ins crrc:$cond, qsrc:$T, qsrc:$F, + i32imm:$BROPC), "#SELECT_CC_QSRC", + []>; + def SELECT_CC_QBRC: PPCCustomInserterPseudo<(outs qbrc:$dst), (ins crrc:$cond, qbrc:$T, qbrc:$F, + i32imm:$BROPC), "#SELECT_CC_QBRC", + []>; + + // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition + // register bit directly. + def SELECT_QFRC: PPCCustomInserterPseudo<(outs qfrc:$dst), (ins crbitrc:$cond, + qfrc:$T, qfrc:$F), "#SELECT_QFRC", + [(set v4f64:$dst, + (select i1:$cond, v4f64:$T, v4f64:$F))]>; + def SELECT_QSRC: PPCCustomInserterPseudo<(outs qsrc:$dst), (ins crbitrc:$cond, + qsrc:$T, qsrc:$F), "#SELECT_QSRC", + [(set v4f32:$dst, + (select i1:$cond, v4f32:$T, v4f32:$F))]>; + def SELECT_QBRC: PPCCustomInserterPseudo<(outs qbrc:$dst), (ins crbitrc:$cond, + qbrc:$T, qbrc:$F), "#SELECT_QBRC", + [(set v4i1:$dst, + (select i1:$cond, v4i1:$T, v4i1:$F))]>; // Convert and Round Instructions def QVFCTID : QPXX19_Int<4, 814, "qvfctid", int_ppc_qpx_qvfctid>; diff --git a/lib/Target/PowerPC/PPCInstrSPE.td b/lib/Target/PowerPC/PPCInstrSPE.td index 96649efdc1bc7152d89ac3e3e99ba283a8fa9a76..9f5891a45f22346e6b4e63e8d91538c131bd4cc8 100644 --- a/lib/Target/PowerPC/PPCInstrSPE.td +++ b/lib/Target/PowerPC/PPCInstrSPE.td @@ -831,22 +831,20 @@ def : Pat<(f64 (fpextend f32:$src)), } let Predicates = [HasSPE] in { - let usesCustomInserter = 1 in { -def SELECT_CC_SPE4 : Pseudo<(outs spe4rc:$dst), +def SELECT_CC_SPE4 : PPCCustomInserterPseudo<(outs spe4rc:$dst), (ins crrc:$cond, spe4rc:$T, spe4rc:$F, i32imm:$BROPC), "#SELECT_CC_SPE4", []>; -def SELECT_CC_SPE : Pseudo<(outs sperc:$dst), +def SELECT_CC_SPE : PPCCustomInserterPseudo<(outs sperc:$dst), (ins crrc:$cond, sperc:$T, sperc:$F, i32imm:$BROPC), "#SELECT_CC_SPE", []>; -def SELECT_SPE4 : Pseudo<(outs spe4rc:$dst), (ins crbitrc:$cond, +def SELECT_SPE4 : PPCCustomInserterPseudo<(outs spe4rc:$dst), (ins crbitrc:$cond, spe4rc:$T, spe4rc:$F), "#SELECT_SPE4", [(set f32:$dst, (select i1:$cond, f32:$T, f32:$F))]>; -def SELECT_SPE : Pseudo<(outs sperc:$dst), (ins crbitrc:$cond, +def SELECT_SPE : PPCCustomInserterPseudo<(outs sperc:$dst), (ins crbitrc:$cond, sperc:$T, sperc:$F), "#SELECT_SPE", [(set f64:$dst, (select i1:$cond, f64:$T, f64:$F))]>; - } def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)), (SELECT_SPE4 (CRANDC $lhs, $rhs), $tval, $fval)>; diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index 9d462df6fef5f41deb24716b38fdbad8b3ec8b47..8321d7f2ecf06d0640d7147316f56cfb3cbc3941 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -67,6 +67,10 @@ def SDT_PPCxxswapd : SDTypeProfile<1, 1, [ def SDTVecConv : SDTypeProfile<1, 2, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2> ]>; +def SDTVabsd : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<3, i32> +]>; + def PPClxvd2x : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; @@ -79,6 +83,7 @@ def PPCmtvsrz : SDNode<"PPCISD::MTVSRZ", SDTUnaryOp, []>; def PPCsvec2fp : SDNode<"PPCISD::SINT_VEC_TO_FP", SDTVecConv, []>; def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>; def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>; +def PPCvabsd : SDNode<"PPCISD::VABSD", SDTVabsd, []>; multiclass XX3Form_Rcr opcode, bits<7> xo, string asmbase, string asmstr, InstrItinClass itin, Intrinsic Int, @@ -132,7 +137,7 @@ let Uses = [RM] in { []>; // Pseudo instruction XFLOADf64 will be expanded to LXSDX or LFDX later - let isPseudo = 1, CodeSize = 3 in + let CodeSize = 3 in def XFLOADf64 : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src), "#XFLOADf64", [(set f64:$XT, (load xoaddr:$src))]>; @@ -163,7 +168,7 @@ let Uses = [RM] in { []>; // Pseudo instruction XFSTOREf64 will be expanded to STXSDX or STFDX later - let isPseudo = 1, CodeSize = 3 in + let CodeSize = 3 in def XFSTOREf64 : PseudoXFormMemOp<(outs), (ins vsfrc:$XT, memrr:$dst), "#XFSTOREf64", [(store f64:$XT, xoaddr:$dst)]>; @@ -898,37 +903,36 @@ let Uses = [RM] in { // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after // instruction selection into a branch sequence. -let usesCustomInserter = 1, // Expanded after instruction selection. - PPC970_Single = 1 in { +let PPC970_Single = 1 in { - def SELECT_CC_VSRC: Pseudo<(outs vsrc:$dst), + def SELECT_CC_VSRC: PPCCustomInserterPseudo<(outs vsrc:$dst), (ins crrc:$cond, vsrc:$T, vsrc:$F, i32imm:$BROPC), "#SELECT_CC_VSRC", []>; - def SELECT_VSRC: Pseudo<(outs vsrc:$dst), + def SELECT_VSRC: PPCCustomInserterPseudo<(outs vsrc:$dst), (ins crbitrc:$cond, vsrc:$T, vsrc:$F), "#SELECT_VSRC", [(set v2f64:$dst, (select i1:$cond, v2f64:$T, v2f64:$F))]>; - def SELECT_CC_VSFRC: Pseudo<(outs f8rc:$dst), + def SELECT_CC_VSFRC: PPCCustomInserterPseudo<(outs f8rc:$dst), (ins crrc:$cond, f8rc:$T, f8rc:$F, i32imm:$BROPC), "#SELECT_CC_VSFRC", []>; - def SELECT_VSFRC: Pseudo<(outs f8rc:$dst), + def SELECT_VSFRC: PPCCustomInserterPseudo<(outs f8rc:$dst), (ins crbitrc:$cond, f8rc:$T, f8rc:$F), "#SELECT_VSFRC", [(set f64:$dst, (select i1:$cond, f64:$T, f64:$F))]>; - def SELECT_CC_VSSRC: Pseudo<(outs f4rc:$dst), + def SELECT_CC_VSSRC: PPCCustomInserterPseudo<(outs f4rc:$dst), (ins crrc:$cond, f4rc:$T, f4rc:$F, i32imm:$BROPC), "#SELECT_CC_VSSRC", []>; - def SELECT_VSSRC: Pseudo<(outs f4rc:$dst), + def SELECT_VSSRC: PPCCustomInserterPseudo<(outs f4rc:$dst), (ins crbitrc:$cond, f4rc:$T, f4rc:$F), "#SELECT_VSSRC", [(set f32:$dst, (select i1:$cond, f32:$T, f32:$F))]>; -} // usesCustomInserter +} } // AddedComplexity def : InstAlias<"xvmovdp $XT, $XB", @@ -1152,6 +1156,26 @@ def : Pat<(int_ppc_vsx_xvrsqrtesp v4f32:$A), def : Pat<(int_ppc_vsx_xvrsqrtedp v2f64:$A), (XVRSQRTEDP $A)>; +// Vector selection +def : Pat<(v16i8 (vselect v16i8:$vA, v16i8:$vB, v16i8:$vC)), + (COPY_TO_REGCLASS + (XXSEL (COPY_TO_REGCLASS $vC, VSRC), + (COPY_TO_REGCLASS $vB, VSRC), + (COPY_TO_REGCLASS $vA, VSRC)), VRRC)>; +def : Pat<(v8i16 (vselect v8i16:$vA, v8i16:$vB, v8i16:$vC)), + (COPY_TO_REGCLASS + (XXSEL (COPY_TO_REGCLASS $vC, VSRC), + (COPY_TO_REGCLASS $vB, VSRC), + (COPY_TO_REGCLASS $vA, VSRC)), VRRC)>; +def : Pat<(vselect v4i32:$vA, v4i32:$vB, v4i32:$vC), + (XXSEL $vC, $vB, $vA)>; +def : Pat<(vselect v2i64:$vA, v2i64:$vB, v2i64:$vC), + (XXSEL $vC, $vB, $vA)>; +def : Pat<(vselect v4i32:$vA, v4f32:$vB, v4f32:$vC), + (XXSEL $vC, $vB, $vA)>; +def : Pat<(vselect v2i64:$vA, v2f64:$vB, v2f64:$vC), + (XXSEL $vC, $vB, $vA)>; + let Predicates = [IsLittleEndian] in { def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))), (f64 (XSCVSXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>; @@ -1234,23 +1258,19 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. def LXSIWZX : XX1Form_memOp<31, 12, (outs vsfrc:$XT), (ins memrr:$src), "lxsiwzx $XT, $src", IIC_LdStLFD, []>; - // Please note let isPseudo = 1 is not part of class Pseudo<>. Missing it - // would cause these Pseudos are not expanded in expandPostRAPseudos() - let isPseudo = 1 in { - // Pseudo instruction XFLOADf32 will be expanded to LXSSPX or LFSX later - let CodeSize = 3 in - def XFLOADf32 : PseudoXFormMemOp<(outs vssrc:$XT), (ins memrr:$src), - "#XFLOADf32", - [(set f32:$XT, (load xoaddr:$src))]>; - // Pseudo instruction LIWAX will be expanded to LXSIWAX or LFIWAX later - def LIWAX : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src), - "#LIWAX", - [(set f64:$XT, (PPClfiwax xoaddr:$src))]>; - // Pseudo instruction LIWZX will be expanded to LXSIWZX or LFIWZX later - def LIWZX : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src), - "#LIWZX", - [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>; - } + // Pseudo instruction XFLOADf32 will be expanded to LXSSPX or LFSX later + let CodeSize = 3 in + def XFLOADf32 : PseudoXFormMemOp<(outs vssrc:$XT), (ins memrr:$src), + "#XFLOADf32", + [(set f32:$XT, (load xoaddr:$src))]>; + // Pseudo instruction LIWAX will be expanded to LXSIWAX or LFIWAX later + def LIWAX : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src), + "#LIWAX", + [(set f64:$XT, (PPClfiwax xoaddr:$src))]>; + // Pseudo instruction LIWZX will be expanded to LXSIWZX or LFIWZX later + def LIWZX : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src), + "#LIWZX", + [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>; } // mayLoad // VSX scalar stores introduced in ISA 2.07 @@ -1261,19 +1281,15 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. def STXSIWX : XX1Form_memOp<31, 140, (outs), (ins vsfrc:$XT, memrr:$dst), "stxsiwx $XT, $dst", IIC_LdStSTFD, []>; - // Please note let isPseudo = 1 is not part of class Pseudo<>. Missing it - // would cause these Pseudos are not expanded in expandPostRAPseudos() - let isPseudo = 1 in { - // Pseudo instruction XFSTOREf32 will be expanded to STXSSPX or STFSX later - let CodeSize = 3 in - def XFSTOREf32 : PseudoXFormMemOp<(outs), (ins vssrc:$XT, memrr:$dst), - "#XFSTOREf32", - [(store f32:$XT, xoaddr:$dst)]>; - // Pseudo instruction STIWX will be expanded to STXSIWX or STFIWX later - def STIWX : PseudoXFormMemOp<(outs), (ins vsfrc:$XT, memrr:$dst), - "#STIWX", - [(PPCstfiwx f64:$XT, xoaddr:$dst)]>; - } + // Pseudo instruction XFSTOREf32 will be expanded to STXSSPX or STFSX later + let CodeSize = 3 in + def XFSTOREf32 : PseudoXFormMemOp<(outs), (ins vssrc:$XT, memrr:$dst), + "#XFSTOREf32", + [(store f32:$XT, xoaddr:$dst)]>; + // Pseudo instruction STIWX will be expanded to STXSIWX or STFIWX later + def STIWX : PseudoXFormMemOp<(outs), (ins vsfrc:$XT, memrr:$dst), + "#STIWX", + [(PPCstfiwx f64:$XT, xoaddr:$dst)]>; } // mayStore } // UseVSXReg = 1 @@ -3242,20 +3258,19 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { def : Pat<(f64 (PPCVexts f64:$A, 2)), (f64 (COPY_TO_REGCLASS (VEXTSH2Ds $A), VSFRC))>; - let isPseudo = 1 in { - def DFLOADf32 : Pseudo<(outs vssrc:$XT), (ins memrix:$src), - "#DFLOADf32", - [(set f32:$XT, (load ixaddr:$src))]>; - def DFLOADf64 : Pseudo<(outs vsfrc:$XT), (ins memrix:$src), - "#DFLOADf64", - [(set f64:$XT, (load ixaddr:$src))]>; - def DFSTOREf32 : Pseudo<(outs), (ins vssrc:$XT, memrix:$dst), - "#DFSTOREf32", - [(store f32:$XT, ixaddr:$dst)]>; - def DFSTOREf64 : Pseudo<(outs), (ins vsfrc:$XT, memrix:$dst), - "#DFSTOREf64", - [(store f64:$XT, ixaddr:$dst)]>; - } + def DFLOADf32 : PPCPostRAExpPseudo<(outs vssrc:$XT), (ins memrix:$src), + "#DFLOADf32", + [(set f32:$XT, (load ixaddr:$src))]>; + def DFLOADf64 : PPCPostRAExpPseudo<(outs vsfrc:$XT), (ins memrix:$src), + "#DFLOADf64", + [(set f64:$XT, (load ixaddr:$src))]>; + def DFSTOREf32 : PPCPostRAExpPseudo<(outs), (ins vssrc:$XT, memrix:$dst), + "#DFSTOREf32", + [(store f32:$XT, ixaddr:$dst)]>; + def DFSTOREf64 : PPCPostRAExpPseudo<(outs), (ins vsfrc:$XT, memrix:$dst), + "#DFSTOREf64", + [(store f64:$XT, ixaddr:$dst)]>; + def : Pat<(f64 (extloadf32 ixaddr:$src)), (COPY_TO_REGCLASS (DFLOADf32 ixaddr:$src), VSFRC)>; def : Pat<(f32 (fpround (f64 (extloadf32 ixaddr:$src)))), @@ -3537,22 +3552,20 @@ let AddedComplexity = 400 in { } let Predicates = [HasP9Vector] in { - let isPseudo = 1 in { - let mayStore = 1 in { - def SPILLTOVSR_STX : PseudoXFormMemOp<(outs), - (ins spilltovsrrc:$XT, memrr:$dst), - "#SPILLTOVSR_STX", []>; - def SPILLTOVSR_ST : Pseudo<(outs), (ins spilltovsrrc:$XT, memrix:$dst), - "#SPILLTOVSR_ST", []>; - } - let mayLoad = 1 in { - def SPILLTOVSR_LDX : PseudoXFormMemOp<(outs spilltovsrrc:$XT), - (ins memrr:$src), - "#SPILLTOVSR_LDX", []>; - def SPILLTOVSR_LD : Pseudo<(outs spilltovsrrc:$XT), (ins memrix:$src), - "#SPILLTOVSR_LD", []>; + let mayStore = 1 in { + def SPILLTOVSR_STX : PseudoXFormMemOp<(outs), + (ins spilltovsrrc:$XT, memrr:$dst), + "#SPILLTOVSR_STX", []>; + def SPILLTOVSR_ST : PPCPostRAExpPseudo<(outs), (ins spilltovsrrc:$XT, memrix:$dst), + "#SPILLTOVSR_ST", []>; + } + let mayLoad = 1 in { + def SPILLTOVSR_LDX : PseudoXFormMemOp<(outs spilltovsrrc:$XT), + (ins memrr:$src), + "#SPILLTOVSR_LDX", []>; + def SPILLTOVSR_LD : PPCPostRAExpPseudo<(outs spilltovsrrc:$XT), (ins memrix:$src), + "#SPILLTOVSR_LD", []>; - } } } // Integer extend helper dags 32 -> 64 @@ -4009,3 +4022,21 @@ let AddedComplexity = 400 in { } } +// Put this P9Altivec related definition here since it's possible to be +// selected to VSX instruction xvnegsp, avoid possible undef. +let Predicates = [HasP9Altivec] in { + + def : Pat<(v4i32 (PPCvabsd v4i32:$A, v4i32:$B, (i32 0))), + (v4i32 (VABSDUW $A, $B))>; + + def : Pat<(v8i16 (PPCvabsd v8i16:$A, v8i16:$B, (i32 0))), + (v8i16 (VABSDUH $A, $B))>; + + def : Pat<(v16i8 (PPCvabsd v16i8:$A, v16i8:$B, (i32 0))), + (v16i8 (VABSDUB $A, $B))>; + + // As PPCVABSD description, the last operand indicates whether do the + // sign bit flip. + def : Pat<(v4i32 (PPCvabsd v4i32:$A, v4i32:$B, (i32 1))), + (v4i32 (VABSDUW (XVNEGSP $A), (XVNEGSP $B)))>; +} diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/PowerPC/PPCPfmCounters.td similarity index 51% rename from lib/Target/AMDGPU/AMDGPUIntrinsics.td rename to lib/Target/PowerPC/PPCPfmCounters.td index 230a04628504784b9892308053065c613963a57f..d2a09f30c0f3e58a826ee8645e7617137d30e3e1 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsics.td +++ b/lib/Target/PowerPC/PPCPfmCounters.td @@ -1,4 +1,4 @@ -//===-- AMDGPUIntrinsics.td - Common intrinsics -*- tablegen -*-----------===// +//===-- PPCPfmCounters.td - PPC Hardware Counters ----------*- tablegen -*-===// // // The LLVM Compiler Infrastructure // @@ -7,10 +7,13 @@ // //===----------------------------------------------------------------------===// // -// This file defines intrinsics that are used by all hw codegen targets. +// This describes the available hardware counters for PPC. // //===----------------------------------------------------------------------===// -let TargetPrefix = "AMDGPU", isTarget = 1 in { - def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; +def CpuCyclesPfmCounter : PfmCounter<"CYCLES">; + +def DefaultPfmCounters : ProcPfmCounters { + let CycleCounter = CpuCyclesPfmCounter; } +def : PfmCountersDefaultBinding; diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td index 5ad0a517c1175db1480236b9b4f256fb1c91e983..c8fe7d7eea78382c61949789d28cedd11862742f 100644 --- a/lib/Target/PowerPC/PPCSchedule.td +++ b/lib/Target/PowerPC/PPCSchedule.td @@ -42,7 +42,6 @@ def IIC_LdStLoad : InstrItinClass; def IIC_LdStLoadUpd : InstrItinClass; def IIC_LdStLoadUpdX : InstrItinClass; def IIC_LdStStore : InstrItinClass; -def IIC_LdStStoreUpd : InstrItinClass; def IIC_LdStDSS : InstrItinClass; def IIC_LdStICBI : InstrItinClass; def IIC_LdStLD : InstrItinClass; @@ -63,8 +62,8 @@ def IIC_LdStSLBIA : InstrItinClass; def IIC_LdStSLBIE : InstrItinClass; def IIC_LdStSTD : InstrItinClass; def IIC_LdStSTDCX : InstrItinClass; -def IIC_LdStSTDU : InstrItinClass; -def IIC_LdStSTDUX : InstrItinClass; +def IIC_LdStSTU : InstrItinClass; +def IIC_LdStSTUX : InstrItinClass; def IIC_LdStSTFD : InstrItinClass; def IIC_LdStSTFDU : InstrItinClass; def IIC_LdStSTVEBX : InstrItinClass; diff --git a/lib/Target/PowerPC/PPCSchedule440.td b/lib/Target/PowerPC/PPCSchedule440.td index 2455e5e52de5e385237c15040a8891ce1380e741..646822eedbe078560181d907f0516c5bb35a3fba 100644 --- a/lib/Target/PowerPC/PPCSchedule440.td +++ b/lib/Target/PowerPC/PPCSchedule440.td @@ -280,13 +280,6 @@ def PPC440Itineraries : ProcessorItineraries< InstrStage<2, [P440_LWB]>], [1, 1, 1], [NoBypass, P440_GPR_Bypass]>, - InstrItinData, - InstrStage<1, [P440_LRACC]>, - InstrStage<1, [P440_AGEN]>, - InstrStage<1, [P440_CRD]>, - InstrStage<2, [P440_LWB]>], - [2, 1, 1, 1], - [NoBypass, P440_GPR_Bypass]>, InstrItinData, InstrStage<1, [P440_LRACC]>, InstrStage<1, [P440_AGEN]>, @@ -373,14 +366,14 @@ def PPC440Itineraries : ProcessorItineraries< InstrStage<2, [P440_LWB]>], [4, 1, 1], [NoBypass, P440_GPR_Bypass]>, - InstrItinData, + InstrItinData, InstrStage<1, [P440_LRACC]>, InstrStage<1, [P440_AGEN]>, InstrStage<1, [P440_CRD]>, InstrStage<2, [P440_LWB]>], [2, 1, 1, 1], [NoBypass, P440_GPR_Bypass]>, - InstrItinData, + InstrItinData, InstrStage<1, [P440_LRACC]>, InstrStage<1, [P440_AGEN]>, InstrStage<1, [P440_CRD]>, diff --git a/lib/Target/PowerPC/PPCScheduleA2.td b/lib/Target/PowerPC/PPCScheduleA2.td index 54cfae5d74b7bcf2a825d5a77386ee2ba4fa495d..f34c1accc0fd9d2446ab1861a795a359029a9e09 100644 --- a/lib/Target/PowerPC/PPCScheduleA2.td +++ b/lib/Target/PowerPC/PPCScheduleA2.td @@ -81,8 +81,6 @@ def PPCA2Itineraries : ProcessorItineraries< [6, 0, 0]>, InstrItinData], [0, 0, 0]>, - InstrItinData], - [2, 0, 0, 0]>, InstrItinData], [16, 0, 0]>, InstrItinData], @@ -105,9 +103,9 @@ def PPCA2Itineraries : ProcessorItineraries< [82, 0, 0]>, // L2 latency InstrItinData], [0, 0, 0]>, - InstrItinData], + InstrItinData], [2, 0, 0, 0]>, - InstrItinData], + InstrItinData], [2, 0, 0, 0]>, InstrItinData], [82, 0, 0]>, // L2 latency diff --git a/lib/Target/PowerPC/PPCScheduleE500.td b/lib/Target/PowerPC/PPCScheduleE500.td index d7c2bd15a2587eebca1cef6da2d5effe10ff4d7c..479a970b2537d2bfc3576ffa6d0295d97e1b769d 100644 --- a/lib/Target/PowerPC/PPCScheduleE500.td +++ b/lib/Target/PowerPC/PPCScheduleE500.td @@ -144,7 +144,13 @@ def PPCE500Itineraries : ProcessorItineraries< InstrStage<1, [E500_LSU_0]>], [6, 1], // Latency = 3 [NoBypass, E500_GPR_Bypass]>, - InstrItinData, + InstrItinData, + InstrStage<1, [E500_SU0, E500_SU1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [NoBypass, E500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData, InstrStage<1, [E500_SU0, E500_SU1], 0>, InstrStage<1, [E500_LSU_0]>], [6, 1], // Latency = 3 diff --git a/lib/Target/PowerPC/PPCScheduleE500mc.td b/lib/Target/PowerPC/PPCScheduleE500mc.td index 5f95f2a79f669fe94edc3433c0f05ce5c036cb2e..d8bda073833f8840d81a9293074c74ea3d94845a 100644 --- a/lib/Target/PowerPC/PPCScheduleE500mc.td +++ b/lib/Target/PowerPC/PPCScheduleE500mc.td @@ -157,7 +157,13 @@ def PPCE500mcItineraries : ProcessorItineraries< InstrStage<1, [E500mc_LSU_0]>], [6, 1], // Latency = 3 [NoBypass, E500mc_GPR_Bypass]>, - InstrItinData, + InstrItinData, + InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>, + InstrStage<1, [E500mc_LSU_0]>], + [6, 1], // Latency = 3 + [NoBypass, E500mc_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData, InstrStage<1, [E500mc_SFX0, E500mc_SFX1], 0>, InstrStage<1, [E500mc_LSU_0]>], [6, 1], // Latency = 3 diff --git a/lib/Target/PowerPC/PPCScheduleE5500.td b/lib/Target/PowerPC/PPCScheduleE5500.td index 32f8e652dd5681a60e3c5fc5b582224c8973ca4e..3e50803955c45b42be16ad2f78648d5cb07a60f8 100644 --- a/lib/Target/PowerPC/PPCScheduleE5500.td +++ b/lib/Target/PowerPC/PPCScheduleE5500.td @@ -206,12 +206,6 @@ def PPCE5500Itineraries : ProcessorItineraries< InstrStage<1, [E5500_LSU_0]>], [7, 2], // Latency = 3, Repeat rate = 1 [NoBypass, E5500_GPR_Bypass]>, - InstrItinData, - InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, - InstrStage<1, [E5500_LSU_0]>], - [7, 2], // Latency = 3, Repeat rate = 1 - [NoBypass, E5500_GPR_Bypass], - 2>, // 2 micro-ops InstrItinData, InstrStage<1, [E5500_LSU_0]>], [7, 2], // Latency = 3, Repeat rate = 1 @@ -281,13 +275,13 @@ def PPCE5500Itineraries : ProcessorItineraries< InstrStage<1, [E5500_LSU_0]>], [7, 2], // Latency = 3, Repeat rate = 1 [NoBypass, E5500_GPR_Bypass]>, - InstrItinData, + InstrItinData, InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, InstrStage<1, [E5500_LSU_0]>], [7, 2], // Latency = 3, Repeat rate = 1 [NoBypass, E5500_GPR_Bypass], 2>, // 2 micro-ops - InstrItinData, + InstrItinData, InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, InstrStage<1, [E5500_LSU_0]>], [7, 2], // Latency = 3, Repeat rate = 1 diff --git a/lib/Target/PowerPC/PPCScheduleG3.td b/lib/Target/PowerPC/PPCScheduleG3.td index 21efd8f8f6c927bcfa9f2ac2a7ebad7c0c210135..0995b7200d93d08e0e6cd0ffa052724d0d22b7d7 100644 --- a/lib/Target/PowerPC/PPCScheduleG3.td +++ b/lib/Target/PowerPC/PPCScheduleG3.td @@ -43,7 +43,8 @@ def G3Itineraries : ProcessorItineraries< InstrItinData]>, InstrItinData]>, InstrItinData]>, - InstrItinData]>, + InstrItinData]>, + InstrItinData]>, InstrItinData]>, InstrItinData]>, InstrItinData]>, diff --git a/lib/Target/PowerPC/PPCScheduleG4.td b/lib/Target/PowerPC/PPCScheduleG4.td index 340773ef787685982cc8c21d07a80e7be2da8db2..1b15c7b3c7ad487666e2b1999aeb20e03fb77d7b 100644 --- a/lib/Target/PowerPC/PPCScheduleG4.td +++ b/lib/Target/PowerPC/PPCScheduleG4.td @@ -48,7 +48,8 @@ def G4Itineraries : ProcessorItineraries< InstrItinData]>, InstrItinData]>, InstrItinData]>, - InstrItinData]>, + InstrItinData]>, + InstrItinData]>, InstrItinData]>, InstrItinData]>, InstrItinData]>, diff --git a/lib/Target/PowerPC/PPCScheduleG4Plus.td b/lib/Target/PowerPC/PPCScheduleG4Plus.td index 1d9f13fcb8500748c6c968da44e7934e3295af9d..0044c3c6a4490800301a7abf21de3297f7a29b2e 100644 --- a/lib/Target/PowerPC/PPCScheduleG4Plus.td +++ b/lib/Target/PowerPC/PPCScheduleG4Plus.td @@ -56,7 +56,6 @@ def G4PlusItineraries : ProcessorItineraries< InstrItinData]>, InstrItinData]>, InstrItinData]>, - InstrItinData]>, InstrItinData]>, InstrItinData]>, InstrItinData]>, @@ -73,8 +72,8 @@ def G4PlusItineraries : ProcessorItineraries< InstrItinData]>, InstrItinData]>, InstrItinData]>, - InstrItinData]>, - InstrItinData]>, + InstrItinData]>, + InstrItinData]>, InstrItinData]>, InstrItinData]>, InstrItinData]>, diff --git a/lib/Target/PowerPC/PPCScheduleG5.td b/lib/Target/PowerPC/PPCScheduleG5.td index b5a9f96d45aef9956c2b0e894a27462f8ec359e5..c802b80170fb770c352d44cae50a27b18be17629 100644 --- a/lib/Target/PowerPC/PPCScheduleG5.td +++ b/lib/Target/PowerPC/PPCScheduleG5.td @@ -54,7 +54,6 @@ def G5Itineraries : ProcessorItineraries< InstrItinData]>, InstrItinData]>, InstrItinData]>, - InstrItinData]>, InstrItinData]>, InstrItinData]>, InstrItinData]>, @@ -76,8 +75,8 @@ def G5Itineraries : ProcessorItineraries< InstrItinData]>, // needs work InstrItinData]>, InstrItinData]>, - InstrItinData]>, - InstrItinData]>, + InstrItinData]>, + InstrItinData]>, InstrItinData]>, InstrItinData]>, InstrItinData]>, diff --git a/lib/Target/PowerPC/PPCScheduleP7.td b/lib/Target/PowerPC/PPCScheduleP7.td index 541f290fadfc2f19ac945013848c671362161967..1d6e509819da5337ca9c8cfe172b1db9c9a7e225 100644 --- a/lib/Target/PowerPC/PPCScheduleP7.td +++ b/lib/Target/PowerPC/PPCScheduleP7.td @@ -261,13 +261,13 @@ def P7Itineraries : ProcessorItineraries< InstrStage<1, [P7_LS1, P7_LS2], 0>, InstrStage<1, [P7_FX1, P7_FX2]>], [1, 1, 1]>, - InstrItinData, + InstrItinData, InstrStage<1, [P7_DU2], 0>, InstrStage<1, [P7_LS1, P7_LS2], 0>, InstrStage<1, [P7_FX1, P7_FX2]>, InstrStage<1, [P7_FX1, P7_FX2]>], [2, 1, 1, 1]>, - InstrItinData, + InstrItinData, InstrStage<1, [P7_DU2], 0>, InstrStage<1, [P7_DU3], 0>, InstrStage<1, [P7_DU4], 0>, diff --git a/lib/Target/PowerPC/PPCScheduleP8.td b/lib/Target/PowerPC/PPCScheduleP8.td index a3fa5ff2f3de866121b24b79439d1be71c064d2e..ff39dfda7016d3e8a6d977f4d88d111a919c8f47 100644 --- a/lib/Target/PowerPC/PPCScheduleP8.td +++ b/lib/Target/PowerPC/PPCScheduleP8.td @@ -267,14 +267,14 @@ def P8Itineraries : ProcessorItineraries< InstrStage<1, [P8_LU1, P8_LU2, P8_LSU1, P8_LSU2]>] [1, 1, 1]>, - InstrItinData, + InstrItinData, InstrStage<1, [P8_DU2], 0>, InstrStage<1, [P8_LU1, P8_LU2, P8_LSU1, P8_LSU2], 0>, InstrStage<1, [P8_FXU1, P8_FXU2]>], [2, 1, 1, 1]>, // First+last - InstrItinData, + InstrItinData, InstrStage<1, [P8_DU2], 0>, InstrStage<1, [P8_DU3], 0>, InstrStage<1, [P8_DU4], 0>, diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index 34410393ef6f7125fde6d4e4809546c4dd14a90f..580d057602f5f711e2491a24ad32b4df7f07279d 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -214,19 +214,24 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT, if (TT.isOSDarwin()) return Reloc::DynamicNoPIC; - // Non-darwin 64-bit platforms are PIC by default. - if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le) + // Big Endian PPC is PIC by default. + if (TT.getArch() == Triple::ppc64) return Reloc::PIC_; - // 32-bit is static by default. + // Rest are static by default. return Reloc::Static; } -static CodeModel::Model getEffectiveCodeModel(const Triple &TT, - Optional CM, - bool JIT) { - if (CM) +static CodeModel::Model getEffectivePPCCodeModel(const Triple &TT, + Optional CM, + bool JIT) { + if (CM) { + if (*CM == CodeModel::Tiny) + report_fatal_error("Target does not support the tiny CodeModel"); + if (*CM == CodeModel::Kernel) + report_fatal_error("Target does not support the kernel CodeModel"); return *CM; + } if (!TT.isOSDarwin() && !JIT && (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le)) return CodeModel::Medium; @@ -246,7 +251,7 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT, : LLVMTargetMachine(T, getDataLayoutString(TT), TT, CPU, computeFSAdditions(FS, OL, TT), Options, getEffectiveRelocModel(TT, RM), - getEffectiveCodeModel(TT, CM, JIT), OL), + getEffectivePPCCodeModel(TT, CM, JIT), OL), TLOF(createTLOF(getTargetTriple())), TargetABI(computeTargetABI(TT, Options)) { initAsmInfo(); diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt index b4bf635dc2c75c4d6a77969b44f25dd51ecf7cec..7f5beca90455ac0508e11b551d29341356868cb8 100644 --- a/lib/Target/PowerPC/README.txt +++ b/lib/Target/PowerPC/README.txt @@ -663,3 +663,4 @@ and force instruction pairs to be scheduled together. More general handling of any_extend and zero_extend: See https://reviews.llvm.org/D24924#555306 + diff --git a/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index b1f1eb404010ed713fb87853c0ef70eae4b610f4..4e70ea402d8c8db1e53c015b5920a881c2859f7c 100644 --- a/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -7,12 +7,16 @@ // //===----------------------------------------------------------------------===// +#include "MCTargetDesc/RISCVAsmBackend.h" #include "MCTargetDesc/RISCVMCExpr.h" #include "MCTargetDesc/RISCVMCTargetDesc.h" #include "MCTargetDesc/RISCVTargetStreamer.h" #include "Utils/RISCVBaseInfo.h" +#include "Utils/RISCVMatInt.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" @@ -39,6 +43,8 @@ namespace { struct RISCVOperand; class RISCVAsmParser : public MCTargetAsmParser { + SmallVector FeatureBitStack; + SMLoc getLoc() const { return getParser().getTok().getLoc(); } bool isRV64() const { return getSTI().hasFeature(RISCV::Feature64Bit); } @@ -115,6 +121,20 @@ class RISCVAsmParser : public MCTargetAsmParser { } } + void pushFeatureBits() { + FeatureBitStack.push_back(getSTI().getFeatureBits()); + } + + bool popFeatureBits() { + if (FeatureBitStack.empty()) + return true; + + FeatureBitset FeatureBits = FeatureBitStack.pop_back_val(); + copySTI().setFeatureBits(FeatureBits); + setAvailableFeatures(ComputeAvailableFeatures(FeatureBits)); + + return false; + } public: enum RISCVMatchResultTy { Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY, @@ -1164,6 +1184,21 @@ bool RISCVAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { bool RISCVAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { + // Ensure that if the instruction occurs when relaxation is enabled, + // relocations are forced for the file. Ideally this would be done when there + // is enough information to reliably determine if the instruction itself may + // cause relaxations. Unfortunately instruction processing stage occurs in the + // same pass as relocation emission, so it's too late to set a 'sticky bit' + // for the entire file. + if (getSTI().getFeatureBits()[RISCV::FeatureRelax]) { + auto *Assembler = getTargetStreamer().getStreamer().getAssemblerPtr(); + if (Assembler != nullptr) { + RISCVAsmBackend &MAB = + static_cast(Assembler->getBackend()); + MAB.setForceRelocs(); + } + } + // First operand is token for instruction Operands.push_back(RISCVOperand::createToken(Name, NameLoc, isRV64())); @@ -1267,6 +1302,33 @@ bool RISCVAsmParser::parseDirectiveOption() { StringRef Option = Tok.getIdentifier(); + if (Option == "push") { + getTargetStreamer().emitDirectiveOptionPush(); + + Parser.Lex(); + if (Parser.getTok().isNot(AsmToken::EndOfStatement)) + return Error(Parser.getTok().getLoc(), + "unexpected token, expected end of statement"); + + pushFeatureBits(); + return false; + } + + if (Option == "pop") { + SMLoc StartLoc = Parser.getTok().getLoc(); + getTargetStreamer().emitDirectiveOptionPop(); + + Parser.Lex(); + if (Parser.getTok().isNot(AsmToken::EndOfStatement)) + return Error(Parser.getTok().getLoc(), + "unexpected token, expected end of statement"); + + if (popFeatureBits()) + return Error(StartLoc, ".option pop with no .option push"); + + return false; + } + if (Option == "rvc") { getTargetStreamer().emitDirectiveOptionRVC(); @@ -1291,9 +1353,34 @@ bool RISCVAsmParser::parseDirectiveOption() { return false; } + if (Option == "relax") { + getTargetStreamer().emitDirectiveOptionRelax(); + + Parser.Lex(); + if (Parser.getTok().isNot(AsmToken::EndOfStatement)) + return Error(Parser.getTok().getLoc(), + "unexpected token, expected end of statement"); + + setFeatureBits(RISCV::FeatureRelax, "relax"); + return false; + } + + if (Option == "norelax") { + getTargetStreamer().emitDirectiveOptionNoRelax(); + + Parser.Lex(); + if (Parser.getTok().isNot(AsmToken::EndOfStatement)) + return Error(Parser.getTok().getLoc(), + "unexpected token, expected end of statement"); + + clearFeatureBits(RISCV::FeatureRelax, "relax"); + return false; + } + // Unknown option. Warning(Parser.getTok().getLoc(), - "unknown option, expected 'rvc' or 'norvc'"); + "unknown option, expected 'push', 'pop', 'rvc', 'norvc', 'relax' or " + "'norelax'"); Parser.eatToEndOfStatement(); return false; } @@ -1307,80 +1394,23 @@ void RISCVAsmParser::emitToStreamer(MCStreamer &S, const MCInst &Inst) { void RISCVAsmParser::emitLoadImm(unsigned DestReg, int64_t Value, MCStreamer &Out) { - if (isInt<32>(Value)) { - // Emits the MC instructions for loading a 32-bit constant into a register. - // - // Depending on the active bits in the immediate Value v, the following - // instruction sequences are emitted: - // - // v == 0 : ADDI(W) - // v[0,12) != 0 && v[12,32) == 0 : ADDI(W) - // v[0,12) == 0 && v[12,32) != 0 : LUI - // v[0,32) != 0 : LUI+ADDI(W) - // - int64_t Hi20 = ((Value + 0x800) >> 12) & 0xFFFFF; - int64_t Lo12 = SignExtend64<12>(Value); - unsigned SrcReg = RISCV::X0; - - if (Hi20) { - emitToStreamer(Out, - MCInstBuilder(RISCV::LUI).addReg(DestReg).addImm(Hi20)); - SrcReg = DestReg; + RISCVMatInt::InstSeq Seq; + RISCVMatInt::generateInstSeq(Value, isRV64(), Seq); + + unsigned SrcReg = RISCV::X0; + for (RISCVMatInt::Inst &Inst : Seq) { + if (Inst.Opc == RISCV::LUI) { + emitToStreamer( + Out, MCInstBuilder(RISCV::LUI).addReg(DestReg).addImm(Inst.Imm)); + } else { + emitToStreamer( + Out, MCInstBuilder(Inst.Opc).addReg(DestReg).addReg(SrcReg).addImm( + Inst.Imm)); } - if (Lo12 || Hi20 == 0) { - unsigned AddiOpcode = - STI->hasFeature(RISCV::Feature64Bit) ? RISCV::ADDIW : RISCV::ADDI; - emitToStreamer(Out, MCInstBuilder(AddiOpcode) - .addReg(DestReg) - .addReg(SrcReg) - .addImm(Lo12)); - } - return; - } - assert(STI->hasFeature(RISCV::Feature64Bit) && - "Target must be 64-bit to support a >32-bit constant"); - - // In the worst case, for a full 64-bit constant, a sequence of 8 instructions - // (i.e., LUI+ADDIW+SLLI+ADDI+SLLI+ADDI+SLLI+ADDI) has to be emmitted. Note - // that the first two instructions (LUI+ADDIW) can contribute up to 32 bits - // while the following ADDI instructions contribute up to 12 bits each. - // - // On the first glance, implementing this seems to be possible by simply - // emitting the most significant 32 bits (LUI+ADDIW) followed by as many left - // shift (SLLI) and immediate additions (ADDI) as needed. However, due to the - // fact that ADDI performs a sign extended addition, doing it like that would - // only be possible when at most 11 bits of the ADDI instructions are used. - // Using all 12 bits of the ADDI instructions, like done by GAS, actually - // requires that the constant is processed starting with the least significant - // bit. - // - // In the following, constants are processed from LSB to MSB but instruction - // emission is performed from MSB to LSB by recursively calling - // emitLoadImm. In each recursion, first the lowest 12 bits are removed - // from the constant and the optimal shift amount, which can be greater than - // 12 bits if the constant is sparse, is determined. Then, the shifted - // remaining constant is processed recursively and gets emitted as soon as it - // fits into 32 bits. The emission of the shifts and additions is subsequently - // performed when the recursion returns. - // - int64_t Lo12 = SignExtend64<12>(Value); - int64_t Hi52 = (Value + 0x800) >> 12; - int ShiftAmount = 12 + findFirstSet((uint64_t)Hi52); - Hi52 = SignExtend64(Hi52 >> (ShiftAmount - 12), 64 - ShiftAmount); - - emitLoadImm(DestReg, Hi52, Out); - - emitToStreamer(Out, MCInstBuilder(RISCV::SLLI) - .addReg(DestReg) - .addReg(DestReg) - .addImm(ShiftAmount)); - - if (Lo12) - emitToStreamer(Out, MCInstBuilder(RISCV::ADDI) - .addReg(DestReg) - .addReg(DestReg) - .addImm(Lo12)); + // Only the first instruction has X0 as its source. + SrcReg = DestReg; + } } void RISCVAsmParser::emitLoadLocalAddress(MCInst &Inst, SMLoc IDLoc, diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 9ba7ebd0eb0f2a3938fc717653b8d4924209bc17..49239ac9099ee2d8c9c060b92e0fe1c8ae1055f4 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -7,115 +7,20 @@ // //===----------------------------------------------------------------------===// -#include "MCTargetDesc/RISCVFixupKinds.h" -#include "MCTargetDesc/RISCVMCTargetDesc.h" +#include "RISCVAsmBackend.h" #include "llvm/ADT/APInt.h" -#include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDirectives.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCObjectWriter.h" -#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; -namespace { -class RISCVAsmBackend : public MCAsmBackend { - const MCSubtargetInfo &STI; - uint8_t OSABI; - bool Is64Bit; - -public: - RISCVAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit) - : MCAsmBackend(support::little), STI(STI), OSABI(OSABI), - Is64Bit(Is64Bit) {} - ~RISCVAsmBackend() override {} - - // Generate diff expression relocations if the relax feature is enabled, - // otherwise it is safe for the assembler to calculate these internally. - bool requiresDiffExpressionRelocations() const override { - return STI.getFeatureBits()[RISCV::FeatureRelax]; - } - void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, - const MCValue &Target, MutableArrayRef Data, - uint64_t Value, bool IsResolved, - const MCSubtargetInfo *STI) const override; - - std::unique_ptr - createObjectTargetWriter() const override; - - // If linker relaxation is enabled, always emit relocations even if the fixup - // can be resolved. This is necessary for correctness as offsets may change - // during relaxation. - bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, - const MCValue &Target) override { - return STI.getFeatureBits()[RISCV::FeatureRelax]; - } - - bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, - const MCRelaxableFragment *DF, - const MCAsmLayout &Layout) const override { - llvm_unreachable("Handled by fixupNeedsRelaxationAdvanced"); - } - - bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, bool Resolved, - uint64_t Value, - const MCRelaxableFragment *DF, - const MCAsmLayout &Layout, - const bool WasForced) const override; - - unsigned getNumFixupKinds() const override { - return RISCV::NumTargetFixupKinds; - } - - const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override { - const static MCFixupKindInfo Infos[] = { - // This table *must* be in the order that the fixup_* kinds are defined in - // RISCVFixupKinds.h. - // - // name offset bits flags - { "fixup_riscv_hi20", 12, 20, 0 }, - { "fixup_riscv_lo12_i", 20, 12, 0 }, - { "fixup_riscv_lo12_s", 0, 32, 0 }, - { "fixup_riscv_pcrel_hi20", 12, 20, MCFixupKindInfo::FKF_IsPCRel }, - { "fixup_riscv_pcrel_lo12_i", 20, 12, MCFixupKindInfo::FKF_IsPCRel }, - { "fixup_riscv_pcrel_lo12_s", 0, 32, MCFixupKindInfo::FKF_IsPCRel }, - { "fixup_riscv_jal", 12, 20, MCFixupKindInfo::FKF_IsPCRel }, - { "fixup_riscv_branch", 0, 32, MCFixupKindInfo::FKF_IsPCRel }, - { "fixup_riscv_rvc_jump", 2, 11, MCFixupKindInfo::FKF_IsPCRel }, - { "fixup_riscv_rvc_branch", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, - { "fixup_riscv_call", 0, 64, MCFixupKindInfo::FKF_IsPCRel }, - { "fixup_riscv_relax", 0, 0, 0 } - }; - static_assert((array_lengthof(Infos)) == RISCV::NumTargetFixupKinds, - "Not all fixup kinds added to Infos array"); - - if (Kind < FirstTargetFixupKind) - return MCAsmBackend::getFixupKindInfo(Kind); - - assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() && - "Invalid kind!"); - return Infos[Kind - FirstTargetFixupKind]; - } - - bool mayNeedRelaxation(const MCInst &Inst, - const MCSubtargetInfo &STI) const override; - unsigned getRelaxedOpcode(unsigned Op) const; - - void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, - MCInst &Res) const override; - - - bool writeNopData(raw_ostream &OS, uint64_t Count) const override; -}; - - bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, bool Resolved, uint64_t Value, @@ -348,8 +253,6 @@ RISCVAsmBackend::createObjectTargetWriter() const { return createRISCVELFObjectWriter(OSABI, Is64Bit); } -} // end anonymous namespace - MCAsmBackend *llvm::createRISCVAsmBackend(const Target &T, const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h new file mode 100644 index 0000000000000000000000000000000000000000..5601f07f9267828c92f0d4e5a6651f840c4815ac --- /dev/null +++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h @@ -0,0 +1,118 @@ +//===-- RISCVAsmBackend.h - RISCV Assembler Backend -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVASMBACKEND_H +#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVASMBACKEND_H + +#include "MCTargetDesc/RISCVFixupKinds.h" +#include "MCTargetDesc/RISCVMCTargetDesc.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" + +namespace llvm { +class MCAssembler; +class MCObjectTargetWriter; +class raw_ostream; + +class RISCVAsmBackend : public MCAsmBackend { + const MCSubtargetInfo &STI; + uint8_t OSABI; + bool Is64Bit; + bool ForceRelocs = false; + +public: + RISCVAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit) + : MCAsmBackend(support::little), STI(STI), OSABI(OSABI), + Is64Bit(Is64Bit) {} + ~RISCVAsmBackend() override {} + + void setForceRelocs() { ForceRelocs = true; } + + // Generate diff expression relocations if the relax feature is enabled or had + // previously been enabled, otherwise it is safe for the assembler to + // calculate these internally. + bool requiresDiffExpressionRelocations() const override { + return STI.getFeatureBits()[RISCV::FeatureRelax] || ForceRelocs; + } + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef Data, + uint64_t Value, bool IsResolved, + const MCSubtargetInfo *STI) const override; + + std::unique_ptr + createObjectTargetWriter() const override; + + // If linker relaxation is enabled, or the relax option had previously been + // enabled, always emit relocations even if the fixup can be resolved. This is + // necessary for correctness as offsets may change during relaxation. + bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target) override { + return STI.getFeatureBits()[RISCV::FeatureRelax] || ForceRelocs; + } + + bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const override { + llvm_unreachable("Handled by fixupNeedsRelaxationAdvanced"); + } + + bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, bool Resolved, + uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout, + const bool WasForced) const override; + + unsigned getNumFixupKinds() const override { + return RISCV::NumTargetFixupKinds; + } + + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override { + const static MCFixupKindInfo Infos[] = { + // This table *must* be in the order that the fixup_* kinds are defined in + // RISCVFixupKinds.h. + // + // name offset bits flags + { "fixup_riscv_hi20", 12, 20, 0 }, + { "fixup_riscv_lo12_i", 20, 12, 0 }, + { "fixup_riscv_lo12_s", 0, 32, 0 }, + { "fixup_riscv_pcrel_hi20", 12, 20, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_riscv_pcrel_lo12_i", 20, 12, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_riscv_pcrel_lo12_s", 0, 32, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_riscv_jal", 12, 20, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_riscv_branch", 0, 32, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_riscv_rvc_jump", 2, 11, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_riscv_rvc_branch", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_riscv_call", 0, 64, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_riscv_relax", 0, 0, 0 } + }; + static_assert((array_lengthof(Infos)) == RISCV::NumTargetFixupKinds, + "Not all fixup kinds added to Infos array"); + + if (Kind < FirstTargetFixupKind) + return MCAsmBackend::getFixupKindInfo(Kind); + + assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() && + "Invalid kind!"); + return Infos[Kind - FirstTargetFixupKind]; + } + + bool mayNeedRelaxation(const MCInst &Inst, + const MCSubtargetInfo &STI) const override; + unsigned getRelaxedOpcode(unsigned Op) const; + + void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + MCInst &Res) const override; + + + bool writeNopData(raw_ostream &OS, uint64_t Count) const override; +}; +} + +#endif diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp index 6428b11cfe9c355c6bad1e575855ee01f651e981..a6ba1e41e9644cb01919091c2cb2c35a46a4f0ee 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp @@ -38,5 +38,9 @@ MCELFStreamer &RISCVTargetELFStreamer::getStreamer() { return static_cast(Streamer); } +void RISCVTargetELFStreamer::emitDirectiveOptionPush() {} +void RISCVTargetELFStreamer::emitDirectiveOptionPop() {} void RISCVTargetELFStreamer::emitDirectiveOptionRVC() {} void RISCVTargetELFStreamer::emitDirectiveOptionNoRVC() {} +void RISCVTargetELFStreamer::emitDirectiveOptionRelax() {} +void RISCVTargetELFStreamer::emitDirectiveOptionNoRelax() {} diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h index daa7abfe1336f850ff9bdf6babe3ed3f34b69a19..1f36bbc4388242e06c3adb04666f20d79804c3b0 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h +++ b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h @@ -20,8 +20,12 @@ public: MCELFStreamer &getStreamer(); RISCVTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI); + virtual void emitDirectiveOptionPush(); + virtual void emitDirectiveOptionPop(); virtual void emitDirectiveOptionRVC(); virtual void emitDirectiveOptionNoRVC(); + virtual void emitDirectiveOptionRelax(); + virtual void emitDirectiveOptionNoRelax(); }; } #endif diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp index 2d5205aa7ef74327714577f8c2692b67b288d89c..8d5ef3dbd17f99e502b07ef2e8324495136faaf1 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp @@ -23,6 +23,14 @@ RISCVTargetAsmStreamer::RISCVTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS) : RISCVTargetStreamer(S), OS(OS) {} +void RISCVTargetAsmStreamer::emitDirectiveOptionPush() { + OS << "\t.option\tpush\n"; +} + +void RISCVTargetAsmStreamer::emitDirectiveOptionPop() { + OS << "\t.option\tpop\n"; +} + void RISCVTargetAsmStreamer::emitDirectiveOptionRVC() { OS << "\t.option\trvc\n"; } @@ -30,3 +38,11 @@ void RISCVTargetAsmStreamer::emitDirectiveOptionRVC() { void RISCVTargetAsmStreamer::emitDirectiveOptionNoRVC() { OS << "\t.option\tnorvc\n"; } + +void RISCVTargetAsmStreamer::emitDirectiveOptionRelax() { + OS << "\t.option\trelax\n"; +} + +void RISCVTargetAsmStreamer::emitDirectiveOptionNoRelax() { + OS << "\t.option\tnorelax\n"; +} diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h index 525c20810f244c1880faa37e023ec94125284db8..74ec9e303933d1260fa6947a9e325e149d718f30 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h +++ b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h @@ -18,8 +18,12 @@ class RISCVTargetStreamer : public MCTargetStreamer { public: RISCVTargetStreamer(MCStreamer &S); + virtual void emitDirectiveOptionPush() = 0; + virtual void emitDirectiveOptionPop() = 0; virtual void emitDirectiveOptionRVC() = 0; virtual void emitDirectiveOptionNoRVC() = 0; + virtual void emitDirectiveOptionRelax() = 0; + virtual void emitDirectiveOptionNoRelax() = 0; }; // This part is for ascii assembly output @@ -29,8 +33,12 @@ class RISCVTargetAsmStreamer : public RISCVTargetStreamer { public: RISCVTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS); + void emitDirectiveOptionPush() override; + void emitDirectiveOptionPop() override; void emitDirectiveOptionRVC() override; void emitDirectiveOptionNoRVC() override; + void emitDirectiveOptionRelax() override; + void emitDirectiveOptionNoRelax() override; }; } diff --git a/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index 1c23680abc1dceb4d9632590ac4f999682407e37..35c185aa5edd3bae2e7dcc3b889be06d0b48bdb0 100644 --- a/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -52,6 +52,9 @@ private: MachineBasicBlock::iterator MBBI, AtomicRMWInst::BinOp, bool IsMasked, int Width, MachineBasicBlock::iterator &NextMBBI); + bool expandAtomicCmpXchg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, bool IsMasked, + int Width, MachineBasicBlock::iterator &NextMBBI); }; char RISCVExpandPseudo::ID = 0; @@ -106,6 +109,10 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB, case RISCV::PseudoMaskedAtomicLoadUMin32: return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMin, true, 32, NextMBBI); + case RISCV::PseudoCmpXchg32: + return expandAtomicCmpXchg(MBB, MBBI, false, 32, NextMBBI); + case RISCV::PseudoMaskedCmpXchg32: + return expandAtomicCmpXchg(MBB, MBBI, true, 32, NextMBBI); } return false; @@ -441,6 +448,103 @@ bool RISCVExpandPseudo::expandAtomicMinMaxOp( return true; } +bool RISCVExpandPseudo::expandAtomicCmpXchg( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, bool IsMasked, + int Width, MachineBasicBlock::iterator &NextMBBI) { + assert(Width == 32 && "RV64 atomic expansion currently unsupported"); + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + MachineFunction *MF = MBB.getParent(); + auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + + // Insert new MBBs. + MF->insert(++MBB.getIterator(), LoopHeadMBB); + MF->insert(++LoopHeadMBB->getIterator(), LoopTailMBB); + MF->insert(++LoopTailMBB->getIterator(), DoneMBB); + + // Set up successors and transfer remaining instructions to DoneMBB. + LoopHeadMBB->addSuccessor(LoopTailMBB); + LoopHeadMBB->addSuccessor(DoneMBB); + LoopTailMBB->addSuccessor(DoneMBB); + LoopTailMBB->addSuccessor(LoopHeadMBB); + DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end()); + DoneMBB->transferSuccessors(&MBB); + MBB.addSuccessor(LoopHeadMBB); + + unsigned DestReg = MI.getOperand(0).getReg(); + unsigned ScratchReg = MI.getOperand(1).getReg(); + unsigned AddrReg = MI.getOperand(2).getReg(); + unsigned CmpValReg = MI.getOperand(3).getReg(); + unsigned NewValReg = MI.getOperand(4).getReg(); + AtomicOrdering Ordering = + static_cast(MI.getOperand(IsMasked ? 6 : 5).getImm()); + + if (!IsMasked) { + // .loophead: + // lr.w dest, (addr) + // bne dest, cmpval, done + BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW32(Ordering)), DestReg) + .addReg(AddrReg); + BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BNE)) + .addReg(DestReg) + .addReg(CmpValReg) + .addMBB(DoneMBB); + // .looptail: + // sc.w scratch, newval, (addr) + // bnez scratch, loophead + BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW32(Ordering)), ScratchReg) + .addReg(AddrReg) + .addReg(NewValReg); + BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE)) + .addReg(ScratchReg) + .addReg(RISCV::X0) + .addMBB(LoopHeadMBB); + } else { + // .loophead: + // lr.w dest, (addr) + // and scratch, dest, mask + // bne scratch, cmpval, done + unsigned MaskReg = MI.getOperand(5).getReg(); + BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW32(Ordering)), DestReg) + .addReg(AddrReg); + BuildMI(LoopHeadMBB, DL, TII->get(RISCV::AND), ScratchReg) + .addReg(DestReg) + .addReg(MaskReg); + BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BNE)) + .addReg(ScratchReg) + .addReg(CmpValReg) + .addMBB(DoneMBB); + + // .looptail: + // xor scratch, dest, newval + // and scratch, scratch, mask + // xor scratch, dest, scratch + // sc.w scratch, scratch, (adrr) + // bnez scratch, loophead + insertMaskedMerge(TII, DL, LoopTailMBB, ScratchReg, DestReg, NewValReg, + MaskReg, ScratchReg); + BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW32(Ordering)), ScratchReg) + .addReg(AddrReg) + .addReg(ScratchReg); + BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE)) + .addReg(ScratchReg) + .addReg(RISCV::X0) + .addMBB(LoopHeadMBB); + } + + NextMBBI = MBB.end(); + MI.eraseFromParent(); + + LivePhysRegs LiveRegs; + computeAndAddLiveIns(LiveRegs, *LoopHeadMBB); + computeAndAddLiveIns(LiveRegs, *LoopTailMBB); + computeAndAddLiveIns(LiveRegs, *DoneMBB); + + return true; +} + } // end of anonymous namespace INITIALIZE_PASS(RISCVExpandPseudo, "riscv-expand-pseudo", diff --git a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index e355b208a75dde556956079e051cfb6c207037d5..aa80365feb830f9e23a6f35d1cd784e1b6deab96 100644 --- a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -11,9 +11,10 @@ // //===----------------------------------------------------------------------===// -#include "RISCV.h" #include "MCTargetDesc/RISCVMCTargetDesc.h" +#include "RISCV.h" #include "RISCVTargetMachine.h" +#include "Utils/RISCVMatInt.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/Support/Debug.h" @@ -63,6 +64,38 @@ void RISCVDAGToDAGISel::PostprocessISelDAG() { doPeepholeLoadStoreADDI(); } +static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, int64_t Imm, + MVT XLenVT) { + RISCVMatInt::InstSeq Seq; + RISCVMatInt::generateInstSeq(Imm, XLenVT == MVT::i64, Seq); + + SDNode *Result; + SDValue SrcReg = CurDAG->getRegister(RISCV::X0, XLenVT); + for (RISCVMatInt::Inst &Inst : Seq) { + SDValue SDImm = CurDAG->getTargetConstant(Inst.Imm, DL, XLenVT); + if (Inst.Opc == RISCV::LUI) + Result = CurDAG->getMachineNode(RISCV::LUI, DL, XLenVT, SDImm); + else + Result = CurDAG->getMachineNode(Inst.Opc, DL, XLenVT, SrcReg, SDImm); + + // Only the first instruction has X0 as its source. + SrcReg = SDValue(Result, 0); + } + + return Result; +} + +// Returns true if the Node is an ISD::AND with a constant argument. If so, +// set Mask to that constant value. +static bool isConstantMask(SDNode *Node, uint64_t &Mask) { + if (Node->getOpcode() == ISD::AND && + Node->getOperand(1).getOpcode() == ISD::Constant) { + Mask = cast(Node->getOperand(1))->getZExtValue(); + return true; + } + return false; +} + void RISCVDAGToDAGISel::Select(SDNode *Node) { // If we have a custom node, we have already selected. if (Node->isMachineOpcode()) { @@ -87,6 +120,11 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, New.getNode()); return; } + int64_t Imm = ConstNode->getSExtValue(); + if (XLenVT == MVT::i64) { + ReplaceNode(Node, selectImm(CurDAG, SDLoc(Node), Imm, XLenVT)); + return; + } break; } case ISD::FrameIndex: { @@ -96,6 +134,29 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ADDI, DL, VT, TFI, Imm)); return; } + case ISD::SRL: { + if (!Subtarget->is64Bit()) + break; + SDValue Op0 = Node->getOperand(0); + SDValue Op1 = Node->getOperand(1); + uint64_t Mask; + // Match (srl (and val, mask), imm) where the result would be a + // zero-extended 32-bit integer. i.e. the mask is 0xffffffff or the result + // is equivalent to this (SimplifyDemandedBits may have removed lower bits + // from the mask that aren't necessary due to the right-shifting). + if (Op1.getOpcode() == ISD::Constant && + isConstantMask(Op0.getNode(), Mask)) { + uint64_t ShAmt = cast(Op1.getNode())->getZExtValue(); + + if ((Mask | maskTrailingOnes(ShAmt)) == 0xffffffff) { + SDValue ShAmtVal = + CurDAG->getTargetConstant(ShAmt, SDLoc(Node), XLenVT); + CurDAG->SelectNodeTo(Node, RISCV::SRLIW, XLenVT, Op0.getOperand(0), + ShAmtVal); + return; + } + } + } } // Select the default instruction. diff --git a/lib/Target/RISCV/RISCVISelLowering.cpp b/lib/Target/RISCV/RISCVISelLowering.cpp index 85758c0cdf8cf4f050ad31edb311131fde361a9e..e78085edac38e39073636f627adf8709056bb54d 100644 --- a/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/lib/Target/RISCV/RISCVISelLowering.cpp @@ -111,10 +111,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::SETUGT, ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE, ISD::SETGT, ISD::SETGE, ISD::SETNE}; - // TODO: add proper support for the various FMA variants - // (FMADD.S, FMSUB.S, FNMSUB.S, FNMADD.S). ISD::NodeType FPOpToExtend[] = { - ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW, ISD::FMA}; + ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW, ISD::FREM}; if (Subtarget.hasStdExtF()) { setOperationAction(ISD::FMINNUM, MVT::f32, Legal); @@ -186,6 +184,7 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::riscv_masked_atomicrmw_min_i32: case Intrinsic::riscv_masked_atomicrmw_umax_i32: case Intrinsic::riscv_masked_atomicrmw_umin_i32: + case Intrinsic::riscv_masked_cmpxchg_i32: PointerType *PtrTy = cast(I.getArgOperand(0)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); @@ -266,6 +265,10 @@ bool RISCVTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return TargetLowering::isZExtFree(Val, VT2); } +bool RISCVTargetLowering::isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const { + return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64; +} + // Changes the condition code and swaps operands if necessary, so the SetCC // operation matches one of the comparisons supported directly in the RISC-V // ISA. @@ -1708,3 +1711,23 @@ Value *RISCVTargetLowering::emitMaskedAtomicRMWIntrinsic( return Builder.CreateCall(LrwOpScwLoop, {AlignedAddr, Incr, Mask, Ordering}); } + +TargetLowering::AtomicExpansionKind +RISCVTargetLowering::shouldExpandAtomicCmpXchgInIR( + AtomicCmpXchgInst *CI) const { + unsigned Size = CI->getCompareOperand()->getType()->getPrimitiveSizeInBits(); + if (Size == 8 || Size == 16) + return AtomicExpansionKind::MaskedIntrinsic; + return AtomicExpansionKind::None; +} + +Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( + IRBuilder<> &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr, + Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const { + Value *Ordering = Builder.getInt32(static_cast(Ord)); + Type *Tys[] = {AlignedAddr->getType()}; + Function *MaskedCmpXchg = Intrinsic::getDeclaration( + CI->getModule(), Intrinsic::riscv_masked_cmpxchg_i32, Tys); + return Builder.CreateCall(MaskedCmpXchg, + {AlignedAddr, CmpVal, NewVal, Mask, Ordering}); +} diff --git a/lib/Target/RISCV/RISCVISelLowering.h b/lib/Target/RISCV/RISCVISelLowering.h index fe988bb14deaf27dd809e186f0675a2ac9ed0c42..6970900bb0622af57b46eb655ada4c6c4051cc07 100644 --- a/lib/Target/RISCV/RISCVISelLowering.h +++ b/lib/Target/RISCV/RISCVISelLowering.h @@ -54,6 +54,7 @@ public: bool isTruncateFree(Type *SrcTy, Type *DstTy) const override; bool isTruncateFree(EVT SrcVT, EVT DstVT) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; + bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override; // Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; @@ -126,6 +127,13 @@ private: virtual Value *emitMaskedAtomicRMWIntrinsic( IRBuilder<> &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr, Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const override; + TargetLowering::AtomicExpansionKind + shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CI) const override; + virtual Value * + emitMaskedAtomicCmpXchgIntrinsic(IRBuilder<> &Builder, AtomicCmpXchgInst *CI, + Value *AlignedAddr, Value *CmpVal, + Value *NewVal, Value *Mask, + AtomicOrdering Ord) const override; }; } diff --git a/lib/Target/RISCV/RISCVInstrFormats.td b/lib/Target/RISCV/RISCVInstrFormats.td index 529e048045c6f10c1ebc1ce4e2acddd62496100a..ebd676a6056ef366917c7e8a01548d07d498f20f 100644 --- a/lib/Target/RISCV/RISCVInstrFormats.td +++ b/lib/Target/RISCV/RISCVInstrFormats.td @@ -45,11 +45,12 @@ def InstFormatCSS : InstFormat<10>; def InstFormatCIW : InstFormat<11>; def InstFormatCL : InstFormat<12>; def InstFormatCS : InstFormat<13>; -def InstFormatCB : InstFormat<14>; -def InstFormatCJ : InstFormat<15>; -def InstFormatOther : InstFormat<16>; +def InstFormatCA : InstFormat<14>; +def InstFormatCB : InstFormat<15>; +def InstFormatCJ : InstFormat<16>; +def InstFormatOther : InstFormat<17>; -// The following opcode names and match those given in Table 19.1 in the +// The following opcode names match those given in Table 19.1 in the // RISC-V User-level ISA specification ("RISC-V base opcode map"). class RISCVOpcode val> { bits<7> Value = val; diff --git a/lib/Target/RISCV/RISCVInstrFormatsC.td b/lib/Target/RISCV/RISCVInstrFormatsC.td index 6abcbd7cc8a123fa9f466a03125726c332ebd910..bda8bbb558eb6ce32c01605ea16e0a3504e18c3b 100644 --- a/lib/Target/RISCV/RISCVInstrFormatsC.td +++ b/lib/Target/RISCV/RISCVInstrFormatsC.td @@ -118,6 +118,19 @@ class RVInst16CS funct3, bits<2> opcode, dag outs, dag ins, let Inst{1-0} = opcode; } +class RVInst16CA funct6, bits<2> funct2, bits<2> opcode, dag outs, + dag ins, string opcodestr, string argstr> + : RVInst16 { + bits<3> rs2; + bits<3> rs1; + + let Inst{15-10} = funct6; + let Inst{9-7} = rs1; + let Inst{6-5} = funct2; + let Inst{4-2} = rs2; + let Inst{1-0} = opcode; +} + class RVInst16CB funct3, bits<2> opcode, dag outs, dag ins, string opcodestr, string argstr> : RVInst16 { diff --git a/lib/Target/RISCV/RISCVInstrInfo.td b/lib/Target/RISCV/RISCVInstrInfo.td index 631a1f7deca04010c69bdb7f14cb2dc05d9a9dda..60c0f0b96a62b4140ff9e98c3ead532116ffe42e 100644 --- a/lib/Target/RISCV/RISCVInstrInfo.td +++ b/lib/Target/RISCV/RISCVInstrInfo.td @@ -400,6 +400,15 @@ def EBREAK : RVInstI<0b000, OPC_SYSTEM, (outs), (ins), "ebreak", ""> { let rd = 0; let imm12 = 1; } + +// This is a de facto standard (as set by GNU binutils) 32-bit unimplemented +// instruction (i.e., it should always trap, if your implementation has invalid +// instruction traps). +def UNIMP : RVInstI<0b001, OPC_SYSTEM, (outs), (ins), "unimp", ""> { + let rs1 = 0; + let rd = 0; + let imm12 = 0b110000000000; +} } // hasSideEffects = 1, mayLoad = 0, mayStore = 0 def CSRRW : CSR_ir<0b001, "csrrw">; @@ -568,6 +577,16 @@ def : InstAlias<"csrwi $csr, $imm", (CSRRWI X0, csr_sysreg:$csr, uimm5:$imm)>; def : InstAlias<"csrsi $csr, $imm", (CSRRSI X0, csr_sysreg:$csr, uimm5:$imm)>; def : InstAlias<"csrci $csr, $imm", (CSRRCI X0, csr_sysreg:$csr, uimm5:$imm)>; +let EmitPriority = 0 in { +def : InstAlias<"csrw $csr, $imm", (CSRRWI X0, csr_sysreg:$csr, uimm5:$imm)>; +def : InstAlias<"csrs $csr, $imm", (CSRRSI X0, csr_sysreg:$csr, uimm5:$imm)>; +def : InstAlias<"csrc $csr, $imm", (CSRRCI X0, csr_sysreg:$csr, uimm5:$imm)>; + +def : InstAlias<"csrrw $rd, $csr, $imm", (CSRRWI GPR:$rd, csr_sysreg:$csr, uimm5:$imm)>; +def : InstAlias<"csrrs $rd, $csr, $imm", (CSRRSI GPR:$rd, csr_sysreg:$csr, uimm5:$imm)>; +def : InstAlias<"csrrc $rd, $csr, $imm", (CSRRCI GPR:$rd, csr_sysreg:$csr, uimm5:$imm)>; +} + def : InstAlias<"sfence.vma", (SFENCE_VMA X0, X0)>; def : InstAlias<"sfence.vma $rs", (SFENCE_VMA GPR:$rs, X0)>; @@ -685,7 +704,9 @@ def : PatGprSimm12; // Define pattern expansions for setcc operations that aren't directly // handled by a RISC-V instruction. +def : Pat<(seteq GPR:$rs1, 0), (SLTIU GPR:$rs1, 1)>; def : Pat<(seteq GPR:$rs1, GPR:$rs2), (SLTIU (XOR GPR:$rs1, GPR:$rs2), 1)>; +def : Pat<(setne GPR:$rs1, 0), (SLTU X0, GPR:$rs1)>; def : Pat<(setne GPR:$rs1, GPR:$rs2), (SLTU X0, (XOR GPR:$rs1, GPR:$rs2))>; def : Pat<(setugt GPR:$rs1, GPR:$rs2), (SLTU GPR:$rs2, GPR:$rs1)>; def : Pat<(setuge GPR:$rs1, GPR:$rs2), (XORI (SLTU GPR:$rs1, GPR:$rs2), 1)>; @@ -813,7 +834,7 @@ defm : LdPat; defm : LdPat; defm : LdPat; defm : LdPat; -defm : LdPat; +defm : LdPat, Requires<[IsRV32]>; defm : LdPat; defm : LdPat; @@ -864,6 +885,45 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), [(CallSeqEnd timm:$amt1, timm:$amt2)]>; } // Defs = [X2], Uses = [X2] +/// RV64 patterns + +let Predicates = [IsRV64] in { + +/// sext and zext + +def : Pat<(sext_inreg GPR:$rs1, i32), (ADDIW GPR:$rs1, 0)>; +def : Pat<(and GPR:$rs1, 0xffffffff), (SRLI (SLLI GPR:$rs1, 32), 32)>; + +/// ALU operations + +def : Pat<(sext_inreg (add GPR:$rs1, GPR:$rs2), i32), + (ADDW GPR:$rs1, GPR:$rs2)>; +def : Pat<(sext_inreg (add GPR:$rs1, simm12:$imm12), i32), + (ADDIW GPR:$rs1, simm12:$imm12)>; +def : Pat<(sext_inreg (sub GPR:$rs1, GPR:$rs2), i32), + (SUBW GPR:$rs1, GPR:$rs2)>; +def : Pat<(sext_inreg (shl GPR:$rs1, uimm5:$shamt), i32), + (SLLIW GPR:$rs1, uimm5:$shamt)>; +// (srl (zexti32 ...), uimm5:$shamt) is matched with custom code due to the +// need to undo manipulation of the mask value performed by DAGCombine. +def : Pat<(sra (sext_inreg GPR:$rs1, i32), uimm5:$shamt), + (SRAIW GPR:$rs1, uimm5:$shamt)>; + +// TODO: patterns for SLLW/SRLW/SRAW. + +/// Loads + +defm : LdPat; +defm : LdPat; +defm : LdPat; +defm : LdPat; + +/// Stores + +defm : StPat; +defm : StPat; +} // Predicates = [IsRV64] + //===----------------------------------------------------------------------===// // Standard extensions //===----------------------------------------------------------------------===// diff --git a/lib/Target/RISCV/RISCVInstrInfoA.td b/lib/Target/RISCV/RISCVInstrInfoA.td index 180434e9ff1363ea7eef15d01f7e4dd4ea28d74b..9cb1d2f0b627d54987c34663aa73473e23549094 100644 --- a/lib/Target/RISCV/RISCVInstrInfoA.td +++ b/lib/Target/RISCV/RISCVInstrInfoA.td @@ -153,7 +153,7 @@ class PseudoAMO : Pseudo<(outs GPR:$res, GPR:$scratch), } def PseudoAtomicLoadNand32 : PseudoAMO; -// Ordering constants must be kept in sync with the AtomicOrdering enum in +// Ordering constants must be kept in sync with the AtomicOrdering enum in // AtomicOrdering.h. def : Pat<(atomic_load_nand_32_monotonic GPR:$addr, GPR:$incr), (PseudoAtomicLoadNand32 GPR:$addr, GPR:$incr, 2)>; @@ -230,4 +230,49 @@ def : PseudoMaskedAMOPat; + +/// Compare and exchange + +class PseudoCmpXchg + : Pseudo<(outs GPR:$res, GPR:$scratch), + (ins GPR:$addr, GPR:$cmpval, GPR:$newval, i32imm:$ordering), []> { + let Constraints = "@earlyclobber $res,@earlyclobber $scratch"; + let mayLoad = 1; + let mayStore = 1; + let hasSideEffects = 0; +} + +// Ordering constants must be kept in sync with the AtomicOrdering enum in +// AtomicOrdering.h. +multiclass PseudoCmpXchgPat { + def : Pat<(!cast(Op#"_monotonic") GPR:$addr, GPR:$cmp, GPR:$new), + (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 2)>; + def : Pat<(!cast(Op#"_acquire") GPR:$addr, GPR:$cmp, GPR:$new), + (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 4)>; + def : Pat<(!cast(Op#"_release") GPR:$addr, GPR:$cmp, GPR:$new), + (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 5)>; + def : Pat<(!cast(Op#"_acq_rel") GPR:$addr, GPR:$cmp, GPR:$new), + (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 6)>; + def : Pat<(!cast(Op#"_seq_cst") GPR:$addr, GPR:$cmp, GPR:$new), + (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 7)>; +} + +def PseudoCmpXchg32 : PseudoCmpXchg; +defm : PseudoCmpXchgPat<"atomic_cmp_swap_32", PseudoCmpXchg32>; + +def PseudoMaskedCmpXchg32 + : Pseudo<(outs GPR:$res, GPR:$scratch), + (ins GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, + i32imm:$ordering), []> { + let Constraints = "@earlyclobber $res,@earlyclobber $scratch"; + let mayLoad = 1; + let mayStore = 1; + let hasSideEffects = 0; +} + +def : Pat<(int_riscv_masked_cmpxchg_i32 + GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering), + (PseudoMaskedCmpXchg32 + GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering)>; + } // Predicates = [HasStdExtA] diff --git a/lib/Target/RISCV/RISCVInstrInfoC.td b/lib/Target/RISCV/RISCVInstrInfoC.td index 21e6cae5ef94f62c599323c0fd1de816acb29bc8..ad68b5a7dc976314e27aefdd445cb90917e6b2ed 100644 --- a/lib/Target/RISCV/RISCVInstrInfoC.td +++ b/lib/Target/RISCV/RISCVInstrInfoC.td @@ -258,16 +258,13 @@ class Shift_right funct2, string OpcodeStr, RegisterClass cls, } let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class CS_ALU funct2, string OpcodeStr, RegisterClass cls, - bit RV64only> - : RVInst16CS<0b100, 0b01, (outs cls:$rd_wb), (ins cls:$rd, cls:$rs2), +class CS_ALU funct6, bits<2> funct2, string OpcodeStr, + RegisterClass cls> + : RVInst16CA { bits<3> rd; let Constraints = "$rd = $rd_wb"; - let Inst{12} = RV64only; - let Inst{11-10} = 0b11; let Inst{9-7} = rd; - let Inst{6-5} = funct2; } //===----------------------------------------------------------------------===// @@ -411,14 +408,14 @@ def C_ANDI : RVInst16CB<0b100, 0b01, (outs GPRC:$rs1_wb), (ins GPRC:$rs1, simm6: let Inst{6-2} = imm{4-0}; } -def C_SUB : CS_ALU<0b00, "c.sub", GPRC, 0>; -def C_XOR : CS_ALU<0b01, "c.xor", GPRC, 0>; -def C_OR : CS_ALU<0b10, "c.or" , GPRC, 0>; -def C_AND : CS_ALU<0b11, "c.and", GPRC, 0>; +def C_SUB : CS_ALU<0b100011, 0b00, "c.sub", GPRC>; +def C_XOR : CS_ALU<0b100011, 0b01, "c.xor", GPRC>; +def C_OR : CS_ALU<0b100011, 0b10, "c.or" , GPRC>; +def C_AND : CS_ALU<0b100011, 0b11, "c.and", GPRC>; let Predicates = [HasStdExtC, IsRV64] in { -def C_SUBW : CS_ALU<0b00, "c.subw", GPRC, 1>; -def C_ADDW : CS_ALU<0b01, "c.addw", GPRC, 1>; +def C_SUBW : CS_ALU<0b100111, 0b00, "c.subw", GPRC>; +def C_ADDW : CS_ALU<0b100111, 0b01, "c.addw", GPRC>; } let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in @@ -478,7 +475,7 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in def C_MV : RVInst16CR<0b1000, 0b10, (outs GPRNoX0:$rs1), (ins GPRNoX0:$rs2), "c.mv", "$rs1, $rs2">; -let rs1 = 0, rs2 = 0, hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +let rs1 = 0, rs2 = 0, hasSideEffects = 1, mayLoad = 0, mayStore = 0 in def C_EBREAK : RVInst16CR<0b1001, 0b10, (outs), (ins), "c.ebreak", "">; let hasSideEffects = 0, mayLoad = 0, mayStore = 0, @@ -517,6 +514,13 @@ def C_SDSP : CStackStore<0b111, "c.sdsp", GPR, uimm9_lsb000> { let Inst{9-7} = imm{8-6}; } +// The all zeros pattern isn't a valid RISC-V instruction. It's used by GNU +// binutils as 16-bit instruction known to be unimplemented (i.e., trapping). +let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in +def C_UNIMP : RVInst16<(outs), (ins), "c.unimp", "", [], InstFormatOther> { + let Inst{15-0} = 0; +} + } // Predicates = [HasStdExtC] //===----------------------------------------------------------------------===// @@ -680,6 +684,7 @@ def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs2, X0), def : CompressPat<(ADDI GPRNoX0:$rs1, GPRNoX0:$rs2, 0), (C_MV GPRNoX0:$rs1, GPRNoX0:$rs2)>; def : CompressPat<(EBREAK), (C_EBREAK)>; +def : CompressPat<(UNIMP), (C_UNIMP)>; def : CompressPat<(JALR X1, GPRNoX0:$rs1, 0), (C_JALR GPRNoX0:$rs1)>; def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs1, GPRNoX0:$rs2), diff --git a/lib/Target/RISCV/RISCVInstrInfoD.td b/lib/Target/RISCV/RISCVInstrInfoD.td index 73ceb2b0660e1b7793461637626b07b28f1ab389..9f1cd50de595d4592b529d6750ed6a61889d586f 100644 --- a/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/lib/Target/RISCV/RISCVInstrInfoD.td @@ -230,6 +230,22 @@ def : Pat<(fabs FPR64:$rs1), (FSGNJX_D $rs1, $rs1)>; def : PatFpr64Fpr64; def : Pat<(fcopysign FPR64:$rs1, (fneg FPR64:$rs2)), (FSGNJN_D $rs1, $rs2)>; +// fmadd: rs1 * rs2 + rs3 +def : Pat<(fma FPR64:$rs1, FPR64:$rs2, FPR64:$rs3), + (FMADD_D $rs1, $rs2, $rs3, 0b111)>; + +// fmsub: rs1 * rs2 - rs3 +def : Pat<(fma FPR64:$rs1, FPR64:$rs2, (fneg FPR64:$rs3)), + (FMSUB_D FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, 0b111)>; + +// fnmsub: -rs1 * rs2 + rs3 +def : Pat<(fma (fneg FPR64:$rs1), FPR64:$rs2, FPR64:$rs3), + (FNMSUB_D FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, 0b111)>; + +// fnmadd: -rs1 * rs2 - rs3 +def : Pat<(fma (fneg FPR64:$rs1), FPR64:$rs2, (fneg FPR64:$rs3)), + (FNMADD_D FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, 0b111)>; + // The RISC-V 2.2 user-level ISA spec defines fmin and fmax as returning the // canonical NaN when giving a signaling NaN. This doesn't match the LLVM // behaviour (see https://bugs.llvm.org/show_bug.cgi?id=27363). However, the diff --git a/lib/Target/RISCV/RISCVInstrInfoF.td b/lib/Target/RISCV/RISCVInstrInfoF.td index a31b93b11ae8b1631862336dc35bfb9fa2ea9815..03bdac45873d0b90adf10d5fe084de97f6463f72 100644 --- a/lib/Target/RISCV/RISCVInstrInfoF.td +++ b/lib/Target/RISCV/RISCVInstrInfoF.td @@ -270,6 +270,22 @@ def : Pat<(fabs FPR32:$rs1), (FSGNJX_S $rs1, $rs1)>; def : PatFpr32Fpr32; def : Pat<(fcopysign FPR32:$rs1, (fneg FPR32:$rs2)), (FSGNJN_S $rs1, $rs2)>; +// fmadd: rs1 * rs2 + rs3 +def : Pat<(fma FPR32:$rs1, FPR32:$rs2, FPR32:$rs3), + (FMADD_S $rs1, $rs2, $rs3, 0b111)>; + +// fmsub: rs1 * rs2 - rs3 +def : Pat<(fma FPR32:$rs1, FPR32:$rs2, (fneg FPR32:$rs3)), + (FMSUB_S FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, 0b111)>; + +// fnmsub: -rs1 * rs2 + rs3 +def : Pat<(fma (fneg FPR32:$rs1), FPR32:$rs2, FPR32:$rs3), + (FNMSUB_S FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, 0b111)>; + +// fnmadd: -rs1 * rs2 - rs3 +def : Pat<(fma (fneg FPR32:$rs1), FPR32:$rs2, (fneg FPR32:$rs3)), + (FNMADD_S FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, 0b111)>; + // The RISC-V 2.2 user-level ISA spec defines fmin and fmax as returning the // canonical NaN when given a signaling NaN. This doesn't match the LLVM // behaviour (see https://bugs.llvm.org/show_bug.cgi?id=27363). However, the diff --git a/lib/Target/RISCV/RISCVTargetMachine.cpp b/lib/Target/RISCV/RISCVTargetMachine.cpp index e75da768ee2e9ceb4c766e6f4d1bba8eafd2db92..8937ec200bd74cf8b59aed8ac1327eeddfede59d 100644 --- a/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -47,12 +47,6 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT, return *RM; } -static CodeModel::Model getEffectiveCodeModel(Optional CM) { - if (CM) - return *CM; - return CodeModel::Small; -} - RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, @@ -61,7 +55,7 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OL, bool JIT) : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, getEffectiveRelocModel(TT, RM), - getEffectiveCodeModel(CM), OL), + getEffectiveCodeModel(CM, CodeModel::Small), OL), TLOF(make_unique()), Subtarget(TT, CPU, FS, *this) { initAsmInfo(); diff --git a/lib/Target/RISCV/Utils/CMakeLists.txt b/lib/Target/RISCV/Utils/CMakeLists.txt index b7869f696307f61f86d6de8df799a40dd7f9a26a..727ab4a9fd771ac509df66bcec71b2f301330bc4 100644 --- a/lib/Target/RISCV/Utils/CMakeLists.txt +++ b/lib/Target/RISCV/Utils/CMakeLists.txt @@ -1,3 +1,4 @@ add_llvm_library(LLVMRISCVUtils RISCVBaseInfo.cpp + RISCVMatInt.cpp ) diff --git a/lib/Target/RISCV/Utils/RISCVBaseInfo.h b/lib/Target/RISCV/Utils/RISCVBaseInfo.h index 86cd2d6e58a1cd672d56e261c7a8b9f5f4a63bca..372e0e80bbaf8c40cf9364744ccc1f91c192c1bf 100644 --- a/lib/Target/RISCV/Utils/RISCVBaseInfo.h +++ b/lib/Target/RISCV/Utils/RISCVBaseInfo.h @@ -39,9 +39,10 @@ enum { InstFormatCIW = 11, InstFormatCL = 12, InstFormatCS = 13, - InstFormatCB = 14, - InstFormatCJ = 15, - InstFormatOther = 16, + InstFormatCA = 14, + InstFormatCB = 15, + InstFormatCJ = 16, + InstFormatOther = 17, InstFormatMask = 31 }; diff --git a/lib/Target/RISCV/Utils/RISCVMatInt.cpp b/lib/Target/RISCV/Utils/RISCVMatInt.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3dc298246bc562ef8df573d877cb3519ca2d7c0b --- /dev/null +++ b/lib/Target/RISCV/Utils/RISCVMatInt.cpp @@ -0,0 +1,79 @@ +//===- RISCVMatInt.cpp - Immediate materialisation -------------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "RISCVMatInt.h" +#include "MCTargetDesc/RISCVMCTargetDesc.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/MachineValueType.h" +#include "llvm/Support/MathExtras.h" +#include + +namespace llvm { + +namespace RISCVMatInt { +void generateInstSeq(int64_t Val, bool Is64Bit, InstSeq &Res) { + if (isInt<32>(Val)) { + // Depending on the active bits in the immediate Value v, the following + // instruction sequences are emitted: + // + // v == 0 : ADDI + // v[0,12) != 0 && v[12,32) == 0 : ADDI + // v[0,12) == 0 && v[12,32) != 0 : LUI + // v[0,32) != 0 : LUI+ADDI(W) + int64_t Hi20 = ((Val + 0x800) >> 12) & 0xFFFFF; + int64_t Lo12 = SignExtend64<12>(Val); + + if (Hi20) + Res.push_back(Inst(RISCV::LUI, Hi20)); + + if (Lo12 || Hi20 == 0) { + unsigned AddiOpc = (Is64Bit && Hi20) ? RISCV::ADDIW : RISCV::ADDI; + Res.push_back(Inst(AddiOpc, Lo12)); + } + return; + } + + assert(Is64Bit && "Can't emit >32-bit imm for non-RV64 target"); + + // In the worst case, for a full 64-bit constant, a sequence of 8 instructions + // (i.e., LUI+ADDIW+SLLI+ADDI+SLLI+ADDI+SLLI+ADDI) has to be emmitted. Note + // that the first two instructions (LUI+ADDIW) can contribute up to 32 bits + // while the following ADDI instructions contribute up to 12 bits each. + // + // On the first glance, implementing this seems to be possible by simply + // emitting the most significant 32 bits (LUI+ADDIW) followed by as many left + // shift (SLLI) and immediate additions (ADDI) as needed. However, due to the + // fact that ADDI performs a sign extended addition, doing it like that would + // only be possible when at most 11 bits of the ADDI instructions are used. + // Using all 12 bits of the ADDI instructions, like done by GAS, actually + // requires that the constant is processed starting with the least significant + // bit. + // + // In the following, constants are processed from LSB to MSB but instruction + // emission is performed from MSB to LSB by recursively calling + // generateInstSeq. In each recursion, first the lowest 12 bits are removed + // from the constant and the optimal shift amount, which can be greater than + // 12 bits if the constant is sparse, is determined. Then, the shifted + // remaining constant is processed recursively and gets emitted as soon as it + // fits into 32 bits. The emission of the shifts and additions is subsequently + // performed when the recursion returns. + + int64_t Lo12 = SignExtend64<12>(Val); + int64_t Hi52 = (Val + 0x800) >> 12; + int ShiftAmount = 12 + findFirstSet((uint64_t)Hi52); + Hi52 = SignExtend64(Hi52 >> (ShiftAmount - 12), 64 - ShiftAmount); + + generateInstSeq(Hi52, Is64Bit, Res); + + Res.push_back(Inst(RISCV::SLLI, ShiftAmount)); + if (Lo12) + Res.push_back(Inst(RISCV::ADDI, Lo12)); +} +} // namespace RISCVMatInt +} // namespace llvm diff --git a/lib/Target/RISCV/Utils/RISCVMatInt.h b/lib/Target/RISCV/Utils/RISCVMatInt.h new file mode 100644 index 0000000000000000000000000000000000000000..49d1d89adc7a1af5532098b50ed3e45c3edda9e1 --- /dev/null +++ b/lib/Target/RISCV/Utils/RISCVMatInt.h @@ -0,0 +1,36 @@ +//===- RISCVMatInt.h - Immediate materialisation ---------------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_RISCV_MATINT_H +#define LLVM_LIB_TARGET_RISCV_MATINT_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/MachineValueType.h" +#include + +namespace llvm { + +namespace RISCVMatInt { +struct Inst { + unsigned Opc; + int64_t Imm; + + Inst(unsigned Opc, int64_t Imm) : Opc(Opc), Imm(Imm) {} +}; +using InstSeq = SmallVector; + +// Helper to generate an instruction sequence that will materialise the given +// immediate value into a register. A sequence of instructions represented by +// a simple struct produced rather than directly emitting the instructions in +// order to allow this helper to be used from both the MC layer and during +// instruction selection. +void generateInstSeq(int64_t Val, bool IsRV64, InstSeq &Res); +} // namespace RISCVMatInt +} // namespace llvm +#endif diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp index 35f52f7d279bfc1199a95bad3a1eb02c8e349f5b..691421e533eae975603700cb3bef5c88c673312f 100644 --- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp +++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp @@ -78,6 +78,8 @@ class SparcAsmParser : public MCTargetAsmParser { // Custom parse functions for Sparc specific operands. OperandMatchResultTy parseMEMOperand(OperandVector &Operands); + OperandMatchResultTy parseMembarTag(OperandVector &Operands); + OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Name); OperandMatchResultTy @@ -256,6 +258,7 @@ public: bool isMem() const override { return isMEMrr() || isMEMri(); } bool isMEMrr() const { return Kind == k_MemoryReg; } bool isMEMri() const { return Kind == k_MemoryImm; } + bool isMembarTag() const { return Kind == k_Immediate; } bool isIntReg() const { return (Kind == k_Register && Reg.Kind == rk_IntReg); @@ -366,6 +369,12 @@ public: addExpr(Inst, Expr); } + void addMembarTagOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCExpr *Expr = getImm(); + addExpr(Inst, Expr); + } + static std::unique_ptr CreateToken(StringRef Str, SMLoc S) { auto Op = make_unique(k_Token); Op->Tok.Data = Str.data(); @@ -742,6 +751,52 @@ SparcAsmParser::parseMEMOperand(OperandVector &Operands) { return MatchOperand_Success; } +OperandMatchResultTy SparcAsmParser::parseMembarTag(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + const MCExpr *EVal; + int64_t ImmVal = 0; + + std::unique_ptr Mask; + if (parseSparcAsmOperand(Mask) == MatchOperand_Success) { + if (!Mask->isImm() || !Mask->getImm()->evaluateAsAbsolute(ImmVal) || + ImmVal < 0 || ImmVal > 127) { + Error(S, "invalid membar mask number"); + return MatchOperand_ParseFail; + } + } + + while (getLexer().getKind() == AsmToken::Hash) { + SMLoc TagStart = getLexer().getLoc(); + Parser.Lex(); // Eat the '#'. + unsigned MaskVal = StringSwitch(Parser.getTok().getString()) + .Case("LoadLoad", 0x1) + .Case("StoreLoad", 0x2) + .Case("LoadStore", 0x4) + .Case("StoreStore", 0x8) + .Case("Lookaside", 0x10) + .Case("MemIssue", 0x20) + .Case("Sync", 0x40) + .Default(0); + + Parser.Lex(); // Eat the identifier token. + + if (!MaskVal) { + Error(TagStart, "unknown membar tag"); + return MatchOperand_ParseFail; + } + + ImmVal |= MaskVal; + + if (getLexer().getKind() == AsmToken::Pipe) + Parser.Lex(); // Eat the '|'. + } + + EVal = MCConstantExpr::create(ImmVal, getContext()); + SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); + Operands.push_back(SparcOperand::CreateImm(EVal, S, E)); + return MatchOperand_Success; +} + OperandMatchResultTy SparcAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp index c1512cbdc44fd502345aa56867f958c4b25585a1..d152efae6d1f6f2cac5c90fbfcd77b9a760c464e 100644 --- a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp +++ b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp @@ -195,3 +195,26 @@ bool SparcInstPrinter::printGetPCX(const MCInst *MI, unsigned opNum, llvm_unreachable("FIXME: Implement SparcInstPrinter::printGetPCX."); return true; } + +void SparcInstPrinter::printMembarTag(const MCInst *MI, int opNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + static const char *const TagNames[] = { + "#LoadLoad", "#StoreLoad", "#LoadStore", "#StoreStore", + "#Lookaside", "#MemIssue", "#Sync"}; + + unsigned Imm = MI->getOperand(opNum).getImm(); + + if (Imm > 127) { + O << Imm; + return; + } + + bool First = true; + for (unsigned i = 0; i < sizeof(TagNames) / sizeof(char *); i++) { + if (Imm & (1 << i)) { + O << (First ? "" : " | ") << TagNames[i]; + First = false; + } + } +} diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h index 6f06d1ddae32f9bd02cee798c018716fefb03e16..89015eb137c20167a3310d39fa964c16b9b99282 100644 --- a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h +++ b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h @@ -49,6 +49,8 @@ public: raw_ostream &OS); bool printGetPCX(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &OS); + void printMembarTag(const MCInst *MI, int opNum, const MCSubtargetInfo &STI, + raw_ostream &O); }; } // end namespace llvm diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp index 307b6e3c3821c2c7e12d6ebe67104b88f13ad323..7d908bb6b3207b24c467aa1e21e05a4d30ccef3d 100644 --- a/lib/Target/Sparc/SparcISelLowering.cpp +++ b/lib/Target/Sparc/SparcISelLowering.cpp @@ -3261,23 +3261,23 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, else return std::make_pair(0U, &SP::IntRegsRegClass); case 'f': - if (VT == MVT::f32) + if (VT == MVT::f32 || VT == MVT::i32) return std::make_pair(0U, &SP::FPRegsRegClass); - else if (VT == MVT::f64) + else if (VT == MVT::f64 || VT == MVT::i64) return std::make_pair(0U, &SP::LowDFPRegsRegClass); else if (VT == MVT::f128) return std::make_pair(0U, &SP::LowQFPRegsRegClass); - llvm_unreachable("Unknown ValueType for f-register-type!"); - break; + // This will generate an error message + return std::make_pair(0U, nullptr); case 'e': - if (VT == MVT::f32) + if (VT == MVT::f32 || VT == MVT::i32) return std::make_pair(0U, &SP::FPRegsRegClass); - else if (VT == MVT::f64) + else if (VT == MVT::f64 || VT == MVT::i64 ) return std::make_pair(0U, &SP::DFPRegsRegClass); else if (VT == MVT::f128) return std::make_pair(0U, &SP::QFPRegsRegClass); - llvm_unreachable("Unknown ValueType for e-register-type!"); - break; + // This will generate an error message + return std::make_pair(0U, nullptr); } } else if (!Constraint.empty() && Constraint.size() <= 5 && Constraint[0] == '{' && *(Constraint.end()-1) == '}') { diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td index 62f3d96af847b00bc86c506344de62f0fcda3456..558b37aeebcb895267679184c29a8d474277d1bb 100644 --- a/lib/Target/Sparc/SparcInstrInfo.td +++ b/lib/Target/Sparc/SparcInstrInfo.td @@ -138,6 +138,16 @@ def MEMri : Operand { def TLSSym : Operand; +def SparcMembarTagAsmOperand : AsmOperandClass { + let Name = "MembarTag"; + let ParserMethod = "parseMembarTag"; +} + +def MembarTag : Operand { + let PrintMethod = "printMembarTag"; + let ParserMatchClass = SparcMembarTagAsmOperand; +} + // Branch targets have OtherVT type. def brtarget : Operand { let EncoderMethod = "getBranchTargetOpValue"; @@ -1503,7 +1513,7 @@ def : Pat<(ctpop i32:$src), (POPCrr (SRLri $src, 0))>; let Predicates = [HasV9], hasSideEffects = 1, rd = 0, rs1 = 0b01111 in - def MEMBARi : F3_2<2, 0b101000, (outs), (ins simm13Op:$simm13), + def MEMBARi : F3_2<2, 0b101000, (outs), (ins MembarTag:$simm13), "membar $simm13", []>; // The CAS instruction, unlike other instructions, only comes in a diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp index 07f9e7250bd9f7822911500b48a4410864bc9f08..5b467235f809d20743692fc1d81b577d9e3e70d5 100644 --- a/lib/Target/Sparc/SparcTargetMachine.cpp +++ b/lib/Target/Sparc/SparcTargetMachine.cpp @@ -70,11 +70,16 @@ static Reloc::Model getEffectiveRelocModel(Optional RM) { // pic32 PIC_ Medium GOT < 2^32 bytes // // All code models require that the text segment is smaller than 2GB. -static CodeModel::Model getEffectiveCodeModel(Optional CM, - Reloc::Model RM, bool Is64Bit, - bool JIT) { - if (CM) +static CodeModel::Model +getEffectiveSparcCodeModel(Optional CM, Reloc::Model RM, + bool Is64Bit, bool JIT) { + if (CM) { + if (*CM == CodeModel::Tiny) + report_fatal_error("Target does not support the tiny CodeModel"); + if (*CM == CodeModel::Kernel) + report_fatal_error("Target does not support the kernel CodeModel"); return *CM; + } if (Is64Bit) { if (JIT) return CodeModel::Large; @@ -88,11 +93,11 @@ SparcTargetMachine::SparcTargetMachine( const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Optional RM, Optional CM, CodeGenOpt::Level OL, bool JIT, bool is64bit) - : LLVMTargetMachine( - T, computeDataLayout(TT, is64bit), TT, CPU, FS, Options, - getEffectiveRelocModel(RM), - getEffectiveCodeModel(CM, getEffectiveRelocModel(RM), is64bit, JIT), - OL), + : LLVMTargetMachine(T, computeDataLayout(TT, is64bit), TT, CPU, FS, Options, + getEffectiveRelocModel(RM), + getEffectiveSparcCodeModel( + CM, getEffectiveRelocModel(RM), is64bit, JIT), + OL), TLOF(make_unique()), Subtarget(TT, CPU, FS, *this, is64bit), is64Bit(is64bit) { initAsmInfo(); diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp index bd99fabb48c91ee5340107530d94bf724ae9279f..e2de721be56869249ec7cca62c546554618da092 100644 --- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp +++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp @@ -647,7 +647,7 @@ bool SystemZAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, } void SystemZAsmPrinter::EmitEndOfAsmFile(Module &M) { - SM.serializeToStackMapSection(); + emitStackMaps(SM); } // Force static initialization. diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index d2c33546716c051f7ab450b011ac279c0034d714..d7951ca810eecc0a1173c05c2d1ffe02643c6998 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -523,6 +523,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); + setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::FP_ROUND); @@ -4479,6 +4480,7 @@ static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT, // Constants with undefs to get a full vector constant and use that // as the starting point. SDValue Result; + SDValue ReplicatedVal; if (NumConstants > 0) { for (unsigned I = 0; I < NumElements; ++I) if (!Constants[I].getNode()) @@ -4489,17 +4491,21 @@ static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT, // avoid a false dependency on any previous contents of the vector // register. - // Use a VLREP if at least one element is a load. - unsigned LoadElIdx = UINT_MAX; + // Use a VLREP if at least one element is a load. Make sure to replicate + // the load with the most elements having its value. + std::map UseCounts; + SDNode *LoadMaxUses = nullptr; for (unsigned I = 0; I < NumElements; ++I) if (Elems[I].getOpcode() == ISD::LOAD && cast(Elems[I])->isUnindexed()) { - LoadElIdx = I; - break; + SDNode *Ld = Elems[I].getNode(); + UseCounts[Ld]++; + if (LoadMaxUses == nullptr || UseCounts[LoadMaxUses] < UseCounts[Ld]) + LoadMaxUses = Ld; } - if (LoadElIdx != UINT_MAX) { - Result = DAG.getNode(SystemZISD::REPLICATE, DL, VT, Elems[LoadElIdx]); - Done[LoadElIdx] = true; + if (LoadMaxUses != nullptr) { + ReplicatedVal = SDValue(LoadMaxUses, 0); + Result = DAG.getNode(SystemZISD::REPLICATE, DL, VT, ReplicatedVal); } else { // Try to use VLVGP. unsigned I1 = NumElements / 2 - 1; @@ -4520,7 +4526,7 @@ static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT, // Use VLVGx to insert the other elements. for (unsigned I = 0; I < NumElements; ++I) - if (!Done[I] && !Elems[I].isUndef()) + if (!Done[I] && !Elems[I].isUndef() && Elems[I] != ReplicatedVal) Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Result, Elems[I], DAG.getConstant(I, DL, MVT::i32)); return Result; @@ -5363,6 +5369,46 @@ SDValue SystemZTargetLowering::combineMERGE( return SDValue(); } +SDValue SystemZTargetLowering::combineLOAD( + SDNode *N, DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + EVT LdVT = N->getValueType(0); + if (LdVT.isVector() || LdVT.isInteger()) + return SDValue(); + // Transform a scalar load that is REPLICATEd as well as having other + // use(s) to the form where the other use(s) use the first element of the + // REPLICATE instead of the load. Otherwise instruction selection will not + // produce a VLREP. Avoid extracting to a GPR, so only do this for floating + // point loads. + + SDValue Replicate; + SmallVector OtherUses; + for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() == SystemZISD::REPLICATE) { + if (Replicate) + return SDValue(); // Should never happen + Replicate = SDValue(*UI, 0); + } + else if (UI.getUse().getResNo() == 0) + OtherUses.push_back(*UI); + } + if (!Replicate || OtherUses.empty()) + return SDValue(); + + SDLoc DL(N); + SDValue Extract0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, LdVT, + Replicate, DAG.getConstant(0, DL, MVT::i32)); + // Update uses of the loaded Value while preserving old chains. + for (SDNode *U : OtherUses) { + SmallVector Ops; + for (SDValue Op : U->ops()) + Ops.push_back((Op.getNode() == N && Op.getResNo() == 0) ? Extract0 : Op); + DAG.UpdateNodeOperands(U, Ops); + } + return SDValue(N, 0); +} + SDValue SystemZTargetLowering::combineSTORE( SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -5694,6 +5740,7 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SIGN_EXTEND_INREG: return combineSIGN_EXTEND_INREG(N, DCI); case SystemZISD::MERGE_HIGH: case SystemZISD::MERGE_LOW: return combineMERGE(N, DCI); + case ISD::LOAD: return combineLOAD(N, DCI); case ISD::STORE: return combineSTORE(N, DCI); case ISD::EXTRACT_VECTOR_ELT: return combineEXTRACT_VECTOR_ELT(N, DCI); case SystemZISD::JOIN_DWORDS: return combineJOIN_DWORDS(N, DCI); diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index 9bf9440794713ea81a3e21c4edfe58a9fc6721e1..172dbeec26d0fab594205743f82d22144940f1c5 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -587,6 +587,7 @@ private: SDValue combineSIGN_EXTEND(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSIGN_EXTEND_INREG(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineMERGE(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineLOAD(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSTORE(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineEXTRACT_VECTOR_ELT(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineJOIN_DWORDS(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp index 76ed6f80ba55575d383d29ace0a9fe2104ec97e1..e9f9188048da8a288ab7362ede6814838b8373e2 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp +++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp @@ -63,6 +63,10 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg, const LiveRegMatrix *Matrix) const { const MachineRegisterInfo *MRI = &MF.getRegInfo(); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + + bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints( + VirtReg, Order, Hints, MF, VRM, Matrix); + if (MRI->getRegClass(VirtReg) == &SystemZ::GRX32BitRegClass) { SmallVector Worklist; SmallSet DoneRegs; @@ -84,8 +88,18 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg, TRI->getCommonSubClass(getRC32(FalseMO, VRM, MRI), getRC32(TrueMO, VRM, MRI)); if (RC && RC != &SystemZ::GRX32BitRegClass) { + // Pass the registers of RC as hints while making sure that if + // any of these registers are copy hints, hint them first. + SmallSet CopyHints; + CopyHints.insert(Hints.begin(), Hints.end()); + Hints.clear(); + for (MCPhysReg Reg : Order) + if (CopyHints.count(Reg) && + RC->contains(Reg) && !MRI->isReserved(Reg)) + Hints.push_back(Reg); for (MCPhysReg Reg : Order) - if (RC->contains(Reg) && !MRI->isReserved(Reg)) + if (!CopyHints.count(Reg) && + RC->contains(Reg) && !MRI->isReserved(Reg)) Hints.push_back(Reg); // Return true to make these hints the only regs available to // RA. This may mean extra spilling but since the alternative is @@ -102,8 +116,7 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg, } } - return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, - VRM, Matrix); + return BaseImplRetVal; } const MCPhysReg * @@ -270,25 +283,30 @@ bool SystemZRegisterInfo::shouldCoalesce(MachineInstr *MI, // Check that the two virtual registers are local to MBB. MachineBasicBlock *MBB = MI->getParent(); - if (LIS.isLiveInToMBB(IntGR128, MBB) || LIS.isLiveOutOfMBB(IntGR128, MBB) || - LIS.isLiveInToMBB(IntGRNar, MBB) || LIS.isLiveOutOfMBB(IntGRNar, MBB)) + MachineInstr *FirstMI_GR128 = + LIS.getInstructionFromIndex(IntGR128.beginIndex()); + MachineInstr *FirstMI_GRNar = + LIS.getInstructionFromIndex(IntGRNar.beginIndex()); + MachineInstr *LastMI_GR128 = LIS.getInstructionFromIndex(IntGR128.endIndex()); + MachineInstr *LastMI_GRNar = LIS.getInstructionFromIndex(IntGRNar.endIndex()); + if ((!FirstMI_GR128 || FirstMI_GR128->getParent() != MBB) || + (!FirstMI_GRNar || FirstMI_GRNar->getParent() != MBB) || + (!LastMI_GR128 || LastMI_GR128->getParent() != MBB) || + (!LastMI_GRNar || LastMI_GRNar->getParent() != MBB)) return false; - // Find the first and last MIs of the registers. - MachineInstr *FirstMI = nullptr, *LastMI = nullptr; + MachineBasicBlock::iterator MII = nullptr, MEE = nullptr; if (WideOpNo == 1) { - FirstMI = LIS.getInstructionFromIndex(IntGR128.beginIndex()); - LastMI = LIS.getInstructionFromIndex(IntGRNar.endIndex()); + MII = FirstMI_GR128; + MEE = LastMI_GRNar; } else { - FirstMI = LIS.getInstructionFromIndex(IntGRNar.beginIndex()); - LastMI = LIS.getInstructionFromIndex(IntGR128.endIndex()); + MII = FirstMI_GRNar; + MEE = LastMI_GR128; } - assert (FirstMI && LastMI && "No instruction from index?"); // Check if coalescing seems safe by finding the set of clobbered physreg // pairs in the region. BitVector PhysClobbered(getNumRegs()); - MachineBasicBlock::iterator MII = FirstMI, MEE = LastMI; MEE++; for (; MII != MEE; ++MII) { for (const MachineOperand &MO : MII->operands()) diff --git a/lib/Target/SystemZ/SystemZScheduleZ13.td b/lib/Target/SystemZ/SystemZScheduleZ13.td index 1a87db6e6605bc52d78410f7331bbe75bad3eb71..74e1dad87908bc82fdf5e13b023745f7d652bc9a 100644 --- a/lib/Target/SystemZ/SystemZScheduleZ13.td +++ b/lib/Target/SystemZ/SystemZScheduleZ13.td @@ -609,7 +609,7 @@ def : InstRW<[WLat3LSU, WLat3LSU, FXa, FXb, LSU, GroupAlone], // Compare double and swap def : InstRW<[WLat6LSU, WLat6LSU, FXa3, FXb2, LSU, GroupAlone2], (instregex "CDS(Y)?$")>; -def : InstRW<[WLat15, WLat15, FXa2, FXb4, LSU3, GroupAlone], +def : InstRW<[WLat15, WLat15, FXa2, FXb4, LSU3, GroupAlone3], (instregex "CDSG$")>; // Compare and swap and store @@ -664,10 +664,10 @@ def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "(PACK|PKA|PKU)$")>; def : InstRW<[WLat12, LSU5, GroupAlone], (instregex "UNPK(A|U)$")>; def : InstRW<[WLat1, FXb, LSU2, Cracked], (instregex "UNPK$")>; -def : InstRW<[WLat5LSU, FXb, VecDFX, LSU3, GroupAlone], +def : InstRW<[WLat5LSU, FXb, VecDFX, LSU3, GroupAlone2], (instregex "(A|S|ZA)P$")>; -def : InstRW<[WLat1, FXb, VecDFX4, LSU3, GroupAlone], (instregex "(M|D)P$")>; -def : InstRW<[WLat15, FXb, VecDFX2, LSU2, GroupAlone], (instregex "SRP$")>; +def : InstRW<[WLat1, FXb, VecDFX4, LSU3, GroupAlone2], (instregex "(M|D)P$")>; +def : InstRW<[WLat15, FXb, VecDFX2, LSU2, GroupAlone3], (instregex "SRP$")>; def : InstRW<[WLat8, VecDFX, LSU, LSU, GroupAlone], (instregex "CP$")>; def : InstRW<[WLat3LSU, VecDFX, LSU, Cracked], (instregex "TP$")>; def : InstRW<[WLat30, MCD], (instregex "ED(MK)?$")>; @@ -684,7 +684,7 @@ def : InstRW<[WLat5, LSU, FXa, Cracked], (instregex "LAE(Y)?$")>; // Load/store access multiple (not modeled precisely) def : InstRW<[WLat20, WLat20, LSU5, GroupAlone], (instregex "LAM(Y)?$")>; -def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "STAM(Y)?$")>; +def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STAM(Y)?$")>; //===----------------------------------------------------------------------===// // Program mask and addressing mode @@ -909,7 +909,7 @@ def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XBR$")>; // Test Data Class def : InstRW<[WLat5, LSU, VecXsPm, NormalGr], (instregex "TC(E|D)B$")>; -def : InstRW<[WLat10, LSU2, VecDF4, GroupAlone], (instregex "TCXB$")>; +def : InstRW<[WLat10, LSU, VecDF4, GroupAlone], (instregex "TCXB$")>; //===----------------------------------------------------------------------===// // FP: Floating-point control register instructions @@ -1210,7 +1210,7 @@ def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone], (instregex "VLM$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(L|32|64)?$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>; def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>; -def : InstRW<[WLat1, LSU2, FXb3, GroupAlone], (instregex "VSTM$")>; +def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM$")>; def : InstRW<[WLat1, FXb2, LSU, Cracked], (instregex "VSCE(F|G)$")>; //===----------------------------------------------------------------------===// @@ -1424,7 +1424,7 @@ def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAC(F)?$")>; //===----------------------------------------------------------------------===// def : InstRW<[WLat4LSU, WLat4LSU, LSU2, GroupAlone], (instregex "LCTL(G)?$")>; -def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "STCT(L|G)$")>; +def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STCT(L|G)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "E(P|S)A(I)?R$")>; def : InstRW<[WLat30, MCD], (instregex "SSA(I)?R$")>; def : InstRW<[WLat30, MCD], (instregex "ESEA$")>; @@ -1468,8 +1468,8 @@ def : InstRW<[WLat30, MCD], (instregex "TPROT$")>; // System: Memory-move Instructions //===----------------------------------------------------------------------===// -def : InstRW<[WLat4LSU, FXa2, FXb, LSU5, GroupAlone], (instregex "MVC(K|P|S)$")>; -def : InstRW<[WLat1, FXa, LSU5, GroupAlone], (instregex "MVC(S|D)K$")>; +def : InstRW<[WLat4LSU, FXa2, FXb, LSU5, GroupAlone2], (instregex "MVC(K|P|S)$")>; +def : InstRW<[WLat1, FXa, LSU5, GroupAlone2], (instregex "MVC(S|D)K$")>; def : InstRW<[WLat30, MCD], (instregex "MVCOS$")>; def : InstRW<[WLat30, MCD], (instregex "MVPG$")>; diff --git a/lib/Target/SystemZ/SystemZScheduleZ14.td b/lib/Target/SystemZ/SystemZScheduleZ14.td index 0f26ffce0e091aa6f19508a5f6f32486285975c4..1962fdf3a1d1fabe3cfea69ef6f16439e3b6a8a7 100644 --- a/lib/Target/SystemZ/SystemZScheduleZ14.td +++ b/lib/Target/SystemZ/SystemZScheduleZ14.td @@ -620,7 +620,7 @@ def : InstRW<[WLat3LSU, WLat3LSU, FXa, FXb, LSU, GroupAlone], def : InstRW<[WLat6LSU, WLat6LSU, FXa3, FXb2, LSU, GroupAlone2], (instregex "CDS(Y)?$")>; def : InstRW<[WLat15, WLat15, FXa2, FXb4, LSU3, - GroupAlone], (instregex "CDSG$")>; + GroupAlone3], (instregex "CDSG$")>; // Compare and swap and store def : InstRW<[WLat30, MCD], (instregex "CSST$")>; @@ -684,10 +684,10 @@ def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "(PACK|PKA|PKU)$")>; def : InstRW<[WLat12, LSU5, GroupAlone], (instregex "UNPK(A|U)$")>; def : InstRW<[WLat1, FXb, LSU2, Cracked], (instregex "UNPK$")>; -def : InstRW<[WLat5LSU, FXb, VecDFX, LSU3, GroupAlone], +def : InstRW<[WLat5LSU, FXb, VecDFX, LSU3, GroupAlone2], (instregex "(A|S|ZA)P$")>; -def : InstRW<[WLat1, FXb, VecDFX4, LSU3, GroupAlone], (instregex "(M|D)P$")>; -def : InstRW<[WLat15, FXb, VecDFX2, LSU2, GroupAlone], (instregex "SRP$")>; +def : InstRW<[WLat1, FXb, VecDFX4, LSU3, GroupAlone2], (instregex "(M|D)P$")>; +def : InstRW<[WLat15, FXb, VecDFX2, LSU2, GroupAlone3], (instregex "SRP$")>; def : InstRW<[WLat8, VecDFX, LSU, LSU, GroupAlone], (instregex "CP$")>; def : InstRW<[WLat3LSU, VecDFX, LSU, Cracked], (instregex "TP$")>; def : InstRW<[WLat30, MCD], (instregex "ED(MK)?$")>; @@ -704,7 +704,7 @@ def : InstRW<[WLat5, LSU, FXa, Cracked], (instregex "LAE(Y)?$")>; // Load/store access multiple (not modeled precisely) def : InstRW<[WLat20, WLat20, LSU5, GroupAlone], (instregex "LAM(Y)?$")>; -def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "STAM(Y)?$")>; +def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STAM(Y)?$")>; //===----------------------------------------------------------------------===// // Program mask and addressing mode @@ -929,7 +929,7 @@ def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XBR$")>; // Test Data Class def : InstRW<[WLat5, LSU, VecXsPm, NormalGr], (instregex "TC(E|D)B$")>; -def : InstRW<[WLat10, LSU2, VecDF4, GroupAlone], (instregex "TCXB$")>; +def : InstRW<[WLat10, LSU, VecDF4, GroupAlone], (instregex "TCXB$")>; //===----------------------------------------------------------------------===// // FP: Floating-point control register instructions @@ -1229,7 +1229,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(L|32|64)?$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>; def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>; -def : InstRW<[WLat1, LSU2, FXb3, GroupAlone], (instregex "VSTM$")>; +def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM$")>; def : InstRW<[WLat1, FXb2, LSU, Cracked], (instregex "VSCE(F|G)$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTRL(R)?$")>; @@ -1500,7 +1500,7 @@ def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "V(T|C)P$")>; //===----------------------------------------------------------------------===// def : InstRW<[WLat30, WLat30, MCD], (instregex "EPSW$")>; -def : InstRW<[WLat20, GroupAlone], (instregex "LPSW(E)?$")>; +def : InstRW<[WLat20, GroupAlone3], (instregex "LPSW(E)?$")>; def : InstRW<[WLat3, FXa, GroupAlone], (instregex "IPK$")>; def : InstRW<[WLat1, LSU, EndGroup], (instregex "SPKA$")>; def : InstRW<[WLat1, LSU, EndGroup], (instregex "SSM$")>; @@ -1513,7 +1513,7 @@ def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAC(F)?$")>; //===----------------------------------------------------------------------===// def : InstRW<[WLat4LSU, WLat4LSU, LSU2, GroupAlone], (instregex "LCTL(G)?$")>; -def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "STCT(L|G)$")>; +def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STCT(L|G)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "E(P|S)A(I)?R$")>; def : InstRW<[WLat30, MCD], (instregex "SSA(I)?R$")>; def : InstRW<[WLat30, MCD], (instregex "ESEA$")>; @@ -1558,8 +1558,8 @@ def : InstRW<[WLat30, MCD], (instregex "TPROT$")>; // System: Memory-move Instructions //===----------------------------------------------------------------------===// -def : InstRW<[WLat4LSU, FXa2, FXb, LSU5, GroupAlone], (instregex "MVC(K|P|S)$")>; -def : InstRW<[WLat1, FXa, LSU5, GroupAlone], (instregex "MVC(S|D)K$")>; +def : InstRW<[WLat4LSU, FXa2, FXb, LSU5, GroupAlone2], (instregex "MVC(K|P|S)$")>; +def : InstRW<[WLat1, FXa, LSU5, GroupAlone2], (instregex "MVC(S|D)K$")>; def : InstRW<[WLat30, MCD], (instregex "MVCOS$")>; def : InstRW<[WLat30, MCD], (instregex "MVPG$")>; diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp index f3620dcf3b92e8af489a48332b4a154016017723..9596a2b6388d62a756ee918ccc1657ff539ee430 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -128,10 +128,16 @@ static Reloc::Model getEffectiveRelocModel(Optional RM) { // in range of LARL. However, the JIT environment has no equivalent // of copy relocs, so locally-binding data symbols might not be in // the range of LARL. We need the Medium model in that case. -static CodeModel::Model getEffectiveCodeModel(Optional CM, - Reloc::Model RM, bool JIT) { - if (CM) +static CodeModel::Model +getEffectiveSystemZCodeModel(Optional CM, Reloc::Model RM, + bool JIT) { + if (CM) { + if (*CM == CodeModel::Tiny) + report_fatal_error("Target does not support the tiny CodeModel"); + if (*CM == CodeModel::Kernel) + report_fatal_error("Target does not support the kernel CodeModel"); return *CM; + } if (JIT) return RM == Reloc::PIC_ ? CodeModel::Small : CodeModel::Medium; return CodeModel::Small; @@ -146,7 +152,8 @@ SystemZTargetMachine::SystemZTargetMachine(const Target &T, const Triple &TT, : LLVMTargetMachine( T, computeDataLayout(TT, CPU, FS), TT, CPU, FS, Options, getEffectiveRelocModel(RM), - getEffectiveCodeModel(CM, getEffectiveRelocModel(RM), JIT), OL), + getEffectiveSystemZCodeModel(CM, getEffectiveRelocModel(RM), JIT), + OL), TLOF(llvm::make_unique()), Subtarget(TT, CPU, FS, *this) { initAsmInfo(); diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index f296d80dbf523061cb4a731bbc1885841f5d4699..129610fe095b03a45f41f1c5f8d675241d8ab59c 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -467,9 +467,6 @@ int SystemZTTIImpl::getArithmeticInstrCost( if (Opcode == Instruction::FRem) return LIBCALL_COST; - if (Opcode == Instruction::LShr || Opcode == Instruction::AShr) - return (ScalarBits >= 32 ? 1 : 2 /*ext*/); - // Or requires one instruction, although it has custom handling for i64. if (Opcode == Instruction::Or) return 1; @@ -484,12 +481,8 @@ int SystemZTTIImpl::getArithmeticInstrCost( return (SignedDivRem ? SDivPow2Cost : 1); if (DivRemConst) return DivMulSeqCost; - if (SignedDivRem) - // sext of op(s) for narrow types - return DivInstrCost + (ScalarBits < 32 ? 3 : (ScalarBits == 32 ? 1 : 0)); - if (UnsignedDivRem) - // Clearing of low 64 bit reg + sext of op(s) for narrow types + dl[g]r - return DivInstrCost + (ScalarBits < 32 ? 3 : 1); + if (SignedDivRem || UnsignedDivRem) + return DivInstrCost; } // Fallback to the default implementation. @@ -718,7 +711,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI)) NeedsExtracts = false; - TotCost += getScalarizationOverhead(Dst, NeedsInserts, NeedsExtracts); + TotCost += getScalarizationOverhead(Src, false, NeedsExtracts); + TotCost += getScalarizationOverhead(Dst, NeedsInserts, false); // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4. if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32) @@ -778,6 +772,18 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, return BaseT::getCastInstrCost(Opcode, Dst, Src, I); } +// Scalar i8 / i16 operations will typically be made after first extending +// the operands to i32. +static unsigned getOperandsExtensionCost(const Instruction *I) { + unsigned ExtCost = 0; + for (Value *Op : I->operands()) + // A load of i8 or i16 sign/zero extends to i32. + if (!isa(Op) && !isa(Op)) + ExtCost++; + + return ExtCost; +} + int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I) { if (ValTy->isVectorTy()) { @@ -833,9 +839,19 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, else { // Scalar switch (Opcode) { case Instruction::ICmp: { + // A loaded value compared with 0 with multiple users becomes Load and + // Test. The load is then not foldable, so return 0 cost for the ICmp. + unsigned ScalarBits = ValTy->getScalarSizeInBits(); + if (I != nullptr && ScalarBits >= 32) + if (LoadInst *Ld = dyn_cast(I->getOperand(0))) + if (const ConstantInt *C = dyn_cast(I->getOperand(1))) + if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() && + C->getZExtValue() == 0) + return 0; + unsigned Cost = 1; if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16) - Cost += 2; // extend both operands + Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2); return Cost; } case Instruction::Select: @@ -898,17 +914,26 @@ isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) { UserI->getOpcode() == Instruction::UDiv) && UserI->getOperand(1) != FoldedValue) return false; // Not commutative, only RHS foldable. + // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an + // extension was made of the load. + unsigned LoadOrTruncBits = + ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits)); switch (UserI->getOpcode()) { case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64 case Instruction::Sub: + case Instruction::ICmp: if (LoadedBits == 32 && ZExtBits == 64) return true; LLVM_FALLTHROUGH; case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64 - if (LoadedBits == 16 && - (SExtBits == 32 || - (SExtBits == 64 && ST->hasMiscellaneousExtensions2()))) - return true; + if (UserI->getOpcode() != Instruction::ICmp) { + if (LoadedBits == 16 && + (SExtBits == 32 || + (SExtBits == 64 && ST->hasMiscellaneousExtensions2()))) + return true; + if (LoadOrTruncBits == 16) + return true; + } LLVM_FALLTHROUGH; case Instruction::SDiv:// SE: 32->64 if (LoadedBits == 32 && SExtBits == 64) @@ -918,7 +943,6 @@ isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) { case Instruction::And: case Instruction::Or: case Instruction::Xor: - case Instruction::ICmp: // This also makes sense for float operations, but disabled for now due // to regressions. // case Instruction::FCmp: @@ -928,16 +952,27 @@ isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) { // case Instruction::FDiv: // All possible extensions of memory checked above. - if (SExtBits || ZExtBits) - return false; - unsigned LoadOrTruncBits = (TruncBits ? TruncBits : LoadedBits); + // Comparison between memory and immediate. + if (UserI->getOpcode() == Instruction::ICmp) + if (ConstantInt *CI = dyn_cast(UserI->getOperand(1))) + if (isUInt<16>(CI->getZExtValue())) + return true; return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64); break; } return false; } +static bool isBswapIntrinsicCall(const Value *V) { + if (const Instruction *I = dyn_cast(V)) + if (auto *CI = dyn_cast(I)) + if (auto *F = CI->getCalledFunction()) + if (F->getIntrinsicID() == Intrinsic::bswap) + return true; + return false; +} + int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I) { @@ -974,6 +1009,22 @@ int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned NumOps = (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src)); + // Store/Load reversed saves one instruction. + if (!Src->isVectorTy() && NumOps == 1 && I != nullptr) { + if (Opcode == Instruction::Load && I->hasOneUse()) { + const Instruction *LdUser = cast(*I->user_begin()); + // In case of load -> bswap -> store, return normal cost for the load. + if (isBswapIntrinsicCall(LdUser) && + (!LdUser->hasOneUse() || !isa(*LdUser->user_begin()))) + return 0; + } + else if (const StoreInst *SI = dyn_cast(I)) { + const Value *StoredVal = SI->getValueOperand(); + if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal)) + return 0; + } + } + if (Src->getScalarSizeInBits() == 128) // 128 bit scalars are held in a pair of two 64 bit registers. NumOps *= 2; @@ -1046,3 +1097,29 @@ int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, // Cost of load/store operations and the permutations needed. return NumVectorMemOps + NumPermutes; } + +static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy) { + if (RetTy->isVectorTy() && ID == Intrinsic::bswap) + return getNumVectorRegs(RetTy); // VPERM + return -1; +} + +int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Args, + FastMathFlags FMF, unsigned VF) { + int Cost = getVectorIntrinsicInstrCost(ID, RetTy); + if (Cost != -1) + return Cost; + return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); +} + +int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Tys, + FastMathFlags FMF, + unsigned ScalarizationCostPassed) { + int Cost = getVectorIntrinsicInstrCost(ID, RetTy); + if (Cost != -1) + return Cost; + return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, + FMF, ScalarizationCostPassed); +} diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h index dd85c4ea541717a4d4e0dc832b45abb1bab76587..e79bee1ea3a806dff55092e1a48650d5573278cf 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -98,6 +98,13 @@ public: unsigned AddressSpace, bool UseMaskForCond = false, bool UseMaskForGaps = false); + + int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Args, FastMathFlags FMF, + unsigned VF = 1); + int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed = UINT_MAX); /// @} }; diff --git a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index efa6793cff27603a5900c69909cb2500e6a3af3a..f463aeaf8275084008533ea9618a2b91c388ad14 100644 --- a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -38,7 +38,7 @@ namespace { /// WebAssemblyOperand - Instances of this class represent the operands in a /// parsed WASM machine instruction. struct WebAssemblyOperand : public MCParsedAsmOperand { - enum KindTy { Token, Integer, Float, Symbol } Kind; + enum KindTy { Token, Integer, Float, Symbol, BrList } Kind; SMLoc StartLoc, EndLoc; @@ -58,11 +58,16 @@ struct WebAssemblyOperand : public MCParsedAsmOperand { const MCExpr *Exp; }; + struct BrLOp { + std::vector List; + }; + union { struct TokOp Tok; struct IntOp Int; struct FltOp Flt; struct SymOp Sym; + struct BrLOp BrL; }; WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, TokOp T) @@ -73,6 +78,13 @@ struct WebAssemblyOperand : public MCParsedAsmOperand { : Kind(K), StartLoc(Start), EndLoc(End), Flt(F) {} WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End, SymOp S) : Kind(K), StartLoc(Start), EndLoc(End), Sym(S) {} + WebAssemblyOperand(KindTy K, SMLoc Start, SMLoc End) + : Kind(K), StartLoc(Start), EndLoc(End), BrL() {} + + ~WebAssemblyOperand() { + if (isBrList()) + BrL.~BrLOp(); + } bool isToken() const override { return Kind == Token; } bool isImm() const override { @@ -80,6 +92,7 @@ struct WebAssemblyOperand : public MCParsedAsmOperand { } bool isMem() const override { return false; } bool isReg() const override { return false; } + bool isBrList() const { return Kind == BrList; } unsigned getReg() const override { llvm_unreachable("Assembly inspects a register operand"); @@ -111,6 +124,12 @@ struct WebAssemblyOperand : public MCParsedAsmOperand { llvm_unreachable("Should be immediate or symbol!"); } + void addBrListOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && isBrList() && "Invalid BrList!"); + for (auto Br : BrL.List) + Inst.addOperand(MCOperand::createImm(Br)); + } + void print(raw_ostream &OS) const override { switch (Kind) { case Token: @@ -125,6 +144,9 @@ struct WebAssemblyOperand : public MCParsedAsmOperand { case Symbol: OS << "Sym:" << Sym.Exp; break; + case BrList: + OS << "BrList:" << BrL.List.size(); + break; } } }; @@ -132,16 +154,40 @@ struct WebAssemblyOperand : public MCParsedAsmOperand { class WebAssemblyAsmParser final : public MCTargetAsmParser { MCAsmParser &Parser; MCAsmLexer &Lexer; - MCSymbolWasm *LastSymbol; + + // Much like WebAssemblyAsmPrinter in the backend, we have to own these. + std::vector> Signatures; + + // Order of labels, directives and instructions in a .s file have no + // syntactical enforcement. This class is a callback from the actual parser, + // and yet we have to be feeding data to the streamer in a very particular + // order to ensure a correct binary encoding that matches the regular backend + // (the streamer does not enforce this). This "state machine" enum helps + // guarantee that correct order. + enum ParserState { + FileStart, + Label, + FunctionStart, + FunctionLocals, + Instructions, + } CurrentState = FileStart; + + // We track this to see if a .functype following a label is the same, + // as this is how we recognize the start of a function. + MCSymbol *LastLabel = nullptr; public: WebAssemblyAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) : MCTargetAsmParser(Options, STI, MII), Parser(Parser), - Lexer(Parser.getLexer()), LastSymbol(nullptr) { + Lexer(Parser.getLexer()) { setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); } + void addSignature(std::unique_ptr &&Sig) { + Signatures.push_back(std::move(Sig)); + } + #define GET_ASSEMBLER_HEADER #include "WebAssemblyGenAsmMatcher.inc" @@ -151,47 +197,66 @@ public: llvm_unreachable("ParseRegister is not implemented."); } - bool Error(const StringRef &msg, const AsmToken &tok) { - return Parser.Error(tok.getLoc(), msg + tok.getString()); + bool error(const StringRef &Msg, const AsmToken &Tok) { + return Parser.Error(Tok.getLoc(), Msg + Tok.getString()); } - bool IsNext(AsmToken::TokenKind Kind) { - auto ok = Lexer.is(Kind); - if (ok) + bool isNext(AsmToken::TokenKind Kind) { + auto Ok = Lexer.is(Kind); + if (Ok) Parser.Lex(); - return ok; + return Ok; } - bool Expect(AsmToken::TokenKind Kind, const char *KindName) { - if (!IsNext(Kind)) - return Error(std::string("Expected ") + KindName + ", instead got: ", + bool expect(AsmToken::TokenKind Kind, const char *KindName) { + if (!isNext(Kind)) + return error(std::string("Expected ") + KindName + ", instead got: ", Lexer.getTok()); return false; } + StringRef expectIdent() { + if (!Lexer.is(AsmToken::Identifier)) { + error("Expected identifier, got: ", Lexer.getTok()); + return StringRef(); + } + auto Name = Lexer.getTok().getString(); + Parser.Lex(); + return Name; + } - std::pair - ParseRegType(const StringRef &RegType) { - // Derive type from .param .local decls, or the instruction itself. - return StringSwitch>(RegType) - .Case("i32", {MVT::i32, wasm::WASM_TYPE_I32}) - .Case("i64", {MVT::i64, wasm::WASM_TYPE_I64}) - .Case("f32", {MVT::f32, wasm::WASM_TYPE_F32}) - .Case("f64", {MVT::f64, wasm::WASM_TYPE_F64}) - .Case("i8x16", {MVT::v16i8, wasm::WASM_TYPE_V128}) - .Case("i16x8", {MVT::v8i16, wasm::WASM_TYPE_V128}) - .Case("i32x4", {MVT::v4i32, wasm::WASM_TYPE_V128}) - .Case("i64x2", {MVT::v2i64, wasm::WASM_TYPE_V128}) - .Case("f32x4", {MVT::v4f32, wasm::WASM_TYPE_V128}) - .Case("f64x2", {MVT::v2f64, wasm::WASM_TYPE_V128}) - // arbitrarily chosen vector type to associate with "v128" - // FIXME: should these be EVTs to avoid this arbitrary hack? Do we want - // to accept more specific SIMD register types? - .Case("v128", {MVT::v16i8, wasm::WASM_TYPE_V128}) - .Default({MVT::INVALID_SIMPLE_VALUE_TYPE, wasm::WASM_TYPE_NORESULT}); + Optional parseType(const StringRef &Type) { + // FIXME: can't use StringSwitch because wasm::ValType doesn't have a + // "invalid" value. + if (Type == "i32") + return wasm::ValType::I32; + if (Type == "i64") + return wasm::ValType::I64; + if (Type == "f32") + return wasm::ValType::F32; + if (Type == "f64") + return wasm::ValType::F64; + if (Type == "v128" || Type == "i8x16" || Type == "i16x8" || + Type == "i32x4" || Type == "i64x2" || Type == "f32x4" || + Type == "f64x2") + return wasm::ValType::V128; + return Optional(); } - void ParseSingleInteger(bool IsNegative, OperandVector &Operands) { + bool parseRegTypeList(SmallVectorImpl &Types) { + while (Lexer.is(AsmToken::Identifier)) { + auto Type = parseType(Lexer.getTok().getString()); + if (!Type) + return true; + Types.push_back(Type.getValue()); + Parser.Lex(); + if (!isNext(AsmToken::Comma)) + break; + } + return false; + } + + void parseSingleInteger(bool IsNegative, OperandVector &Operands) { auto &Int = Lexer.getTok(); int64_t Val = Int.getIntVal(); if (IsNegative) @@ -202,9 +267,9 @@ public: Parser.Lex(); } - bool ParseOperandStartingWithInteger(bool IsNegative, OperandVector &Operands, + bool parseOperandStartingWithInteger(bool IsNegative, OperandVector &Operands, StringRef InstName) { - ParseSingleInteger(IsNegative, Operands); + parseSingleInteger(IsNegative, Operands); // FIXME: there is probably a cleaner way to do this. auto IsLoadStore = InstName.startswith("load") || InstName.startswith("store") || @@ -214,7 +279,7 @@ public: // Parse load/store operands of the form: offset align auto &Offset = Lexer.getTok(); if (Offset.is(AsmToken::Integer)) { - ParseSingleInteger(false, Operands); + parseSingleInteger(false, Operands); } else { // Alignment not specified. // FIXME: correctly derive a default from the instruction. @@ -233,13 +298,15 @@ public: // Note: Name does NOT point into the sourcecode, but to a local, so // use NameLoc instead. Name = StringRef(NameLoc.getPointer(), Name.size()); + // WebAssembly has instructions with / in them, which AsmLexer parses // as seperate tokens, so if we find such tokens immediately adjacent (no // whitespace), expand the name to include them: for (;;) { auto &Sep = Lexer.getTok(); if (Sep.getLoc().getPointer() != Name.end() || - Sep.getKind() != AsmToken::Slash) break; + Sep.getKind() != AsmToken::Slash) + break; // Extend name with / Name = StringRef(Name.begin(), Name.size() + Sep.getString().size()); Parser.Lex(); @@ -247,10 +314,11 @@ public: auto &Id = Lexer.getTok(); if (Id.getKind() != AsmToken::Identifier || Id.getLoc().getPointer() != Name.end()) - return Error("Incomplete instruction name: ", Id); + return error("Incomplete instruction name: ", Id); Name = StringRef(Name.begin(), Name.size() + Id.getString().size()); Parser.Lex(); } + // Now construct the name as first operand. Operands.push_back(make_unique( WebAssemblyOperand::Token, NameLoc, SMLoc::getFromPointer(Name.end()), @@ -258,6 +326,7 @@ public: auto NamePair = Name.split('.'); // If no '.', there is no type prefix. auto BaseName = NamePair.second.empty() ? NamePair.first : NamePair.second; + while (Lexer.isNot(AsmToken::EndOfStatement)) { auto &Tok = Lexer.getTok(); switch (Tok.getKind()) { @@ -266,7 +335,7 @@ public: const MCExpr *Val; SMLoc End; if (Parser.parsePrimaryExpr(Val, End)) - return Error("Cannot parse symbol: ", Lexer.getTok()); + return error("Cannot parse symbol: ", Lexer.getTok()); Operands.push_back(make_unique( WebAssemblyOperand::Symbol, Id.getLoc(), Id.getEndLoc(), WebAssemblyOperand::SymOp{Val})); @@ -275,33 +344,49 @@ public: case AsmToken::Minus: Parser.Lex(); if (Lexer.isNot(AsmToken::Integer)) - return Error("Expected integer instead got: ", Lexer.getTok()); - if (ParseOperandStartingWithInteger(true, Operands, BaseName)) + return error("Expected integer instead got: ", Lexer.getTok()); + if (parseOperandStartingWithInteger(true, Operands, BaseName)) return true; break; case AsmToken::Integer: - if (ParseOperandStartingWithInteger(false, Operands, BaseName)) + if (parseOperandStartingWithInteger(false, Operands, BaseName)) return true; break; case AsmToken::Real: { double Val; if (Tok.getString().getAsDouble(Val, false)) - return Error("Cannot parse real: ", Tok); + return error("Cannot parse real: ", Tok); Operands.push_back(make_unique( WebAssemblyOperand::Float, Tok.getLoc(), Tok.getEndLoc(), WebAssemblyOperand::FltOp{Val})); Parser.Lex(); break; } + case AsmToken::LCurly: { + Parser.Lex(); + auto Op = make_unique( + WebAssemblyOperand::BrList, Tok.getLoc(), Tok.getEndLoc()); + if (!Lexer.is(AsmToken::RCurly)) + for (;;) { + Op->BrL.List.push_back(Lexer.getTok().getIntVal()); + expect(AsmToken::Integer, "integer"); + if (!isNext(AsmToken::Comma)) + break; + } + expect(AsmToken::RCurly, "}"); + Operands.push_back(std::move(Op)); + break; + } default: - return Error("Unexpected token in operand: ", Tok); + return error("Unexpected token in operand: ", Tok); } if (Lexer.isNot(AsmToken::EndOfStatement)) { - if (Expect(AsmToken::Comma, ",")) + if (expect(AsmToken::Comma, ",")) return true; } } Parser.Lex(); + // Block instructions require a signature index, but these are missing in // assembly, so we add a dummy one explicitly (since we have no control // over signature tables here, we assume these will be regenerated when @@ -315,9 +400,31 @@ public: } void onLabelParsed(MCSymbol *Symbol) override { - LastSymbol = cast(Symbol); + LastLabel = Symbol; + CurrentState = Label; + } + + bool parseSignature(wasm::WasmSignature *Signature) { + if (expect(AsmToken::LParen, "(")) + return true; + if (parseRegTypeList(Signature->Params)) + return true; + if (expect(AsmToken::RParen, ")")) + return true; + if (expect(AsmToken::MinusGreater, "->")) + return true; + if (expect(AsmToken::LParen, "(")) + return true; + if (parseRegTypeList(Signature->Returns)) + return true; + if (expect(AsmToken::RParen, ")")) + return true; + return false; } + // This function processes wasm-specific directives streamed to + // WebAssemblyTargetStreamer, all others go to the generic parser + // (see WasmAsmParser). bool ParseDirective(AsmToken DirectiveID) override { // This function has a really weird return value behavior that is different // from all the other parsing functions: @@ -329,103 +436,89 @@ public: auto &Out = getStreamer(); auto &TOut = reinterpret_cast(*Out.getTargetStreamer()); + // TODO: any time we return an error, at least one token must have been // consumed, otherwise this will not signal an error to the caller. - if (DirectiveID.getString() == ".type") { - // This could be the start of a function, check if followed by - // "label,@function" - if (!Lexer.is(AsmToken::Identifier)) - return Error("Expected label after .type directive, got: ", - Lexer.getTok()); - auto WasmSym = cast( - TOut.getStreamer().getContext().getOrCreateSymbol( - Lexer.getTok().getString())); - Parser.Lex(); - if (!(IsNext(AsmToken::Comma) && IsNext(AsmToken::At) && - Lexer.is(AsmToken::Identifier))) - return Error("Expected label,@type declaration, got: ", Lexer.getTok()); - auto TypeName = Lexer.getTok().getString(); - if (TypeName == "function") - WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); - else if (TypeName == "global") - WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL); - else - return Error("Unknown WASM symbol type: ", Lexer.getTok()); - Parser.Lex(); - return Expect(AsmToken::EndOfStatement, "EOL"); - } else if (DirectiveID.getString() == ".size") { - if (!Lexer.is(AsmToken::Identifier)) - return Error("Expected label after .size directive, got: ", - Lexer.getTok()); - auto WasmSym = cast( - TOut.getStreamer().getContext().getOrCreateSymbol( - Lexer.getTok().getString())); - Parser.Lex(); - if (!IsNext(AsmToken::Comma)) - return Error("Expected `,`, got: ", Lexer.getTok()); - const MCExpr *Exp; - if (Parser.parseExpression(Exp)) - return Error("Cannot parse .size expression: ", Lexer.getTok()); - WasmSym->setSize(Exp); - return Expect(AsmToken::EndOfStatement, "EOL"); - } else if (DirectiveID.getString() == ".globaltype") { - if (!Lexer.is(AsmToken::Identifier)) - return Error("Expected symbol name after .globaltype directive, got: ", - Lexer.getTok()); - auto Name = Lexer.getTok().getString(); - Parser.Lex(); - if (!IsNext(AsmToken::Comma)) - return Error("Expected `,`, got: ", Lexer.getTok()); - if (!Lexer.is(AsmToken::Identifier)) - return Error("Expected type in .globaltype directive, got: ", - Lexer.getTok()); - auto Type = ParseRegType(Lexer.getTok().getString()).second; - if (Type == wasm::WASM_TYPE_NORESULT) - return Error("Unknown type in .globaltype directive: ", - Lexer.getTok()); - Parser.Lex(); + if (DirectiveID.getString() == ".globaltype") { + auto SymName = expectIdent(); + if (SymName.empty()) + return true; + if (expect(AsmToken::Comma, ",")) + return true; + auto TypeTok = Lexer.getTok(); + auto TypeName = expectIdent(); + if (TypeName.empty()) + return true; + auto Type = parseType(TypeName); + if (!Type) + return error("Unknown type in .globaltype directive: ", TypeTok); // Now set this symbol with the correct type. auto WasmSym = cast( - TOut.getStreamer().getContext().getOrCreateSymbol(Name)); + TOut.getStreamer().getContext().getOrCreateSymbol(SymName)); WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL); - WasmSym->setGlobalType(wasm::WasmGlobalType{uint8_t(Type), true}); + WasmSym->setGlobalType( + wasm::WasmGlobalType{uint8_t(Type.getValue()), true}); // And emit the directive again. TOut.emitGlobalType(WasmSym); - return Expect(AsmToken::EndOfStatement, "EOL"); - } else if (DirectiveID.getString() == ".param" || - DirectiveID.getString() == ".local") { - // Track the number of locals, needed for correct virtual register - // assignment elsewhere. - // Also output a directive to the streamer. - std::vector Params; - std::vector Locals; - while (Lexer.is(AsmToken::Identifier)) { - auto RegType = ParseRegType(Lexer.getTok().getString()).first; - if (RegType == MVT::INVALID_SIMPLE_VALUE_TYPE) - return true; - if (DirectiveID.getString() == ".param") { - Params.push_back(RegType); - } else { - Locals.push_back(RegType); - } - Parser.Lex(); - if (!IsNext(AsmToken::Comma)) - break; + return expect(AsmToken::EndOfStatement, "EOL"); + } + + if (DirectiveID.getString() == ".functype") { + // This code has to send things to the streamer similar to + // WebAssemblyAsmPrinter::EmitFunctionBodyStart. + // TODO: would be good to factor this into a common function, but the + // assembler and backend really don't share any common code, and this code + // parses the locals seperately. + auto SymName = expectIdent(); + if (SymName.empty()) + return true; + auto WasmSym = cast( + TOut.getStreamer().getContext().getOrCreateSymbol(SymName)); + if (CurrentState == Label && WasmSym == LastLabel) { + // This .functype indicates a start of a function. + CurrentState = FunctionStart; } - assert(LastSymbol); - // TODO: LastSymbol isn't even used by emitParam, so could be removed. - TOut.emitParam(LastSymbol, Params); + auto Signature = make_unique(); + if (parseSignature(Signature.get())) + return true; + WasmSym->setSignature(Signature.get()); + addSignature(std::move(Signature)); + WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); + TOut.emitFunctionType(WasmSym); + // TODO: backend also calls TOut.emitIndIdx, but that is not implemented. + return expect(AsmToken::EndOfStatement, "EOL"); + } + + if (DirectiveID.getString() == ".eventtype") { + auto SymName = expectIdent(); + if (SymName.empty()) + return true; + auto WasmSym = cast( + TOut.getStreamer().getContext().getOrCreateSymbol(SymName)); + auto Signature = make_unique(); + if (parseRegTypeList(Signature->Params)) + return true; + WasmSym->setSignature(Signature.get()); + addSignature(std::move(Signature)); + WasmSym->setType(wasm::WASM_SYMBOL_TYPE_EVENT); + TOut.emitEventType(WasmSym); + // TODO: backend also calls TOut.emitIndIdx, but that is not implemented. + return expect(AsmToken::EndOfStatement, "EOL"); + } + + if (DirectiveID.getString() == ".local") { + if (CurrentState != FunctionStart) + return error(".local directive should follow the start of a function", + Lexer.getTok()); + SmallVector Locals; + if (parseRegTypeList(Locals)) + return true; TOut.emitLocal(Locals); - return Expect(AsmToken::EndOfStatement, "EOL"); - } else { - // TODO: remove. - while (Lexer.isNot(AsmToken::EndOfStatement)) - Parser.Lex(); - return Expect(AsmToken::EndOfStatement, "EOL"); + CurrentState = FunctionLocals; + return expect(AsmToken::EndOfStatement, "EOL"); } - // TODO: current ELF directive parsing is broken, fix this is a followup. - //return true; // We didn't process this directive. - return false; + + return true; // We didn't process this directive. } bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned & /*Opcode*/, @@ -437,6 +530,16 @@ public: MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm); switch (MatchResult) { case Match_Success: { + if (CurrentState == FunctionStart) { + // This is the first instruction in a function, but we haven't seen + // a .local directive yet. The streamer requires locals to be encoded + // as a prelude to the instructions, so emit an empty list of locals + // here. + auto &TOut = reinterpret_cast( + *Out.getTargetStreamer()); + TOut.emitLocal(SmallVector()); + } + CurrentState = Instructions; Out.EmitInstruction(Inst, getSTI()); return false; } diff --git a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp index 54403a562741ce22d2cf2861bbfc34600ef40a10..c068d6b43d492fbaffa043dcf447862ad0441b05 100644 --- a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp +++ b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp @@ -36,6 +36,8 @@ using DecodeStatus = MCDisassembler::DecodeStatus; #include "WebAssemblyGenDisassemblerTables.inc" namespace { +static constexpr int WebAssemblyInstructionTableSize = 256; + class WebAssemblyDisassembler final : public MCDisassembler { std::unique_ptr MCII; @@ -74,18 +76,26 @@ static int nextByte(ArrayRef Bytes, uint64_t &Size) { return V; } -static bool parseLEBImmediate(MCInst &MI, uint64_t &Size, - ArrayRef Bytes, bool Signed) { +static bool nextLEB(int64_t &Val, ArrayRef Bytes, uint64_t &Size, + bool Signed = false) { unsigned N = 0; const char *Error = nullptr; - auto Val = Signed ? decodeSLEB128(Bytes.data() + Size, &N, - Bytes.data() + Bytes.size(), &Error) - : static_cast( - decodeULEB128(Bytes.data() + Size, &N, - Bytes.data() + Bytes.size(), &Error)); + Val = Signed ? decodeSLEB128(Bytes.data() + Size, &N, + Bytes.data() + Bytes.size(), &Error) + : static_cast(decodeULEB128(Bytes.data() + Size, &N, + Bytes.data() + Bytes.size(), + &Error)); if (Error) return false; Size += N; + return true; +} + +static bool parseLEBImmediate(MCInst &MI, uint64_t &Size, + ArrayRef Bytes, bool Signed) { + int64_t Val; + if (!nextLEB(Val, Bytes, Size, Signed)) + return false; MI.addOperand(MCOperand::createImm(Val)); return true; } @@ -111,7 +121,7 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction( raw_ostream & /*OS*/, raw_ostream &CS) const { CommentStream = &CS; Size = 0; - auto Opc = nextByte(Bytes, Size); + int Opc = nextByte(Bytes, Size); if (Opc < 0) return MCDisassembler::Fail; const auto *WasmInst = &InstructionTable0[Opc]; @@ -127,10 +137,12 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction( } if (!WasmInst) return MCDisassembler::Fail; - Opc = nextByte(Bytes, Size); - if (Opc < 0) + int64_t PrefixedOpc; + if (!nextLEB(PrefixedOpc, Bytes, Size)) + return MCDisassembler::Fail; + if (PrefixedOpc < 0 || PrefixedOpc >= WebAssemblyInstructionTableSize) return MCDisassembler::Fail; - WasmInst += Opc; + WasmInst += PrefixedOpc; } if (WasmInst->ET == ET_Unused) return MCDisassembler::Fail; diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp index e94faa1a21409240048f05b90647801980a48957..eaf9750792bba97c01f93247ab324882d3afd0c8 100644 --- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp +++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp @@ -134,11 +134,19 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, unsigned NumFixedOperands = Desc.NumOperands; SmallSet Printed; for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) { - if (!(i < NumFixedOperands - ? (Desc.OpInfo[i].OperandType == - WebAssembly::OPERAND_BASIC_BLOCK) - : (Desc.TSFlags & WebAssemblyII::VariableOpImmediateIsLabel))) - continue; + // See if this operand denotes a basic block target. + if (i < NumFixedOperands) { + // A non-variable_ops operand, check its type. + if (Desc.OpInfo[i].OperandType != WebAssembly::OPERAND_BASIC_BLOCK) + continue; + } else { + // A variable_ops operand, which currently can be immediates (used in + // br_table) which are basic block targets, or for call instructions + // when using -wasm-keep-registers (in which case they are registers, + // and should not be processed). + if (!MI->getOperand(i).isImm()) + continue; + } uint64_t Depth = MI->getOperand(i).getImm(); if (!Printed.insert(Depth).second) continue; @@ -194,9 +202,6 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) { - assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() || - MII.get(MI->getOpcode()).TSFlags == 0) && - "WebAssembly variable_ops register ops don't use TSFlags"); unsigned WAReg = Op.getReg(); if (int(WAReg) >= 0) printRegName(O, WAReg); @@ -210,23 +215,9 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, if (OpNo < MII.get(MI->getOpcode()).getNumDefs()) O << '='; } else if (Op.isImm()) { - const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - assert((OpNo < Desc.getNumOperands() || - (Desc.TSFlags & WebAssemblyII::VariableOpIsImmediate)) && - "WebAssemblyII::VariableOpIsImmediate should be set for " - "variable_ops immediate ops"); - (void)Desc; - // TODO: (MII.get(MI->getOpcode()).TSFlags & - // WebAssemblyII::VariableOpImmediateIsLabel) - // can tell us whether this is an immediate referencing a label in the - // control flow stack, and it may be nice to pretty-print. O << Op.getImm(); } else if (Op.isFPImm()) { const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - assert(OpNo < Desc.getNumOperands() && - "Unexpected floating-point immediate as a non-fixed operand"); - assert(Desc.TSFlags == 0 && - "WebAssembly variable_ops floating point ops don't use TSFlags"); const MCOperandInfo &Info = Desc.OpInfo[OpNo]; if (Info.OperandType == WebAssembly::OPERAND_F32IMM) { // TODO: MC converts all floating point immediate operands to double. @@ -237,16 +228,22 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, O << ::toString(APFloat(Op.getFPImm())); } } else { - assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() || - (MII.get(MI->getOpcode()).TSFlags & - WebAssemblyII::VariableOpIsImmediate)) && - "WebAssemblyII::VariableOpIsImmediate should be set for " - "variable_ops expr ops"); assert(Op.isExpr() && "unknown operand kind in printOperand"); Op.getExpr()->print(O, &MAI); } } +void WebAssemblyInstPrinter::printBrList(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << "{"; + for (unsigned I = OpNo, E = MI->getNumOperands(); I != E; ++I) { + if (I != OpNo) + O << ", "; + O << MI->getOperand(I).getImm(); + } + O << "}"; +} + void WebAssemblyInstPrinter::printWebAssemblyP2AlignOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h index ded64f9a6e9b83a046f330ec112c1a732963a2c1..fd968bc9ea304be46d670023d6365143f5273a64 100644 --- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h +++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h @@ -43,6 +43,7 @@ public: // Used by tblegen code. void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printBrList(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printWebAssemblyP2AlignOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printWebAssemblySignatureOperand(const MCInst *MI, unsigned OpNo, diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp index fa399371f2cba73a8918fc366aae6a26f55c2ab5..065a4dc94ca6a0bf09d0a7458cc19c63625444f2 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp @@ -67,7 +67,8 @@ void WebAssemblyMCCodeEmitter::encodeInstruction( OS << uint8_t(Binary); } else { assert(Binary <= UINT16_MAX && "Several-byte opcodes not supported yet"); - OS << uint8_t(Binary >> 8) << uint8_t(Binary); + OS << uint8_t(Binary >> 8); + encodeULEB128(uint8_t(Binary), OS); } // For br_table instructions, encode the size of the table. In the MCInst, @@ -85,10 +86,9 @@ void WebAssemblyMCCodeEmitter::encodeInstruction( const MCOperand &MO = MI.getOperand(i); if (MO.isReg()) { /* nothing to encode */ + } else if (MO.isImm()) { if (i < Desc.getNumOperands()) { - assert(Desc.TSFlags == 0 && - "WebAssembly non-variable_ops don't use TSFlags"); const MCOperandInfo &Info = Desc.OpInfo[i]; LLVM_DEBUG(dbgs() << "Encoding immediate: type=" << int(Info.OperandType) << "\n"); @@ -123,15 +123,10 @@ void WebAssemblyMCCodeEmitter::encodeInstruction( encodeULEB128(uint64_t(MO.getImm()), OS); } } else { - assert(Desc.TSFlags == (WebAssemblyII::VariableOpIsImmediate | - WebAssemblyII::VariableOpImmediateIsLabel)); encodeULEB128(uint64_t(MO.getImm()), OS); } + } else if (MO.isFPImm()) { - assert(i < Desc.getNumOperands() && - "Unexpected floating-point immediate as a non-fixed operand"); - assert(Desc.TSFlags == 0 && - "WebAssembly variable_ops floating point ops don't use TSFlags"); const MCOperandInfo &Info = Desc.OpInfo[i]; if (Info.OperandType == WebAssembly::OPERAND_F32IMM) { // TODO: MC converts all floating point immediate operands to double. @@ -143,22 +138,27 @@ void WebAssemblyMCCodeEmitter::encodeInstruction( double d = MO.getFPImm(); support::endian::write(OS, d, support::little); } + } else if (MO.isExpr()) { const MCOperandInfo &Info = Desc.OpInfo[i]; llvm::MCFixupKind FixupKind; size_t PaddedSize = 5; - if (Info.OperandType == WebAssembly::OPERAND_I32IMM) { + switch (Info.OperandType) { + case WebAssembly::OPERAND_I32IMM: FixupKind = MCFixupKind(WebAssembly::fixup_code_sleb128_i32); - } else if (Info.OperandType == WebAssembly::OPERAND_I64IMM) { + break; + case WebAssembly::OPERAND_I64IMM: FixupKind = MCFixupKind(WebAssembly::fixup_code_sleb128_i64); PaddedSize = 10; - } else if (Info.OperandType == WebAssembly::OPERAND_FUNCTION32 || - Info.OperandType == WebAssembly::OPERAND_OFFSET32 || - Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) { + break; + case WebAssembly::OPERAND_FUNCTION32: + case WebAssembly::OPERAND_OFFSET32: + case WebAssembly::OPERAND_TYPEINDEX: + case WebAssembly::OPERAND_GLOBAL: + case WebAssembly::OPERAND_EVENT: FixupKind = MCFixupKind(WebAssembly::fixup_code_uleb128_i32); - } else if (Info.OperandType == WebAssembly::OPERAND_GLOBAL) { - FixupKind = MCFixupKind(WebAssembly::fixup_code_uleb128_i32); - } else { + break; + default: llvm_unreachable("unexpected symbolic operand kind"); } Fixups.push_back(MCFixup::create(OS.tell() - Start, MO.getExpr(), diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp index 24dd5238f71334146c357004d6aa54e6c093ae06..390f367c29785231867e031f95c1173397026694 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp @@ -90,6 +90,10 @@ static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S, return new WebAssemblyTargetAsmStreamer(S, OS); } +static MCTargetStreamer *createNullTargetStreamer(MCStreamer &S) { + return new WebAssemblyTargetNullStreamer(S); +} + // Force static initialization. extern "C" void LLVMInitializeWebAssemblyTargetMC() { for (Target *T : @@ -120,6 +124,8 @@ extern "C" void LLVMInitializeWebAssemblyTargetMC() { createObjectTargetStreamer); // Register the asm target streamer. TargetRegistry::RegisterAsmTargetStreamer(*T, createAsmTargetStreamer); + // Register the null target streamer. + TargetRegistry::RegisterNullTargetStreamer(*T, createNullTargetStreamer); } } diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index 730bb8d6c79c25d5bf3cdf3f53ec5db722da1554..4ca20f5936831b3133da63444b713052b0f14981 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -77,18 +77,12 @@ enum OperandType { OPERAND_SIGNATURE, /// type signature immediate for call_indirect. OPERAND_TYPEINDEX, + /// Event index. + OPERAND_EVENT, }; } // end namespace WebAssembly namespace WebAssemblyII { -enum { - // For variadic instructions, this flag indicates whether an operand - // in the variable_ops range is an immediate value. - VariableOpIsImmediate = (1 << 0), - // For immediate values in the variable_ops range, this flag indicates - // whether the value represents a control-flow label. - VariableOpImmediateIsLabel = (1 << 1) -}; /// Target Operand Flag enum. enum TOF { @@ -97,7 +91,8 @@ enum TOF { // Flags to indicate the type of the symbol being referenced MO_SYMBOL_FUNCTION = 0x1, MO_SYMBOL_GLOBAL = 0x2, - MO_SYMBOL_MASK = 0x3, + MO_SYMBOL_EVENT = 0x4, + MO_SYMBOL_MASK = 0x7, }; } // end namespace WebAssemblyII diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp index 4c4ca4e599c67c53b41731f058ac92ed31d933f5..70ac50272545b9cde60550bc5d3f6168e81e69d0 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp @@ -39,74 +39,80 @@ WebAssemblyTargetAsmStreamer::WebAssemblyTargetAsmStreamer( WebAssemblyTargetWasmStreamer::WebAssemblyTargetWasmStreamer(MCStreamer &S) : WebAssemblyTargetStreamer(S) {} -static void PrintTypes(formatted_raw_ostream &OS, ArrayRef Types) { +static void printTypes(formatted_raw_ostream &OS, + ArrayRef Types) { bool First = true; - for (MVT Type : Types) { + for (auto Type : Types) { if (First) First = false; else OS << ", "; - OS << WebAssembly::TypeToString(WebAssembly::toValType(Type)); + OS << WebAssembly::TypeToString(Type); } OS << '\n'; } -void WebAssemblyTargetAsmStreamer::emitParam(MCSymbol *Symbol, - ArrayRef Types) { +void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef Types) { if (!Types.empty()) { - OS << "\t.param \t"; - - // FIXME: Currently this applies to the "current" function; it may - // be cleaner to specify an explicit symbol as part of the directive. - - PrintTypes(OS, Types); + OS << "\t.local \t"; + printTypes(OS, Types); } } -void WebAssemblyTargetAsmStreamer::emitResult(MCSymbol *Symbol, - ArrayRef Types) { - if (!Types.empty()) { - OS << "\t.result \t"; +void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; } - // FIXME: Currently this applies to the "current" function; it may - // be cleaner to specify an explicit symbol as part of the directive. +void WebAssemblyTargetAsmStreamer::emitSignature( + const wasm::WasmSignature *Sig) { + OS << "("; + emitParamList(Sig); + OS << ") -> ("; + emitReturnList(Sig); + OS << ")"; +} - PrintTypes(OS, Types); +void WebAssemblyTargetAsmStreamer::emitParamList( + const wasm::WasmSignature *Sig) { + auto &Params = Sig->Params; + for (auto &Ty : Params) { + if (&Ty != &Params[0]) + OS << ", "; + OS << WebAssembly::TypeToString(Ty); } } -void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef Types) { - if (!Types.empty()) { - OS << "\t.local \t"; - PrintTypes(OS, Types); +void WebAssemblyTargetAsmStreamer::emitReturnList( + const wasm::WasmSignature *Sig) { + auto &Returns = Sig->Returns; + for (auto &Ty : Returns) { + if (&Ty != &Returns[0]) + OS << ", "; + OS << WebAssembly::TypeToString(Ty); } } -void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; } - -void WebAssemblyTargetAsmStreamer::emitIndirectFunctionType( - MCSymbolWasm *Symbol) { - OS << "\t.functype\t" << Symbol->getName(); - if (Symbol->getSignature()->Returns.empty()) - OS << ", void"; - else { - assert(Symbol->getSignature()->Returns.size() == 1); - OS << ", " - << WebAssembly::TypeToString(Symbol->getSignature()->Returns.front()); - } - for (auto Ty : Symbol->getSignature()->Params) - OS << ", " << WebAssembly::TypeToString(Ty); - OS << '\n'; +void WebAssemblyTargetAsmStreamer::emitFunctionType(const MCSymbolWasm *Sym) { + assert(Sym->isFunction()); + OS << "\t.functype\t" << Sym->getName() << " "; + emitSignature(Sym->getSignature()); + OS << "\n"; } -void WebAssemblyTargetAsmStreamer::emitGlobalType(MCSymbolWasm *Sym) { +void WebAssemblyTargetAsmStreamer::emitGlobalType(const MCSymbolWasm *Sym) { + assert(Sym->isGlobal()); OS << "\t.globaltype\t" << Sym->getName() << ", " << WebAssembly::TypeToString( static_cast(Sym->getGlobalType().Type)) << '\n'; } -void WebAssemblyTargetAsmStreamer::emitImportModule(MCSymbolWasm *Sym, +void WebAssemblyTargetAsmStreamer::emitEventType(const MCSymbolWasm *Sym) { + assert(Sym->isEvent()); + OS << "\t.eventtype\t" << Sym->getName() << " "; + emitParamList(Sym->getSignature()); + OS << "\n"; +} + +void WebAssemblyTargetAsmStreamer::emitImportModule(const MCSymbolWasm *Sym, StringRef ModuleName) { OS << "\t.import_module\t" << Sym->getName() << ", " << ModuleName << '\n'; } @@ -115,19 +121,9 @@ void WebAssemblyTargetAsmStreamer::emitIndIdx(const MCExpr *Value) { OS << "\t.indidx \t" << *Value << '\n'; } -void WebAssemblyTargetWasmStreamer::emitParam(MCSymbol *Symbol, - ArrayRef Types) { - // The Symbol already has its signature -} - -void WebAssemblyTargetWasmStreamer::emitResult(MCSymbol *Symbol, - ArrayRef Types) { - // The Symbol already has its signature -} - -void WebAssemblyTargetWasmStreamer::emitLocal(ArrayRef Types) { - SmallVector, 4> Grouped; - for (MVT Type : Types) { +void WebAssemblyTargetWasmStreamer::emitLocal(ArrayRef Types) { + SmallVector, 4> Grouped; + for (auto Type : Types) { if (Grouped.empty() || Grouped.back().first != Type) Grouped.push_back(std::make_pair(Type, 1)); else @@ -137,7 +133,7 @@ void WebAssemblyTargetWasmStreamer::emitLocal(ArrayRef Types) { Streamer.EmitULEB128IntValue(Grouped.size()); for (auto Pair : Grouped) { Streamer.EmitULEB128IntValue(Pair.second); - emitValueType(WebAssembly::toValType(Pair.first)); + emitValueType(Pair.first); } } @@ -148,18 +144,3 @@ void WebAssemblyTargetWasmStreamer::emitEndFunc() { void WebAssemblyTargetWasmStreamer::emitIndIdx(const MCExpr *Value) { llvm_unreachable(".indidx encoding not yet implemented"); } - -void WebAssemblyTargetWasmStreamer::emitIndirectFunctionType( - MCSymbolWasm *Symbol) { - // Symbol already has its arguments and result set. - Symbol->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); -} - -void WebAssemblyTargetWasmStreamer::emitGlobalType(MCSymbolWasm *Sym) { - // Not needed. -} - -void WebAssemblyTargetWasmStreamer::emitImportModule(MCSymbolWasm *Sym, - StringRef ModuleName) { - Sym->setModuleName(ModuleName); -} diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h index e60158b5defb90369d2113a4a29d2a58a908563b..3073938118b458be1e3e0c9e9ecffd6b91ed47ce 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h @@ -31,22 +31,21 @@ class WebAssemblyTargetStreamer : public MCTargetStreamer { public: explicit WebAssemblyTargetStreamer(MCStreamer &S); - /// .param - virtual void emitParam(MCSymbol *Symbol, ArrayRef Types) = 0; - /// .result - virtual void emitResult(MCSymbol *Symbol, ArrayRef Types) = 0; /// .local - virtual void emitLocal(ArrayRef Types) = 0; + virtual void emitLocal(ArrayRef Types) = 0; /// .endfunc virtual void emitEndFunc() = 0; /// .functype - virtual void emitIndirectFunctionType(MCSymbolWasm *Symbol) = 0; + virtual void emitFunctionType(const MCSymbolWasm *Sym) = 0; /// .indidx virtual void emitIndIdx(const MCExpr *Value) = 0; /// .globaltype - virtual void emitGlobalType(MCSymbolWasm *Sym) = 0; + virtual void emitGlobalType(const MCSymbolWasm *Sym) = 0; + /// .eventtype + virtual void emitEventType(const MCSymbolWasm *Sym) = 0; /// .import_module - virtual void emitImportModule(MCSymbolWasm *Sym, StringRef ModuleName) = 0; + virtual void emitImportModule(const MCSymbolWasm *Sym, + StringRef ModuleName) = 0; protected: void emitValueType(wasm::ValType Type); @@ -55,18 +54,20 @@ protected: /// This part is for ascii assembly output class WebAssemblyTargetAsmStreamer final : public WebAssemblyTargetStreamer { formatted_raw_ostream &OS; + void emitSignature(const wasm::WasmSignature *Sig); + void emitParamList(const wasm::WasmSignature *Sig); + void emitReturnList(const wasm::WasmSignature *Sig); public: WebAssemblyTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS); - void emitParam(MCSymbol *Symbol, ArrayRef Types) override; - void emitResult(MCSymbol *Symbol, ArrayRef Types) override; - void emitLocal(ArrayRef Types) override; + void emitLocal(ArrayRef Types) override; void emitEndFunc() override; - void emitIndirectFunctionType(MCSymbolWasm *Symbol) override; + void emitFunctionType(const MCSymbolWasm *Sym) override; void emitIndIdx(const MCExpr *Value) override; - void emitGlobalType(MCSymbolWasm *Sym) override; - void emitImportModule(MCSymbolWasm *Sym, StringRef ModuleName) override; + void emitGlobalType(const MCSymbolWasm *Sym) override; + void emitEventType(const MCSymbolWasm *Sym) override; + void emitImportModule(const MCSymbolWasm *Sym, StringRef ModuleName) override; }; /// This part is for Wasm object output @@ -74,14 +75,29 @@ class WebAssemblyTargetWasmStreamer final : public WebAssemblyTargetStreamer { public: explicit WebAssemblyTargetWasmStreamer(MCStreamer &S); - void emitParam(MCSymbol *Symbol, ArrayRef Types) override; - void emitResult(MCSymbol *Symbol, ArrayRef Types) override; - void emitLocal(ArrayRef Types) override; + void emitLocal(ArrayRef Types) override; void emitEndFunc() override; - void emitIndirectFunctionType(MCSymbolWasm *Symbol) override; + void emitFunctionType(const MCSymbolWasm *Sym) override {} void emitIndIdx(const MCExpr *Value) override; - void emitGlobalType(MCSymbolWasm *Sym) override; - void emitImportModule(MCSymbolWasm *Sym, StringRef ModuleName) override; + void emitGlobalType(const MCSymbolWasm *Sym) override {} + void emitEventType(const MCSymbolWasm *Sym) override {} + void emitImportModule(const MCSymbolWasm *Sym, + StringRef ModuleName) override {} +}; + +/// This part is for null output +class WebAssemblyTargetNullStreamer final : public WebAssemblyTargetStreamer { +public: + explicit WebAssemblyTargetNullStreamer(MCStreamer &S) + : WebAssemblyTargetStreamer(S) {} + + void emitLocal(ArrayRef) override {} + void emitEndFunc() override {} + void emitFunctionType(const MCSymbolWasm *) override {} + void emitIndIdx(const MCExpr *) override {} + void emitGlobalType(const MCSymbolWasm *) override {} + void emitEventType(const MCSymbolWasm *) override {} + void emitImportModule(const MCSymbolWasm *, StringRef) override {} }; } // end namespace llvm diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp index d9ed146ede7d423504aec4148d7baf2c8a830959..763e30be8e02002f241450e1ccb8e967a37d9cb1 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp @@ -86,6 +86,11 @@ static bool IsGlobalType(const MCValue &Target) { return RefA && RefA->getKind() == MCSymbolRefExpr::VK_WebAssembly_GLOBAL; } +static bool IsEventType(const MCValue &Target) { + const MCSymbolRefExpr *RefA = Target.getSymA(); + return RefA && RefA->getKind() == MCSymbolRefExpr::VK_WebAssembly_EVENT; +} + unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target, const MCFixup &Fixup) const { // WebAssembly functions are not allocated in the data address space. To @@ -106,6 +111,8 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target, return wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB; if (IsFunction) return wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB; + if (IsEventType(Target)) + return wasm::R_WEBASSEMBLY_EVENT_INDEX_LEB; return wasm::R_WEBASSEMBLY_MEMORY_ADDR_LEB; case FK_Data_4: if (IsFunction) diff --git a/lib/Target/WebAssembly/WebAssembly.td b/lib/Target/WebAssembly/WebAssembly.td index 2f301da8e4223964f353cdcf08529c01ea0f22dd..ec9dbffde7f2f6f4ce0b08c900c791a15ec6156f 100644 --- a/lib/Target/WebAssembly/WebAssembly.td +++ b/lib/Target/WebAssembly/WebAssembly.td @@ -71,7 +71,8 @@ def : ProcessorModel<"generic", NoSchedModel, []>; // Latest and greatest experimental version of WebAssembly. Bugs included! def : ProcessorModel<"bleeding-edge", NoSchedModel, - [FeatureSIMD128, FeatureAtomics]>; + [FeatureSIMD128, FeatureAtomics, + FeatureNontrappingFPToInt, FeatureSignExt]>; //===----------------------------------------------------------------------===// // Target Declaration diff --git a/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp b/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp index a4d276c7936525cfa258130bf0f08857b11d53e1..3f17160964b6e537f96c664ede240be6802940d4 100644 --- a/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp +++ b/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp @@ -90,16 +90,23 @@ bool WebAssemblyAddMissingPrototypes::runOnModule(Module &M) { Function *NewF = nullptr; for (Use &U : F.uses()) { LLVM_DEBUG(dbgs() << "prototype-less use: " << F.getName() << "\n"); - if (BitCastOperator *BC = dyn_cast(U.getUser())) { - FunctionType *DestType = - cast(BC->getDestTy()->getPointerElementType()); - - // Create a new function with the correct type - NewType = DestType; - NewF = Function::Create(NewType, F.getLinkage(), F.getName()); - NewF->setAttributes(F.getAttributes()); - NewF->removeFnAttr("no-prototype"); - break; + if (auto *BC = dyn_cast(U.getUser())) { + if (auto *DestType = dyn_cast( + BC->getDestTy()->getPointerElementType())) { + if (!NewType) { + // Create a new function with the correct type + NewType = DestType; + NewF = Function::Create(NewType, F.getLinkage(), F.getName()); + NewF->setAttributes(F.getAttributes()); + NewF->removeFnAttr("no-prototype"); + } else { + if (NewType != DestType) { + report_fatal_error("Prototypeless function used with " + "conflicting signatures: " + + F.getName()); + } + } + } } } @@ -110,28 +117,38 @@ bool WebAssemblyAddMissingPrototypes::runOnModule(Module &M) { continue; } - for (Use &U : F.uses()) { - if (BitCastOperator *BC = dyn_cast(U.getUser())) { - FunctionType *DestType = - cast(BC->getDestTy()->getPointerElementType()); - if (NewType != DestType) { - report_fatal_error( - "Prototypeless function used with conflicting signatures: " + - F.getName()); - } - BC->replaceAllUsesWith(NewF); - Replacements.emplace_back(&F, NewF); - } else { - dbgs() << *U.getUser()->getType() << "\n"; + SmallVector DeadInsts; + + for (Use &US : F.uses()) { + User *U = US.getUser(); + if (auto *BC = dyn_cast(U)) { + if (auto *Inst = dyn_cast(U)) { + // Replace with a new bitcast + IRBuilder<> Builder(Inst); + Value *NewCast = Builder.CreatePointerCast(NewF, BC->getDestTy()); + Inst->replaceAllUsesWith(NewCast); + DeadInsts.push_back(Inst); + } else if (auto *Const = dyn_cast(U)) { + Constant *NewConst = + ConstantExpr::getPointerCast(NewF, BC->getDestTy()); + Const->replaceAllUsesWith(NewConst); + } else { + dbgs() << *U->getType() << "\n"; #ifndef NDEBUG - U.getUser()->dump(); + U->dump(); #endif - report_fatal_error( - "unexpected use of prototypeless function: " + F.getName() + "\n"); + report_fatal_error("unexpected use of prototypeless function: " + + F.getName() + "\n"); + } } } + + for (auto I : DeadInsts) + I->eraseFromParent(); + Replacements.emplace_back(&F, NewF); } + // Finally replace the old function declarations with the new ones for (auto &Pair : Replacements) { Function *Old = Pair.first; diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index 1e21ab92b62997ab3488284e5eedc07fa00f2968..c4f03dfa7f9e451ed81859023d2c64cde7bb15f7 100644 --- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -79,11 +79,12 @@ WebAssemblyTargetStreamer *WebAssemblyAsmPrinter::getTargetStreamer() { void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) { for (auto &It : OutContext.getSymbols()) { - // Emit a .globaltype declaration. + // Emit a .globaltype and .eventtype declaration. auto Sym = cast(It.getValue()); - if (Sym->getType() == wasm::WASM_SYMBOL_TYPE_GLOBAL) { + if (Sym->getType() == wasm::WASM_SYMBOL_TYPE_GLOBAL) getTargetStreamer()->emitGlobalType(Sym); - } + else if (Sym->getType() == wasm::WASM_SYMBOL_TYPE_EVENT) + getTargetStreamer()->emitEventType(Sym); } for (const auto &F : M) { @@ -93,6 +94,7 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) { SmallVector Params; ComputeSignatureVTs(F.getFunctionType(), F, TM, Params, Results); auto *Sym = cast(getSymbol(&F)); + Sym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); if (!Sym->getSignature()) { auto Signature = SignatureFromMVTs(Results, Params); Sym->setSignature(Signature.get()); @@ -103,12 +105,13 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) { // infer the type from a call). With object files it applies to all // imports. so fix the names and the tests, or rethink how import // delcarations work in asm files. - getTargetStreamer()->emitIndirectFunctionType(Sym); + getTargetStreamer()->emitFunctionType(Sym); if (TM.getTargetTriple().isOSBinFormatWasm() && F.hasFnAttribute("wasm-import-module")) { StringRef Name = F.getFnAttribute("wasm-import-module").getValueAsString(); + Sym->setModuleName(Name); getTargetStreamer()->emitImportModule(Sym, Name); } } @@ -163,9 +166,10 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() { auto *WasmSym = cast(CurrentFnSym); WasmSym->setSignature(Signature.get()); addSignature(std::move(Signature)); + WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); // FIXME: clean up how params and results are emitted (use signatures) - getTargetStreamer()->emitParam(CurrentFnSym, ParamVTs); + getTargetStreamer()->emitFunctionType(WasmSym); // Emit the function index. if (MDNode *Idx = F.getMetadata("wasm.index")) { @@ -175,8 +179,9 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() { cast(Idx->getOperand(0))->getValue())); } - getTargetStreamer()->emitResult(CurrentFnSym, ResultVTs); - getTargetStreamer()->emitLocal(MFI->getLocals()); + SmallVector Locals; + ValTypesFromMVTs(MFI->getLocals(), Locals); + getTargetStreamer()->emitLocal(Locals); AsmPrinter::EmitFunctionBodyStart(); } diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp index b1955017c687d5c56900be37994016813b767d39..f8f5f4040c86a3e9427c9ff052e672049fadae07 100644 --- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp +++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -78,12 +78,11 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass { // map DenseMap BeginToBottom; - // Helper functions to register / unregister scope information created by - // marker instructions. + // Helper functions to register scope information created by marker + // instructions. void registerScope(MachineInstr *Begin, MachineInstr *End); void registerTryScope(MachineInstr *Begin, MachineInstr *End, MachineBasicBlock *EHPad); - void unregisterScope(MachineInstr *Begin); MachineBasicBlock *getBottom(const MachineInstr *Begin); @@ -182,23 +181,6 @@ void WebAssemblyCFGStackify::registerTryScope(MachineInstr *Begin, EHPadToTry[EHPad] = Begin; } -void WebAssemblyCFGStackify::unregisterScope(MachineInstr *Begin) { - assert(BeginToEnd.count(Begin)); - MachineInstr *End = BeginToEnd[Begin]; - assert(EndToBegin.count(End)); - BeginToEnd.erase(Begin); - EndToBegin.erase(End); - MachineBasicBlock *EHPad = TryToEHPad.lookup(Begin); - if (EHPad) { - assert(EHPadToTry.count(EHPad)); - TryToEHPad.erase(Begin); - EHPadToTry.erase(EHPad); - } - MachineBasicBlock *Bottom = BeginToBottom.lookup(Begin); - if (Bottom) - BeginToBottom.erase(Begin); -} - // Given a LOOP/TRY marker, returns its bottom BB. Use cached information if any // to prevent recomputation. MachineBasicBlock * diff --git a/lib/Target/WebAssembly/WebAssemblyISD.def b/lib/Target/WebAssembly/WebAssemblyISD.def index 444a087605ef6fa92c65d30fea03fa19450e995b..e987d7f7f43a12946c8c4a3b4400df45de9a03e4 100644 --- a/lib/Target/WebAssembly/WebAssemblyISD.def +++ b/lib/Target/WebAssembly/WebAssemblyISD.def @@ -25,5 +25,6 @@ HANDLE_NODETYPE(SHUFFLE) HANDLE_NODETYPE(VEC_SHL) HANDLE_NODETYPE(VEC_SHR_S) HANDLE_NODETYPE(VEC_SHR_U) +HANDLE_NODETYPE(THROW) // add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here... diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 578d23570f85c03073cba1bec336c1930fc2a4d9..a0382157f76918dfe6ed4cf59d997b9b8a889e88 100644 --- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/WasmEHFuncInfo.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/Function.h" @@ -122,14 +123,22 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( for (auto Op : {ISD::SADDSAT, ISD::UADDSAT}) setOperationAction(Op, T, Legal); - for (auto T : {MVT::i32, MVT::i64}) { - // Expand unavailable integer operations. - for (auto Op : - {ISD::BSWAP, ISD::SMUL_LOHI, ISD::UMUL_LOHI, ISD::MULHS, ISD::MULHU, - ISD::SDIVREM, ISD::UDIVREM, ISD::SHL_PARTS, ISD::SRA_PARTS, - ISD::SRL_PARTS, ISD::ADDC, ISD::ADDE, ISD::SUBC, ISD::SUBE}) { + // Expand unavailable integer operations. + for (auto Op : + {ISD::BSWAP, ISD::SMUL_LOHI, ISD::UMUL_LOHI, ISD::MULHS, ISD::MULHU, + ISD::SDIVREM, ISD::UDIVREM, ISD::SHL_PARTS, ISD::SRA_PARTS, + ISD::SRL_PARTS, ISD::ADDC, ISD::ADDE, ISD::SUBC, ISD::SUBE}) { + for (auto T : {MVT::i32, MVT::i64}) { setOperationAction(Op, T, Expand); } + if (Subtarget->hasSIMD128()) { + for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) { + setOperationAction(Op, T, Expand); + } + if (EnableUnimplementedWasmSIMDInstrs) { + setOperationAction(Op, MVT::v2i64, Expand); + } + } } // There is no i64x2.mul instruction @@ -156,14 +165,15 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setOperationAction(Op, MVT::v2i64, Custom); } - // There is no select instruction for vectors - if (Subtarget->hasSIMD128()) { - for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32}) - setOperationAction(ISD::VSELECT, T, Expand); - if (EnableUnimplementedWasmSIMDInstrs) - for (auto T : {MVT::v2i64, MVT::v2f64}) - setOperationAction(ISD::VSELECT, T, Expand); - } + // There are no select instructions for vectors + if (Subtarget->hasSIMD128()) + for (auto Op : {ISD::VSELECT, ISD::SELECT_CC, ISD::SELECT}) { + for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32}) + setOperationAction(Op, T, Expand); + if (EnableUnimplementedWasmSIMDInstrs) + for (auto T : {MVT::v2i64, MVT::v2f64}) + setOperationAction(Op, T, Expand); + } // As a special case, these operators use the type to mean the type to // sign-extend from. @@ -233,6 +243,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( // Exception handling intrinsics setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setMaxAtomicSizeInBitsSupported(64); } @@ -881,6 +892,8 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op, case ISD::EXTRACT_VECTOR_ELT: case ISD::INSERT_VECTOR_ELT: return LowerAccessVectorElement(Op, DAG); + case ISD::INTRINSIC_VOID: + return LowerINTRINSIC_VOID(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::SHL: @@ -1044,6 +1057,44 @@ WebAssemblyTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } } +SDValue +WebAssemblyTargetLowering::LowerINTRINSIC_VOID(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + unsigned IntNo = cast(Op.getOperand(1))->getZExtValue(); + SDLoc DL(Op); + + switch (IntNo) { + default: + return {}; // Don't custom lower most intrinsics. + + case Intrinsic::wasm_throw: { + int Tag = cast(Op.getOperand(2).getNode())->getZExtValue(); + switch (Tag) { + case CPP_EXCEPTION: { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); + const char *SymName = MF.createExternalSymbolName("__cpp_exception"); + SDValue SymNode = + DAG.getNode(WebAssemblyISD::Wrapper, DL, PtrVT, + DAG.getTargetExternalSymbol( + SymName, PtrVT, WebAssemblyII::MO_SYMBOL_EVENT)); + return DAG.getNode(WebAssemblyISD::THROW, DL, + MVT::Other, // outchain type + { + Op.getOperand(0), // inchain + SymNode, // exception symbol + Op.getOperand(3) // thrown value + }); + } + default: + llvm_unreachable("Invalid tag!"); + } + break; + } + } +} + SDValue WebAssemblyTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/lib/Target/WebAssembly/WebAssemblyISelLowering.h index 5182a58efc784839ed0216a613b7e44b8d947e29..80076818de7335293226909c1a7047b6a2cb2805 100644 --- a/lib/Target/WebAssembly/WebAssemblyISelLowering.h +++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.h @@ -98,6 +98,7 @@ private: SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCopyToReg(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerAccessVectorElement(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td index 0af94ef875518d0484e6e2671b09d9dfb53a269b..a7fc7564009007474825f53e500007dc8bf5deb7 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td @@ -33,11 +33,17 @@ def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst), def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst), (BR_UNLESS bb_op:$dst, I32:$cond)>; +// A list of branch targets enclosed in {} and separated by comma. +// Used by br_table only. +def BrListAsmOperand : AsmOperandClass { let Name = "BrList"; } +def brlist : Operand { + let ParserMatchClass = BrListAsmOperand; + let PrintMethod = "printBrList"; +} + // TODO: SelectionDAG's lowering insists on using a pointer as the index for // jump tables, so in practice we don't ever use BR_TABLE_I64 in wasm32 mode // currently. -// Set TSFlags{0} to 1 to indicate that the variable_ops are immediates. -// Set TSFlags{1} to 1 to indicate that the immediates represent labels. // FIXME: this can't inherit from I<> since there is no way to inherit from a // multiclass and still have the let statements. let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in { @@ -45,29 +51,19 @@ let isCodeGenOnly = 1 in def BR_TABLE_I32 : NI<(outs), (ins I32:$index, variable_ops), [(WebAssemblybr_table I32:$index)], "false", "br_table \t$index", 0x0e> { - let TSFlags{0} = 1; - let TSFlags{1} = 1; } let BaseName = "BR_TABLE_I32" in -def BR_TABLE_I32_S : NI<(outs), (ins variable_ops), - [], "true", - "br_table \t", 0x0e> { - let TSFlags{0} = 1; - let TSFlags{1} = 1; +def BR_TABLE_I32_S : NI<(outs), (ins brlist:$brl), [], "true", + "br_table \t$brl", 0x0e> { } let isCodeGenOnly = 1 in def BR_TABLE_I64 : NI<(outs), (ins I64:$index, variable_ops), [(WebAssemblybr_table I64:$index)], "false", "br_table \t$index"> { - let TSFlags{0} = 1; - let TSFlags{1} = 1; } let BaseName = "BR_TABLE_I64" in -def BR_TABLE_I64_S : NI<(outs), (ins variable_ops), - [], "true", - "br_table \t"> { - let TSFlags{0} = 1; - let TSFlags{1} = 1; +def BR_TABLE_I64_S : NI<(outs), (ins brlist:$brl), [], "true", + "br_table \t$brl"> { } } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 @@ -146,14 +142,16 @@ let Predicates = [HasExceptionHandling] in { // Throwing an exception: throw / rethrow let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in { -defm THROW_I32 : I<(outs), (ins i32imm:$tag, I32:$val), - (outs), (ins i32imm:$tag), - [(int_wasm_throw imm:$tag, I32:$val)], +defm THROW_I32 : I<(outs), (ins event_op:$tag, I32:$val), + (outs), (ins event_op:$tag), + [(WebAssemblythrow (WebAssemblywrapper texternalsym:$tag), + I32:$val)], "throw \t$tag, $val", "throw \t$tag", 0x08>; -defm THROW_I64 : I<(outs), (ins i32imm:$tag, I64:$val), - (outs), (ins i32imm:$tag), - [(int_wasm_throw imm:$tag, I64:$val)], +defm THROW_I64 : I<(outs), (ins event_op:$tag, I64:$val), + (outs), (ins event_op:$tag), + [(WebAssemblythrow (WebAssemblywrapper texternalsym:$tag), + I64:$val)], "throw \t$tag, $val", "throw \t$tag", 0x08>; defm RETHROW : NRI<(outs), (ins bb_op:$dst), [], "rethrow \t$dst", 0x09>; diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index 8fff924265ff3133c99054732d305c45ee137460..085c5a77e68af79de223bdd232c0341b3608b0e6 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -64,6 +64,7 @@ def SDT_WebAssemblyArgument : SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>; def SDT_WebAssemblyReturn : SDTypeProfile<0, -1, []>; def SDT_WebAssemblyWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; +def SDT_WebAssemblyThrow : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>; //===----------------------------------------------------------------------===// // WebAssembly-specific DAG Nodes. @@ -90,6 +91,8 @@ def WebAssemblyreturn : SDNode<"WebAssemblyISD::RETURN", SDT_WebAssemblyReturn, [SDNPHasChain]>; def WebAssemblywrapper : SDNode<"WebAssemblyISD::Wrapper", SDT_WebAssemblyWrapper>; +def WebAssemblythrow : SDNode<"WebAssemblyISD::THROW", SDT_WebAssemblyThrow, + [SDNPHasChain]>; //===----------------------------------------------------------------------===// // WebAssembly-specific Operands. @@ -140,6 +143,10 @@ let OperandType = "OPERAND_P2ALIGN" in { def P2Align : Operand { let PrintMethod = "printWebAssemblyP2AlignOperand"; } + +let OperandType = "OPERAND_EVENT" in +def event_op : Operand; + } // OperandType = "OPERAND_P2ALIGN" let OperandType = "OPERAND_SIGNATURE" in { diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index caad638e9e387107485f5e7baa249da8c58127ee..7ac2d15c295188fbbbe28265d4a0273442a7023f 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -36,6 +36,58 @@ def ImmI#SIZE : ImmLeaf; +//===----------------------------------------------------------------------===// +// Load and store +//===----------------------------------------------------------------------===// + +// Load: v128.load +multiclass SIMDLoad { + let mayLoad = 1 in + defm LOAD_#vec_t : + SIMD_I<(outs V128:$dst), (ins P2Align:$align, offset32_op:$off, I32:$addr), + (outs), (ins P2Align:$align, offset32_op:$off), [], + "v128.load\t$dst, ${off}(${addr})$align", + "v128.load\t$off$align", 0>; +} + +foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in { +defm "" : SIMDLoad; + +// Def load and store patterns from WebAssemblyInstrMemory.td for vector types +def : LoadPatNoOffset("LOAD_"#vec_t)>; +def : LoadPatImmOff("LOAD_"#vec_t)>; +def : LoadPatImmOff("LOAD_"#vec_t)>; +def : LoadPatGlobalAddr("LOAD_"#vec_t)>; +def : LoadPatExternalSym("LOAD_"#vec_t)>; +def : LoadPatOffsetOnly("LOAD_"#vec_t)>; +def : LoadPatGlobalAddrOffOnly("LOAD_"#vec_t)>; +def : LoadPatExternSymOffOnly("LOAD_"#vec_t)>; +} + +// Store: v128.store +multiclass SIMDStore { + let mayStore = 1 in + defm STORE_#vec_t : + SIMD_I<(outs), (ins P2Align:$align, offset32_op:$off, I32:$addr, V128:$vec), + (outs), (ins P2Align:$align, offset32_op:$off), [], + "v128.store\t${off}(${addr})$align, $vec", + "v128.store\t$off$align", 1>; +} + +foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in { +defm "" : SIMDStore; + +// Def load and store patterns from WebAssemblyInstrMemory.td for vector types +def : StorePatNoOffset("STORE_"#vec_t)>; +def : StorePatImmOff("STORE_"#vec_t)>; +def : StorePatImmOff("STORE_"#vec_t)>; +def : StorePatGlobalAddr("STORE_"#vec_t)>; +def : StorePatExternalSym("STORE_"#vec_t)>; +def : StorePatOffsetOnly("STORE_"#vec_t)>; +def : StorePatGlobalAddrOffOnly("STORE_"#vec_t)>; +def : StorePatExternSymOffOnly("STORE_"#vec_t)>; +} + //===----------------------------------------------------------------------===// // Constructing SIMD values //===----------------------------------------------------------------------===// @@ -46,7 +98,7 @@ multiclass ConstVec { defm CONST_V128_#vec_t : SIMD_I<(outs V128:$dst), ops, (outs), ops, [(set V128:$dst, (vec_t pat))], "v128.const\t$dst, "#args, - "v128.const\t"#args, 0>; + "v128.const\t"#args, 2>; } defm "" : ConstVec; +// Shuffle lanes: shuffle +defm SHUFFLE : + SIMD_I<(outs V128:$dst), + (ins V128:$x, V128:$y, + vec_i8imm_op:$m0, vec_i8imm_op:$m1, + vec_i8imm_op:$m2, vec_i8imm_op:$m3, + vec_i8imm_op:$m4, vec_i8imm_op:$m5, + vec_i8imm_op:$m6, vec_i8imm_op:$m7, + vec_i8imm_op:$m8, vec_i8imm_op:$m9, + vec_i8imm_op:$mA, vec_i8imm_op:$mB, + vec_i8imm_op:$mC, vec_i8imm_op:$mD, + vec_i8imm_op:$mE, vec_i8imm_op:$mF), + (outs), + (ins + vec_i8imm_op:$m0, vec_i8imm_op:$m1, + vec_i8imm_op:$m2, vec_i8imm_op:$m3, + vec_i8imm_op:$m4, vec_i8imm_op:$m5, + vec_i8imm_op:$m6, vec_i8imm_op:$m7, + vec_i8imm_op:$m8, vec_i8imm_op:$m9, + vec_i8imm_op:$mA, vec_i8imm_op:$mB, + vec_i8imm_op:$mC, vec_i8imm_op:$mD, + vec_i8imm_op:$mE, vec_i8imm_op:$mF), + [], + "v8x16.shuffle\t$dst, $x, $y, "# + "$m0, $m1, $m2, $m3, $m4, $m5, $m6, $m7, "# + "$m8, $m9, $mA, $mB, $mC, $mD, $mE, $mF", + "v8x16.shuffle\t"# + "$m0, $m1, $m2, $m3, $m4, $m5, $m6, $m7, "# + "$m8, $m9, $mA, $mB, $mC, $mD, $mE, $mF", + 3>; + +// Shuffles after custom lowering +def wasm_shuffle_t : SDTypeProfile<1, 18, []>; +def wasm_shuffle : SDNode<"WebAssemblyISD::SHUFFLE", wasm_shuffle_t>; +foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in { +def : Pat<(vec_t (wasm_shuffle (vec_t V128:$x), (vec_t V128:$y), + (i32 LaneIdx32:$m0), (i32 LaneIdx32:$m1), + (i32 LaneIdx32:$m2), (i32 LaneIdx32:$m3), + (i32 LaneIdx32:$m4), (i32 LaneIdx32:$m5), + (i32 LaneIdx32:$m6), (i32 LaneIdx32:$m7), + (i32 LaneIdx32:$m8), (i32 LaneIdx32:$m9), + (i32 LaneIdx32:$mA), (i32 LaneIdx32:$mB), + (i32 LaneIdx32:$mC), (i32 LaneIdx32:$mD), + (i32 LaneIdx32:$mE), (i32 LaneIdx32:$mF))), + (vec_t (SHUFFLE (vec_t V128:$x), (vec_t V128:$y), + (i32 LaneIdx32:$m0), (i32 LaneIdx32:$m1), + (i32 LaneIdx32:$m2), (i32 LaneIdx32:$m3), + (i32 LaneIdx32:$m4), (i32 LaneIdx32:$m5), + (i32 LaneIdx32:$m6), (i32 LaneIdx32:$m7), + (i32 LaneIdx32:$m8), (i32 LaneIdx32:$m9), + (i32 LaneIdx32:$mA), (i32 LaneIdx32:$mB), + (i32 LaneIdx32:$mC), (i32 LaneIdx32:$mD), + (i32 LaneIdx32:$mE), (i32 LaneIdx32:$mF)))>; +} + // Create vector with identical lanes: splat def splat2 : PatFrag<(ops node:$x), (build_vector node:$x, node:$x)>; def splat4 : PatFrag<(ops node:$x), (build_vector @@ -116,12 +223,12 @@ multiclass Splat; } -defm "" : Splat; -defm "" : Splat; -defm "" : Splat; -defm "" : Splat; -defm "" : Splat; -defm "" : Splat; +defm "" : Splat; +defm "" : Splat; +defm "" : Splat; +defm "" : Splat; +defm "" : Splat; +defm "" : Splat; //===----------------------------------------------------------------------===// // Accessing lanes @@ -164,16 +271,16 @@ defm extract_i16x8 : ExtractPat; multiclass ExtractLaneExtended baseInst> { defm "" : ExtractLane("extract_i8x16"#sign)>; - defm "" : ExtractLane("extract_i16x8"#sign)>; } -defm "" : ExtractLaneExtended<"_s", 9>; -defm "" : ExtractLaneExtended<"_u", 10>; +defm "" : ExtractLaneExtended<"_s", 5>; +defm "" : ExtractLaneExtended<"_u", 6>; defm "" : ExtractLane; -defm "" : ExtractLane; -defm "" : ExtractLane; -defm "" : ExtractLane; +defm "" : ExtractLane; +defm "" : ExtractLane; +defm "" : ExtractLane; // Follow convention of making implicit expansions unsigned def : Pat<(i32 (vector_extract (v16i8 V128:$vec), (i32 LaneIdx16:$idx))), @@ -216,12 +323,12 @@ multiclass ReplaceLane; } -defm "" : ReplaceLane; -defm "" : ReplaceLane; -defm "" : ReplaceLane; -defm "" : ReplaceLane; -defm "" : ReplaceLane; -defm "" : ReplaceLane; +defm "" : ReplaceLane; +defm "" : ReplaceLane; +defm "" : ReplaceLane; +defm "" : ReplaceLane; +defm "" : ReplaceLane; +defm "" : ReplaceLane; // Lower undef lane indices to zero def : Pat<(vector_insert (v16i8 V128:$vec), I32:$x, undef), @@ -349,63 +456,80 @@ def : Pat<(v2f64 (build_vector (f64 F64:$x0), (f64 F64:$x1))), (v2f64 (REPLACE_LANE_v2f64 (v2f64 (SPLAT_v2f64 (f64 F64:$x0))), 1, F64:$x1))>; -// Shuffle lanes: shuffle -defm SHUFFLE : - SIMD_I<(outs V128:$dst), - (ins V128:$x, V128:$y, - vec_i8imm_op:$m0, vec_i8imm_op:$m1, - vec_i8imm_op:$m2, vec_i8imm_op:$m3, - vec_i8imm_op:$m4, vec_i8imm_op:$m5, - vec_i8imm_op:$m6, vec_i8imm_op:$m7, - vec_i8imm_op:$m8, vec_i8imm_op:$m9, - vec_i8imm_op:$mA, vec_i8imm_op:$mB, - vec_i8imm_op:$mC, vec_i8imm_op:$mD, - vec_i8imm_op:$mE, vec_i8imm_op:$mF), - (outs), - (ins - vec_i8imm_op:$m0, vec_i8imm_op:$m1, - vec_i8imm_op:$m2, vec_i8imm_op:$m3, - vec_i8imm_op:$m4, vec_i8imm_op:$m5, - vec_i8imm_op:$m6, vec_i8imm_op:$m7, - vec_i8imm_op:$m8, vec_i8imm_op:$m9, - vec_i8imm_op:$mA, vec_i8imm_op:$mB, - vec_i8imm_op:$mC, vec_i8imm_op:$mD, - vec_i8imm_op:$mE, vec_i8imm_op:$mF), - [], - "v8x16.shuffle\t$dst, $x, $y, "# - "$m0, $m1, $m2, $m3, $m4, $m5, $m6, $m7, "# - "$m8, $m9, $mA, $mB, $mC, $mD, $mE, $mF", - "v8x16.shuffle\t"# - "$m0, $m1, $m2, $m3, $m4, $m5, $m6, $m7, "# - "$m8, $m9, $mA, $mB, $mC, $mD, $mE, $mF", - 23>; +//===----------------------------------------------------------------------===// +// Comparisons +//===----------------------------------------------------------------------===// -// Shuffles after custom lowering -def wasm_shuffle_t : SDTypeProfile<1, 18, []>; -def wasm_shuffle : SDNode<"WebAssemblyISD::SHUFFLE", wasm_shuffle_t>; -foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in { -def : Pat<(vec_t (wasm_shuffle (vec_t V128:$x), (vec_t V128:$y), - (i32 LaneIdx32:$m0), (i32 LaneIdx32:$m1), - (i32 LaneIdx32:$m2), (i32 LaneIdx32:$m3), - (i32 LaneIdx32:$m4), (i32 LaneIdx32:$m5), - (i32 LaneIdx32:$m6), (i32 LaneIdx32:$m7), - (i32 LaneIdx32:$m8), (i32 LaneIdx32:$m9), - (i32 LaneIdx32:$mA), (i32 LaneIdx32:$mB), - (i32 LaneIdx32:$mC), (i32 LaneIdx32:$mD), - (i32 LaneIdx32:$mE), (i32 LaneIdx32:$mF))), - (vec_t (SHUFFLE (vec_t V128:$x), (vec_t V128:$y), - (i32 LaneIdx32:$m0), (i32 LaneIdx32:$m1), - (i32 LaneIdx32:$m2), (i32 LaneIdx32:$m3), - (i32 LaneIdx32:$m4), (i32 LaneIdx32:$m5), - (i32 LaneIdx32:$m6), (i32 LaneIdx32:$m7), - (i32 LaneIdx32:$m8), (i32 LaneIdx32:$m9), - (i32 LaneIdx32:$mA), (i32 LaneIdx32:$mB), - (i32 LaneIdx32:$mC), (i32 LaneIdx32:$mD), - (i32 LaneIdx32:$mE), (i32 LaneIdx32:$mF)))>; +multiclass SIMDCondition simdop> { + defm _#vec_t : + SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins), + [(set (out_t V128:$dst), + (setcc (vec_t V128:$lhs), (vec_t V128:$rhs), cond) + )], + vec#"."#name#"\t$dst, $lhs, $rhs", vec#"."#name, simdop>; +} + +multiclass SIMDConditionInt baseInst> { + defm "" : SIMDCondition; + defm "" : SIMDCondition; + defm "" : SIMDCondition; +} + +multiclass SIMDConditionFP baseInst> { + defm "" : SIMDCondition; + defm "" : SIMDCondition; } +// Equality: eq +let isCommutable = 1 in { +defm EQ : SIMDConditionInt<"eq", SETEQ, 24>; +defm EQ : SIMDConditionFP<"eq", SETOEQ, 64>; +} // isCommutable = 1 + +// Non-equality: ne +let isCommutable = 1 in { +defm NE : SIMDConditionInt<"ne", SETNE, 25>; +defm NE : SIMDConditionFP<"ne", SETUNE, 65>; +} // isCommutable = 1 + +// Less than: lt_s / lt_u / lt +defm LT_S : SIMDConditionInt<"lt_s", SETLT, 26>; +defm LT_U : SIMDConditionInt<"lt_u", SETULT, 27>; +defm LT : SIMDConditionFP<"lt", SETOLT, 66>; + +// Greater than: gt_s / gt_u / gt +defm GT_S : SIMDConditionInt<"gt_s", SETGT, 28>; +defm GT_U : SIMDConditionInt<"gt_u", SETUGT, 29>; +defm GT : SIMDConditionFP<"gt", SETOGT, 67>; + +// Less than or equal: le_s / le_u / le +defm LE_S : SIMDConditionInt<"le_s", SETLE, 30>; +defm LE_U : SIMDConditionInt<"le_u", SETULE, 31>; +defm LE : SIMDConditionFP<"le", SETOLE, 68>; + +// Greater than or equal: ge_s / ge_u / ge +defm GE_S : SIMDConditionInt<"ge_s", SETGE, 32>; +defm GE_U : SIMDConditionInt<"ge_u", SETUGE, 33>; +defm GE : SIMDConditionFP<"ge", SETOGE, 69>; + +// Lower float comparisons that don't care about NaN to standard WebAssembly +// float comparisons. These instructions are generated in the target-independent +// expansion of unordered comparisons and ordered ne. +def : Pat<(v4i32 (seteq (v4f32 V128:$lhs), (v4f32 V128:$rhs))), + (v4i32 (EQ_v4f32 (v4f32 V128:$lhs), (v4f32 V128:$rhs)))>; +def : Pat<(v4i32 (setne (v4f32 V128:$lhs), (v4f32 V128:$rhs))), + (v4i32 (NE_v4f32 (v4f32 V128:$lhs), (v4f32 V128:$rhs)))>; +def : Pat<(v2i64 (seteq (v2f64 V128:$lhs), (v2f64 V128:$rhs))), + (v2i64 (EQ_v2f64 (v2f64 V128:$lhs), (v2f64 V128:$rhs)))>; +def : Pat<(v2i64 (setne (v2f64 V128:$lhs), (v2f64 V128:$rhs))), + (v2i64 (NE_v2f64 (v2f64 V128:$lhs), (v2f64 V128:$rhs)))>; + //===----------------------------------------------------------------------===// -// Integer arithmetic +// Bitwise operations //===----------------------------------------------------------------------===// multiclass SIMDBinary; } -multiclass SIMDBinaryIntNoI64x2 baseInst> { - defm "" : SIMDBinary; - defm "" : SIMDBinary; - defm "" : SIMDBinary; +multiclass SIMDBitwise simdop> { + defm "" : SIMDBinary; + defm "" : SIMDBinary; + defm "" : SIMDBinary; + defm "" : SIMDBinary; } -multiclass SIMDBinaryInt baseInst> { - defm "" : SIMDBinaryIntNoI64x2; - defm "" : SIMDBinary; +multiclass SIMDUnary simdop> { + defm _#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins), + [(set (vec_t V128:$dst), + (vec_t (node (vec_t V128:$vec))) + )], + vec#"."#name#"\t$dst, $vec", vec#"."#name, simdop>; } -// Integer vector negation -def ivneg : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>; - -// Integer addition: add -let isCommutable = 1 in -defm ADD : SIMDBinaryInt; - -// Integer subtraction: sub -defm SUB : SIMDBinaryInt; +// Bitwise logic: v128.not +foreach vec_t = [v16i8, v8i16, v4i32, v2i64] in +defm NOT: SIMDUnary; -// Integer multiplication: mul -defm MUL : SIMDBinaryIntNoI64x2; +// Bitwise logic: v128.and / v128.or / v128.xor +let isCommutable = 1 in { +defm AND : SIMDBitwise; +defm OR : SIMDBitwise; +defm XOR : SIMDBitwise; +} // isCommutable = 1 -// Integer negation: neg -multiclass SIMDNeg simdop> { - defm NEG_#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins), - [(set (vec_t V128:$dst), - (vec_t (neg (vec_t V128:$vec))) - )], - vec#".neg\t$dst, $vec", vec#".neg", simdop>; -} +// Bitwise select: v128.bitselect +foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in + defm BITSELECT_#vec_t : + SIMD_I<(outs V128:$dst), (ins V128:$v1, V128:$v2, V128:$c), (outs), (ins), + [(set (vec_t V128:$dst), + (vec_t (int_wasm_bitselect + (vec_t V128:$c), (vec_t V128:$v1), (vec_t V128:$v2) + )) + )], + "v128.bitselect\t$dst, $v1, $v2, $c", "v128.bitselect", 80>; -defm "" : SIMDNeg; -defm "" : SIMDNeg; -defm "" : SIMDNeg; -defm "" : SIMDNeg; +// Bitselect is equivalent to (c & v1) | (~c & v2) +foreach vec_t = [v16i8, v8i16, v4i32, v2i64] in + def : Pat<(vec_t (or (and (vec_t V128:$c), (vec_t V128:$v1)), + (and (vnot V128:$c), (vec_t V128:$v2)))), + (!cast("BITSELECT_"#vec_t) + V128:$v1, V128:$v2, V128:$c)>; //===----------------------------------------------------------------------===// -// Saturating integer arithmetic +// Integer unary arithmetic //===----------------------------------------------------------------------===// -multiclass SIMDBinarySat baseInst> { - defm "" : SIMDBinary; - defm "" : SIMDBinary; +multiclass SIMDUnaryInt baseInst> { + defm "" : SIMDUnary; + defm "" : SIMDUnary; + defm "" : SIMDUnary; + defm "" : SIMDUnary; } -// Saturating integer addition: add_saturate_s / add_saturate_u -let isCommutable = 1 in { -defm ADD_SAT_S : - SIMDBinarySat; -defm ADD_SAT_U : - SIMDBinarySat; -} // isCommutable = 1 +multiclass SIMDReduceVec simdop> { + defm _#vec_t : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins), + [(set I32:$dst, (i32 (op (vec_t V128:$vec))))], + vec#"."#name#"\t$dst, $vec", vec#"."#name, simdop>; +} -// Saturating integer subtraction: sub_saturate_s / sub_saturate_u -defm SUB_SAT_S : - SIMDBinarySat; -defm SUB_SAT_U : - SIMDBinarySat; +multiclass SIMDReduce baseInst> { + defm "" : SIMDReduceVec; + defm "" : SIMDReduceVec; + defm "" : SIMDReduceVec; + defm "" : SIMDReduceVec; +} + +// Integer vector negation +def ivneg : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>; + +// Integer negation: neg +defm NEG : SIMDUnaryInt; + +// Any lane true: any_true +defm ANYTRUE : SIMDReduce; + +// All lanes true: all_true +defm ALLTRUE : SIMDReduce; //===----------------------------------------------------------------------===// // Bit shifts @@ -493,22 +638,22 @@ multiclass SIMDShift; } -multiclass SIMDShiftInt baseInst, int skip> { +multiclass SIMDShiftInt baseInst> { defm "" : SIMDShift; defm "" : SIMDShift; + !add(baseInst, 17)>; defm "" : SIMDShift; + !add(baseInst, 34)>; defm "" : SIMDShift; + name, !add(baseInst, 51)>; } // Left shift by scalar: shl -defm SHL : SIMDShiftInt; +defm SHL : SIMDShiftInt; // Right shift by scalar: shr_s / shr_u -defm SHR_S : SIMDShiftInt; -defm SHR_U : SIMDShiftInt; +defm SHR_S : SIMDShiftInt; +defm SHR_U : SIMDShiftInt; // Truncate i64 shift operands to i32s foreach shifts = [[shl, SHL_v2i64], [sra, SHR_S_v2i64], [srl, SHR_U_v2i64]] in @@ -529,267 +674,87 @@ def : Pat<(v2i64 (shifts[0] (v2i64 V128:$vec), I32:$x)), (v2i64 (shifts[1] (v2i64 V128:$vec), I32:$x))>; //===----------------------------------------------------------------------===// -// Bitwise operations -//===----------------------------------------------------------------------===// - -multiclass SIMDBitwise simdop> { - defm "" : SIMDBinary; - defm "" : SIMDBinary; - defm "" : SIMDBinary; - defm "" : SIMDBinary; -} - -// Bitwise logic: v128.and / v128.or / v128.xor -let isCommutable = 1 in { -defm AND : SIMDBitwise; -defm OR : SIMDBitwise; -defm XOR : SIMDBitwise; -} // isCommutable = 1 - -// Bitwise logic: v128.not -multiclass SIMDNot { - defm NOT_#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins), - [(set (vec_t V128:$dst), (vec_t (vnot V128:$vec)))], - "v128.not\t$dst, $vec", "v128.not", 63>; -} - -defm "" : SIMDNot; -defm "" : SIMDNot; -defm "" : SIMDNot; -defm "" : SIMDNot; - -// Bitwise select: v128.bitselect -multiclass Bitselect { - defm BITSELECT_#vec_t : - SIMD_I<(outs V128:$dst), (ins V128:$v1, V128:$v2, V128:$c), (outs), (ins), - [(set (vec_t V128:$dst), - (vec_t (int_wasm_bitselect - (vec_t V128:$c), (vec_t V128:$v1), (vec_t V128:$v2) - )) - )], - "v128.bitselect\t$dst, $v1, $v2, $c", "v128.bitselect", 64>; -} - -foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in -defm "" : Bitselect; - -// Bitselect is equivalent to (c & v1) | (~c & v2) -foreach vec_t = [v16i8, v8i16, v4i32, v2i64] in - def : Pat<(vec_t (or (and (vec_t V128:$c), (vec_t V128:$v1)), - (and (vnot V128:$c), (vec_t V128:$v2)))), - (!cast("BITSELECT_"#vec_t) - V128:$v1, V128:$v2, V128:$c)>; - -//===----------------------------------------------------------------------===// -// Boolean horizontal reductions +// Integer binary arithmetic //===----------------------------------------------------------------------===// -multiclass SIMDReduceVec simdop> { - defm _#vec_t : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins), - [(set I32:$dst, (i32 (op (vec_t V128:$vec))))], - vec#"."#name#"\t$dst, $vec", vec#"."#name, simdop>; -} - -multiclass SIMDReduce baseInst> { - defm "" : SIMDReduceVec; - defm "" : SIMDReduceVec; - defm "" : SIMDReduceVec; - defm "" : SIMDReduceVec; -} - -// Any lane true: any_true -defm ANYTRUE : SIMDReduce<"any_true", int_wasm_anytrue, 65>; - -// All lanes true: all_true -defm ALLTRUE : SIMDReduce<"all_true", int_wasm_alltrue, 69>; - -//===----------------------------------------------------------------------===// -// Comparisons -//===----------------------------------------------------------------------===// - -multiclass SIMDCondition simdop> { - defm _#vec_t : - SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins), - [(set (out_t V128:$dst), - (setcc (vec_t V128:$lhs), (vec_t V128:$rhs), cond) - )], - vec#"."#name#"\t$dst, $lhs, $rhs", vec#"."#name, simdop>; +multiclass SIMDBinaryIntSmall baseInst> { + defm "" : SIMDBinary; + defm "" : SIMDBinary; } -multiclass SIMDConditionInt baseInst, - int step = 1> { - defm "" : SIMDCondition; - defm "" : SIMDCondition; - defm "" : SIMDCondition; +multiclass SIMDBinaryIntNoI64x2 baseInst> { + defm "" : SIMDBinaryIntSmall; + defm "" : SIMDBinary; } -multiclass SIMDConditionFP baseInst> { - defm "" : SIMDCondition; - defm "" : SIMDCondition; +multiclass SIMDBinaryInt baseInst> { + defm "" : SIMDBinaryIntNoI64x2; + defm "" : SIMDBinary; } -// Equality: eq +// Integer addition: add / add_saturate_s / add_saturate_u let isCommutable = 1 in { -defm EQ : SIMDConditionInt<"eq", SETEQ, 73>; -defm EQ : SIMDConditionFP<"eq", SETOEQ, 77>; +defm ADD : SIMDBinaryInt; +defm ADD_SAT_S : SIMDBinaryIntSmall; +defm ADD_SAT_U : SIMDBinaryIntSmall; } // isCommutable = 1 -// Non-equality: ne -let isCommutable = 1 in { -defm NE : SIMDConditionInt<"ne", SETNE, 79>; -defm NE : SIMDConditionFP<"ne", SETUNE, 83>; -} // isCommutable = 1 - -// Less than: lt_s / lt_u / lt -defm LT_S : SIMDConditionInt<"lt_s", SETLT, 85, 2>; -defm LT_U : SIMDConditionInt<"lt_u", SETULT, 86, 2>; -defm LT : SIMDConditionFP<"lt", SETOLT, 93>; - -// Less than or equal: le_s / le_u / le -defm LE_S : SIMDConditionInt<"le_s", SETLE, 95, 2>; -defm LE_U : SIMDConditionInt<"le_u", SETULE, 96, 2>; -defm LE : SIMDConditionFP<"le", SETOLE, 103>; - -// Greater than: gt_s / gt_u / gt -defm GT_S : SIMDConditionInt<"gt_s", SETGT, 105, 2>; -defm GT_U : SIMDConditionInt<"gt_u", SETUGT, 106, 2>; -defm GT : SIMDConditionFP<"gt", SETOGT, 113>; - -// Greater than or equal: ge_s / ge_u / ge -defm GE_S : SIMDConditionInt<"ge_s", SETGE, 115, 2>; -defm GE_U : SIMDConditionInt<"ge_u", SETUGE, 116, 2>; -defm GE : SIMDConditionFP<"ge", SETOGE, 123>; +// Integer subtraction: sub / sub_saturate_s / sub_saturate_u +defm SUB : SIMDBinaryInt; +defm SUB_SAT_S : + SIMDBinaryIntSmall; +defm SUB_SAT_U : + SIMDBinaryIntSmall; -// Lower float comparisons that don't care about NaN to standard WebAssembly -// float comparisons. These instructions are generated in the target-independent -// expansion of unordered comparisons and ordered ne. -def : Pat<(v4i32 (seteq (v4f32 V128:$lhs), (v4f32 V128:$rhs))), - (v4i32 (EQ_v4f32 (v4f32 V128:$lhs), (v4f32 V128:$rhs)))>; -def : Pat<(v4i32 (setne (v4f32 V128:$lhs), (v4f32 V128:$rhs))), - (v4i32 (NE_v4f32 (v4f32 V128:$lhs), (v4f32 V128:$rhs)))>; -def : Pat<(v2i64 (seteq (v2f64 V128:$lhs), (v2f64 V128:$rhs))), - (v2i64 (EQ_v2f64 (v2f64 V128:$lhs), (v2f64 V128:$rhs)))>; -def : Pat<(v2i64 (setne (v2f64 V128:$lhs), (v2f64 V128:$rhs))), - (v2i64 (NE_v2f64 (v2f64 V128:$lhs), (v2f64 V128:$rhs)))>; +// Integer multiplication: mul +defm MUL : SIMDBinaryIntNoI64x2; //===----------------------------------------------------------------------===// -// Load and store +// Floating-point unary arithmetic //===----------------------------------------------------------------------===// -// Load: v128.load -multiclass SIMDLoad { - let mayLoad = 1 in - defm LOAD_#vec_t : - SIMD_I<(outs V128:$dst), (ins P2Align:$align, offset32_op:$off, I32:$addr), - (outs), (ins P2Align:$align, offset32_op:$off), [], - "v128.load\t$dst, ${off}(${addr})$align", - "v128.load\t$off$align", 1>; +multiclass SIMDUnaryFP baseInst> { + defm "" : SIMDUnary; + defm "" : SIMDUnary; } -foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in { -defm "" : SIMDLoad; - -// Def load and store patterns from WebAssemblyInstrMemory.td for vector types -def : LoadPatNoOffset("LOAD_"#vec_t)>; -def : LoadPatImmOff("LOAD_"#vec_t)>; -def : LoadPatImmOff("LOAD_"#vec_t)>; -def : LoadPatGlobalAddr("LOAD_"#vec_t)>; -def : LoadPatExternalSym("LOAD_"#vec_t)>; -def : LoadPatOffsetOnly("LOAD_"#vec_t)>; -def : LoadPatGlobalAddrOffOnly("LOAD_"#vec_t)>; -def : LoadPatExternSymOffOnly("LOAD_"#vec_t)>; -} - -// Store: v128.store -multiclass SIMDStore { - let mayStore = 1 in - defm STORE_#vec_t : - SIMD_I<(outs), (ins P2Align:$align, offset32_op:$off, I32:$addr, V128:$vec), - (outs), (ins P2Align:$align, offset32_op:$off), [], - "v128.store\t${off}(${addr})$align, $vec", - "v128.store\t$off$align", 2>; -} - -foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in { -defm "" : SIMDStore; - -// Def load and store patterns from WebAssemblyInstrMemory.td for vector types -def : StorePatNoOffset("STORE_"#vec_t)>; -def : StorePatImmOff("STORE_"#vec_t)>; -def : StorePatImmOff("STORE_"#vec_t)>; -def : StorePatGlobalAddr("STORE_"#vec_t)>; -def : StorePatExternalSym("STORE_"#vec_t)>; -def : StorePatOffsetOnly("STORE_"#vec_t)>; -def : StorePatGlobalAddrOffOnly("STORE_"#vec_t)>; -def : StorePatExternSymOffOnly("STORE_"#vec_t)>; -} - -//===----------------------------------------------------------------------===// -// Floating-point sign bit operations -//===----------------------------------------------------------------------===// +// Absolute value: abs +defm ABS : SIMDUnaryFP; // Negation: neg -defm "" : SIMDNeg; -defm "" : SIMDNeg; +defm NEG : SIMDUnaryFP; -// Absolute value: abs -multiclass SIMDAbs simdop> { - defm ABS_#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins), - [(set (vec_t V128:$dst), (vec_t (fabs V128:$vec)))], - vec#".abs\t$dst, $vec", vec#".abs", simdop>; -} - -defm "" : SIMDAbs; -defm "" : SIMDAbs; +// Square root: sqrt +defm SQRT : SIMDUnaryFP; //===----------------------------------------------------------------------===// -// Floating-point min and max +// Floating-point binary arithmetic //===----------------------------------------------------------------------===// multiclass SIMDBinaryFP baseInst> { defm "" : SIMDBinary; - defm "" : SIMDBinary; + defm "" : SIMDBinary; } -// NaN-propagating minimum: min -defm MIN : SIMDBinaryFP; - -// NaN-propagating maximum: max -defm MAX : SIMDBinaryFP; - -//===----------------------------------------------------------------------===// -// Floating-point arithmetic -//===----------------------------------------------------------------------===// - // Addition: add let isCommutable = 1 in -defm ADD : SIMDBinaryFP; +defm ADD : SIMDBinaryFP; // Subtraction: sub -defm SUB : SIMDBinaryFP; - -// Division: div -defm DIV : SIMDBinaryFP; +defm SUB : SIMDBinaryFP; // Multiplication: mul let isCommutable = 1 in -defm MUL : SIMDBinaryFP; +defm MUL : SIMDBinaryFP; -// Square root: sqrt -multiclass SIMDSqrt simdop> { - defm SQRT_#vec_t : - SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins), - [(set (vec_t V128:$dst), (vec_t (fsqrt V128:$vec)))], - vec#".sqrt\t$dst, $vec", vec#".sqrt", simdop>; -} +// Division: div +defm DIV : SIMDBinaryFP; -defm "" : SIMDSqrt; -defm "" : SIMDSqrt; +// NaN-propagating minimum: min +defm MIN : SIMDBinaryFP; + +// NaN-propagating maximum: max +defm MAX : SIMDBinaryFP; //===----------------------------------------------------------------------===// // Conversions @@ -804,16 +769,16 @@ multiclass SIMDConvert; -defm "" : SIMDConvert; -defm "" : SIMDConvert; -defm "" : SIMDConvert; +defm "" : SIMDConvert; +defm "" : SIMDConvert; +defm "" : SIMDConvert; +defm "" : SIMDConvert; // Floating point to integer with saturation: trunc_sat_s / trunc_sat_u -defm "" : SIMDConvert; -defm "" : SIMDConvert; -defm "" : SIMDConvert; -defm "" : SIMDConvert; +defm "" : SIMDConvert; +defm "" : SIMDConvert; +defm "" : SIMDConvert; +defm "" : SIMDConvert; // Lower llvm.wasm.trunc.saturate.* to saturating instructions def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))), diff --git a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp index 98953f0948244eec5b3f2caa040677901fef4f08..871e92082a488286ab0ef55390253830ac16f4d4 100644 --- a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp +++ b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp @@ -31,6 +31,7 @@ class WebAssemblyLateEHPrepare final : public MachineFunctionPass { bool runOnMachineFunction(MachineFunction &MF) override; + bool removeUnnecessaryUnreachables(MachineFunction &MF); bool replaceFuncletReturns(MachineFunction &MF); bool hoistCatches(MachineFunction &MF); bool addCatchAlls(MachineFunction &MF); @@ -59,7 +60,7 @@ FunctionPass *llvm::createWebAssemblyLateEHPrepare() { // possible search paths should be the same. // Returns nullptr in case it does not find any EH pad in the search, or finds // multiple different EH pads. -static MachineBasicBlock *GetMatchingEHPad(MachineInstr *MI) { +static MachineBasicBlock *getMatchingEHPad(MachineInstr *MI) { MachineFunction *MF = MI->getParent()->getParent(); SmallVector WL; SmallPtrSet Visited; @@ -83,18 +84,15 @@ static MachineBasicBlock *GetMatchingEHPad(MachineInstr *MI) { return EHPad; } -// Erases the given BBs and all their children from the function. If other BBs -// have the BB as a successor, the successor relationships will be deleted as -// well. +// Erase the specified BBs if the BB does not have any remaining predecessors, +// and also all its dead children. template -static void EraseBBsAndChildren(const Container &MBBs) { +static void eraseDeadBBsAndChildren(const Container &MBBs) { SmallVector WL(MBBs.begin(), MBBs.end()); while (!WL.empty()) { MachineBasicBlock *MBB = WL.pop_back_val(); - SmallVector Preds(MBB->pred_begin(), - MBB->pred_end()); - for (auto *Pred : Preds) - Pred->removeSuccessor(MBB); + if (!MBB->pred_empty()) + continue; SmallVector Succs(MBB->succ_begin(), MBB->succ_end()); WL.append(MBB->succ_begin(), MBB->succ_end()); @@ -110,6 +108,7 @@ bool WebAssemblyLateEHPrepare::runOnMachineFunction(MachineFunction &MF) { return false; bool Changed = false; + Changed |= removeUnnecessaryUnreachables(MF); Changed |= addRethrows(MF); if (!MF.getFunction().hasPersonalityFn()) return Changed; @@ -122,6 +121,31 @@ bool WebAssemblyLateEHPrepare::runOnMachineFunction(MachineFunction &MF) { return Changed; } +bool WebAssemblyLateEHPrepare::removeUnnecessaryUnreachables( + MachineFunction &MF) { + bool Changed = false; + for (auto &MBB : MF) { + for (auto &MI : MBB) { + if (!WebAssembly::isThrow(MI)) + continue; + Changed = true; + + // The instruction after the throw should be an unreachable or a branch to + // another BB that should eventually lead to an unreachable. Delete it + // because throw itself is a terminator, and also delete successors if + // any. + MBB.erase(std::next(MachineBasicBlock::iterator(MI)), MBB.end()); + SmallVector Succs(MBB.succ_begin(), + MBB.succ_end()); + for (auto *Succ : Succs) + MBB.removeSuccessor(Succ); + eraseDeadBBsAndChildren(Succs); + } + } + + return Changed; +} + bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) { bool Changed = false; const auto &TII = *MF.getSubtarget().getInstrInfo(); @@ -183,7 +207,7 @@ bool WebAssemblyLateEHPrepare::hoistCatches(MachineFunction &MF) { Catches.push_back(&MI); for (auto *Catch : Catches) { - MachineBasicBlock *EHPad = GetMatchingEHPad(Catch); + MachineBasicBlock *EHPad = getMatchingEHPad(Catch); assert(EHPad && "No matching EH pad for catch"); if (EHPad->begin() == Catch) continue; @@ -242,7 +266,7 @@ bool WebAssemblyLateEHPrepare::addRethrows(MachineFunction &MF) { Rethrow = BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII.get(WebAssembly::RETHROW_TO_CALLER)); - // Becasue __cxa_rethrow does not return, the instruction after the + // Because __cxa_rethrow does not return, the instruction after the // rethrow should be an unreachable or a branch to another BB that should // eventually lead to an unreachable. Delete it because rethrow itself is // a terminator, and also delete non-EH pad successors if any. @@ -251,7 +275,9 @@ bool WebAssemblyLateEHPrepare::addRethrows(MachineFunction &MF) { for (auto *Succ : MBB.successors()) if (!Succ->isEHPad()) NonPadSuccessors.push_back(Succ); - EraseBBsAndChildren(NonPadSuccessors); + for (auto *Succ : NonPadSuccessors) + MBB.removeSuccessor(Succ); + eraseDeadBBsAndChildren(NonPadSuccessors); } return Changed; } @@ -283,7 +309,7 @@ bool WebAssemblyLateEHPrepare::ensureSingleBBTermPads(MachineFunction &MF) { bool Changed = false; for (auto *Call : ClangCallTerminateCalls) { - MachineBasicBlock *EHPad = GetMatchingEHPad(Call); + MachineBasicBlock *EHPad = getMatchingEHPad(Call); assert(EHPad && "No matching EH pad for catch"); // If it is already the form we want, skip it @@ -308,7 +334,11 @@ bool WebAssemblyLateEHPrepare::ensureSingleBBTermPads(MachineFunction &MF) { BuildMI(*EHPad, InsertPos, Call->getDebugLoc(), TII.get(WebAssembly::UNREACHABLE)); EHPad->erase(InsertPos, EHPad->end()); - EraseBBsAndChildren(EHPad->successors()); + SmallVector Succs(EHPad->succ_begin(), + EHPad->succ_end()); + for (auto *Succ : Succs) + EHPad->removeSuccessor(Succ); + eraseDeadBBsAndChildren(Succs); } return Changed; } diff --git a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index f0d24075801f31e3364e4a8aabc47e6a86f95ba5..8755198f45b52481ae73854e18d8d7248e93a380 100644 --- a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -50,24 +50,19 @@ /// /// In detail, this pass does following things: /// -/// 1) Assumes the existence of global variables: __THREW__, __threwValue, and -/// __tempRet0. -/// __tempRet0 will be set within __cxa_find_matching_catch() function in -/// JS library, and __THREW__ and __threwValue will be set in invoke wrappers +/// 1) Assumes the existence of global variables: __THREW__, __threwValue +/// __THREW__ and __threwValue will be set in invoke wrappers /// in JS glue code. For what invoke wrappers are, refer to 3). These /// variables are used for both exceptions and setjmp/longjmps. /// __THREW__ indicates whether an exception or a longjmp occurred or not. 0 /// means nothing occurred, 1 means an exception occurred, and other numbers /// mean a longjmp occurred. In the case of longjmp, __threwValue variable /// indicates the corresponding setjmp buffer the longjmp corresponds to. -/// In exception handling, __tempRet0 indicates the type of an exception -/// caught, and in setjmp/longjmp, it means the second argument to longjmp -/// function. /// /// * Exception handling /// -/// 2) We assume the existence of setThrew and setTempRet0 functions at link -/// time. +/// 2) We assume the existence of setThrew and setTempRet0/getTempRet0 functions +/// at link time. /// The global variables in 1) will exist in wasm address space, /// but their values should be set in JS code, so these functions /// as interfaces to JS glue code. These functions are equivalent to the @@ -80,10 +75,12 @@ /// __threwValue = value; /// } /// } +// +/// setTempRet0 is called from __cxa_find_matching_catch() in JS glue code. /// -/// function setTempRet0(value) { -/// __tempRet0 = value; -/// } +/// In exception handling, getTempRet0 indicates the type of an exception +/// caught, and in setjmp/longjmp, it means the second argument to longjmp +/// function. /// /// 3) Lower /// invoke @func(arg1, arg2) to label %invoke.cont unwind label %lpad @@ -120,11 +117,10 @@ /// ... use %val ... /// into /// %fmc = call @__cxa_find_matching_catch_N(c1, c2, c3, ...) -/// %val = {%fmc, __tempRet0} +/// %val = {%fmc, getTempRet0()} /// ... use %val ... /// Here N is a number calculated based on the number of clauses. -/// Global variable __tempRet0 is set within __cxa_find_matching_catch() in -/// JS glue code. +/// setTempRet0 is called from __cxa_find_matching_catch() in JS glue code. /// /// 5) Lower /// resume {%a, %b} @@ -140,7 +136,17 @@ /// /// * Setjmp / Longjmp handling /// -/// 7) In the function entry that calls setjmp, initialize setjmpTable and +/// In case calls to longjmp() exists +/// +/// 1) Lower +/// longjmp(buf, value) +/// into +/// emscripten_longjmp_jmpbuf(buf, value) +/// emscripten_longjmp_jmpbuf will be lowered to emscripten_longjmp later. +/// +/// In case calls to setjmp() exists +/// +/// 2) In the function entry that calls setjmp, initialize setjmpTable and /// sejmpTableSize as follows: /// setjmpTableSize = 4; /// setjmpTable = (int *) malloc(40); @@ -148,27 +154,22 @@ /// setjmpTable and setjmpTableSize are used in saveSetjmp() function in JS /// code. /// -/// 8) Lower +/// 3) Lower /// setjmp(buf) /// into /// setjmpTable = saveSetjmp(buf, label, setjmpTable, setjmpTableSize); -/// setjmpTableSize = __tempRet0; +/// setjmpTableSize = getTempRet0(); /// For each dynamic setjmp call, setjmpTable stores its ID (a number which /// is incrementally assigned from 0) and its label (a unique number that /// represents each callsite of setjmp). When we need more entries in /// setjmpTable, it is reallocated in saveSetjmp() in JS code and it will /// return the new table address, and assign the new table size in -/// __tempRet0. saveSetjmp also stores the setjmp's ID into the buffer buf. -/// A BB with setjmp is split into two after setjmp call in order to make the -/// post-setjmp BB the possible destination of longjmp BB. +/// setTempRet0(). saveSetjmp also stores the setjmp's ID into the buffer +/// buf. A BB with setjmp is split into two after setjmp call in order to +/// make the post-setjmp BB the possible destination of longjmp BB. /// -/// 9) Lower -/// longjmp(buf, value) -/// into -/// emscripten_longjmp_jmpbuf(buf, value) -/// emscripten_longjmp_jmpbuf will be lowered to emscripten_longjmp later. /// -/// 10) Lower every call that might longjmp into +/// 4) Lower every call that might longjmp into /// __THREW__ = 0; /// call @__invoke_SIG(func, arg1, arg2) /// %__THREW__.val = __THREW__; @@ -178,32 +179,32 @@ /// setjmpTableSize); /// if (%label == 0) /// emscripten_longjmp(%__THREW__.val, __threwValue); -/// __tempRet0 = __threwValue; +/// setTempRet0(__threwValue); /// } else { /// %label = -1; /// } -/// longjmp_result = __tempRet0; +/// longjmp_result = getTempRet0(); /// switch label { /// label 1: goto post-setjmp BB 1 /// label 2: goto post-setjmp BB 2 /// ... /// default: goto splitted next BB /// } -/// testSetjmp examines setjmpTable to see if there is a matching setjmp -/// call. After calling an invoke wrapper, if a longjmp occurred, __THREW__ -/// will be the address of matching jmp_buf buffer and __threwValue be the -/// second argument to longjmp. mem[__THREW__.val] is a setjmp ID that is -/// stored in saveSetjmp. testSetjmp returns a setjmp label, a unique ID to -/// each setjmp callsite. Label 0 means this longjmp buffer does not -/// correspond to one of the setjmp callsites in this function, so in this -/// case we just chain the longjmp to the caller. (Here we call -/// emscripten_longjmp, which is different from emscripten_longjmp_jmpbuf. -/// emscripten_longjmp_jmpbuf takes jmp_buf as its first argument, while -/// emscripten_longjmp takes an int. Both of them will eventually be lowered -/// to emscripten_longjmp in s2wasm, but here we need two signatures - we -/// can't translate an int value to a jmp_buf.) -/// Label -1 means no longjmp occurred. Otherwise we jump to the right -/// post-setjmp BB based on the label. +/// testSetjmp examines setjmpTable to see if there is a matching setjmp +/// call. After calling an invoke wrapper, if a longjmp occurred, __THREW__ +/// will be the address of matching jmp_buf buffer and __threwValue be the +/// second argument to longjmp. mem[__THREW__.val] is a setjmp ID that is +/// stored in saveSetjmp. testSetjmp returns a setjmp label, a unique ID to +/// each setjmp callsite. Label 0 means this longjmp buffer does not +/// correspond to one of the setjmp callsites in this function, so in this +/// case we just chain the longjmp to the caller. (Here we call +/// emscripten_longjmp, which is different from emscripten_longjmp_jmpbuf. +/// emscripten_longjmp_jmpbuf takes jmp_buf as its first argument, while +/// emscripten_longjmp takes an int. Both of them will eventually be lowered +/// to emscripten_longjmp in s2wasm, but here we need two signatures - we +/// can't translate an int value to a jmp_buf.) +/// Label -1 means no longjmp occurred. Otherwise we jump to the right +/// post-setjmp BB based on the label. /// ///===----------------------------------------------------------------------===// @@ -241,7 +242,8 @@ class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass { GlobalVariable *ThrewGV; GlobalVariable *ThrewValueGV; - GlobalVariable *TempRet0GV; + Function *GetTempRet0Func; + Function *SetTempRet0Func; Function *ResumeF; Function *EHTypeIDF; Function *EmLongjmpF; @@ -281,9 +283,10 @@ public: WebAssemblyLowerEmscriptenEHSjLj(bool EnableEH = true, bool EnableSjLj = true) : ModulePass(ID), EnableEH(EnableEH), EnableSjLj(EnableSjLj), - ThrewGV(nullptr), ThrewValueGV(nullptr), TempRet0GV(nullptr), - ResumeF(nullptr), EHTypeIDF(nullptr), EmLongjmpF(nullptr), - EmLongjmpJmpbufF(nullptr), SaveSetjmpF(nullptr), TestSetjmpF(nullptr) { + ThrewGV(nullptr), ThrewValueGV(nullptr), GetTempRet0Func(nullptr), + SetTempRet0Func(nullptr), ResumeF(nullptr), EHTypeIDF(nullptr), + EmLongjmpF(nullptr), EmLongjmpJmpbufF(nullptr), SaveSetjmpF(nullptr), + TestSetjmpF(nullptr) { EHWhitelistSet.insert(EHWhitelist.begin(), EHWhitelist.end()); } bool runOnModule(Module &M) override; @@ -509,7 +512,8 @@ bool WebAssemblyLowerEmscriptenEHSjLj::canLongjmp(Module &M, Function *ThrowF = M.getFunction("__cxa_throw"); Function *TerminateF = M.getFunction("__clang_call_terminate"); if (Callee == BeginCatchF || Callee == EndCatchF || - Callee == AllocExceptionF || Callee == ThrowF || Callee == TerminateF) + Callee == AllocExceptionF || Callee == ThrowF || Callee == TerminateF || + Callee == GetTempRet0Func || Callee == SetTempRet0Func) return false; // Otherwise we don't know @@ -522,11 +526,11 @@ bool WebAssemblyLowerEmscriptenEHSjLj::canLongjmp(Module &M, // %label = _testSetjmp(mem[%__THREW__.val], setjmpTable, setjmpTableSize); // if (%label == 0) // emscripten_longjmp(%__THREW__.val, threwValue); -// __tempRet0 = threwValue; +// setTempRet0(threwValue); // } else { // %label = -1; // } -// %longjmp_result = __tempRet0; +// %longjmp_result = getTempRet0(); // // As output parameters. returns %label, %longjmp_result, and the BB the last // instruction (%longjmp_result = ...) is in. @@ -570,15 +574,15 @@ void WebAssemblyLowerEmscriptenEHSjLj::wrapTestSetjmp( IRB.CreateCall(EmLongjmpF, {Threw, ThrewValue}); IRB.CreateUnreachable(); - // __tempRet0 = threwValue; + // setTempRet0(threwValue); IRB.SetInsertPoint(EndBB2); - IRB.CreateStore(ThrewValue, TempRet0GV); + IRB.CreateCall(SetTempRet0Func, ThrewValue); IRB.CreateBr(EndBB1); IRB.SetInsertPoint(ElseBB1); IRB.CreateBr(EndBB1); - // longjmp_result = __tempRet0; + // longjmp_result = getTempRet0(); IRB.SetInsertPoint(EndBB1); PHINode *LabelPHI = IRB.CreatePHI(IRB.getInt32Ty(), 2, "label"); LabelPHI->addIncoming(ThenLabel, EndBB2); @@ -588,7 +592,7 @@ void WebAssemblyLowerEmscriptenEHSjLj::wrapTestSetjmp( // Output parameter assignment Label = LabelPHI; EndBB = EndBB1; - LongjmpResult = IRB.CreateLoad(TempRet0GV, "longjmp_result"); + LongjmpResult = IRB.CreateCall(GetTempRet0Func, None, "longjmp_result"); } void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) { @@ -628,12 +632,19 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) { bool LongjmpUsed = LongjmpF && !LongjmpF->use_empty(); bool DoSjLj = EnableSjLj && (SetjmpUsed || LongjmpUsed); - // Declare (or get) global variables __THREW__, __threwValue, and __tempRet0, - // which are used in common for both exception handling and setjmp/longjmp - // handling + // Declare (or get) global variables __THREW__, __threwValue, and + // getTempRet0/setTempRet0 function which are used in common for both + // exception handling and setjmp/longjmp handling ThrewGV = getGlobalVariableI32(M, IRB, "__THREW__"); ThrewValueGV = getGlobalVariableI32(M, IRB, "__threwValue"); - TempRet0GV = getGlobalVariableI32(M, IRB, "__tempRet0"); + GetTempRet0Func = + Function::Create(FunctionType::get(IRB.getInt32Ty(), false), + GlobalValue::ExternalLinkage, "getTempRet0", &M); + SetTempRet0Func = Function::Create( + FunctionType::get(IRB.getVoidTy(), IRB.getInt32Ty(), false), + GlobalValue::ExternalLinkage, "setTempRet0", &M); + GetTempRet0Func->setDoesNotThrow(); + SetTempRet0Func->setDoesNotThrow(); bool Changed = false; @@ -662,22 +673,6 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) { if (DoSjLj) { Changed = true; // We have setjmp or longjmp somewhere - // Register saveSetjmp function - FunctionType *SetjmpFTy = SetjmpF->getFunctionType(); - SmallVector Params = {SetjmpFTy->getParamType(0), - IRB.getInt32Ty(), Type::getInt32PtrTy(C), - IRB.getInt32Ty()}; - FunctionType *FTy = - FunctionType::get(Type::getInt32PtrTy(C), Params, false); - SaveSetjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage, - SaveSetjmpFName, &M); - - // Register testSetjmp function - Params = {IRB.getInt32Ty(), Type::getInt32PtrTy(C), IRB.getInt32Ty()}; - FTy = FunctionType::get(IRB.getInt32Ty(), Params, false); - TestSetjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage, - TestSetjmpFName, &M); - if (LongjmpF) { // Replace all uses of longjmp with emscripten_longjmp_jmpbuf, which is // defined in JS code @@ -687,20 +682,39 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) { LongjmpF->replaceAllUsesWith(EmLongjmpJmpbufF); } - FTy = FunctionType::get(IRB.getVoidTy(), - {IRB.getInt32Ty(), IRB.getInt32Ty()}, false); - EmLongjmpF = - Function::Create(FTy, GlobalValue::ExternalLinkage, EmLongjmpFName, &M); - - // Only traverse functions that uses setjmp in order not to insert - // unnecessary prep / cleanup code in every function - SmallPtrSet SetjmpUsers; - for (User *U : SetjmpF->users()) { - auto *UI = cast(U); - SetjmpUsers.insert(UI->getFunction()); + + if (SetjmpF) { + // Register saveSetjmp function + FunctionType *SetjmpFTy = SetjmpF->getFunctionType(); + SmallVector Params = {SetjmpFTy->getParamType(0), + IRB.getInt32Ty(), Type::getInt32PtrTy(C), + IRB.getInt32Ty()}; + FunctionType *FTy = + FunctionType::get(Type::getInt32PtrTy(C), Params, false); + SaveSetjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage, + SaveSetjmpFName, &M); + + // Register testSetjmp function + Params = {IRB.getInt32Ty(), Type::getInt32PtrTy(C), IRB.getInt32Ty()}; + FTy = FunctionType::get(IRB.getInt32Ty(), Params, false); + TestSetjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage, + TestSetjmpFName, &M); + + FTy = FunctionType::get(IRB.getVoidTy(), + {IRB.getInt32Ty(), IRB.getInt32Ty()}, false); + EmLongjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage, + EmLongjmpFName, &M); + + // Only traverse functions that uses setjmp in order not to insert + // unnecessary prep / cleanup code in every function + SmallPtrSet SetjmpUsers; + for (User *U : SetjmpF->users()) { + auto *UI = cast(U); + SetjmpUsers.insert(UI->getFunction()); + } + for (Function *F : SetjmpUsers) + runSjLjOnFunction(*F); } - for (Function *F : SetjmpUsers) - runSjLjOnFunction(*F); } if (!Changed) { @@ -840,8 +854,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) { CallInst *FMCI = IRB.CreateCall(FMCF, FMCArgs, "fmc"); Value *Undef = UndefValue::get(LPI->getType()); Value *Pair0 = IRB.CreateInsertValue(Undef, FMCI, 0, "pair0"); - Value *TempRet0 = - IRB.CreateLoad(TempRet0GV, TempRet0GV->getName() + ".val"); + Value *TempRet0 = IRB.CreateCall(GetTempRet0Func, None, "tempret0"); Value *Pair1 = IRB.CreateInsertValue(Pair0, TempRet0, 1, "pair1"); LPI->replaceAllUsesWith(Pair1); @@ -922,7 +935,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) { Instruction *NewSetjmpTable = IRB.CreateCall(SaveSetjmpF, Args, "setjmpTable"); Instruction *NewSetjmpTableSize = - IRB.CreateLoad(TempRet0GV, "setjmpTableSize"); + IRB.CreateCall(GetTempRet0Func, None, "setjmpTableSize"); SetjmpTableInsts.push_back(NewSetjmpTable); SetjmpTableSizeInsts.push_back(NewSetjmpTableSize); ToErase.push_back(CI); @@ -1044,7 +1057,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) { // ... // somebb: // setjmpTable = saveSetjmp(buf, label, setjmpTable, setjmpTableSize); - // setjmpTableSize = __tempRet0; + // setjmpTableSize = getTempRet0(); // So we need to make sure the SSA for these variables is valid so that every // saveSetjmp and testSetjmp calls have the correct arguments. SSAUpdater SetjmpTableSSA; diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp index 1dad7b8a2890f9644c2369cb9491d84f81683435..fa862fbaa634c2b3772f9958af28049ed00d2983 100644 --- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp +++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp @@ -75,10 +75,10 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol( cast(Printer.GetExternalSymbolSymbol(Name)); const WebAssemblySubtarget &Subtarget = Printer.getSubtarget(); - // __stack_pointer is a global variable; all other external symbols used by - // CodeGen are functions. It's OK to hardcode knowledge of specific symbols - // here; this method is precisely there for fetching the signatures of known - // Clang-provided symbols. + // Except for the two exceptions (__stack_pointer and __cpp_exception), all + // other external symbols used by CodeGen are functions. It's OK to hardcode + // knowledge of specific symbols here; this method is precisely there for + // fetching the signatures of known Clang-provided symbols. if (strcmp(Name, "__stack_pointer") == 0) { WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL); WasmSym->setGlobalType(wasm::WasmGlobalType{ @@ -90,24 +90,45 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol( SmallVector Returns; SmallVector Params; - GetLibcallSignature(Subtarget, Name, Returns, Params); + if (strcmp(Name, "__cpp_exception") == 0) { + WasmSym->setType(wasm::WASM_SYMBOL_TYPE_EVENT); + // We can't confirm its signature index for now because there can be + // imported exceptions. Set it to be 0 for now. + WasmSym->setEventType( + {wasm::WASM_EVENT_ATTRIBUTE_EXCEPTION, /* SigIndex */ 0}); + // We may have multiple C++ compilation units to be linked together, each of + // which defines the exception symbol. To resolve them, we declare them as + // weak. + WasmSym->setWeak(true); + WasmSym->setExternal(true); + + // All C++ exceptions are assumed to have a single i32 (for wasm32) or i64 + // (for wasm64) param type and void return type. The reaon is, all C++ + // exception values are pointers, and to share the type section with + // functions, exceptions are assumed to have void return type. + Params.push_back(Subtarget.hasAddr64() ? wasm::ValType::I64 + : wasm::ValType::I32); + } else { // Function symbols + WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); + GetLibcallSignature(Subtarget, Name, Returns, Params); + } auto Signature = make_unique(std::move(Returns), std::move(Params)); WasmSym->setSignature(Signature.get()); Printer.addSignature(std::move(Signature)); - WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); return WasmSym; } MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym, int64_t Offset, - bool IsFunc, - bool IsGlob) const { + bool IsFunc, bool IsGlob, + bool IsEvent) const { MCSymbolRefExpr::VariantKind VK = IsFunc ? MCSymbolRefExpr::VK_WebAssembly_FUNCTION : IsGlob ? MCSymbolRefExpr::VK_WebAssembly_GLOBAL - : MCSymbolRefExpr::VK_None; + : IsEvent ? MCSymbolRefExpr::VK_WebAssembly_EVENT + : MCSymbolRefExpr::VK_None; const MCExpr *Expr = MCSymbolRefExpr::create(Sym, VK, Ctx); @@ -116,6 +137,8 @@ MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym, report_fatal_error("Function addresses with offsets not supported"); if (IsGlob) report_fatal_error("Global indexes with offsets not supported"); + if (IsEvent) + report_fatal_error("Event indexes with offsets not supported"); Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Offset, Ctx), Ctx); } @@ -218,7 +241,7 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI, "WebAssembly does not use target flags on GlobalAddresses"); MCOp = LowerSymbolOperand(GetGlobalAddressSymbol(MO), MO.getOffset(), MO.getGlobal()->getValueType()->isFunctionTy(), - false); + false, false); break; case MachineOperand::MO_ExternalSymbol: // The target flag indicates whether this is a symbol for a @@ -228,14 +251,16 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI, MCOp = LowerSymbolOperand( GetExternalSymbolSymbol(MO), /*Offset=*/0, (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_FUNCTION) != 0, - (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_GLOBAL) != 0); + (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_GLOBAL) != 0, + (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_EVENT) != 0); break; case MachineOperand::MO_MCSymbol: // This is currently used only for LSDA symbols (GCC_except_table), // because global addresses or other external symbols are handled above. assert(MO.getTargetFlags() == 0 && "WebAssembly does not use target flags on MCSymbol"); - MCOp = LowerSymbolOperand(MO.getMCSymbol(), /*Offset=*/0, false, false); + MCOp = LowerSymbolOperand(MO.getMCSymbol(), /*Offset=*/0, false, false, + false); break; } diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h index 54a55796415488953771314daa96acf8731fef18..fa7a0ea61b3b386a95a455a2174227823899f2ed 100644 --- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h +++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h @@ -34,7 +34,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower { MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const; MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const; MCOperand LowerSymbolOperand(MCSymbol *Sym, int64_t Offset, bool IsFunc, - bool IsGlob) const; + bool IsGlob, bool IsEvent) const; public: WebAssemblyMCInstLower(MCContext &ctx, WebAssemblyAsmPrinter &printer) diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp index 073706e567e74caae137c8bec3dacc757646f305..0157af0f85109dbaa85d30d8d041f1876f1fe881 100644 --- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp +++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp @@ -64,13 +64,17 @@ void llvm::ComputeSignatureVTs(const FunctionType *Ty, const Function &F, Params.push_back(PtrVT); } +void llvm::ValTypesFromMVTs(const ArrayRef &In, + SmallVectorImpl &Out) { + for (MVT Ty : In) + Out.push_back(WebAssembly::toValType(Ty)); +} + std::unique_ptr llvm::SignatureFromMVTs(const SmallVectorImpl &Results, const SmallVectorImpl &Params) { auto Sig = make_unique(); - for (MVT Ty : Results) - Sig->Returns.push_back(WebAssembly::toValType(Ty)); - for (MVT Ty : Params) - Sig->Params.push_back(WebAssembly::toValType(Ty)); + ValTypesFromMVTs(Results, Sig->Returns); + ValTypesFromMVTs(Params, Sig->Params); return Sig; } diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h index cde44d24599a209119fc720068c0c412bb941f57..4be4beb85d048b560d9809b7ca021c82faa9eecd 100644 --- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h +++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h @@ -129,6 +129,9 @@ void ComputeSignatureVTs(const FunctionType *Ty, const Function &F, const TargetMachine &TM, SmallVectorImpl &Params, SmallVectorImpl &Results); +void ValTypesFromMVTs(const ArrayRef &In, + SmallVectorImpl &Out); + std::unique_ptr SignatureFromMVTs(const SmallVectorImpl &Results, const SmallVectorImpl &Params); diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index 954c8a6b50b8e585443c32fd7c1d46af8491feff..a6fb78280dad82cd22c99d5dd1a5e3df7906a3d3 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -82,8 +82,12 @@ extern "C" void LLVMInitializeWebAssemblyTarget() { //===----------------------------------------------------------------------===// static Reloc::Model getEffectiveRelocModel(Optional RM) { - if (!RM.hasValue()) - return Reloc::PIC_; + if (!RM.hasValue()) { + // Default to static relocation model. This should always be more optimial + // than PIC since the static linker can determine all global addresses and + // assume direct function calls. + return Reloc::Static; + } return *RM; } @@ -97,7 +101,7 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine( TT.isArch64Bit() ? "e-m:e-p:64:64-i64:64-n32:64-S128" : "e-m:e-p:32:32-i64:64-n32:64-S128", TT, CPU, FS, Options, getEffectiveRelocModel(RM), - CM ? *CM : CodeModel::Large, OL), + getEffectiveCodeModel(CM, CodeModel::Large), OL), TLOF(new WebAssemblyTargetObjectFile()) { // WebAssembly type-checks instructions, but a noreturn function with a return // type that doesn't match the context will cause a check failure. So we lower diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index 5ded1f971a035c4a997db32d145f897e9d1e8e80..524b4ae53be12f8df06ba4718ee3929ca108c560 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -30,6 +30,7 @@ set(sources X86CmovConversion.cpp X86CondBrFolding.cpp X86DomainReassignment.cpp + X86DiscriminateMemOps.cpp X86ExpandPseudo.cpp X86FastISel.cpp X86FixupBWInsts.cpp @@ -44,6 +45,7 @@ set(sources X86ISelLowering.cpp X86IndirectBranchTracking.cpp X86InterleavedAccess.cpp + X86InsertPrefetch.cpp X86InstrFMA3Info.cpp X86InstrFoldTables.cpp X86InstrInfo.cpp diff --git a/lib/Target/X86/LLVMBuild.txt b/lib/Target/X86/LLVMBuild.txt index 2062163afb054fbba74b2ce79b49201cd0847e83..055336baac195e30b1e50813f507e7e7eefb1234 100644 --- a/lib/Target/X86/LLVMBuild.txt +++ b/lib/Target/X86/LLVMBuild.txt @@ -31,5 +31,5 @@ has_jit = 1 type = Library name = X86CodeGen parent = X86 -required_libraries = Analysis AsmPrinter CodeGen Core MC SelectionDAG Support Target X86AsmPrinter X86Desc X86Info X86Utils GlobalISel +required_libraries = Analysis AsmPrinter CodeGen Core MC SelectionDAG Support Target X86AsmPrinter X86Desc X86Info X86Utils GlobalISel ProfileData add_to_library_groups = X86 diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 362ca96ac0939472f15e22c957f8df81f91f7256..ea4aaf14223d1ff8545218a79e9966d8bca104df 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -39,7 +39,7 @@ using namespace llvm; #include "X86GenRegisterInfo.inc" #define GET_INSTRINFO_MC_DESC -#define GET_GENINSTRINFO_MC_HELPERS +#define GET_INSTRINFO_MC_HELPERS #include "X86GenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index 595c26d31e3f7920cb8656799ad8341e145fffa8..4e9f5ba60d2e4ebc16837d4a90467c7c4451222c 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -134,7 +134,7 @@ unsigned getX86SubSuperRegisterOrZero(unsigned, unsigned, // Defines symbolic names for the X86 instructions. // #define GET_INSTRINFO_ENUM -#define GET_GENINSTRINFO_MC_DECL +#define GET_INSTRINFO_MC_HELPER_DECLS #include "X86GenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/lib/Target/X86/Utils/LLVMBuild.txt b/lib/Target/X86/Utils/LLVMBuild.txt index de0a30fa19c8082c7d3bb5df0f2035ebad8d6da0..fdb886f53a082030076fb6ed61b96ba57c43718c 100644 --- a/lib/Target/X86/Utils/LLVMBuild.txt +++ b/lib/Target/X86/Utils/LLVMBuild.txt @@ -19,5 +19,5 @@ type = Library name = X86Utils parent = X86 -required_libraries = Core Support +required_libraries = Support add_to_library_groups = X86 diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 19f8e35ade04bf70cc1e3982b9164ec307791872..1c8813815b863a339732d69feb4e163a5de9ed05 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -122,6 +122,13 @@ FunctionPass *createX86EvexToVexInsts(); /// This pass creates the thunks for the retpoline feature. FunctionPass *createX86RetpolineThunksPass(); +/// This pass ensures instructions featuring a memory operand +/// have distinctive (with respect to eachother) +FunctionPass *createX86DiscriminateMemOpsPass(); + +/// This pass applies profiling information to insert cache prefetches. +FunctionPass *createX86InsertPrefetchPass(); + InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM, X86Subtarget &, X86RegisterBankInfo &); @@ -136,6 +143,7 @@ void initializeWinEHStatePassPass(PassRegistry &); void initializeX86AvoidSFBPassPass(PassRegistry &); void initializeX86CallFrameOptimizationPass(PassRegistry &); void initializeX86CmovConverterPassPass(PassRegistry &); +void initializeX86CondBrFoldingPassPass(PassRegistry &); void initializeX86DomainReassignmentPass(PassRegistry &); void initializeX86ExecutionDomainFixPass(PassRegistry &); void initializeX86FlagsCopyLoweringPassPass(PassRegistry &); diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 74135656528d5a73a7130f972335b70a6f5002a3..6b1749fc75005965e9e6908cc869ce965070031b 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -883,6 +883,17 @@ class SkylakeServerProc : ProcModel; def : SkylakeServerProc<"skx">; // Legacy alias. +def CLXFeatures : ProcessorFeatures; + +class CascadelakeProc : ProcModel; +def : CascadelakeProc<"cascadelake">; + def CNLFeatures : ProcessorFeatures(OutStreamer->getTargetStreamer()); - unsigned ParamsSize = - MF->getInfo()->getArgumentStackSize(); - XTS->emitFPOProc(CurrentFnSym, ParamsSize); + if (auto *XTS = + static_cast(OutStreamer->getTargetStreamer())) + XTS->emitFPOProc( + CurrentFnSym, + MF->getInfo()->getArgumentStackSize()); } } void X86AsmPrinter::EmitFunctionBodyEnd() { if (EmitFPOData) { - X86TargetStreamer *XTS = - static_cast(OutStreamer->getTargetStreamer()); - XTS->emitFPOEndProc(); + if (auto *XTS = + static_cast(OutStreamer->getTargetStreamer())) + XTS->emitFPOEndProc(); } } @@ -674,7 +674,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { emitNonLazyStubs(MMI, *OutStreamer); // Emit stack and fault map information. - SM.serializeToStackMapSection(); + emitStackMaps(SM); FM.serializeToFaultMapSection(); // This flag tells the linker that no global symbols contain code that fall @@ -695,12 +695,12 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { } if (TT.isOSBinFormatCOFF()) { - SM.serializeToStackMapSection(); + emitStackMaps(SM); return; } if (TT.isOSBinFormatELF()) { - SM.serializeToStackMapSection(); + emitStackMaps(SM); FM.serializeToFaultMapSection(); return; } diff --git a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp index eb9c4b3e5977e4a0b45e4c9b45659bc6bb9d6370..2850baf7a65e7a898190f1d5d7943e4085fd43cc 100644 --- a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp +++ b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp @@ -586,7 +586,7 @@ void X86AvoidSFBPass::breakBlockedCopies( StDisp2 += OverlapDelta; Size2 -= OverlapDelta; } - Size1 = std::abs(std::abs(LdDisp2) - std::abs(LdDisp1)); + Size1 = LdDisp2 - LdDisp1; // Build a copy for the point until the current blocking store's // displacement. diff --git a/lib/Target/X86/X86CondBrFolding.cpp b/lib/Target/X86/X86CondBrFolding.cpp index 1d221930c2a46b3a4bd5eac60794181b0c95a2e9..7ce443c4656af40827ff32455a51b0eb558705cc 100644 --- a/lib/Target/X86/X86CondBrFolding.cpp +++ b/lib/Target/X86/X86CondBrFolding.cpp @@ -62,8 +62,9 @@ STATISTIC(NumFixedCondBrs, "Number of x86 condbr folded"); namespace { class X86CondBrFoldingPass : public MachineFunctionPass { public: - X86CondBrFoldingPass() : MachineFunctionPass(ID) {} - + X86CondBrFoldingPass() : MachineFunctionPass(ID) { + initializeX86CondBrFoldingPassPass(*PassRegistry::getPassRegistry()); + } StringRef getPassName() const override { return "X86 CondBr Folding"; } bool runOnMachineFunction(MachineFunction &MF) override; @@ -73,12 +74,13 @@ public: AU.addRequired(); } -private: +public: static char ID; }; +} // namespace char X86CondBrFoldingPass::ID = 0; -} // namespace +INITIALIZE_PASS(X86CondBrFoldingPass, "X86CondBrFolding", "X86CondBrFolding", false, false) FunctionPass *llvm::createX86CondBrFolding() { return new X86CondBrFoldingPass(); @@ -466,7 +468,8 @@ bool X86CondBrFolding::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, break; } SrcReg = MI.getOperand(SrcRegIndex).getReg(); - assert(MI.getOperand(ValueIndex).isImm() && "Expecting Imm operand"); + if (!MI.getOperand(ValueIndex).isImm()) + return false; CmpValue = MI.getOperand(ValueIndex).getImm(); return true; } diff --git a/lib/Target/X86/X86DiscriminateMemOps.cpp b/lib/Target/X86/X86DiscriminateMemOps.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5653babedf8eccd67fcb41be29fd61c377c6000a --- /dev/null +++ b/lib/Target/X86/X86DiscriminateMemOps.cpp @@ -0,0 +1,145 @@ +//===- X86DiscriminateMemOps.cpp - Unique IDs for Mem Ops -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// This pass aids profile-driven cache prefetch insertion by ensuring all +/// instructions that have a memory operand are distinguishible from each other. +/// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86InstrInfo.h" +#include "X86MachineFunctionInfo.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/ProfileData/SampleProf.h" +#include "llvm/ProfileData/SampleProfReader.h" +#include "llvm/Transforms/IPO/SampleProfile.h" +using namespace llvm; + +#define DEBUG_TYPE "x86-discriminate-memops" + +namespace { + +using Location = std::pair; + +Location diToLocation(const DILocation *Loc) { + return std::make_pair(Loc->getFilename(), Loc->getLine()); +} + +/// Ensure each instruction having a memory operand has a distinct pair. +void updateDebugInfo(MachineInstr *MI, const DILocation *Loc) { + DebugLoc DL(Loc); + MI->setDebugLoc(DL); +} + +class X86DiscriminateMemOps : public MachineFunctionPass { + bool runOnMachineFunction(MachineFunction &MF) override; + StringRef getPassName() const override { + return "X86 Discriminate Memory Operands"; + } + +public: + static char ID; + + /// Default construct and initialize the pass. + X86DiscriminateMemOps(); +}; + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// Implementation +//===----------------------------------------------------------------------===// + +char X86DiscriminateMemOps::ID = 0; + +/// Default construct and initialize the pass. +X86DiscriminateMemOps::X86DiscriminateMemOps() : MachineFunctionPass(ID) {} + +bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) { + DISubprogram *FDI = MF.getFunction().getSubprogram(); + if (!FDI || !FDI->getUnit()->getDebugInfoForProfiling()) + return false; + + // Have a default DILocation, if we find instructions with memops that don't + // have any debug info. + const DILocation *ReferenceDI = + DILocation::get(FDI->getContext(), FDI->getLine(), 0, FDI); + + DenseMap MemOpDiscriminators; + MemOpDiscriminators[diToLocation(ReferenceDI)] = 0; + + // Figure out the largest discriminator issued for each Location. When we + // issue new discriminators, we can thus avoid issuing discriminators + // belonging to instructions that don't have memops. This isn't a requirement + // for the goals of this pass, however, it avoids unnecessary ambiguity. + for (auto &MBB : MF) { + for (auto &MI : MBB) { + const auto &DI = MI.getDebugLoc(); + if (!DI) + continue; + Location Loc = diToLocation(DI); + MemOpDiscriminators[Loc] = + std::max(MemOpDiscriminators[Loc], DI->getBaseDiscriminator()); + } + } + + // Keep track of the discriminators seen at each Location. If an instruction's + // DebugInfo has a Location and discriminator we've already seen, replace its + // discriminator with a new one, to guarantee uniqueness. + DenseMap> Seen; + + bool Changed = false; + for (auto &MBB : MF) { + for (auto &MI : MBB) { + if (X86II::getMemoryOperandNo(MI.getDesc().TSFlags) < 0) + continue; + const DILocation *DI = MI.getDebugLoc(); + if (!DI) { + DI = ReferenceDI; + } + DenseSet &Set = Seen[diToLocation(DI)]; + const std::pair::iterator, bool> TryInsert = + Set.insert(DI->getBaseDiscriminator()); + if (!TryInsert.second) { + DI = DI->setBaseDiscriminator(++MemOpDiscriminators[diToLocation(DI)]); + updateDebugInfo(&MI, DI); + Changed = true; + const std::pair::iterator, bool> MustInsert = + Set.insert(DI->getBaseDiscriminator()); + // FIXME (mtrofin): check if the to-be inserted base discriminator can + // be added. This requires a new API on DILocation. + // The assumption is that this scenario is infrequent/OK not to support. + // If evidence points otherwise, we can explore synthesize unique DIs by + // adding fake line numbers. + if (!MustInsert.second) { + LLVM_DEBUG(dbgs() + << "Unable to create a unique discriminator in " + << DI->getFilename() << " Line: " << DI->getLine() + << " Column: " << DI->getColumn() + << ". This is likely due to a large macro expansion.\n"); + } + } + + // Bump the reference DI to avoid cramming discriminators on line 0. + // FIXME(mtrofin): pin ReferenceDI on blocks or first instruction with DI + // in a block. It's more consistent than just relying on the last memop + // instruction we happened to see. + ReferenceDI = DI; + } + } + return Changed; +} + +FunctionPass *llvm::createX86DiscriminateMemOpsPass() { + return new X86DiscriminateMemOps(); +} diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index a49ad8bd59dfc840c7a56d8c0fed2132db8bd72d..cbfdc4b3b93b3cb7b478ac58896ec484f4388b20 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -3998,7 +3998,8 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, } Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI)); - MI->eraseFromParent(); + MachineBasicBlock::iterator I(MI); + removeDeadCode(I, std::next(I)); return true; } diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index e40b0f81e3306b8a690272c7429f320753192385..d722c75c4f0a516a924bca273c89483190c02caf 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -2270,9 +2270,15 @@ void X86FrameLowering::adjustForSegmentedStacks( // Do not generate a prologue for leaf functions with a stack of size zero. // For non-leaf functions we have to allow for the possibility that the - // call is to a non-split function, as in PR37807. - if (StackSize == 0 && !MFI.hasTailCall()) + // callis to a non-split function, as in PR37807. This function could also + // take the address of a non-split function. When the linker tries to adjust + // its non-existent prologue, it would fail with an error. Mark the object + // file so that such failures are not errors. See this Go language bug-report + // https://go-review.googlesource.com/c/go/+/148819/ + if (StackSize == 0 && !MFI.hasTailCall()) { + MF.getMMI().setHasNosplitStack(true); return; + } MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock(); MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock(); diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 16819f4451c2d2b23d311042281c8591b51e14ee..43d0538c4cda660d9646c6335e802ac22890029b 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2192,17 +2192,17 @@ bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const { } /// Test whether the given X86ISD::CMP node has any uses which require the SF -/// or OF bits to be accurate. -static bool hasNoSignedComparisonUses(SDNode *N) { +/// flag to be accurate. +static bool hasNoSignFlagUses(SDValue Flags) { // Examine each user of the node. - for (SDNode::use_iterator UI = N->use_begin(), - UE = N->use_end(); UI != UE; ++UI) { - // Only examine CopyToReg uses. - if (UI->getOpcode() != ISD::CopyToReg) - return false; + for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); + UI != UE; ++UI) { + // Only check things that use the flags. + if (UI.getUse().getResNo() != Flags.getResNo()) + continue; // Only examine CopyToReg uses that copy to EFLAGS. - if (cast(UI->getOperand(1))->getReg() != - X86::EFLAGS) + if (UI->getOpcode() != ISD::CopyToReg || + cast(UI->getOperand(1))->getReg() != X86::EFLAGS) return false; // Examine each user of the CopyToReg use. for (SDNode::use_iterator FlagUI = UI->use_begin(), @@ -2215,11 +2215,14 @@ static bool hasNoSignedComparisonUses(SDNode *N) { switch (FlagUI->getMachineOpcode()) { // These comparisons don't treat the most significant bit specially. case X86::SETAr: case X86::SETAEr: case X86::SETBr: case X86::SETBEr: - case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr: + case X86::SETEr: case X86::SETNEr: case X86::SETOr: case X86::SETNOr: + case X86::SETPr: case X86::SETNPr: case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm: - case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm: + case X86::SETEm: case X86::SETNEm: case X86::SETOm: case X86::SETNOm: + case X86::SETPm: case X86::SETNPm: case X86::JA_1: case X86::JAE_1: case X86::JB_1: case X86::JBE_1: - case X86::JE_1: case X86::JNE_1: case X86::JP_1: case X86::JNP_1: + case X86::JE_1: case X86::JNE_1: case X86::JO_1: case X86::JNO_1: + case X86::JP_1: case X86::JNP_1: case X86::CMOVA16rr: case X86::CMOVA16rm: case X86::CMOVA32rr: case X86::CMOVA32rm: case X86::CMOVA64rr: case X86::CMOVA64rm: @@ -2241,6 +2244,9 @@ static bool hasNoSignedComparisonUses(SDNode *N) { case X86::CMOVNP16rr: case X86::CMOVNP16rm: case X86::CMOVNP32rr: case X86::CMOVNP32rm: case X86::CMOVNP64rr: case X86::CMOVNP64rm: + case X86::CMOVO16rr: case X86::CMOVO16rm: + case X86::CMOVO32rr: case X86::CMOVO32rm: + case X86::CMOVO64rr: case X86::CMOVO64rm: case X86::CMOVP16rr: case X86::CMOVP16rm: case X86::CMOVP32rr: case X86::CMOVP32rm: case X86::CMOVP64rr: case X86::CMOVP64rm: @@ -2255,18 +2261,16 @@ static bool hasNoSignedComparisonUses(SDNode *N) { /// Test whether the given node which sets flags has any uses which require the /// CF flag to be accurate. -static bool hasNoCarryFlagUses(SDNode *N) { +static bool hasNoCarryFlagUses(SDValue Flags) { // Examine each user of the node. - for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; - ++UI) { + for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); + UI != UE; ++UI) { // Only check things that use the flags. - if (UI.getUse().getResNo() != 1) + if (UI.getUse().getResNo() != Flags.getResNo()) continue; - // Only examine CopyToReg uses. - if (UI->getOpcode() != ISD::CopyToReg) - return false; // Only examine CopyToReg uses that copy to EFLAGS. - if (cast(UI->getOperand(1))->getReg() != X86::EFLAGS) + if (UI->getOpcode() != ISD::CopyToReg || + cast(UI->getOperand(1))->getReg() != X86::EFLAGS) return false; // Examine each user of the CopyToReg use. for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end(); @@ -2631,7 +2635,7 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { (-OperandV).getMinSignedBits() <= 8) || (MemVT == MVT::i64 && OperandV.getMinSignedBits() > 32 && (-OperandV).getMinSignedBits() <= 32)) && - hasNoCarryFlagUses(StoredVal.getNode())) { + hasNoCarryFlagUses(StoredVal.getValue(1))) { OperandV = -OperandV; Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD; } @@ -2858,10 +2862,28 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 // Shift NBits left by 8 bits, thus producing 'control'. + // This makes the low 8 bits to be zero. SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8); SDValue Control = CurDAG->getNode(ISD::SHL, DL, NVT, NBits, C8); insertDAGNode(*CurDAG, OrigNBits, Control); - // NOTE: could also try to extract start from (x >> start) + + // If the 'X' is *logically* shifted, we can fold that shift into 'control'. + if (X.getOpcode() == ISD::SRL) { + SDValue ShiftAmt = X.getOperand(1); + X = X.getOperand(0); + + assert(ShiftAmt.getValueType() == MVT::i8 && + "Expected shift amount to be i8"); + + // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero! + SDValue OrigShiftAmt = ShiftAmt; + ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, NVT, ShiftAmt); + insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt); + + // And now 'or' these low 8 bits of shift amount into the 'control'. + Control = CurDAG->getNode(ISD::OR, DL, NVT, Control, ShiftAmt); + insertDAGNode(*CurDAG, OrigNBits, Control); + } // And finally, form the BEXTR itself. SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, NVT, X, Control); @@ -3392,14 +3414,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) { unsigned Opc, MOpc; bool isSigned = Opcode == ISD::SMUL_LOHI; - bool hasBMI2 = Subtarget->hasBMI2(); if (!isSigned) { switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); - case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r; - MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break; - case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r; - MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break; + case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break; + case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break; } } else { switch (NVT.SimpleTy) { @@ -3420,12 +3439,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) { case X86::MUL64r: SrcReg = LoReg = X86::RAX; HiReg = X86::RDX; break; - case X86::MULX32rr: - SrcReg = X86::EDX; LoReg = HiReg = 0; - break; - case X86::MULX64rr: - SrcReg = X86::RDX; LoReg = HiReg = 0; - break; } SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; @@ -3439,26 +3452,15 @@ void X86DAGToDAGISel::Select(SDNode *Node) { SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg, N0, SDValue()).getValue(1); - SDValue ResHi, ResLo; - if (foldedLoad) { SDValue Chain; MachineSDNode *CNode = nullptr; SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), InFlag }; - if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) { - SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue); - CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); - ResHi = SDValue(CNode, 0); - ResLo = SDValue(CNode, 1); - Chain = SDValue(CNode, 2); - InFlag = SDValue(CNode, 3); - } else { - SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue); - CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); - Chain = SDValue(CNode, 0); - InFlag = SDValue(CNode, 1); - } + SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue); + CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + Chain = SDValue(CNode, 0); + InFlag = SDValue(CNode, 1); // Update the chain. ReplaceUses(N1.getValue(1), Chain); @@ -3466,39 +3468,27 @@ void X86DAGToDAGISel::Select(SDNode *Node) { CurDAG->setNodeMemRefs(CNode, {cast(N1)->getMemOperand()}); } else { SDValue Ops[] = { N1, InFlag }; - if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) { - SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue); - SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); - ResHi = SDValue(CNode, 0); - ResLo = SDValue(CNode, 1); - InFlag = SDValue(CNode, 2); - } else { - SDVTList VTs = CurDAG->getVTList(MVT::Glue); - SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); - InFlag = SDValue(CNode, 0); - } + SDVTList VTs = CurDAG->getVTList(MVT::Glue); + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + InFlag = SDValue(CNode, 0); } // Copy the low half of the result, if it is needed. if (!SDValue(Node, 0).use_empty()) { - if (!ResLo.getNode()) { - assert(LoReg && "Register for low half is not defined!"); - ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT, - InFlag); - InFlag = ResLo.getValue(2); - } + assert(LoReg && "Register for low half is not defined!"); + SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, + NVT, InFlag); + InFlag = ResLo.getValue(2); ReplaceUses(SDValue(Node, 0), ResLo); LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n'); } // Copy the high half of the result, if it is needed. if (!SDValue(Node, 1).use_empty()) { - if (!ResHi.getNode()) { - assert(HiReg && "Register for high half is not defined!"); - ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT, - InFlag); - InFlag = ResHi.getValue(2); - } + assert(HiReg && "Register for high half is not defined!"); + SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, + NVT, InFlag); + InFlag = ResHi.getValue(2); ReplaceUses(SDValue(Node, 1), ResHi); LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n'); @@ -3731,7 +3721,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { if (isUInt<8>(Mask) && (!(Mask & 0x80) || CmpVT == MVT::i8 || - hasNoSignedComparisonUses(Node))) { + hasNoSignFlagUses(SDValue(Node, 0)))) { // For example, convert "testl %eax, $8" to "testb %al, $8" VT = MVT::i8; SubRegOp = X86::sub_8bit; @@ -3739,7 +3729,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { MOpc = X86::TEST8mi; } else if (OptForMinSize && isUInt<16>(Mask) && (!(Mask & 0x8000) || CmpVT == MVT::i16 || - hasNoSignedComparisonUses(Node))) { + hasNoSignFlagUses(SDValue(Node, 0)))) { // For example, "testl %eax, $32776" to "testw %ax, $32776". // NOTE: We only want to form TESTW instructions if optimizing for // min size. Otherwise we only save one byte and possibly get a length @@ -3753,7 +3743,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) { // Without minsize 16-bit Cmps can get here so we need to // be sure we calculate the correct sign flag if needed. (CmpVT != MVT::i16 || !(Mask & 0x8000))) || - CmpVT == MVT::i32 || hasNoSignedComparisonUses(Node))) { + CmpVT == MVT::i32 || + hasNoSignFlagUses(SDValue(Node, 0)))) { // For example, "testq %rax, $268468232" to "testl %eax, $268468232". // NOTE: We only want to run that transform if N0 is 32 or 64 bits. // Otherwize, we find ourselves in a position where we have to do diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 38d3a30cb19e8f6be885a339ae8957a63cfdfb33..6eb21ab7087d9c001aac3bf9e227fe76894d262e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -19,7 +19,6 @@ #include "X86InstrBuilder.h" #include "X86IntrinsicsInfo.h" #include "X86MachineFunctionInfo.h" -#include "X86ShuffleDecodeConstantPool.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "llvm/ADT/SmallBitVector.h" @@ -196,6 +195,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ABS , MVT::i64 , Custom); } + // Funnel shifts. + for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) { + setOperationAction(ShiftOp , MVT::i16 , Custom); + setOperationAction(ShiftOp , MVT::i32 , Custom); + if (Subtarget.is64Bit()) + setOperationAction(ShiftOp , MVT::i64 , Custom); + } + // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this // operation. setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); @@ -786,14 +793,20 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); - setOperationAction(ISD::SDIV, MVT::v2i32, Custom); - setOperationAction(ISD::SREM, MVT::v2i32, Custom); - setOperationAction(ISD::UDIV, MVT::v2i32, Custom); - setOperationAction(ISD::UREM, MVT::v2i32, Custom); + for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8, + MVT::v2i16, MVT::v4i16, MVT::v2i32 }) { + setOperationAction(ISD::SDIV, VT, Custom); + setOperationAction(ISD::SREM, VT, Custom); + setOperationAction(ISD::UDIV, VT, Custom); + setOperationAction(ISD::UREM, VT, Custom); + } setOperationAction(ISD::MUL, MVT::v2i8, Custom); setOperationAction(ISD::MUL, MVT::v2i16, Custom); setOperationAction(ISD::MUL, MVT::v2i32, Custom); + setOperationAction(ISD::MUL, MVT::v4i8, Custom); + setOperationAction(ISD::MUL, MVT::v4i16, Custom); + setOperationAction(ISD::MUL, MVT::v8i8, Custom); setOperationAction(ISD::MUL, MVT::v16i8, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); @@ -816,6 +829,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom); } + setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal); + setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal); + setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal); + setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal); + setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal); + setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal); + setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal); + setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal); + // Use widening instead of promotion. + for (auto VT : { MVT::v8i8, MVT::v4i8, MVT::v2i8, + MVT::v4i16, MVT::v2i16 }) { + setOperationAction(ISD::UADDSAT, VT, Custom); + setOperationAction(ISD::SADDSAT, VT, Custom); + setOperationAction(ISD::USUBSAT, VT, Custom); + setOperationAction(ISD::SSUBSAT, VT, Custom); + } + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); @@ -850,9 +880,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // scalars) and extend in-register to a legal 128-bit vector type. For sext // loads these must work with a single scalar load. for (MVT VT : MVT::integer_vector_valuetypes()) { - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom); + if (!ExperimentalVectorWideningLegalization) { + // We don't want narrow result types here when widening. + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom); + } setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom); @@ -861,6 +894,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom); } + if (ExperimentalVectorWideningLegalization && + !Subtarget.hasSSE41() && Subtarget.is64Bit()) { + // This lets DAG combine create sextloads that get split and scalarized. + // TODO: Does this make sense? What about v2i8->v2i64? + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i8, Custom); + } + for (auto VT : { MVT::v2f64, MVT::v2i64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); @@ -883,10 +924,26 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom); - // Custom legalize these to avoid over promotion. + + // Custom legalize these to avoid over promotion or custom promotion. setOperationAction(ISD::FP_TO_SINT, MVT::v2i8, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i8, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i8, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2i8, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i8, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i8, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); + + // By marking FP_TO_SINT v8i16 as Custom, will trick type legalization into + // promoting v8i8 FP_TO_UINT into FP_TO_SINT. When the v8i16 FP_TO_SINT is + // split again based on the input type, this will cause an AssertSExt i16 to + // be emitted instead of an AssertZExt. This will allow packssdw followed by + // packuswb to be used to truncate to v8i8. This is necessary since packusdw + // isn't available until sse4.1. + setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); @@ -906,7 +963,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for // store. setOperationAction(ISD::LOAD, MVT::v2f32, Custom); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i16, Custom); + setOperationAction(ISD::LOAD, MVT::v8i8, Custom); setOperationAction(ISD::STORE, MVT::v2f32, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v4i16, Custom); + setOperationAction(ISD::STORE, MVT::v8i8, Custom); setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); @@ -918,6 +981,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); + if (ExperimentalVectorWideningLegalization) { + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); + + setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); + } + // In the customized shift lowering, the legal v4i32/v2i64 cases // in AVX2 will be recognized. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { @@ -928,7 +1002,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ROTL, MVT::v4i32, Custom); setOperationAction(ISD::ROTL, MVT::v8i16, Custom); - setOperationAction(ISD::ROTL, MVT::v16i8, Custom); + + // With BWI, expanding (and promoting the shifts) is the better. + if (!Subtarget.hasBWI()) + setOperationAction(ISD::ROTL, MVT::v16i8, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { @@ -974,17 +1051,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal); } - for (MVT VT : MVT::integer_vector_valuetypes()) { - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); + if (!ExperimentalVectorWideningLegalization) { + // Avoid narrow result types when widening. The legal types are listed + // in the next loop. + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); + } } // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal); - setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal); + if (!ExperimentalVectorWideningLegalization) + setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal); @@ -1060,9 +1142,20 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SRA, VT, Custom); } + if (ExperimentalVectorWideningLegalization) { + // These types need custom splitting if their input is a 128-bit vector. + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); + } + setOperationAction(ISD::ROTL, MVT::v8i32, Custom); setOperationAction(ISD::ROTL, MVT::v16i16, Custom); - setOperationAction(ISD::ROTL, MVT::v32i8, Custom); + + // With BWI, expanding (and promoting the shifts) is the better. + if (!Subtarget.hasBWI()) + setOperationAction(ISD::ROTL, MVT::v32i8, Custom); setOperationAction(ISD::SELECT, MVT::v4f64, Custom); setOperationAction(ISD::SELECT, MVT::v4i64, Custom); @@ -1124,6 +1217,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SMIN, MVT::v4i64, Custom); setOperationAction(ISD::UMIN, MVT::v4i64, Custom); + setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); + setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); + setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); + setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); + setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); + setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); + setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); + setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); + for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom); @@ -1241,6 +1343,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::UADDSAT, VT, Custom); + setOperationAction(ISD::SADDSAT, VT, Custom); + setOperationAction(ISD::USUBSAT, VT, Custom); + setOperationAction(ISD::SSUBSAT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); @@ -1324,6 +1430,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); + if (ExperimentalVectorWideningLegalization) { + // Need to custom widen this if we don't have AVX512BW. + setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom); + } + for (auto VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::FCEIL, VT, Legal); @@ -1332,12 +1445,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FNEARBYINT, VT, Legal); } - setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom); - setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom); - // Without BWI we need to use custom lowering to handle MVT::v64i8 input. - setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom); - setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom); + for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) { + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); + } setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); @@ -1495,6 +1607,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SUB, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::UADDSAT, VT, Custom); + setOperationAction(ISD::SADDSAT, VT, Custom); + setOperationAction(ISD::USUBSAT, VT, Custom); + setOperationAction(ISD::SSUBSAT, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); @@ -1555,6 +1671,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); @@ -1574,6 +1691,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::UADDSAT, VT, Legal); + setOperationAction(ISD::SADDSAT, VT, Legal); + setOperationAction(ISD::USUBSAT, VT, Legal); + setOperationAction(ISD::SSUBSAT, VT, Legal); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. @@ -1736,8 +1857,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); - setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG); - setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); @@ -4709,6 +4828,14 @@ bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, return true; } +bool X86TargetLowering::reduceSelectOfFPConstantLoads(bool IsFPSetCC) const { + // If we are using XMM registers in the ABI and the condition of the select is + // a floating-point compare and we have blendv or conditional move, then it is + // cheaper to select instead of doing a cross-register move and creating a + // load that depends on the compare result. + return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX(); +} + bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const { // TODO: It might be a win to ease or lift this restriction, but the generic // folds in DAGCombiner conflict with vector folds for an AVX512 target. @@ -4737,6 +4864,12 @@ bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const { (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2(); } +bool X86TargetLowering::shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT, + bool IsSigned) const { + // f80 UINT_TO_FP is more efficient using Strict code if FCMOV is available. + return !IsSigned && FpVT == MVT::f80 && Subtarget.hasCMov(); +} + bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const { if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) @@ -4763,7 +4896,11 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const { bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT) const { - if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1) + if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() && + BitcastVT.getVectorElementType() == MVT::i1) + return false; + + if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8) return false; return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT); @@ -5456,25 +5593,29 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { return DAG.getBitcast(VT, Vec); } -static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In, +static SDValue getExtendInVec(bool Signed, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG) { EVT InVT = In.getValueType(); - assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode"); - - if (VT.is128BitVector() && InVT.is128BitVector()) - return DAG.getNode(X86ISD::VSEXT == Opc ? ISD::SIGN_EXTEND_VECTOR_INREG - : ISD::ZERO_EXTEND_VECTOR_INREG, - DL, VT, In); + assert(VT.isVector() && InVT.isVector() && "Expected vector VTs."); // For 256-bit vectors, we only need the lower (128-bit) input half. // For 512-bit vectors, we only need the lower input half or quarter. - if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) { - int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits(); + if (InVT.getSizeInBits() > 128) { + assert(VT.getSizeInBits() == InVT.getSizeInBits() && + "Expected VTs to be the same size!"); + unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits(); In = extractSubVector(In, 0, DAG, DL, - std::max(128, (int)VT.getSizeInBits() / Scale)); + std::max(128U, VT.getSizeInBits() / Scale)); + InVT = In.getValueType(); } - return DAG.getNode(Opc, DL, VT, In); + if (VT.getVectorNumElements() == InVT.getVectorNumElements()) + return DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, + DL, VT, In); + + return DAG.getNode(Signed ? ISD::SIGN_EXTEND_VECTOR_INREG + : ISD::ZERO_EXTEND_VECTOR_INREG, + DL, VT, In); } /// Returns a vector_shuffle node for an unpackl operation. @@ -5869,6 +6010,31 @@ static void createPackShuffleMask(MVT VT, SmallVectorImpl &Mask, } } +// Split the demanded elts of a PACKSS/PACKUS node between its operands. +static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, + APInt &DemandedLHS, APInt &DemandedRHS) { + int NumLanes = VT.getSizeInBits() / 128; + int NumElts = DemandedElts.getBitWidth(); + int NumInnerElts = NumElts / 2; + int NumEltsPerLane = NumElts / NumLanes; + int NumInnerEltsPerLane = NumInnerElts / NumLanes; + + DemandedLHS = APInt::getNullValue(NumInnerElts); + DemandedRHS = APInt::getNullValue(NumInnerElts); + + // Map DemandedElts to the packed operands. + for (int Lane = 0; Lane != NumLanes; ++Lane) { + for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) { + int OuterIdx = (Lane * NumEltsPerLane) + Elt; + int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt; + if (DemandedElts[OuterIdx]) + DemandedLHS.setBit(InnerIdx); + if (DemandedElts[OuterIdx + NumInnerEltsPerLane]) + DemandedRHS.setBit(InnerIdx); + } + } +} + /// Calculates the shuffle mask corresponding to the target-specific opcode. /// If the mask could be calculated, returns it in \p Mask, returns the shuffle /// operands in \p Ops, and returns true. @@ -6146,8 +6312,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, Ops.push_back(N->getOperand(0)); Ops.push_back(N->getOperand(2)); SDValue MaskNode = N->getOperand(1); - if (auto *C = getTargetConstantFromNode(MaskNode)) { - DecodeVPERMV3Mask(C, MaskEltSize, VT.getSizeInBits(), Mask); + if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, + RawUndefs)) { + DecodeVPERMV3Mask(RawMask, RawUndefs, Mask); break; } return false; @@ -6529,7 +6696,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, return true; } case ISD::ZERO_EXTEND_VECTOR_INREG: - case X86ISD::VZEXT: { + case ISD::ZERO_EXTEND: { // TODO - add support for VPMOVZX with smaller input vector types. SDValue Src = N.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); @@ -6586,8 +6753,7 @@ static bool resolveTargetShuffleInputs(SDValue Op, return false; resolveTargetShuffleInputsAndMask(Inputs, Mask); - // TODO - Add support for more than 2 inputs. - return Inputs.size() <= 2; + return true; } /// Returns the scalar element that will make up the ith @@ -8576,9 +8742,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // If we are inserting one variable into a vector of non-zero constants, try // to avoid loading each constant element as a scalar. Load the constants as a // vector and then insert the variable scalar element. If insertion is not - // supported, we assume that we will fall back to a shuffle to get the scalar - // blended with the constants. Insertion into a zero vector is handled as a - // special-case somewhere below here. + // supported, fall back to a shuffle to get the scalar blended with the + // constants. Insertion into a zero vector is handled as a special-case + // somewhere below here. if (NumConstants == NumElems - 1 && NumNonZero != 1 && (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) || isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) { @@ -8616,7 +8782,21 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF); SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI); - return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex); + unsigned InsertC = cast(InsIndex)->getZExtValue(); + unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits(); + if (InsertC < NumEltsInLow128Bits) + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex); + + // There's no good way to insert into the high elements of a >128-bit + // vector, so use shuffles to avoid an extract/insert sequence. + assert(VT.getSizeInBits() > 128 && "Invalid insertion index?"); + assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector"); + SmallVector ShuffleMask; + unsigned NumElts = VT.getVectorNumElements(); + for (unsigned i = 0; i != NumElts; ++i) + ShuffleMask.push_back(i == InsertC ? NumElts : i); + SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt); + return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask); } // Special case for single non-zero, non-undef, element. @@ -10051,6 +10231,11 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, assert((VT.is128BitVector() || Subtarget.hasAVX2()) && "256-bit byte-blends require AVX2 support!"); + // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. + if (SDValue Masked = + lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG)) + return Masked; + if (Subtarget.hasBWI() && Subtarget.hasVLX()) { MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); @@ -10058,11 +10243,6 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); } - // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. - if (SDValue Masked = - lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG)) - return Masked; - // Scale the blend by the number of bytes per element. int Scale = VT.getScalarSizeInBits() / 8; @@ -10129,7 +10309,8 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, - SelectionDAG &DAG) { + SelectionDAG &DAG, + bool ImmBlends = false) { // We build up the blend mask while checking whether a blend is a viable way // to reduce the shuffle. SmallVector BlendMask(Mask.size(), -1); @@ -10149,6 +10330,12 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, PermuteMask[i] = Mask[i] % Size; } + // If only immediate blends, then bail if the blend mask can't be widened to + // i16. + unsigned EltSize = VT.getScalarSizeInBits(); + if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask)) + return SDValue(); + SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); } @@ -10219,6 +10406,92 @@ static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask); } +/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then +/// permuting the elements of the result in place. +static SDValue lowerVectorShuffleAsByteRotateAndPermute( + const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { + if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) || + (VT.is256BitVector() && !Subtarget.hasAVX2()) || + (VT.is512BitVector() && !Subtarget.hasBWI())) + return SDValue(); + + // We don't currently support lane crossing permutes. + if (is128BitLaneCrossingShuffleMask(VT, Mask)) + return SDValue(); + + int Scale = VT.getScalarSizeInBits() / 8; + int NumLanes = VT.getSizeInBits() / 128; + int NumElts = VT.getVectorNumElements(); + int NumEltsPerLane = NumElts / NumLanes; + + // Determine range of mask elts. + bool Blend1 = true; + bool Blend2 = true; + std::pair Range1 = std::make_pair(INT_MAX, INT_MIN); + std::pair Range2 = std::make_pair(INT_MAX, INT_MIN); + for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) { + for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) { + int M = Mask[Lane + Elt]; + if (M < 0) + continue; + if (M < NumElts) { + Blend1 &= (M == (Lane + Elt)); + assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask"); + M = M % NumEltsPerLane; + Range1.first = std::min(Range1.first, M); + Range1.second = std::max(Range1.second, M); + } else { + M -= NumElts; + Blend2 &= (M == (Lane + Elt)); + assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask"); + M = M % NumEltsPerLane; + Range2.first = std::min(Range2.first, M); + Range2.second = std::max(Range2.second, M); + } + } + } + + // Bail if we don't need both elements. + // TODO - it might be worth doing this for unary shuffles if the permute + // can be widened. + if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) || + !(0 <= Range2.first && Range2.second < NumEltsPerLane)) + return SDValue(); + + if (VT.getSizeInBits() > 128 && (Blend1 || Blend2)) + return SDValue(); + + // Rotate the 2 ops so we can access both ranges, then permute the result. + auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) { + MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); + SDValue Rotate = DAG.getBitcast( + VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi), + DAG.getBitcast(ByteVT, Lo), + DAG.getConstant(Scale * RotAmt, DL, MVT::i8))); + SmallVector PermMask(NumElts, SM_SentinelUndef); + for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) { + for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) { + int M = Mask[Lane + Elt]; + if (M < 0) + continue; + if (M < NumElts) + PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane); + else + PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane); + } + } + return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask); + }; + + // Check if the ranges are small enough to rotate from either direction. + if (Range2.second < Range1.first) + return RotateAndPermute(V1, V2, Range1.first, 0); + if (Range1.second < Range2.first) + return RotateAndPermute(V2, V1, Range2.first, NumElts); + return SDValue(); +} + /// Generic routine to decompose a shuffle and blend into independent /// blends and permutes. /// @@ -10226,11 +10499,9 @@ static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, /// shuffle+blend operations on newer X86 ISAs where we have very fast blend /// operations. It will try to pick the best arrangement of shuffles and /// blends. -static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL, - MVT VT, SDValue V1, - SDValue V2, - ArrayRef Mask, - SelectionDAG &DAG) { +static SDValue lowerVectorShuffleAsDecomposedShuffleBlend( + const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { // Shuffle the input elements into the desired positions in V1 and V2 and // blend them together. SmallVector V1Mask(Mask.size(), -1); @@ -10245,18 +10516,26 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL, BlendMask[i] = i + Size; } - // Try to lower with the simpler initial blend/unpack strategies unless one of - // the input shuffles would be a no-op. We prefer to shuffle inputs as the - // shuffle may be able to fold with a load or other benefit. However, when - // we'll have to do 2x as many shuffles in order to achieve this, - // blending/unpacking first is a better strategy. + // Try to lower with the simpler initial blend/unpack/rotate strategies unless + // one of the input shuffles would be a no-op. We prefer to shuffle inputs as + // the shuffle may be able to fold with a load or other benefit. However, when + // we'll have to do 2x as many shuffles in order to achieve this, a 2-input + // pre-shuffle first is a better strategy. if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) { - if (SDValue BlendPerm = - lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) + // Only prefer immediate blends to unpack/rotate. + if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute( + DL, VT, V1, V2, Mask, DAG, true)) return BlendPerm; if (SDValue UnpackPerm = lowerVectorShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG)) return UnpackPerm; + if (SDValue RotatePerm = lowerVectorShuffleAsByteRotateAndPermute( + DL, VT, V1, V2, Mask, Subtarget, DAG)) + return RotatePerm; + // Unpack/rotate failed - try again with variable blends. + if (SDValue BlendPerm = + lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) + return BlendPerm; } V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); @@ -10767,7 +11046,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = ShuffleOffset(InputV); - InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG); + InputV = getExtendInVec(/*Signed*/false, DL, ExtVT, InputV, DAG); return DAG.getBitcast(VT, InputV); } @@ -11305,7 +11584,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue BC = peekThroughBitcasts(V); // Also check the simpler case, where we can directly reuse the scalar. - if (V.getOpcode() == ISD::BUILD_VECTOR || + if ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { V = V.getOperand(BroadcastIdx); @@ -11791,7 +12070,7 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, // a permute. That will be faster than the domain cross. if (IsBlendSupported) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, - Mask, DAG); + Mask, Subtarget, DAG); // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. @@ -12101,7 +12380,7 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, // a permute. That will be faster than the domain cross. if (IsBlendSupported) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, - Mask, DAG); + Mask, Subtarget, DAG); // Try to lower by permuting the inputs into an unpack instruction. if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( @@ -12816,7 +13095,7 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, // We can always bit-blend if we have to so the fallback strategy is to // decompose into single-input permutes and blends. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, - Mask, DAG); + Mask, Subtarget, DAG); } /// Check whether a compaction lowering can be done by dropping even @@ -12949,6 +13228,10 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Broadcast; + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) + return V; + // Check whether we can widen this to an i16 shuffle by duplicating bytes. // Notably, this handles splat and partial-splat shuffles more efficiently. // However, it only makes sense if the pre-duplication shuffle simplifies @@ -13092,6 +13375,12 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, // If we have VBMI we can use one VPERM instead of multiple PSHUFBs. if (Subtarget.hasVBMI() && Subtarget.hasVLX()) return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG); + + // Use PALIGNR+Permute if possible - permute might become PSHUFB but the + // PALIGNR will be cheaper than the second PSHUFB+OR. + if (SDValue V = lowerVectorShuffleAsByteRotateAndPermute( + DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) + return V; } return PSHUFB; @@ -13147,7 +13436,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, // Handle multi-input cases by blending single-input shuffles. if (NumV2Elements > 0) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, - Mask, DAG); + Mask, Subtarget, DAG); // The fallback path for single-input shuffles widens this into two v8i16 // vectors with unpacks, shuffles those, and then pulls them back together @@ -13360,6 +13649,7 @@ static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!V2.isUndef() && "This routine must not be used to lower single-input " "shuffles as it could then recurse on itself."); @@ -13386,7 +13676,7 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, }; if (DoBothBroadcast()) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, - DAG); + Subtarget, DAG); // If the inputs all stem from a single 128-bit lane of each input, then we // split them rather than blending because the split will decompose to @@ -13404,7 +13694,8 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, // Otherwise, just fall back to decomposed shuffles and a blend. This requires // that the decomposed single-input shuffles don't end up here. - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, + Subtarget, DAG); } /// Lower a vector shuffle crossing multiple 128-bit lanes as @@ -14247,10 +14538,11 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, // can fully permute the elements. if (Subtarget.hasAVX2()) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, - Mask, DAG); + Mask, Subtarget, DAG); // Otherwise fall back on generic lowering. - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, + Subtarget, DAG); } /// Handle lowering of 4-lane 64-bit integer shuffles. @@ -14344,7 +14636,7 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, // Otherwise fall back on generic blend lowering. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, - Mask, DAG); + Mask, Subtarget, DAG); } /// Handle lowering of 8-lane 32-bit floating point shuffles. @@ -14433,17 +14725,18 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef Mask, // vpunpckhwd instrs than vblend. if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32)) if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, - Mask, DAG)) + Mask, Subtarget, DAG)) return V; // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. if (Subtarget.hasAVX2()) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, - Mask, DAG); + Mask, Subtarget, DAG); // Otherwise fall back on generic lowering. - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, + Subtarget, DAG); } /// Handle lowering of 8-lane 32-bit integer shuffles. @@ -14472,8 +14765,8 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, // vpunpcklwd and vpunpckhwd instrs. if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() && !Subtarget.hasAVX512()) - if (SDValue V = - lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG)) + if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, + Mask, Subtarget, DAG)) return V; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, @@ -14556,7 +14849,7 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, // Otherwise fall back on generic blend lowering. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, - Mask, DAG); + Mask, Subtarget, DAG); } /// Handle lowering of 16-lane 16-bit integer shuffles. @@ -14657,7 +14950,8 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, return V; // Otherwise fall back on generic lowering. - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, + Subtarget, DAG); } /// Handle lowering of 32-lane 8-bit integer shuffles. @@ -14747,7 +15041,8 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, return V; // Otherwise fall back on generic lowering. - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG); + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, + Subtarget, DAG); } /// High-level routine to lower various 256-bit x86 vector shuffles. @@ -15249,6 +15544,11 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG)) return V; + // Use dedicated pack instructions for masks that match their pattern. + if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG, + Subtarget)) + return V; + // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) @@ -15609,6 +15909,14 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SmallVector WidenedMask; if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && canWidenShuffleElements(ZeroableMask, WidenedMask)) { + // Shuffle mask widening should not interfere with a broadcast opportunity + // by obfuscating the operands with bitcasts. + // TODO: Avoid lowering directly from this top-level function: make this + // a query (canLowerAsBroadcast) and defer lowering to the type-based calls. + if (SDValue Broadcast = + lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) + return Broadcast; + MVT NewEltVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2); @@ -15726,7 +16034,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { // Build a mask by testing the condition against zero. MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond, - getZeroVector(CondVT, Subtarget, DAG, dl), + DAG.getConstant(0, dl, CondVT), ISD::SETNE); // Now return a new VSELECT using the mask. return DAG.getSelect(dl, VT, Mask, LHS, RHS); @@ -16716,6 +17024,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { /// Lower SRA_PARTS and friends, which return two i32 values /// and take a 2 x i32 value to shift plus a shift amount. +/// TODO: Can this be moved to general expansion code? static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); MVT VT = Op.getSimpleValueType(); @@ -16725,8 +17034,8 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); - // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the - // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away + // ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and + // ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away // during isel. SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, DAG.getConstant(VTBits - 1, dl, MVT::i8)); @@ -16736,10 +17045,10 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { SDValue Tmp2, Tmp3; if (Op.getOpcode() == ISD::SHL_PARTS) { - Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); + Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt); Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt); } else { - Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); + Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt); Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt); } @@ -16763,6 +17072,37 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { return DAG.getMergeValues({ Lo, Hi }, dl); } +static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && + "Unexpected funnel shift opcode!"); + assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && + "Unexpected funnel shift type!"); + + SDLoc DL(Op); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue Amt = Op.getOperand(2); + + // Expand slow SHLD/SHRD cases if we are not optimizing for size. + bool OptForSize = DAG.getMachineFunction().getFunction().optForSize(); + if (!OptForSize && Subtarget.isSHLDSlow()) + return SDValue(); + + bool IsFSHR = Op.getOpcode() == ISD::FSHR; + if (IsFSHR) + std::swap(Op0, Op1); + + // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo. + if (VT == MVT::i16) + Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, + DAG.getConstant(15, DL, Amt.getValueType())); + + unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD); + return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt); +} + // Try to use a packed vector operation to handle i64 on 32-bit targets when // AVX512DQ is enabled. static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG, @@ -17450,8 +17790,19 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, InVT.getVectorElementType() == MVT::i32) && "Unexpected element type"); + // Custom legalize v8i8->v8i64 on CPUs without avx512bw. + if (InVT == MVT::v8i8) { + if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64) + return SDValue(); + + In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), + MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8)); + // FIXME: This should be ANY_EXTEND_VECTOR_INREG for ANY_EXTEND input. + return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, VT, In); + } + if (Subtarget.hasInt256()) - return DAG.getNode(X86ISD::VZEXT, dl, VT, In); + return Op; // Optimize vectors in AVX mode: // @@ -17534,7 +17885,7 @@ static SDValue LowerZERO_EXTEND_Mask(SDValue Op, } SDValue One = DAG.getConstant(1, DL, WideVT); - SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL); + SDValue Zero = DAG.getConstant(0, DL, WideVT); SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero); @@ -17742,10 +18093,8 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, } // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m. if (Subtarget.hasDQI()) - return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), - In, ISD::SETGT); - return DAG.getSetCC(DL, VT, In, getZeroVector(InVT, Subtarget, DAG, DL), - ISD::SETNE); + return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT); + return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE); } SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { @@ -17758,20 +18107,22 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); + // If called by the legalizer just return. + if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT)) + return SDValue(); + if (VT.getVectorElementType() == MVT::i1) return LowerTruncateVecI1(Op, DAG, Subtarget); // vpmovqb/w/d, vpmovdb/w, vpmovwb if (Subtarget.hasAVX512()) { - // word to byte only under BWI - if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) { // v16i16 -> v16i8 - // Make sure we're allowed to promote 512-bits. - if (Subtarget.canExtendTo512DQ()) - return DAG.getNode(ISD::TRUNCATE, DL, VT, - DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In)); - } else { + // word to byte only under BWI. Otherwise we have to promoted to v16i32 + // and then truncate that. But we should only do that if we haven't been + // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be + // handled by isel patterns. + if (InVT != MVT::v16i16 || Subtarget.hasBWI() || + Subtarget.canExtendTo512DQ()) return Op; - } } unsigned NumPackedSignBits = std::min(VT.getScalarSizeInBits(), 16); @@ -17859,6 +18210,17 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { return DAG.getBitcast(MVT::v8i16, res); } + if (VT == MVT::v16i8 && InVT == MVT::v16i16) { + // Use an AND to zero uppper bits for PACKUS. + In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT)); + + SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In, + DAG.getIntPtrConstant(0, DL)); + SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In, + DAG.getIntPtrConstant(8, DL)); + return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi); + } + // Handle truncation of V256 to V128 using shuffles. assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!"); @@ -18213,8 +18575,8 @@ static bool hasNonFlagsUse(SDValue Op) { /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. -SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, - SelectionDAG &DAG) const { +static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { // CF and OF aren't always set the way we want. Determine which // of these we need. bool NeedCF = false; @@ -18257,27 +18619,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, unsigned Opcode = 0; unsigned NumOperands = 0; - // Truncate operations may prevent the merge of the SETCC instruction - // and the arithmetic instruction before it. Attempt to truncate the operands - // of the arithmetic instruction and use a reduced bit-width instruction. - bool NeedTruncation = false; SDValue ArithOp = Op; - if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) { - SDValue Arith = Op->getOperand(0); - // Both the trunc and the arithmetic op need to have one user each. - if (Arith->hasOneUse()) - switch (Arith.getOpcode()) { - default: break; - case ISD::ADD: - case ISD::SUB: - case ISD::AND: - case ISD::OR: - case ISD::XOR: { - NeedTruncation = true; - ArithOp = Arith; - } - } - } // Sometimes flags can be set either with an AND or with an SRL/SHL // instruction. SRL/SHL variant should be preferred for masks longer than this @@ -18326,27 +18668,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, Opcode = X86ISD::ADD; NumOperands = 2; break; - case ISD::SHL: - case ISD::SRL: - // If we have a constant logical shift that's only used in a comparison - // against zero turn it into an equivalent AND. This allows turning it into - // a TEST instruction later. - if (ZeroCheck && Op->hasOneUse() && - isa(Op->getOperand(1)) && !hasNonFlagsUse(Op)) { - EVT VT = Op.getValueType(); - unsigned BitWidth = VT.getSizeInBits(); - unsigned ShAmt = Op->getConstantOperandVal(1); - if (ShAmt >= BitWidth) // Avoid undefined shifts. - break; - APInt Mask = ArithOp.getOpcode() == ISD::SRL - ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt) - : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); - if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth)) - break; - Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0), - DAG.getConstant(Mask, dl, VT)); - } - break; case ISD::AND: // If the primary 'and' result isn't used, don't bother using X86ISD::AND, @@ -18389,8 +18710,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, if (LeadingOnes + TrailingZeros == BitWidth) { assert(TrailingZeros < VT.getSizeInBits() && "Shift amount should be less than the type width"); - MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT); - SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy); + SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, MVT::i8); Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt); break; } @@ -18401,8 +18721,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, if (LeadingZeros + TrailingOnes == BitWidth) { assert(LeadingZeros < VT.getSizeInBits() && "Shift amount should be less than the type width"); - MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT); - SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy); + SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, MVT::i8); Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt); break; } @@ -18447,36 +18766,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, break; } - // If we found that truncation is beneficial, perform the truncation and - // update 'Op'. - if (NeedTruncation) { - EVT VT = Op.getValueType(); - SDValue WideVal = Op->getOperand(0); - EVT WideVT = WideVal.getValueType(); - unsigned ConvertedOp = 0; - // Use a target machine opcode to prevent further DAGCombine - // optimizations that may separate the arithmetic operations - // from the setcc node. - switch (WideVal.getOpcode()) { - default: break; - case ISD::ADD: ConvertedOp = X86ISD::ADD; break; - case ISD::SUB: ConvertedOp = X86ISD::SUB; break; - case ISD::AND: ConvertedOp = X86ISD::AND; break; - case ISD::OR: ConvertedOp = X86ISD::OR; break; - case ISD::XOR: ConvertedOp = X86ISD::XOR; break; - } - - if (ConvertedOp) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) { - SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0)); - SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1)); - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - Op = DAG.getNode(ConvertedOp, dl, VTs, V0, V1); - } - } - } - if (Opcode == 0) { // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, @@ -18495,10 +18784,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG) const { if (isNullConstant(Op1)) - return EmitTest(Op0, X86CC, dl, DAG); - - assert(!(isa(Op1) && Op0.getValueType() == MVT::i1) && - "Unexpected comparison operation for MVT::i1 operands"); + return EmitTest(Op0, X86CC, dl, DAG, Subtarget); if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { @@ -18521,6 +18807,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); return SDValue(Sub.getNode(), 1); } + assert(Op0.getValueType().isFloatingPoint() && "Unexpected VT!"); return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); } @@ -18830,34 +19117,32 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode); } -/// Try to turn a VSETULT into a VSETULE by modifying its second -/// operand \p Op1. If non-trivial (for example because it's not constant) -/// return an empty value. -static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1, - SelectionDAG &DAG) { - BuildVectorSDNode *BV = dyn_cast(Op1.getNode()); +/// Given a simple buildvector constant, return a new vector constant with each +/// element decremented. If decrementing would result in underflow or this +/// is not a simple vector constant, return an empty value. +static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) { + auto *BV = dyn_cast(V.getNode()); if (!BV) return SDValue(); - MVT VT = Op1.getSimpleValueType(); - MVT EVT = VT.getVectorElementType(); - unsigned n = VT.getVectorNumElements(); - SmallVector ULTOp1; - - for (unsigned i = 0; i < n; ++i) { - ConstantSDNode *Elt = dyn_cast(BV->getOperand(i)); - if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT) + MVT VT = V.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + SmallVector NewVecC; + SDLoc DL(V); + for (unsigned i = 0; i < NumElts; ++i) { + auto *Elt = dyn_cast(BV->getOperand(i)); + if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT) return SDValue(); // Avoid underflow. - APInt Val = Elt->getAPIntValue(); - if (Val == 0) + if (Elt->getAPIntValue().isNullValue()) return SDValue(); - ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT)); + NewVecC.push_back(DAG.getConstant(Elt->getAPIntValue() - 1, DL, EltVT)); } - return DAG.getBuildVector(VT, dl, ULTOp1); + return DAG.getBuildVector(VT, DL, NewVecC); } /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for @@ -18886,7 +19171,7 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, // Only do this pre-AVX since vpcmp* is no longer destructive. if (Subtarget.hasAVX()) return SDValue(); - SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG); + SDValue ULEOp1 = decrementVectorConstant(Op1, DAG); if (!ULEOp1) return SDValue(); Op1 = ULEOp1; @@ -18900,9 +19185,9 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, break; } - SDValue Result = DAG.getNode(X86ISD::SUBUS, dl, VT, Op0, Op1); + SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1); return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result, - getZeroVector(VT, Subtarget, DAG, dl)); + DAG.getConstant(0, dl, VT)); } static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, @@ -19065,13 +19350,26 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) && !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1)); - // Special case: Use min/max operations for unsigned compares. We only want - // to do this for unsigned compares if we need to flip signs or if it allows - // use to avoid an invert. + // Special case: Use min/max operations for unsigned compares. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (ISD::isUnsignedIntSetCC(Cond) && (FlipSigns || ISD::isTrueWhenEqual(Cond)) && TLI.isOperationLegal(ISD::UMIN, VT)) { + // If we have a constant operand, increment/decrement it and change the + // condition to avoid an invert. + // TODO: This could be extended to handle a non-splat constant by checking + // that each element of the constant is not the max/null value. + APInt C; + if (Cond == ISD::SETUGT && isConstantSplat(Op1, C) && !C.isMaxValue()) { + // X > C --> X >= (C+1) --> X == umax(X, C+1) + Op1 = DAG.getConstant(C + 1, dl, VT); + Cond = ISD::SETUGE; + } + if (Cond == ISD::SETULT && isConstantSplat(Op1, C) && !C.isNullValue()) { + // X < C --> X <= (C-1) --> X == umin(X, C-1) + Op1 = DAG.getConstant(C - 1, dl, VT); + Cond = ISD::SETULE; + } bool Invert = false; unsigned Opc; switch (Cond) { @@ -19381,7 +19679,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // of 3 logic instructions for size savings and potentially speed. // Unfortunately, there is no scalar form of VBLENDV. - // If either operand is a constant, don't try this. We can expect to + // If either operand is a +0.0 constant, don't try this. We can expect to // optimize away at least one of the logic instructions later in that // case, so that sequence would be faster than a variable blend. @@ -19389,13 +19687,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // uses XMM0 as the selection register. That may need just as many // instructions as the AND/ANDN/OR sequence due to register moves, so // don't bother. - - if (Subtarget.hasAVX() && - !isa(Op1) && !isa(Op2)) { - + if (Subtarget.hasAVX() && !isNullFPConstant(Op1) && + !isNullFPConstant(Op2)) { // Convert to vectors, do a VSELECT, and convert back to scalar. // All of the conversions should be optimized away. - MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1); SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2); @@ -19489,22 +19784,21 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // (select (x == 0), 0, -1) -> neg & sbb if (isNullConstant(Y) && (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { - SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType()); - SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0); - SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), - DAG.getConstant(X86::COND_B, DL, MVT::i8), - SDValue(Neg.getNode(), 1)); - return Res; + SDValue Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + Zero = DAG.getConstant(0, DL, Op.getValueType()); + return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp); } Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType())); Cmp = ConvertCmpIfNecessary(Cmp, DAG); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + SDValue Zero = DAG.getConstant(0, DL, Op.getValueType()); SDValue Res = // Res = 0 or -1. - DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), - DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp); + DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp); if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E)) Res = DAG.getNOT(DL, Res, Res.getValueType()); @@ -19632,7 +19926,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (AddTest) { CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8); - Cond = EmitTest(Cond, X86::COND_NE, DL, DAG); + Cond = EmitCmp(Cond, DAG.getConstant(0, DL, Cond.getValueType()), + X86::COND_NE, DL, DAG); } // a < b ? -1 : 0 -> RES = ~setcc_carry @@ -19723,8 +20018,8 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) { V = DAG.getNode(Op.getOpcode(), dl, WideVT, In); } else { - SDValue NegOne = getOnesVector(WideVT, DAG, dl); - SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl); + SDValue NegOne = DAG.getConstant(-1, dl, WideVT); + SDValue Zero = DAG.getConstant(0, dl, WideVT); V = DAG.getSelect(dl, WideVT, In, NegOne, Zero); } @@ -19764,7 +20059,6 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, SDValue In = Op->getOperand(0); MVT VT = Op->getSimpleValueType(0); MVT InVT = In.getSimpleValueType(); - assert(VT.getSizeInBits() == InVT.getSizeInBits()); MVT SVT = VT.getVectorElementType(); MVT InSVT = InVT.getVectorElementType(); @@ -19785,11 +20079,12 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, // For 256-bit vectors, we only need the lower (128-bit) half of the input. // For 512-bit vectors, we need 128-bits or 256-bits. - if (VT.getSizeInBits() > 128) { + if (InVT.getSizeInBits() > 128) { // Input needs to be at least the same number of elements as output, and // at least 128-bits. int InSize = InSVT.getSizeInBits() * NumElts; In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128)); + InVT = In.getSimpleValueType(); } // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results, @@ -19797,8 +20092,15 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, // need to be handled here for 256/512-bit results. if (Subtarget.hasInt256()) { assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension"); + + if (InVT.getVectorNumElements() != NumElts) + return DAG.getNode(Op.getOpcode(), dl, VT, In); + + // FIXME: Apparently we create inreg operations that could be regular + // extends. unsigned ExtOpc = - Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? X86ISD::VSEXT : X86ISD::VZEXT; + Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND; return DAG.getNode(ExtOpc, dl, VT, In); } @@ -19808,7 +20110,6 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, int HalfNumElts = NumElts / 2; MVT HalfVT = MVT::getVectorVT(SVT, HalfNumElts); - InVT = In.getSimpleValueType(); unsigned NumSrcElts = InVT.getVectorNumElements(); SmallVector HiMask(NumSrcElts, SM_SentinelUndef); for (int i = 0; i != HalfNumElts; ++i) @@ -19822,39 +20123,46 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, // We should only get here for sign extend. assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!"); + assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs"); // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI. SDValue Curr = In; - MVT CurrVT = InVT; + SDValue SignExt = Curr; // As SRAI is only available on i16/i32 types, we expand only up to i32 // and handle i64 separately. - while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) { - Curr = getUnpackl(DAG, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr); - MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2); - CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2); - Curr = DAG.getBitcast(CurrVT, Curr); - } + if (InVT != MVT::v4i32) { + MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT; - SDValue SignExt = Curr; - if (CurrVT != InVT) { - unsigned SignExtShift = - CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits(); - SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, + unsigned DestWidth = DestVT.getScalarSizeInBits(); + unsigned Scale = DestWidth / InSVT.getSizeInBits(); + + unsigned InNumElts = InVT.getVectorNumElements(); + unsigned DestElts = DestVT.getVectorNumElements(); + + // Build a shuffle mask that takes each input element and places it in the + // MSBs of the new element size. + SmallVector Mask(InNumElts, SM_SentinelUndef); + for (unsigned i = 0; i != DestElts; ++i) + Mask[i * Scale + (Scale - 1)] = i; + + Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask); + Curr = DAG.getBitcast(DestVT, Curr); + + unsigned SignExtShift = DestWidth - InSVT.getSizeInBits(); + SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr, DAG.getConstant(SignExtShift, dl, MVT::i8)); } - if (CurrVT == VT) - return SignExt; - - if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) { - SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, - DAG.getConstant(31, dl, MVT::i8)); - SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5}); - return DAG.getBitcast(VT, Ext); + if (VT == MVT::v2i64) { + assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT"); + SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32); + SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT); + SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5}); + SignExt = DAG.getBitcast(VT, SignExt); } - return SDValue(); + return SignExt; } static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, @@ -19879,8 +20187,18 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, InVT.getVectorElementType() == MVT::i32) && "Unexpected element type"); + // Custom legalize v8i8->v8i64 on CPUs without avx512bw. + if (InVT == MVT::v8i8) { + if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64) + return SDValue(); + + In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), + MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8)); + return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In); + } + if (Subtarget.hasInt256()) - return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + return Op; // Optimize vectors in AVX mode // Sign extend v8i16 to v8i32 and @@ -19913,7 +20231,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SDLoc dl(St); SDValue StoredVal = St->getValue(); - // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads. + // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores. if (StoredVal.getValueType().isVector() && StoredVal.getValueType().getVectorElementType() == MVT::i1) { assert(StoredVal.getValueType().getVectorNumElements() <= 8 && @@ -19935,14 +20253,23 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, if (St->isTruncatingStore()) return SDValue(); - assert(StoredVal.getValueType() == MVT::v2f32 && "Unexpected VT"); + MVT StoreVT = StoredVal.getSimpleValueType(); + assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && + "Unexpected VT"); + if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) != + TargetLowering::TypeWidenVector) + return SDValue(); - // Widen the vector, cast to a v2x64 type, extract the single 64-bit - // element and store it. - StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, StoredVal, - DAG.getUNDEF(MVT::v2f32)); - StoredVal = DAG.getBitcast(MVT::v2f64, StoredVal); - StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, StoredVal, + // Widen the vector, cast to a v2x64 type, extract the single 64-bit element + // and store it. + MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(), + StoreVT.getVectorNumElements() * 2); + StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal, + DAG.getUNDEF(StoreVT)); + MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64; + MVT CastVT = MVT::getVectorVT(StVT, 2); + StoredVal = DAG.getBitcast(CastVT, StoredVal); + StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal, DAG.getIntPtrConstant(0, dl)); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), @@ -19952,7 +20279,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, // Lower vector extended loads using a shuffle. If SSSE3 is not available we // may emit an illegal shuffle but the expansion is still better than scalar -// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise +// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise // we'll emit a shuffle and a arithmetic shift. // FIXME: Is the expansion actually better than scalar code? It doesn't seem so. // TODO: It is possible to support ZExt by zeroing the undef values during @@ -19960,16 +20287,16 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT RegVT = Op.getSimpleValueType(); - assert(RegVT.isVector() && "We only custom lower vector sext loads."); + assert(RegVT.isVector() && "We only custom lower vector loads."); assert(RegVT.isInteger() && - "We only custom lower integer vector sext loads."); + "We only custom lower integer vector loads."); LoadSDNode *Ld = cast(Op.getNode()); SDLoc dl(Ld); EVT MemVT = Ld->getMemoryVT(); // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads. - if (RegVT.isVector() && RegVT.getVectorElementType() == MVT::i1) { + if (RegVT.getVectorElementType() == MVT::i1) { assert(EVT(RegVT) == MemVT && "Expected non-extending load"); assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT"); assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && @@ -20072,26 +20399,26 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, assert((Ext != ISD::SEXTLOAD || NumLoads == 1) && "Can only lower sext loads with a single scalar load!"); - unsigned loadRegZize = RegSz; + unsigned loadRegSize = RegSz; if (Ext == ISD::SEXTLOAD && RegSz >= 256) - loadRegZize = 128; + loadRegSize = 128; // If we don't have BWI we won't be able to create the shuffle needed for // v8i8->v8i64. if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && MemVT == MVT::v8i8) - loadRegZize = 128; + loadRegSize = 128; // Represent our vector as a sequence of elements which are the // largest scalar that we can load. EVT LoadUnitVecVT = EVT::getVectorVT( - *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits()); + *DAG.getContext(), SclrLoadTy, loadRegSize / SclrLoadTy.getSizeInBits()); // Represent the data using the same element type that is stored in // memory. In practice, we ''widen'' MemVT. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), - loadRegZize / MemVT.getScalarSizeInBits()); + loadRegSize / MemVT.getScalarSizeInBits()); assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && "Invalid vector type"); @@ -20137,25 +20464,13 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, unsigned SizeRatio = RegSz / MemSz; if (Ext == ISD::SEXTLOAD) { - // If we have SSE4.1, we can directly emit a VSEXT node. - if (Subtarget.hasSSE41()) { - SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG); - return DAG.getMergeValues({Sext, TF}, dl); - } - - // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest - // lanes. - assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) && - "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!"); - - SDValue Shuff = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, RegVT, - SlicedVec); - return DAG.getMergeValues({Shuff, TF}, dl); + SDValue Sext = getExtendInVec(/*Signed*/true, dl, RegVT, SlicedVec, DAG); + return DAG.getMergeValues({Sext, TF}, dl); } if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && MemVT == MVT::v8i8) { - SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG); + SDValue Sext = getExtendInVec(/*Signed*/false, dl, RegVT, SlicedVec, DAG); return DAG.getMergeValues({Sext, TF}, dl); } @@ -20459,7 +20774,8 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (addTest) { X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE; CC = DAG.getConstant(X86Cond, dl, MVT::i8); - Cond = EmitTest(Cond, X86Cond, dl, DAG); + Cond = EmitCmp(Cond, DAG.getConstant(0, dl, Cond.getValueType()), + X86Cond, dl, DAG); } Cond = ConvertCmpIfNecessary(Cond, DAG); return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), @@ -20967,7 +21283,9 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDLoc dl(Op); assert(Mask.getValueType() == MVT::i8 && "Unexpect type"); - SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask); + SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1, + DAG.getBitcast(MVT::v8i1, Mask), + DAG.getIntPtrConstant(0, dl)); if (Op.getOpcode() == X86ISD::FSETCCM || Op.getOpcode() == X86ISD::FSETCCM_RND || Op.getOpcode() == X86ISD::VFPCLASSS) @@ -21595,10 +21913,19 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case ADX: { SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32); SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32); - SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1), - DAG.getConstant(-1, dl, MVT::i8)); - SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2), - Op.getOperand(3), GenCF.getValue(1)); + + SDValue Res; + // If the carry in is zero, then we should just use ADD/SUB instead of + // ADC/SBB. + if (isNullConstant(Op.getOperand(1))) { + Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2), + Op.getOperand(3)); + } else { + SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1), + DAG.getConstant(-1, dl, MVT::i8)); + Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2), + Op.getOperand(3), GenCF.getValue(1)); + } SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG); SDValue Results[] = { SetCC, Res }; return DAG.getMergeValues(Results, dl); @@ -22859,11 +23186,10 @@ static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, // we just take the hi result (by masking the lo result to zero before the // add). SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0)); - SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL); + SDValue Zero = DAG.getConstant(0, DL, CurrVT); - SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT); SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT); - SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask); + SDValue Lo = Op0; SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift); SDValue HiZ; if (CurrVT.is512BitVector()) { @@ -23078,6 +23404,28 @@ static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) { return split256IntArith(Op, DAG); } +static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + if (VT.getScalarType() == MVT::i1) { + SDLoc dl(Op); + switch (Op.getOpcode()) { + default: llvm_unreachable("Expected saturated arithmetic opcode"); + case ISD::UADDSAT: + case ISD::SADDSAT: + return DAG.getNode(ISD::OR, dl, VT, Op.getOperand(0), Op.getOperand(1)); + case ISD::USUBSAT: + case ISD::SSUBSAT: + return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), + DAG.getNOT(dl, Op.getOperand(1), VT)); + } + } + + assert(Op.getSimpleValueType().is256BitVector() && + Op.getSimpleValueType().isInteger() && + "Only handle AVX 256-bit vector integer operation"); + return split256IntArith(Op, DAG); +} + static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) { @@ -23155,53 +23503,49 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16 // vector pairs, multiply and truncate. if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) { - if (Subtarget.hasInt256()) { - // For 512-bit vectors, split into 256-bit vectors to allow the - // sign-extension to occur. - if (VT == MVT::v64i8) - return split512IntArith(Op, DAG); - - // For 256-bit vectors, split into 128-bit vectors to allow the - // sign-extension to occur. We don't need this on AVX512BW as we can - // safely sign-extend to v32i16. - if (VT == MVT::v32i8 && !Subtarget.hasBWI()) - return split256IntArith(Op, DAG); + unsigned NumElts = VT.getVectorNumElements(); + if ((VT == MVT::v16i8 && Subtarget.hasInt256()) || + (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) { MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); return DAG.getNode( ISD::TRUNCATE, dl, VT, DAG.getNode(ISD::MUL, dl, ExVT, - DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A), - DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B))); + DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A), + DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B))); } - assert(VT == MVT::v16i8 && - "Pre-AVX2 support only supports v16i8 multiplication"); - MVT ExVT = MVT::v8i16; + MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2); - // Extract the lo parts and sign extend to i16 - // We're going to mask off the low byte of each result element of the - // pmullw, so it doesn't matter what's in the high byte of each 16-bit - // element. - const int LoShufMask[] = {0, -1, 1, -1, 2, -1, 3, -1, - 4, -1, 5, -1, 6, -1, 7, -1}; - SDValue ALo = DAG.getVectorShuffle(VT, dl, A, A, LoShufMask); - SDValue BLo = DAG.getVectorShuffle(VT, dl, B, B, LoShufMask); - ALo = DAG.getBitcast(ExVT, ALo); - BLo = DAG.getBitcast(ExVT, BLo); - - // Extract the hi parts and sign extend to i16 + // Extract the lo/hi parts to any extend to i16. // We're going to mask off the low byte of each result element of the // pmullw, so it doesn't matter what's in the high byte of each 16-bit // element. - const int HiShufMask[] = {8, -1, 9, -1, 10, -1, 11, -1, - 12, -1, 13, -1, 14, -1, 15, -1}; - SDValue AHi = DAG.getVectorShuffle(VT, dl, A, A, HiShufMask); - SDValue BHi = DAG.getVectorShuffle(VT, dl, B, B, HiShufMask); - AHi = DAG.getBitcast(ExVT, AHi); - BHi = DAG.getBitcast(ExVT, BHi); - - // Multiply, mask the lower 8bits of the lo/hi results and pack + SDValue Undef = DAG.getUNDEF(VT); + SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef)); + SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef)); + + SDValue BLo, BHi; + if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) { + // If the LHS is a constant, manually unpackl/unpackh. + SmallVector LoOps, HiOps; + for (unsigned i = 0; i != NumElts; i += 16) { + for (unsigned j = 0; j != 8; ++j) { + LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl, + MVT::i16)); + HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl, + MVT::i16)); + } + } + + BLo = DAG.getBuildVector(ExVT, dl, LoOps); + BHi = DAG.getBuildVector(ExVT, dl, HiOps); + } else { + BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef)); + BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef)); + } + + // Multiply, mask the lower 8bits of the lo/hi results and pack. SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT)); @@ -23262,7 +23606,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero); bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero); - SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl); + SDValue Zero = DAG.getConstant(0, dl, VT); // Only multiply lo/hi halves that aren't known to be zero. SDValue AloBlo = Zero; @@ -23352,11 +23696,11 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, // If we have a signed multiply but no PMULDQ fix up the result of an // unsigned multiply. if (IsSigned && !Subtarget.hasSSE41()) { - SDValue ShAmt = DAG.getConstant(31, dl, VT); + SDValue Zero = DAG.getConstant(0, dl, VT); SDValue T1 = DAG.getNode(ISD::AND, dl, VT, - DAG.getNode(ISD::SRA, dl, VT, A, ShAmt), B); + DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B); SDValue T2 = DAG.getNode(ISD::AND, dl, VT, - DAG.getNode(ISD::SRA, dl, VT, B, ShAmt), A); + DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A); SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup); @@ -23375,113 +23719,138 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack // and then ashr/lshr the upper bits down to the lower bits before multiply. - unsigned ExShift = IsSigned ? X86ISD::VSRAI : X86ISD::VSRLI; unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - // For 512-bit vectors, split into 256-bit vectors to allow the + if ((VT == MVT::v16i8 && Subtarget.hasInt256()) || + (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) { + MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts); + SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A); + SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B); + SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB); + Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul); + } + + // For signed 512-bit vectors, split into 256-bit vectors to allow the // sign-extension to occur. - if (VT == MVT::v64i8) + if (VT == MVT::v64i8 && IsSigned) return split512IntArith(Op, DAG); - // AVX2 implementations - extend xmm subvectors to ymm. - if (Subtarget.hasInt256()) { + // Signed AVX2 implementation - extend xmm subvectors to ymm. + if (VT == MVT::v32i8 && IsSigned) { SDValue Lo = DAG.getIntPtrConstant(0, dl); SDValue Hi = DAG.getIntPtrConstant(NumElts / 2, dl); - if (VT == MVT::v32i8) { - if (Subtarget.canExtendTo512BW()) { - MVT ExVT = MVT::v32i16; - SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A); - SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B); - SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB); - Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG); - return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul); + MVT ExVT = MVT::v16i16; + SDValue ALo = extract128BitVector(A, 0, DAG, dl); + SDValue BLo = extract128BitVector(B, 0, DAG, dl); + SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl); + SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl); + ALo = DAG.getNode(ExAVX, dl, ExVT, ALo); + BLo = DAG.getNode(ExAVX, dl, ExVT, BLo); + AHi = DAG.getNode(ExAVX, dl, ExVT, AHi); + BHi = DAG.getNode(ExAVX, dl, ExVT, BHi); + Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); + Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); + Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG); + Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG); + + // Bitcast back to VT and then pack all the even elements from Lo and Hi. + // Shuffle lowering should turn this into PACKUS+PERMQ + Lo = DAG.getBitcast(VT, Lo); + Hi = DAG.getBitcast(VT, Hi); + return DAG.getVectorShuffle(VT, dl, Lo, Hi, + { 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62}); + } + + // For signed v16i8 and all unsigned vXi8 we will unpack the low and high + // half of each 128 bit lane to widen to a vXi16 type. Do the multiplies, + // shift the results and pack the half lane results back together. + + MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2); + + static const int PSHUFDMask[] = { 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1}; + + // Extract the lo parts and zero/sign extend to i16. + // Only use SSE4.1 instructions for signed v16i8 where using unpack requires + // shifts to sign extend. Using unpack for unsigned only requires an xor to + // create zeros and a copy due to tied registers contraints pre-avx. But using + // zero_extend_vector_inreg would require an additional pshufd for the high + // part. + + SDValue ALo, AHi; + if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) { + ALo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, A); + + AHi = DAG.getVectorShuffle(VT, dl, A, A, PSHUFDMask); + AHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, AHi); + } else if (IsSigned) { + ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A)); + AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A)); + + ALo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, ALo, 8, DAG); + AHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, AHi, 8, DAG); + } else { + ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, + DAG.getConstant(0, dl, VT))); + AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, + DAG.getConstant(0, dl, VT))); + } + + SDValue BLo, BHi; + if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) { + // If the LHS is a constant, manually unpackl/unpackh and extend. + SmallVector LoOps, HiOps; + for (unsigned i = 0; i != NumElts; i += 16) { + for (unsigned j = 0; j != 8; ++j) { + SDValue LoOp = B.getOperand(i + j); + SDValue HiOp = B.getOperand(i + j + 8); + + if (IsSigned) { + LoOp = DAG.getSExtOrTrunc(LoOp, dl, MVT::i16); + HiOp = DAG.getSExtOrTrunc(HiOp, dl, MVT::i16); + } else { + LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16); + HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16); + } + + LoOps.push_back(LoOp); + HiOps.push_back(HiOp); } - MVT ExVT = MVT::v16i16; - SDValue ALo = extract128BitVector(A, 0, DAG, dl); - SDValue BLo = extract128BitVector(B, 0, DAG, dl); - SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl); - SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl); - ALo = DAG.getNode(ExAVX, dl, ExVT, ALo); - BLo = DAG.getNode(ExAVX, dl, ExVT, BLo); - AHi = DAG.getNode(ExAVX, dl, ExVT, AHi); - BHi = DAG.getNode(ExAVX, dl, ExVT, BHi); - Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); - Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); - Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG); - Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG); - - SDValue Res = DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); - // The ymm variant of PACKUS treats the 128-bit lanes separately, so we - // need to permute the final result into place. - Res = DAG.getBitcast(MVT::v4i64, Res); - Res = DAG.getVectorShuffle(MVT::v4i64, dl, Res, Res, { 0, 2, 1, 3 }); - return DAG.getBitcast(VT, Res); } - assert(VT == MVT::v16i8 && "Unexpected VT"); + BLo = DAG.getBuildVector(ExVT, dl, LoOps); + BHi = DAG.getBuildVector(ExVT, dl, HiOps); + } else if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) { + BLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, B); - SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A); - SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B); - SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB); - Mul = - getTargetVShiftByConstNode(X86ISD::VSRLI, dl, MVT::v16i16, Mul, 8, DAG); - // If we have BWI we can use truncate instruction. - if (Subtarget.hasBWI()) - return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul); - Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo); - Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi); - return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); - } + BHi = DAG.getVectorShuffle(VT, dl, B, B, PSHUFDMask); + BHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, BHi); + } else if (IsSigned) { + BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B)); + BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B)); - assert(VT == MVT::v16i8 && - "Pre-AVX2 support only supports v16i8 multiplication"); - MVT ExVT = MVT::v8i16; - unsigned ExSSE41 = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG - : ISD::ZERO_EXTEND_VECTOR_INREG; - - // Extract the lo parts and zero/sign extend to i16. - SDValue ALo, BLo; - if (Subtarget.hasSSE41()) { - ALo = DAG.getNode(ExSSE41, dl, ExVT, A); - BLo = DAG.getNode(ExSSE41, dl, ExVT, B); - } else { - const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3, - -1, 4, -1, 5, -1, 6, -1, 7}; - ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); - BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); - ALo = DAG.getBitcast(ExVT, ALo); - BLo = DAG.getBitcast(ExVT, BLo); - ALo = getTargetVShiftByConstNode(ExShift, dl, ExVT, ALo, 8, DAG); - BLo = getTargetVShiftByConstNode(ExShift, dl, ExVT, BLo, 8, DAG); - } - - // Extract the hi parts and zero/sign extend to i16. - SDValue AHi, BHi; - if (Subtarget.hasSSE41()) { - const int ShufMask[] = { 8, 9, 10, 11, 12, 13, 14, 15, - -1, -1, -1, -1, -1, -1, -1, -1}; - AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); - BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); - AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi); - BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi); + BLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BLo, 8, DAG); + BHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BHi, 8, DAG); } else { - const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11, - -1, 12, -1, 13, -1, 14, -1, 15}; - AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); - BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); - AHi = DAG.getBitcast(ExVT, AHi); - BHi = DAG.getBitcast(ExVT, BHi); - AHi = getTargetVShiftByConstNode(ExShift, dl, ExVT, AHi, 8, DAG); - BHi = getTargetVShiftByConstNode(ExShift, dl, ExVT, BHi, 8, DAG); + BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, + DAG.getConstant(0, dl, VT))); + BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, + DAG.getConstant(0, dl, VT))); } // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and - // pack back to v16i8. + // pack back to vXi8. SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG); RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG); + + // Bitcast back to VT and then pack all the even elements from Lo and Hi. return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); } @@ -23605,8 +23974,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, if (ShiftAmt == 63 && Subtarget.hasSSE42()) { assert((VT != MVT::v4i64 || Subtarget.hasInt256()) && "Unsupported PCMPGT op"); - return DAG.getNode(X86ISD::PCMPGT, dl, VT, - getZeroVector(VT, Subtarget, DAG, dl), R); + return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R); } if (ShiftAmt >= 32) { @@ -23621,7 +23989,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {9, 1, 11, 3, 13, 5, 15, 7}); } else { - // SRA upper i32, SHL whole i64 and select lower i32. + // SRA upper i32, SRL whole i64 and select lower i32. SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, ShiftAmt, DAG); SDValue Lower = @@ -23662,7 +24030,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, // ashr(R, 7) === cmp_slt(R, 0) if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) { - SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); + SDValue Zeros = DAG.getConstant(0, dl, VT); if (VT.is512BitVector()) { assert(VT == MVT::v64i8 && "Unexpected element type!"); SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT); @@ -23708,60 +24076,52 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, return SDValue(); } -// Determine if V is a splat value, and return the scalar. -static SDValue IsSplatValue(MVT VT, SDValue V, const SDLoc &dl, - SelectionDAG &DAG, const X86Subtarget &Subtarget, - unsigned Opcode) { - V = peekThroughEXTRACT_SUBVECTORs(V); +// If V is a splat value, return the source vector and splat index; +static SDValue IsSplatVector(SDValue V, int &SplatIdx, SelectionDAG &DAG) { + V = peekThroughEXTRACT_SUBVECTORs(V); - // Check if this is a splat build_vector node. - if (BuildVectorSDNode *BV = dyn_cast(V)) { - SDValue SplatAmt = BV->getSplatValue(); - if (SplatAmt && SplatAmt.isUndef()) - return SDValue(); - return SplatAmt; - } - - // Check for SUB(SPLAT_BV, SPLAT) cases from rotate patterns. - if (V.getOpcode() == ISD::SUB && - !SupportedVectorVarShift(VT, Subtarget, Opcode)) { - SDValue LHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(0)); - SDValue RHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(1)); - - // Ensure that the corresponding splat BV element is not UNDEF. - BitVector UndefElts; - BuildVectorSDNode *BV0 = dyn_cast(LHS); - ShuffleVectorSDNode *SVN1 = dyn_cast(RHS); - if (BV0 && SVN1 && BV0->getSplatValue(&UndefElts) && SVN1->isSplat()) { - unsigned SplatIdx = (unsigned)SVN1->getSplatIndex(); - if (!UndefElts[SplatIdx]) - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, - VT.getVectorElementType(), V, - DAG.getIntPtrConstant(SplatIdx, dl)); + EVT VT = V.getValueType(); + unsigned Opcode = V.getOpcode(); + switch (Opcode) { + default: { + APInt UndefElts; + APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); + if (DAG.isSplatValue(V, DemandedElts, UndefElts)) { + // Handle case where all demanded elements are UNDEF. + if (DemandedElts.isSubsetOf(UndefElts)) { + SplatIdx = 0; + return DAG.getUNDEF(VT); + } + SplatIdx = (UndefElts & DemandedElts).countTrailingOnes(); + return V; } + break; } - - // Check if this is a shuffle node doing a splat. - ShuffleVectorSDNode *SVN = dyn_cast(V); - if (!SVN || !SVN->isSplat()) - return SDValue(); - - unsigned SplatIdx = (unsigned)SVN->getSplatIndex(); - SDValue InVec = V.getOperand(0); - if (InVec.getOpcode() == ISD::BUILD_VECTOR) { - assert((SplatIdx < VT.getVectorNumElements()) && - "Unexpected shuffle index found!"); - return InVec.getOperand(SplatIdx); - } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { - if (ConstantSDNode *C = dyn_cast(InVec.getOperand(2))) - if (C->getZExtValue() == SplatIdx) - return InVec.getOperand(1); + case ISD::VECTOR_SHUFFLE: { + // Check if this is a shuffle node doing a splat. + // TODO - remove this and rely purely on SelectionDAG::isSplatValue, + // getTargetVShiftNode currently struggles without the splat source. + auto *SVN = cast(V); + if (!SVN->isSplat()) + break; + int Idx = SVN->getSplatIndex(); + int NumElts = V.getValueType().getVectorNumElements(); + SplatIdx = Idx % NumElts; + return V.getOperand(Idx / NumElts); } + } + + return SDValue(); +} - // Avoid introducing an extract element from a shuffle. - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, - VT.getVectorElementType(), InVec, - DAG.getIntPtrConstant(SplatIdx, dl)); +static SDValue GetSplatValue(SDValue V, const SDLoc &dl, + SelectionDAG &DAG) { + int SplatIdx; + if (SDValue SrcVector = IsSplatVector(V, SplatIdx, DAG)) + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + SrcVector.getValueType().getScalarType(), SrcVector, + DAG.getIntPtrConstant(SplatIdx, dl)); + return SDValue(); } static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, @@ -23774,9 +24134,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false); unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true); - Amt = peekThroughEXTRACT_SUBVECTORs(Amt); - - if (SDValue BaseShAmt = IsSplatValue(VT, Amt, dl, DAG, Subtarget, Opcode)) { + if (SDValue BaseShAmt = GetSplatValue(Amt, dl, DAG)) { if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) { MVT EltVT = VT.getVectorElementType(); assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!"); @@ -23901,7 +24259,7 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, // AVX2 can more effectively perform this as a zext/trunc to/from v8i32. if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) { - SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); + SDValue Z = DAG.getConstant(0, dl, VT); SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z)); SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z)); Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG); @@ -24095,7 +24453,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, // just zero-extending, but for SSE just duplicating the top 16-bits is // cheaper and has the same effect for out of range values. if (Subtarget.hasAVX()) { - SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); + SDValue Z = DAG.getConstant(0, dl, VT); Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1}); Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1}); Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1}); @@ -24230,7 +24588,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, // On pre-SSE41 targets we test for the sign bit by comparing to // zero - a negative value will set all bits of the lanes to true // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering. - SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl); + SDValue Z = DAG.getConstant(0, dl, SelVT); SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel); return DAG.getSelect(dl, SelVT, C, V0, V1); }; @@ -24267,10 +24625,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, // For SRA we need to unpack each byte to the higher byte of a i16 vector // so we can correctly sign extend. We don't care what happens to the // lower byte. - SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt); - SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt); - SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R); - SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R); + SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt); + SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt); + SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R); + SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R); ALo = DAG.getBitcast(ExtVT, ALo); AHi = DAG.getBitcast(ExtVT, AHi); RLo = DAG.getBitcast(ExtVT, RLo); @@ -24312,11 +24670,11 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) { MVT ExtVT = MVT::v8i32; - SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); - SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z); - SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z); - SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R); - SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R); + SDValue Z = DAG.getConstant(0, dl, VT); + SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z); + SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z); + SDValue RLo = getUnpackl(DAG, dl, VT, Z, R); + SDValue RHi = getUnpackh(DAG, dl, VT, Z, R); ALo = DAG.getBitcast(ExtVT, ALo); AHi = DAG.getBitcast(ExtVT, AHi); RLo = DAG.getBitcast(ExtVT, RLo); @@ -24408,20 +24766,31 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SDValue Amt = Op.getOperand(1); unsigned Opcode = Op.getOpcode(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); + int NumElts = VT.getVectorNumElements(); + + // Check for constant splat rotation amount. + APInt UndefElts; + SmallVector EltBits; + int CstSplatIndex = -1; + if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) + for (int i = 0; i != NumElts; ++i) + if (!UndefElts[i]) { + if (CstSplatIndex < 0 || EltBits[i] == EltBits[CstSplatIndex]) { + CstSplatIndex = i; + continue; + } + CstSplatIndex = -1; + break; + } + // AVX512 implicitly uses modulo rotation amounts. if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) { // Attempt to rotate by immediate. - APInt UndefElts; - SmallVector EltBits; - if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) { - if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) { - return EltBits[0] == V; - })) { - unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI); - uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits); - return DAG.getNode(Op, DL, VT, R, - DAG.getConstant(RotateAmt, DL, MVT::i8)); - } + if (0 <= CstSplatIndex) { + unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI); + uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits); + return DAG.getNode(Op, DL, VT, R, + DAG.getConstant(RotateAmt, DL, MVT::i8)); } // Else, fall-back on VPROLV/VPRORV. @@ -24432,19 +24801,17 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, // XOP has 128-bit vector variable + immediate rotates. // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL. + // XOP implicitly uses modulo rotation amounts. if (Subtarget.hasXOP()) { if (VT.is256BitVector()) return split256IntArith(Op, DAG); assert(VT.is128BitVector() && "Only rotate 128-bit vectors!"); // Attempt to rotate by immediate. - if (auto *BVAmt = dyn_cast(Amt)) { - if (auto *RotateConst = BVAmt->getConstantSplatNode()) { - uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue(); - assert(RotateAmt < EltSizeInBits && "Rotation out of range"); - return DAG.getNode(X86ISD::VROTLI, DL, VT, R, - DAG.getConstant(RotateAmt, DL, MVT::i8)); - } + if (0 <= CstSplatIndex) { + uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits); + return DAG.getNode(X86ISD::VROTLI, DL, VT, R, + DAG.getConstant(RotateAmt, DL, MVT::i8)); } // Use general rotate by variable (per-element). @@ -24461,44 +24828,19 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, "Only vXi32/vXi16/vXi8 vector rotates supported"); // Rotate by an uniform constant - expand back to shifts. - // TODO - legalizers should be able to handle this. - if (auto *BVAmt = dyn_cast(Amt)) { - if (auto *RotateConst = BVAmt->getConstantSplatNode()) { - uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue(); - assert(RotateAmt < EltSizeInBits && "Rotation out of range"); - if (RotateAmt == 0) - return R; - - SDValue AmtR = DAG.getConstant(EltSizeInBits - RotateAmt, DL, VT); - SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt); - SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR); - return DAG.getNode(ISD::OR, DL, VT, SHL, SRL); - } - } + if (0 <= CstSplatIndex) + return SDValue(); - // Rotate by splat - expand back to shifts. - // TODO - legalizers should be able to handle this. - if ((EltSizeInBits >= 16 || Subtarget.hasBWI()) && - IsSplatValue(VT, Amt, DL, DAG, Subtarget, Opcode)) { - SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT); - AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt); - SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt); - SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR); - return DAG.getNode(ISD::OR, DL, VT, SHL, SRL); - } + bool IsSplatAmt = DAG.isSplatValue(Amt); // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by // the amount bit. - if (EltSizeInBits == 8) { - if (Subtarget.hasBWI()) { - SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT); - AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt); - SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt); - SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR); - return DAG.getNode(ISD::OR, DL, VT, SHL, SRL); - } + if (EltSizeInBits == 8 && !IsSplatAmt) { + if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) + return SDValue(); - MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); + // We don't need ModuloAmt here as we just peek at individual bits. + MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2); auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { if (Subtarget.hasSSE41()) { @@ -24512,7 +24854,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, // On pre-SSE41 targets we test for the sign bit by comparing to // zero - a negative value will set all bits of the lanes to true // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering. - SDValue Z = getZeroVector(SelVT, Subtarget, DAG, DL); + SDValue Z = DAG.getConstant(0, DL, SelVT); SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel); return DAG.getSelect(DL, SelVT, C, V0, V1); }; @@ -24553,14 +24895,17 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, return SignBitSelect(VT, Amt, M, R); } + // ISD::ROT* uses modulo rotate amounts. + Amt = DAG.getNode(ISD::AND, DL, VT, Amt, + DAG.getConstant(EltSizeInBits - 1, DL, VT)); + bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) && SupportedVectorVarShift(VT, Subtarget, ISD::SRL); - // Best to fallback for all supported variable shifts. - // AVX2 - best to fallback for non-constants as well. - // TODO - legalizers should be able to handle this. - if (LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) { + // Fallback for splats + all supported variable shifts. + // Fallback for non-constants AVX2 vXi16 as well. + if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) { SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT); AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt); SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt); @@ -24938,40 +25283,32 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) { assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); - if (DstVT != MVT::f64) + if (DstVT != MVT::f64 && DstVT != MVT::i64 && + !(DstVT == MVT::x86mmx && SrcVT.isVector())) // This conversion needs to be expanded. return SDValue(); - SmallVector Elts; SDLoc dl(Op); - unsigned NumElts; - MVT SVT; if (SrcVT.isVector()) { - NumElts = SrcVT.getVectorNumElements(); - SVT = SrcVT.getVectorElementType(); - // Widen the vector in input in the case of MVT::v2i32. // Example: from MVT::v2i32 to MVT::v4i32. - for (unsigned i = 0, e = NumElts; i != e; ++i) - Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Src, - DAG.getIntPtrConstant(i, dl))); + MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(), + SrcVT.getVectorNumElements() * 2); + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src, + DAG.getUNDEF(SrcVT)); } else { assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() && "Unexpected source type in LowerBITCAST"); - Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src, - DAG.getIntPtrConstant(0, dl))); - Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src, - DAG.getIntPtrConstant(1, dl))); - NumElts = 2; - SVT = MVT::i32; - } - // Explicitly mark the extra elements as Undef. - Elts.append(NumElts, DAG.getUNDEF(SVT)); - - EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); - SDValue BV = DAG.getBuildVector(NewVT, dl, Elts); - SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64, + Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src); + } + + MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64; + Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src); + + if (DstVT == MVT::x86mmx) + return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src, DAG.getIntPtrConstant(0, dl)); } @@ -25014,7 +25351,7 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, // PSADBW instruction horizontally add all bytes and leave the result in i64 // chunks, thus directly computes the pop count for v2i64 and v4i64. if (EltVT == MVT::i64) { - SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); + SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT); MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros); return DAG.getBitcast(VT, V); @@ -25026,13 +25363,13 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, // this is that it lines up the results of two PSADBW instructions to be // two v2i64 vectors which concatenated are the 4 population counts. We can // then use PACKUSWB to shrink and concatenate them into a v4i32 again. - SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL); + SDValue Zeros = DAG.getConstant(0, DL, VT); SDValue V32 = DAG.getBitcast(VT, V); - SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros); - SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros); + SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros); + SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros); // Do the horizontal sums into two v2i64s. - Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); + Zeros = DAG.getConstant(0, DL, ByteVecVT); MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, Low), Zeros); @@ -25805,6 +26142,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SHL_PARTS: case ISD::SRA_PARTS: case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); + case ISD::FSHL: + case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); @@ -25872,6 +26211,10 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); case ISD::ADD: case ISD::SUB: return LowerADD_SUB(Op, DAG); + case ISD::UADDSAT: + case ISD::SADDSAT: + case ISD::USUBSAT: + case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG); case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: @@ -25922,8 +26265,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, llvm_unreachable("Do not know how to custom type legalize this operation!"); case ISD::MUL: { EVT VT = N->getValueType(0); - assert(VT.isVector() && VT.getVectorNumElements() == 2 && "Unexpected VT"); - if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) { + assert(VT.isVector() && "Unexpected VT"); + if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger && + VT.getVectorNumElements() == 2) { // Promote to a pattern that will be turned into PMULUDQ. SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64, N->getOperand(0)); @@ -25935,33 +26279,55 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, DAG.getConstant(0xffffffff, dl, MVT::v2i64)); SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v2i64, N0, N1); Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul)); + } else if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && + VT.getVectorElementType() == MVT::i8) { + // Pre-promote these to vXi16 to avoid op legalization thinking all 16 + // elements are needed. + MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); + SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0)); + SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1)); + SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1); + Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + unsigned NumConcats = 16 / VT.getVectorNumElements(); + SmallVector ConcatOps(NumConcats, DAG.getUNDEF(VT)); + ConcatOps[0] = Res; + Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps); + Results.push_back(Res); } return; } - case X86ISD::ADDUS: - case X86ISD::SUBUS: + case ISD::UADDSAT: + case ISD::SADDSAT: + case ISD::USUBSAT: + case ISD::SSUBSAT: + case X86ISD::VPMADDWD: case X86ISD::AVG: { - // Legalize types for X86ISD::AVG/ADDUS/SUBUS by widening. + // Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and + // X86ISD::AVG/VPMADDWD by widening. assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); - auto InVT = N->getValueType(0); - assert(InVT.getSizeInBits() < 128); - assert(128 % InVT.getSizeInBits() == 0); + EVT VT = N->getValueType(0); + EVT InVT = N->getOperand(0).getValueType(); + assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && + "Expected a VT that divides into 128 bits."); unsigned NumConcat = 128 / InVT.getSizeInBits(); - EVT RegVT = EVT::getVectorVT(*DAG.getContext(), - InVT.getVectorElementType(), - NumConcat * InVT.getVectorNumElements()); + EVT InWideVT = EVT::getVectorVT(*DAG.getContext(), + InVT.getVectorElementType(), + NumConcat * InVT.getVectorNumElements()); + EVT WideVT = EVT::getVectorVT(*DAG.getContext(), + VT.getVectorElementType(), + NumConcat * VT.getVectorNumElements()); SmallVector Ops(NumConcat, DAG.getUNDEF(InVT)); Ops[0] = N->getOperand(0); - SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops); Ops[0] = N->getOperand(1); - SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops); - SDValue Res = DAG.getNode(N->getOpcode(), dl, RegVT, InVec0, InVec1); - if (getTypeAction(*DAG.getContext(), InVT) != TypeWidenVector) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res, + SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1); + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); return; @@ -26004,25 +26370,155 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::SDIV: case ISD::UDIV: case ISD::SREM: - case ISD::UREM: - if (N->getValueType(0) == MVT::v2i32) { - // If we're already legalizing via widening, we don't need this since - // that will scalarize div/rem. - if (getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector) - return; + case ISD::UREM: { + EVT VT = N->getValueType(0); + if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector) { + // If this RHS is a constant splat vector we can widen this and let + // division/remainder by constant optimize it. + // TODO: Can we do something for non-splat? + APInt SplatVal; + if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) { + unsigned NumConcats = 128 / VT.getSizeInBits(); + SmallVector Ops0(NumConcats, DAG.getUNDEF(VT)); + Ops0[0] = N->getOperand(0); + EVT ResVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0); + SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT); + SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1); + Results.push_back(Res); + } + return; + } + + if (VT == MVT::v2i32) { // Legalize v2i32 div/rem by unrolling. Otherwise we promote to the // v2i64 and unroll later. But then we create i64 scalar ops which // might be slow in 64-bit mode or require a libcall in 32-bit mode. Results.push_back(DAG.UnrollVectorOp(N)); return; } + + if (VT.isVector()) + return; + LLVM_FALLTHROUGH; + } case ISD::SDIVREM: case ISD::UDIVREM: { SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG); Results.push_back(V); return; } + case ISD::TRUNCATE: { + MVT VT = N->getSimpleValueType(0); + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) + return; + + // The generic legalizer will try to widen the input type to the same + // number of elements as the widened result type. But this isn't always + // the best thing so do some custom legalization to avoid some cases. + MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT(); + SDValue In = N->getOperand(0); + EVT InVT = In.getValueType(); + + unsigned InBits = InVT.getSizeInBits(); + if (128 % InBits == 0) { + // 128 bit and smaller inputs should avoid truncate all together and + // just use a build_vector that will become a shuffle. + // TODO: Widen and use a shuffle directly? + MVT InEltVT = InVT.getSimpleVT().getVectorElementType(); + EVT EltVT = VT.getVectorElementType(); + unsigned WidenNumElts = WidenVT.getVectorNumElements(); + SmallVector Ops(WidenNumElts, DAG.getUNDEF(EltVT)); + // Use the original element count so we don't do more scalar opts than + // necessary. + unsigned MinElts = VT.getVectorNumElements(); + for (unsigned i=0; i < MinElts; ++i) { + SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In, + DAG.getIntPtrConstant(i, dl)); + Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val); + } + Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops)); + return; + } + // With AVX512 there are some cases that can use a target specific + // truncate node to go from 256/512 to less than 128 with zeros in the + // upper elements of the 128 bit result. + if (Subtarget.hasAVX512() && isTypeLegal(InVT)) { + // We can use VTRUNC directly if for 256 bits with VLX or for any 512. + if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) { + Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In)); + return; + } + // There's one case we can widen to 512 bits and use VTRUNC. + if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) { + In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In, + DAG.getUNDEF(MVT::v4i64)); + Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In)); + return; + } + } + return; + } + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: { + if (!ExperimentalVectorWideningLegalization) + return; + + EVT VT = N->getValueType(0); + SDValue In = N->getOperand(0); + EVT InVT = In.getValueType(); + if (!Subtarget.hasSSE41() && VT == MVT::v4i64 && + (InVT == MVT::v4i16 || InVT == MVT::v4i8)) { + // Custom split this so we can extend i8/i16->i32 invec. This is better + // since sign_extend_inreg i8/i16->i64 requires two sra operations. So + // this allows the first to be shared. + In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In); + + // Fill a vector with sign bits for each element. + SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32); + SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT); + + // Create an unpackl and unpackh to interleave the sign bits then bitcast + // to v2i64. + SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits, + {0, 4, 1, 5}); + Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo); + SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits, + {2, 6, 3, 7}); + Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi); + + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); + Results.push_back(Res); + return; + } + + if ((VT == MVT::v16i32 || VT == MVT::v8i64) && InVT.is128BitVector()) { + // Perform custom splitting instead of the two stage extend we would get + // by default. + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + assert(isTypeLegal(LoVT) && "Split VT not legal?"); + + bool IsSigned = N->getOpcode() == ISD::SIGN_EXTEND; + + SDValue Lo = getExtendInVec(IsSigned, dl, LoVT, In, DAG); + + // We need to shift the input over by half the number of elements. + unsigned NumElts = InVT.getVectorNumElements(); + unsigned HalfNumElts = NumElts / 2; + SmallVector ShufMask(NumElts, SM_SentinelUndef); + for (unsigned i = 0; i != HalfNumElts; ++i) + ShufMask[i] = i + HalfNumElts; + + SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask); + Hi = getExtendInVec(IsSigned, dl, HiVT, Hi, DAG); + + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); + Results.push_back(Res); + } + return; + } case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: { bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; @@ -26033,7 +26529,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, // Promote these manually to avoid over promotion to v2i64. Type // legalization will revisit the v2i32 operation for more cleanup. if ((VT == MVT::v2i8 || VT == MVT::v2i16) && - getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) { + getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) { // AVX512DQ provides instructions that produce a v2i64 result. if (Subtarget.hasDQI()) return; @@ -26048,6 +26544,39 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } + if (VT.isVector() && VT.getScalarSizeInBits() < 32) { + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) + return; + + // Try to create a 128 bit vector, but don't exceed a 32 bit element. + unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U); + MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth), + VT.getVectorNumElements()); + SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src); + + // Preserve what we know about the size of the original result. Except + // when the result is v2i32 since we can't widen the assert. + if (PromoteVT != MVT::v2i32) + Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext + : ISD::AssertSext, + dl, PromoteVT, Res, + DAG.getValueType(VT.getVectorElementType())); + + // Truncate back to the original width. + Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + + // Now widen to 128 bits. + unsigned NumConcats = 128 / VT.getSizeInBits(); + MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(), + VT.getVectorNumElements() * NumConcats); + SmallVector ConcatOps(NumConcats, DAG.getUNDEF(VT)); + ConcatOps[0] = Res; + Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps); + Results.push_back(Res); + return; + } + + if (VT == MVT::v2i32) { assert((IsSigned || Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512"); @@ -26400,44 +26929,53 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Chain); return; } - EVT IndexVT = Index.getValueType(); - EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(), - IndexVT.getScalarType(), 4); - // Otherwise we need to custom widen everything to avoid promotion. - Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index, - DAG.getUNDEF(IndexVT)); - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, - DAG.getConstant(0, dl, MVT::v2i1)); - SDValue Ops[] = { Gather->getChain(), PassThru, Mask, - Gather->getBasePtr(), Index, Gather->getScale() }; - SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other), - Gather->getMemoryVT(), dl, Ops, - Gather->getMemOperand()); - SDValue Chain = Res.getValue(1); - if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); - Results.push_back(Res); - Results.push_back(Chain); - return; + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) { + EVT IndexVT = Index.getValueType(); + EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(), + IndexVT.getScalarType(), 4); + // Otherwise we need to custom widen everything to avoid promotion. + Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index, + DAG.getUNDEF(IndexVT)); + Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, + DAG.getConstant(0, dl, MVT::v2i1)); + SDValue Ops[] = { Gather->getChain(), PassThru, Mask, + Gather->getBasePtr(), Index, Gather->getScale() }; + SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other), + Gather->getMemoryVT(), dl, Ops, + Gather->getMemOperand()); + SDValue Chain = Res.getValue(1); + if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector) + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); + Results.push_back(Chain); + return; + } } - break; + return; } case ISD::LOAD: { - // Use an f64 load and a scalar_to_vector for v2f32 loads. This avoids - // scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp cast - // since type legalization will try to use an i64 load. - assert(N->getValueType(0) == MVT::v2f32 && "Unexpected VT"); + // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This + // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp + // cast since type legalization will try to use an i64 load. + MVT VT = N->getSimpleValueType(0); + assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT"); + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) + return; if (!ISD::isNON_EXTLoad(N)) return; auto *Ld = cast(N); - SDValue Res = DAG.getLoad(MVT::f64, dl, Ld->getChain(), Ld->getBasePtr(), + MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64; + SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); SDValue Chain = Res.getValue(1); - Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Res); - Res = DAG.getBitcast(MVT::v4f32, Res); + MVT WideVT = MVT::getVectorVT(LdVT, 2); + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res); + MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() * 2); + Res = DAG.getBitcast(CastVT, Res); Results.push_back(Res); Results.push_back(Chain); return; @@ -26499,8 +27037,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::BLENDI: return "X86ISD::BLENDI"; case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND"; - case X86ISD::ADDUS: return "X86ISD::ADDUS"; - case X86ISD::SUBUS: return "X86ISD::SUBUS"; case X86ISD::HADD: return "X86ISD::HADD"; case X86ISD::HSUB: return "X86ISD::HSUB"; case X86ISD::FHADD: return "X86ISD::FHADD"; @@ -26547,8 +27083,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::LDEC: return "X86ISD::LDEC"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; - case X86ISD::VZEXT: return "X86ISD::VZEXT"; - case X86ISD::VSEXT: return "X86ISD::VSEXT"; case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; @@ -26716,8 +27250,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND"; case X86ISD::SCALEF: return "X86ISD::SCALEF"; case X86ISD::SCALEFS: return "X86ISD::SCALEFS"; - case X86ISD::ADDS: return "X86ISD::ADDS"; - case X86ISD::SUBS: return "X86ISD::SUBS"; case X86ISD::AVG: return "X86ISD::AVG"; case X86ISD::MULHRS: return "X86ISD::MULHRS"; case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND"; @@ -29493,34 +30025,29 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, } case X86ISD::PACKUS: { // PACKUS is just a truncation if the upper half is zero. - // TODO: Add DemandedElts support. + APInt DemandedLHS, DemandedRHS; + getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); + + Known.One = APInt::getAllOnesValue(BitWidth * 2); + Known.Zero = APInt::getAllOnesValue(BitWidth * 2); + KnownBits Known2; - DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1); - DAG.computeKnownBits(Op.getOperand(1), Known2, Depth + 1); - Known.One &= Known2.One; - Known.Zero &= Known2.Zero; + if (!!DemandedLHS) { + DAG.computeKnownBits(Op.getOperand(0), Known2, DemandedLHS, Depth + 1); + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; + } + if (!!DemandedRHS) { + DAG.computeKnownBits(Op.getOperand(1), Known2, DemandedRHS, Depth + 1); + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; + } + if (Known.countMinLeadingZeros() < BitWidth) Known.resetAll(); Known = Known.trunc(BitWidth); break; } - case X86ISD::VZEXT: { - // TODO: Add DemandedElts support. - SDValue N0 = Op.getOperand(0); - unsigned NumElts = VT.getVectorNumElements(); - - EVT SrcVT = N0.getValueType(); - unsigned InNumElts = SrcVT.getVectorNumElements(); - unsigned InBitWidth = SrcVT.getScalarSizeInBits(); - assert(InNumElts >= NumElts && "Illegal VZEXT input"); - - Known = KnownBits(InBitWidth); - APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts); - DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1); - Known = Known.zext(BitWidth); - Known.Zero.setBitsFrom(InBitWidth); - break; - } case X86ISD::CMOV: { DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1); // If we don't know any bits, early out. @@ -29598,14 +30125,6 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( // SETCC_CARRY sets the dest to ~0 for true or 0 for false. return VTBits; - case X86ISD::VSEXT: { - // TODO: Add DemandedElts support. - SDValue Src = Op.getOperand(0); - unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); - Tmp += VTBits - Src.getScalarValueSizeInBits(); - return Tmp; - } - case X86ISD::VTRUNC: { // TODO: Add DemandedElts support. SDValue Src = Op.getOperand(0); @@ -29619,10 +30138,16 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( case X86ISD::PACKSS: { // PACKSS is just a truncation if the sign bits extend to the packed size. - // TODO: Add DemandedElts support. + APInt DemandedLHS, DemandedRHS; + getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS, + DemandedRHS); + unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits(); - unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); - unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1); + unsigned Tmp0 = SrcBits, Tmp1 = SrcBits; + if (!!DemandedLHS) + Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1); + if (!!DemandedRHS) + Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1); unsigned Tmp = std::min(Tmp0, Tmp1); if (Tmp > (SrcBits - VTBits)) return Tmp - (SrcBits - VTBits); @@ -29676,21 +30201,6 @@ SDValue X86TargetLowering::unwrapAddress(SDValue N) const { return N; } -/// Returns true (and the GlobalValue and the offset) if the node is a -/// GlobalAddress + offset. -bool X86TargetLowering::isGAPlusOffset(SDNode *N, - const GlobalValue* &GA, - int64_t &Offset) const { - if (N->getOpcode() == X86ISD::Wrapper) { - if (isa(N->getOperand(0))) { - GA = cast(N->getOperand(0))->getGlobal(); - Offset = cast(N->getOperand(0))->getOffset(); - return true; - } - } - return TargetLowering::isGAPlusOffset(N, GA, Offset); -} - // Attempt to match a combined shuffle mask against supported unary shuffle // instructions. // TODO: Investigate sharing more of this with shuffle lowering. @@ -29729,10 +30239,12 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, MVT::getIntegerVT(MaskEltSize); SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize); - if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) { + if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) V1 = extractSubVector(V1, 0, DAG, DL, SrcSize); - Shuffle = unsigned(X86ISD::VZEXT); - } else + + if (SrcVT.getVectorNumElements() == NumDstElts) + Shuffle = unsigned(ISD::ZERO_EXTEND); + else Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG); DstVT = MVT::getIntegerVT(Scale * MaskEltSize); @@ -30756,7 +31268,10 @@ static SDValue combineX86ShufflesRecursively( if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG)) return SDValue(); - assert(OpInputs.size() <= 2 && "Too many shuffle inputs"); + // TODO - Add support for more than 2 inputs. + if (2 < OpInputs.size()) + return SDValue(); + SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue()); SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue()); @@ -31773,11 +32288,69 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // Handle special case opcodes. switch (Opc) { + case X86ISD::VSHL: + case X86ISD::VSRL: + case X86ISD::VSRA: { + // We only need the bottom 64-bits of the (128-bit) shift amount. + SDValue Amt = Op.getOperand(1); + MVT AmtVT = Amt.getSimpleValueType(); + assert(AmtVT.is128BitVector() && "Unexpected value type"); + APInt AmtUndef, AmtZero; + unsigned NumAmtElts = AmtVT.getVectorNumElements(); + APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2); + if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO, + Depth + 1)) + return true; + LLVM_FALLTHROUGH; + } + case X86ISD::VSHLI: + case X86ISD::VSRLI: + case X86ISD::VSRAI: { + SDValue Src = Op.getOperand(0); + APInt SrcUndef; + if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO, + Depth + 1)) + return true; + // TODO convert SrcUndef to KnownUndef. + break; + } + case X86ISD::CVTSI2P: + case X86ISD::CVTUI2P: { + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + APInt SrcUndef, SrcZero; + APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements()); + if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO, + Depth + 1)) + return true; + break; + } + case X86ISD::PACKSS: + case X86ISD::PACKUS: { + APInt DemandedLHS, DemandedRHS; + getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); + + APInt SrcUndef, SrcZero; + if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, SrcUndef, + SrcZero, TLO, Depth + 1)) + return true; + if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, SrcUndef, + SrcZero, TLO, Depth + 1)) + return true; + break; + } case X86ISD::VBROADCAST: { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); if (!SrcVT.isVector()) return false; + // Don't bother broadcasting if we just need the 0'th element. + if (DemandedElts == 1) { + if(Src.getValueType() != VT) + Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG, + SDLoc(Op)); + return TLO.CombineTo(Op, Src); + } APInt SrcUndef, SrcZero; APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0); if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO, @@ -31797,13 +32370,15 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( } // Simplify target shuffles. - if (!isTargetShuffle(Opc)) + if (!isTargetShuffle(Opc) || !VT.isSimple()) return false; // Get target shuffle mask. + bool IsUnary; SmallVector OpMask; SmallVector OpInputs; - if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, TLO.DAG)) + if (!getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, OpInputs, + OpMask, IsUnary)) return false; // Shuffle inputs must be the same type as the result. @@ -31811,11 +32386,19 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( [VT](SDValue V) { return VT != V.getValueType(); })) return false; + // Clear known elts that might have been set above. + KnownZero.clearAllBits(); + KnownUndef.clearAllBits(); + // Check if shuffle mask can be simplified to undef/zero/identity. int NumSrcs = OpInputs.size(); - for (int i = 0; i != NumElts; ++i) + for (int i = 0; i != NumElts; ++i) { + int &M = OpMask[i]; if (!DemandedElts[i]) - OpMask[i] = SM_SentinelUndef; + M = SM_SentinelUndef; + else if (0 <= M && OpInputs[M / NumElts].isUndef()) + M = SM_SentinelUndef; + } if (isUndefInRange(OpMask, 0, NumElts)) { KnownUndef.setAllBits(); @@ -31860,8 +32443,10 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( } bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( - SDValue Op, const APInt &OriginalDemandedBits, KnownBits &Known, - TargetLoweringOpt &TLO, unsigned Depth) const { + SDValue Op, const APInt &OriginalDemandedBits, + const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, + unsigned Depth) const { + EVT VT = Op.getValueType(); unsigned BitWidth = OriginalDemandedBits.getBitWidth(); unsigned Opc = Op.getOpcode(); switch(Opc) { @@ -31884,41 +32469,129 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( if (ShiftImm->getAPIntValue().uge(BitWidth)) break; - KnownBits KnownOp; unsigned ShAmt = ShiftImm->getZExtValue(); APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt); - if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, KnownOp, TLO, - Depth + 1)) + + if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, + OriginalDemandedElts, Known, TLO, Depth + 1)) return true; + + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known.Zero <<= ShAmt; + Known.One <<= ShAmt; + + // Low bits known zero. + Known.Zero.setLowBits(ShAmt); } break; } - case X86ISD::VSRAI: case X86ISD::VSRLI: { if (auto *ShiftImm = dyn_cast(Op.getOperand(1))) { if (ShiftImm->getAPIntValue().uge(BitWidth)) break; - KnownBits KnownOp; unsigned ShAmt = ShiftImm->getZExtValue(); APInt DemandedMask = OriginalDemandedBits << ShAmt; + if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, + OriginalDemandedElts, Known, TLO, Depth + 1)) + return true; + + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known.Zero.lshrInPlace(ShAmt); + Known.One.lshrInPlace(ShAmt); + + // High bits known zero. + Known.Zero.setHighBits(ShAmt); + } + break; + } + case X86ISD::VSRAI: { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + if (auto *ShiftImm = dyn_cast(Op1)) { + if (ShiftImm->getAPIntValue().uge(BitWidth)) + break; + + unsigned ShAmt = ShiftImm->getZExtValue(); + APInt DemandedMask = OriginalDemandedBits << ShAmt; + + // If we just want the sign bit then we don't need to shift it. + if (OriginalDemandedBits.isSignMask()) + return TLO.CombineTo(Op, Op0); + + // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1 + if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) { + SDValue Op00 = Op0.getOperand(0); + unsigned NumSignBits = + TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts); + if (ShAmt < NumSignBits) + return TLO.CombineTo(Op, Op00); + } + // If any of the demanded bits are produced by the sign extension, we also // demand the input sign bit. - if (Opc == X86ISD::VSRAI && - OriginalDemandedBits.countLeadingZeros() < ShAmt) + if (OriginalDemandedBits.countLeadingZeros() < ShAmt) DemandedMask.setSignBit(); - if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, KnownOp, TLO, - Depth + 1)) + if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known, + TLO, Depth + 1)) return true; + + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known.Zero.lshrInPlace(ShAmt); + Known.One.lshrInPlace(ShAmt); + + // If the input sign bit is known to be zero, or if none of the top bits + // are demanded, turn this into an unsigned shift right. + if (Known.Zero[BitWidth - ShAmt - 1] || + OriginalDemandedBits.countLeadingZeros() >= ShAmt) + return TLO.CombineTo( + Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1)); + + // High bits are known one. + if (Known.One[BitWidth - ShAmt - 1]) + Known.One.setHighBits(ShAmt); } break; } + case X86ISD::MOVMSK: { + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + unsigned SrcBits = SrcVT.getScalarSizeInBits(); + unsigned NumElts = SrcVT.getVectorNumElements(); + + // If we don't need the sign bits at all just return zero. + if (OriginalDemandedBits.countTrailingZeros() >= NumElts) + return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT)); + + // Only demand the vector elements of the sign bits we need. + APInt KnownUndef, KnownZero; + APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts); + if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero, + TLO, Depth + 1)) + return true; + + Known.Zero = KnownZero.zextOrSelf(BitWidth); + Known.Zero.setHighBits(BitWidth - NumElts); + + // MOVMSK only uses the MSB from each vector element. + KnownBits KnownSrc; + if (SimplifyDemandedBits(Src, APInt::getSignMask(SrcBits), DemandedElts, + KnownSrc, TLO, Depth + 1)) + return true; + + if (KnownSrc.One[SrcBits - 1]) + Known.One.setLowBits(NumElts); + else if (KnownSrc.Zero[SrcBits - 1]) + Known.Zero.setLowBits(NumElts); + return false; + } } return TargetLowering::SimplifyDemandedBitsForTargetNode( - Op, OriginalDemandedBits, Known, TLO, Depth); + Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); } /// Check if a vector extract from a target-specific shuffle of a load can be @@ -31972,9 +32645,13 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, if (Idx == SM_SentinelUndef) return DAG.getUNDEF(EltVT); + // Bail if any mask element is SM_SentinelZero - getVectorShuffle below + // won't handle it. + if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; })) + return SDValue(); + assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range"); - SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0] - : ShuffleOps[1]; + SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0] : ShuffleOps[1]; // If inputs to shuffle are the same for both ops, then allow 2 uses unsigned AllowedUses = @@ -32584,7 +33261,7 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, // ready for the PHMINPOS. if (ExtractVT == MVT::i8) { SDValue Upper = DAG.getVectorShuffle( - SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL), + SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8), {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16}); MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper); } @@ -33165,9 +33842,14 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(N); SDValue Cond = N->getOperand(0); - // Get the LHS/RHS of the select. SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); + + // Try simplification again because we use this function to optimize + // SHRUNKBLEND nodes that are not handled by the generic combiner. + if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS)) + return V; + EVT VT = LHS.getValueType(); EVT CondVT = Cond.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -33362,7 +34044,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Since SKX these selects have a proper lowering. if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() && CondVT.getVectorElementType() == MVT::i1 && - VT.getVectorNumElements() > 4 && + (ExperimentalVectorWideningLegalization || + VT.getVectorNumElements() > 4) && (VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16)) { Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); @@ -33427,33 +34110,27 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1); SDValue CondRHS = Cond->getOperand(1); - auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL, - ArrayRef Ops) { - return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops); - }; - // Look for a general sub with unsigned saturation first. // x >= y ? x-y : 0 --> subus x, y // x > y ? x-y : 0 --> subus x, y if ((CC == ISD::SETUGE || CC == ISD::SETUGT) && Other->getOpcode() == ISD::SUB && OpRHS == CondRHS) - return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS }, - SUBUSBuilder); + return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS); if (auto *OpRHSBV = dyn_cast(OpRHS)) { if (isa(CondRHS)) { // If the RHS is a constant we have to reverse the const // canonicalization. // x > C-1 ? x+-C : 0 --> subus x, C - auto MatchSUBUS = [](ConstantSDNode *Op, ConstantSDNode *Cond) { + // TODO: Handle build_vectors with undef elements. + auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) { return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1); }; if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && - ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchSUBUS)) { + ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT)) { OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), OpRHS); - return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS }, - SUBUSBuilder); + return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS); } // Another special case: If C was a sign bit, the sub has been @@ -33465,11 +34142,10 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR && ISD::isBuildVectorAllZeros(CondRHS.getNode()) && OpRHSConst->getAPIntValue().isSignMask()) { - OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT); // Note that we have to rebuild the RHS constant here to ensure we // don't rely on particular values of undef lanes. - return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS }, - SUBUSBuilder); + OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT); + return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS); } } } @@ -33502,11 +34178,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, if (Other.getNode() && Other.getOpcode() == ISD::ADD) { SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1); - auto ADDUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL, - ArrayRef Ops) { - return DAG.getNode(X86ISD::ADDUS, DL, Ops[0].getValueType(), Ops); - }; - // Canonicalize condition operands. if (CC == ISD::SETUGE) { std::swap(CondLHS, CondRHS); @@ -33518,21 +34189,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // x+y >= x ? x+y : ~0 --> addus x, y if (CC == ISD::SETULE && Other == CondRHS && (OpLHS == CondLHS || OpRHS == CondLHS)) - return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS }, - ADDUSBuilder); + return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS); if (isa(OpRHS) && isa(CondRHS) && CondLHS == OpLHS) { // If the RHS is a constant we have to reverse the const // canonicalization. // x > ~C ? x+C : ~0 --> addus x, C - auto MatchADDUS = [](ConstantSDNode *Op, ConstantSDNode *Cond) { + auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) { return Cond->getAPIntValue() == ~Op->getAPIntValue(); }; if (CC == ISD::SETULE && - ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchADDUS)) - return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS }, - ADDUSBuilder); + ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT)) + return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS); } } } @@ -34106,24 +34775,8 @@ static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) { for (unsigned i = 0; i < 2; i++) { SDValue Opd = N->getOperand(i); - // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to - // compute signbits for it separately. - if (Opd.getOpcode() == ISD::ANY_EXTEND) { - // For anyextend, it is safe to assume an appropriate number of leading - // sign/zero bits. - if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8) - SignBits[i] = 25; - else if (Opd.getOperand(0).getValueType().getVectorElementType() == - MVT::i16) - SignBits[i] = 17; - else - return false; - IsPositive[i] = true; - } else { - SignBits[i] = DAG.ComputeNumSignBits(Opd); - if (DAG.SignBitIsZero(Opd)) - IsPositive[i] = true; - } + SignBits[i] = DAG.ComputeNumSignBits(Opd); + IsPositive[i] = DAG.SignBitIsZero(Opd); } bool AllPositive = IsPositive[0] && IsPositive[1]; @@ -34208,7 +34861,8 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0); SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1); - if (NumElts >= OpsVT.getVectorNumElements()) { + if (ExperimentalVectorWideningLegalization || + NumElts >= OpsVT.getVectorNumElements()) { // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the // lower part is needed. SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); @@ -34397,12 +35051,24 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG, return SDValue(); // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case. + // Also allow v2i32 if it will be widened. MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements()); - if (!DAG.getTargetLoweringInfo().isTypeLegal(WVT)) + if (!((ExperimentalVectorWideningLegalization && VT == MVT::v2i32) || + DAG.getTargetLoweringInfo().isTypeLegal(WVT))) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + + // If we are zero extending two steps without SSE4.1, its better to reduce + // the vmul width instead. + if (!Subtarget.hasSSE41() && + (N0.getOpcode() == ISD::ZERO_EXTEND && + N0.getOperand(0).getScalarValueSizeInBits() <= 8) && + (N1.getOpcode() == ISD::ZERO_EXTEND && + N1.getOperand(0).getScalarValueSizeInBits() <= 8)) + return SDValue(); + APInt Mask17 = APInt::getHighBitsSet(32, 17); if (!DAG.MaskedValueIsZero(N1, Mask17) || !DAG.MaskedValueIsZero(N0, Mask17)) @@ -34819,6 +35485,8 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, N1.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type"); + bool IsSigned = (X86ISD::PACKSS == Opcode); + // Constant Folding. APInt UndefElts0, UndefElts1; SmallVector EltBits0, EltBits1; @@ -34831,7 +35499,6 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, unsigned NumSrcElts = NumDstElts / 2; unsigned NumDstEltsPerLane = NumDstElts / NumLanes; unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; - bool IsSigned = (X86ISD::PACKSS == Opcode); APInt Undefs(NumDstElts, 0); SmallVector Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt)); @@ -34875,6 +35542,25 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N)); } + // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular + // truncate to create a larger truncate. + if (Subtarget.hasAVX512() && + N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 && + N0.getOperand(0).getValueType() == MVT::v8i32) { + if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) || + (!IsSigned && + DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) { + if (Subtarget.hasVLX()) + return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0)); + + // Widen input to v16i32 so we can truncate that. + SDLoc dl(N); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32, + N0.getOperand(0), DAG.getUNDEF(MVT::v8i32)); + return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat); + } + } + // Attempt to combine as shuffle. SDValue Op(N, 0); if (SDValue Res = @@ -34886,6 +35572,28 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || + X86ISD::VSRL == N->getOpcode()) && + "Unexpected shift opcode"); + EVT VT = N->getValueType(0); + + // Shift zero -> zero. + if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) + return DAG.getConstant(0, SDLoc(N), VT); + + APInt KnownUndef, KnownZero; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); + if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef, + KnownZero, DCI)) + return SDValue(N, 0); + + return SDValue(); +} + static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -34900,13 +35608,14 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, unsigned NumBitsPerElt = VT.getScalarSizeInBits(); assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && "Unexpected value type"); + assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type"); // Out of range logical bit shifts are guaranteed to be zero. // Out of range arithmetic bit shifts splat the sign bit. - APInt ShiftVal = cast(N1)->getAPIntValue(); - if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) { + unsigned ShiftVal = cast(N1)->getZExtValue(); + if (ShiftVal >= NumBitsPerElt) { if (LogicalShift) - return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); + return DAG.getConstant(0, SDLoc(N), VT); else ShiftVal = NumBitsPerElt - 1; } @@ -34917,26 +35626,21 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, // Shift zero -> zero. if (ISD::isBuildVectorAllZeros(N0.getNode())) - return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); - - // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31). - // This VSRLI only looks at the sign bit, which is unmodified by VSRAI. - // TODO - support other sra opcodes as needed. - if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt && - N0.getOpcode() == X86ISD::VSRAI) - return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1); - - // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1 - if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI && - N1 == N0.getOperand(1)) { - SDValue N00 = N0.getOperand(0); - unsigned NumSignBits = DAG.ComputeNumSignBits(N00); - if (ShiftVal.ult(NumSignBits)) - return N00; + return DAG.getConstant(0, SDLoc(N), VT); + + // Fold (VSRAI (VSRAI X, C1), C2) --> (VSRAI X, (C1 + C2)) with (C1 + C2) + // clamped to (NumBitsPerElt - 1). + if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSRAI) { + unsigned ShiftVal2 = cast(N0.getOperand(1))->getZExtValue(); + unsigned NewShiftVal = ShiftVal + ShiftVal2; + if (NewShiftVal >= NumBitsPerElt) + NewShiftVal = NumBitsPerElt - 1; + return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0), + DAG.getConstant(NewShiftVal, SDLoc(N), MVT::i8)); } // We can decode 'whole byte' logical bit shifts as shuffles. - if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) { + if (LogicalShift && (ShiftVal % 8) == 0) { SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively( {Op}, 0, Op, {0}, {}, /*Depth*/ 1, @@ -34951,14 +35655,13 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) { assert(EltBits.size() == VT.getVectorNumElements() && "Unexpected shift value type"); - unsigned ShiftImm = ShiftVal.getZExtValue(); for (APInt &Elt : EltBits) { if (X86ISD::VSHLI == Opcode) - Elt <<= ShiftImm; + Elt <<= ShiftVal; else if (X86ISD::VSRAI == Opcode) - Elt.ashrInPlace(ShiftImm); + Elt.ashrInPlace(ShiftVal); else - Elt.lshrInPlace(ShiftImm); + Elt.lshrInPlace(ShiftVal); } return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N)); } @@ -35130,8 +35833,8 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { // some of the transition sequences. // Even with AVX-512 this is still useful for removing casts around logical // operations on vXi1 mask types. -static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); assert(VT.isVector() && "Expected vector type"); @@ -35871,10 +36574,10 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C ) // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C ) // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C ) - unsigned Bits = VT.getSizeInBits(); + unsigned Bits = VT.getScalarSizeInBits(); if (ShAmt1.getOpcode() == ISD::SUB) { SDValue Sum = ShAmt1.getOperand(0); - if (ConstantSDNode *SumC = dyn_cast(Sum)) { + if (auto *SumC = dyn_cast(Sum)) { SDValue ShAmt1Op1 = ShAmt1.getOperand(1); if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE) ShAmt1Op1 = ShAmt1Op1.getOperand(0); @@ -35884,8 +36587,8 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); } - } else if (ConstantSDNode *ShAmt1C = dyn_cast(ShAmt1)) { - ConstantSDNode *ShAmt0C = dyn_cast(ShAmt0); + } else if (auto *ShAmt1C = dyn_cast(ShAmt1)) { + auto *ShAmt0C = dyn_cast(ShAmt0); if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits) return DAG.getNode(Opc, DL, VT, N0.getOperand(0), N1.getOperand(0), @@ -35893,7 +36596,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, MVT::i8, ShAmt0)); } else if (ShAmt1.getOpcode() == ISD::XOR) { SDValue Mask = ShAmt1.getOperand(1); - if (ConstantSDNode *MaskC = dyn_cast(Mask)) { + if (auto *MaskC = dyn_cast(Mask)) { unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL); SDValue ShAmt1Op0 = ShAmt1.getOperand(0); if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE) @@ -36592,7 +37295,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, Mld->getBasePtr(), NewMask, WidePassThru, Mld->getMemoryVT(), Mld->getMemOperand(), ISD::NON_EXTLOAD); - SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG); + SDValue NewVec = getExtendInVec(/*Signed*/true, dl, VT, WideLd, DAG); return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); } @@ -36740,6 +37443,18 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + // Convert a store of vXi1 into a store of iX and a bitcast. + if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() && + VT.getVectorElementType() == MVT::i1) { + + EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements()); + StoredVal = DAG.getBitcast(NewVT, StoredVal); + + return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), + St->getPointerInfo(), St->getAlignment(), + St->getMemOperand()->getFlags()); + } + // If this is a store of a scalar_to_vector to v1i1, just use a scalar store. // This will avoid a copy to k-register. if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() && @@ -37078,7 +37793,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { if (!LHS.getOperand(1).isUndef()) B = LHS.getOperand(1); ArrayRef Mask = cast(LHS.getNode())->getMask(); - std::copy(Mask.begin(), Mask.end(), LMask.begin()); + llvm::copy(Mask, LMask.begin()); } else { A = LHS; for (unsigned i = 0; i != NumElts; ++i) @@ -37095,7 +37810,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { if (!RHS.getOperand(1).isUndef()) D = RHS.getOperand(1); ArrayRef Mask = cast(RHS.getNode())->getMask(); - std::copy(Mask.begin(), Mask.end(), RMask.begin()); + llvm::copy(Mask, RMask.begin()); } else { C = RHS; for (unsigned i = 0; i != NumElts; ++i) @@ -37184,6 +37899,8 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify /// the codegen. /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) ) +/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove +/// anything that is guaranteed to be transformed by DAGCombiner. static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL) { @@ -37434,9 +38151,11 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, if (!Subtarget.hasSSE2()) return SDValue(); - // Only handle vXi16 types that are at least 128-bits. + // Only handle vXi16 types that are at least 128-bits unless they will be + // widened. if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 || - VT.getVectorNumElements() < 8) + (!ExperimentalVectorWideningLegalization && + VT.getVectorNumElements() < 8)) return SDValue(); // Input type should be vXi32. @@ -38059,6 +38778,20 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax); } +static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + APInt KnownUndef, KnownZero; + APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); + if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef, + KnownZero, DCI)) + return SDValue(N, 0); + + return SDValue(); +} + /// Do target-specific dag combines on X86ISD::ANDNP nodes. static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -38071,7 +38804,7 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, // ANDNP(x, 0) -> 0 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode())) - return getZeroVector(VT, Subtarget, DAG, SDLoc(N)); + return DAG.getConstant(0, SDLoc(N), VT); // Turn ANDNP back to AND if input is inverted. if (VT.isVector() && N->getOperand(0).getOpcode() == ISD::XOR && @@ -38401,6 +39134,9 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG, static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + if (ExperimentalVectorWideningLegalization) + return SDValue(); + unsigned Opcode = N->getOpcode(); if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND) return SDValue(); @@ -38415,6 +39151,22 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, EVT InVT = N0.getValueType(); EVT InSVT = InVT.getScalarType(); + // FIXME: Generic DAGCombiner previously had a bug that would cause a + // sign_extend of setcc to sometimes return the original node and tricked it + // into thinking CombineTo was used which prevented the target combines from + // running. + // Earlying out here to avoid regressions like this + // (v4i32 (sext (v4i1 (setcc (v4i16))))) + // Becomes + // (v4i32 (sext_invec (v8i16 (concat (v4i16 (setcc (v4i16))), undef)))) + // Type legalized to + // (v4i32 (sext_invec (v8i16 (trunc_invec (v4i32 (setcc (v4i32))))))) + // Leading to a packssdw+pmovsxwd + // We could write a DAG combine to fix this, but really we shouldn't be + // creating sext_invec that's forcing v8i16 into the DAG. + if (N0.getOpcode() == ISD::SETCC) + return SDValue(); + // Input type must be a vector and we must be extending legal integer types. if (!VT.isVector() || VT.getVectorNumElements() < 2) return SDValue(); @@ -38575,7 +39327,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, return V; if (VT.isVector()) - if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget)) + if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget)) return R; if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget)) @@ -38746,7 +39498,7 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, return V; if (VT.isVector()) - if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget)) + if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget)) return R; if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget)) @@ -38899,7 +39651,9 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, // NOTE: The element count check is to ignore operand types that need to // go through type promotion to a 128-bit vector. if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() && - VT.getVectorElementType() == MVT::i1 && VT.getVectorNumElements() > 4 && + VT.getVectorElementType() == MVT::i1 && + (ExperimentalVectorWideningLegalization || + VT.getVectorNumElements() > 4) && (OpVT.getVectorElementType() == MVT::i8 || OpVT.getVectorElementType() == MVT::i16)) { SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS, @@ -38920,10 +39674,11 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { SDValue Src = N->getOperand(0); MVT SrcVT = Src.getSimpleValueType(); + MVT VT = N->getSimpleValueType(0); // Perform constant folding. if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) { - assert(N->getValueType(0) == MVT::i32 && "Unexpected result type"); + assert(VT== MVT::i32 && "Unexpected result type"); APInt Imm(32, 0); for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) { SDValue In = Src.getOperand(Idx); @@ -38931,7 +39686,7 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, cast(In)->getAPIntValue().isNegative()) Imm.setBit(Idx); } - return DAG.getConstant(Imm, SDLoc(N), N->getValueType(0)); + return DAG.getConstant(Imm, SDLoc(N), VT); } // Look through int->fp bitcasts that don't change the element width. @@ -38941,11 +39696,10 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, EVT(SrcVT).changeVectorElementTypeToInteger()) Src = Src.getOperand(0); + // Simplify the inputs. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - - // MOVMSK only uses the MSB from each vector element. - APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits())); - if (TLI.SimplifyDemandedBits(Src, DemandedMask, DCI)) + APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits())); + if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI)) return SDValue(N, 0); // Combine (movmsk (setne (and X, (1 << C)), 0)) -> (movmsk (X << C)). @@ -39232,6 +39986,159 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static bool needCarryOrOverflowFlag(SDValue Flags) { + assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!"); + + for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + + X86::CondCode CC; + switch (User->getOpcode()) { + default: + // Be conservative. + return true; + case X86ISD::SETCC: + case X86ISD::SETCC_CARRY: + CC = (X86::CondCode)User->getConstantOperandVal(0); + break; + case X86ISD::BRCOND: + CC = (X86::CondCode)User->getConstantOperandVal(2); + break; + case X86ISD::CMOV: + CC = (X86::CondCode)User->getConstantOperandVal(2); + break; + } + + switch (CC) { + default: break; + case X86::COND_A: case X86::COND_AE: + case X86::COND_B: case X86::COND_BE: + case X86::COND_O: case X86::COND_NO: + case X86::COND_G: case X86::COND_GE: + case X86::COND_L: case X86::COND_LE: + return true; + } + } + + return false; +} + +static bool onlyZeroFlagUsed(SDValue Flags) { + assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!"); + + for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + + unsigned CCOpNo; + switch (User->getOpcode()) { + default: + // Be conservative. + return false; + case X86ISD::SETCC: CCOpNo = 0; break; + case X86ISD::SETCC_CARRY: CCOpNo = 0; break; + case X86ISD::BRCOND: CCOpNo = 2; break; + case X86ISD::CMOV: CCOpNo = 2; break; + } + + X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo); + if (CC != X86::COND_E && CC != X86::COND_NE) + return false; + } + + return true; +} + +static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) { + // Only handle test patterns. + if (!isNullConstant(N->getOperand(1))) + return SDValue(); + + // If we have a CMP of a truncated binop, see if we can make a smaller binop + // and use its flags directly. + // TODO: Maybe we should try promoting compares that only use the zero flag + // first if we can prove the upper bits with computeKnownBits? + SDLoc dl(N); + SDValue Op = N->getOperand(0); + EVT VT = Op.getValueType(); + + // If we have a constant logical shift that's only used in a comparison + // against zero turn it into an equivalent AND. This allows turning it into + // a TEST instruction later. + if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) && + Op.hasOneUse() && isa(Op.getOperand(1)) && + onlyZeroFlagUsed(SDValue(N, 0))) { + EVT VT = Op.getValueType(); + unsigned BitWidth = VT.getSizeInBits(); + unsigned ShAmt = Op.getConstantOperandVal(1); + if (ShAmt < BitWidth) { // Avoid undefined shifts. + APInt Mask = Op.getOpcode() == ISD::SRL + ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt) + : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); + if (Mask.isSignedIntN(32)) { + Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), + DAG.getConstant(Mask, dl, VT)); + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, + DAG.getConstant(0, dl, VT)); + } + } + } + + + // Look for a truncate with a single use. + if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse()) + return SDValue(); + + Op = Op.getOperand(0); + + // Arithmetic op can only have one use. + if (!Op.hasOneUse()) + return SDValue(); + + unsigned NewOpc; + switch (Op.getOpcode()) { + default: return SDValue(); + case ISD::AND: + // Skip and with constant. We have special handling for and with immediate + // during isel to generate test instructions. + if (isa(Op.getOperand(1))) + return SDValue(); + NewOpc = X86ISD::AND; + break; + case ISD::OR: NewOpc = X86ISD::OR; break; + case ISD::XOR: NewOpc = X86ISD::XOR; break; + case ISD::ADD: + // If the carry or overflow flag is used, we can't truncate. + if (needCarryOrOverflowFlag(SDValue(N, 0))) + return SDValue(); + NewOpc = X86ISD::ADD; + break; + case ISD::SUB: + // If the carry or overflow flag is used, we can't truncate. + if (needCarryOrOverflowFlag(SDValue(N, 0))) + return SDValue(); + NewOpc = X86ISD::SUB; + break; + } + + // We found an op we can narrow. Truncate its inputs. + SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0)); + SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1)); + + // Use a X86 specific opcode to avoid DAG combine messing with it. + SDVTList VTs = DAG.getVTList(VT, MVT::i32); + Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1); + + // For AND, keep a CMP so that we can match the test pattern. + if (NewOpc == X86ISD::AND) + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, + DAG.getConstant(0, dl, VT)); + + // Return the flags. + return Op.getValue(1); +} + static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) { if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) { MVT VT = N->getSimpleValueType(0); @@ -39278,21 +40185,6 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit -/// which is more useful than 0/1 in some cases. -static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) { - SDLoc DL(N); - // "Condition code B" is also known as "the carry flag" (CF). - SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8); - SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS); - MVT VT = N->getSimpleValueType(0); - if (VT == MVT::i8) - return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT)); - - assert(VT == MVT::i1 && "Unexpected type for SETCC node"); - return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB); -} - /// If this is an add or subtract where one operand is produced by a cmp+setcc, /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB} /// with CMP+{ADC, SBB}. @@ -39363,13 +40255,11 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { } if (CC == X86::COND_B) { - // X + SETB Z --> X + (mask SBB Z, Z) - // X - SETB Z --> X - (mask SBB Z, Z) - // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY? - SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG); - if (SBB.getValueSizeInBits() != VT.getSizeInBits()) - SBB = DAG.getZExtOrTrunc(SBB, DL, VT); - return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB); + // X + SETB Z --> adc X, 0 + // X - SETB Z --> sbb X, 0 + return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, + DAG.getVTList(VT, MVT::i32), X, + DAG.getConstant(0, DL, VT), Y.getOperand(1)); } if (CC == X86::COND_A) { @@ -39387,10 +40277,9 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { EFLAGS.getNode()->getVTList(), EFLAGS.getOperand(1), EFLAGS.getOperand(0)); SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); - SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG); - if (SBB.getValueSizeInBits() != VT.getSizeInBits()) - SBB = DAG.getZExtOrTrunc(SBB, DL, VT); - return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB); + return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, + DAG.getVTList(VT, MVT::i32), X, + DAG.getConstant(0, DL, VT), NewEFLAGS); } } @@ -39728,6 +40617,39 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, PMADDBuilder); } +// Try to turn (add (umax X, C), -C) into (psubus X, C) +static SDValue combineAddToSUBUS(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasSSE2()) + return SDValue(); + + EVT VT = N->getValueType(0); + + // psubus is available in SSE2 for i8 and i16 vectors. + if (!VT.isVector() || VT.getVectorNumElements() < 2 || + !isPowerOf2_32(VT.getVectorNumElements()) || + !(VT.getVectorElementType() == MVT::i8 || + VT.getVectorElementType() == MVT::i16)) + return SDValue(); + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + if (Op0.getOpcode() != ISD::UMAX) + return SDValue(); + + // The add should have a constant that is the negative of the max. + // TODO: Handle build_vectors with undef elements. + auto MatchUSUBSAT = [](ConstantSDNode *Max, ConstantSDNode *Op) { + return Max->getAPIntValue() == (-Op->getAPIntValue()); + }; + if (!ISD::matchBinaryPredicate(Op0.getOperand(1), Op1, MatchUSUBSAT)) + return SDValue(); + + SDLoc DL(N); + return DAG.getNode(ISD::USUBSAT, DL, VT, Op0.getOperand(0), + Op0.getOperand(1)); +} + // Attempt to turn this pattern into PMADDWD. // (mul (add (zext (build_vector)), (zext (build_vector))), // (add (zext (build_vector)), (zext (build_vector))) @@ -39883,6 +40805,9 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineIncDecVector(N, DAG)) return V; + if (SDValue V = combineAddToSUBUS(N, DAG, Subtarget)) + return V; + return combineAddOrSubToADCOrSBB(N, DAG); } @@ -39928,16 +40853,16 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG, } else return SDValue(); - auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL, - ArrayRef Ops) { - return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops); + auto USUBSATBuilder = [](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef Ops) { + return DAG.getNode(ISD::USUBSAT, DL, Ops[0].getValueType(), Ops); }; // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with // special preprocessing in some cases. if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64) return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, - { SubusLHS, SubusRHS }, SUBUSBuilder); + { SubusLHS, SubusRHS }, USUBSATBuilder); // Special preprocessing case can be only applied // if the value was zero extended from 16 bit, @@ -39969,7 +40894,7 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG, SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType); SDValue Psubus = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType, - { NewSubusLHS, NewSubusRHS }, SUBUSBuilder); + { NewSubusLHS, NewSubusRHS }, USUBSATBuilder); // Zero extend the result, it may be used somewhere as 32 bit, // if not zext and following trunc will shrink. return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType); @@ -40022,98 +40947,6 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, return combineAddOrSubToADCOrSBB(N, DAG); } -static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { - if (DCI.isBeforeLegalize()) - return SDValue(); - - SDLoc DL(N); - unsigned Opcode = N->getOpcode(); - MVT VT = N->getSimpleValueType(0); - MVT SVT = VT.getVectorElementType(); - unsigned NumElts = VT.getVectorNumElements(); - unsigned EltSizeInBits = SVT.getSizeInBits(); - - SDValue Op = N->getOperand(0); - MVT OpVT = Op.getSimpleValueType(); - MVT OpEltVT = OpVT.getVectorElementType(); - unsigned OpEltSizeInBits = OpEltVT.getSizeInBits(); - unsigned InputBits = OpEltSizeInBits * NumElts; - - // Perform any constant folding. - // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled. - APInt UndefElts; - SmallVector EltBits; - if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) { - APInt Undefs(NumElts, 0); - SmallVector Vals(NumElts, APInt(EltSizeInBits, 0)); - bool IsZEXT = - (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG); - for (unsigned i = 0; i != NumElts; ++i) { - if (UndefElts[i]) { - Undefs.setBit(i); - continue; - } - Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits) - : EltBits[i].sextOrTrunc(EltSizeInBits); - } - return getConstVector(Vals, Undefs, VT, DAG, DL); - } - - // (vzext (bitcast (vzext (x)) -> (vzext x) - // TODO: (vsext (bitcast (vsext (x)) -> (vsext x) - SDValue V = peekThroughBitcasts(Op); - if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) { - MVT InnerVT = V.getSimpleValueType(); - MVT InnerEltVT = InnerVT.getVectorElementType(); - - // If the element sizes match exactly, we can just do one larger vzext. This - // is always an exact type match as vzext operates on integer types. - if (OpEltVT == InnerEltVT) { - assert(OpVT == InnerVT && "Types must match for vzext!"); - return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0)); - } - - // The only other way we can combine them is if only a single element of the - // inner vzext is used in the input to the outer vzext. - if (InnerEltVT.getSizeInBits() < InputBits) - return SDValue(); - - // In this case, the inner vzext is completely dead because we're going to - // only look at bits inside of the low element. Just do the outer vzext on - // a bitcast of the input to the inner. - return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V)); - } - - // Check if we can bypass extracting and re-inserting an element of an input - // vector. Essentially: - // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) - // TODO: Add X86ISD::VSEXT support - if (Opcode == X86ISD::VZEXT && - V.getOpcode() == ISD::SCALAR_TO_VECTOR && - V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && - V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) { - SDValue ExtractedV = V.getOperand(0); - SDValue OrigV = ExtractedV.getOperand(0); - if (isNullConstant(ExtractedV.getOperand(1))) { - MVT OrigVT = OrigV.getSimpleValueType(); - // Extract a subvector if necessary... - if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) { - int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits(); - OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(), - OrigVT.getVectorNumElements() / Ratio); - OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV, - DAG.getIntPtrConstant(0, DL)); - } - Op = DAG.getBitcast(OpVT, OrigV); - return DAG.getNode(X86ISD::VZEXT, DL, VT, Op); - } - } - - return SDValue(); -} - static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = N->getSimpleValueType(0); @@ -40121,9 +40954,9 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, if (N->getOperand(0) == N->getOperand(1)) { if (N->getOpcode() == X86ISD::PCMPEQ) - return getOnesVector(VT, DAG, DL); + return DAG.getConstant(-1, DL, VT); if (N->getOpcode() == X86ISD::PCMPGT) - return getZeroVector(VT, Subtarget, DAG, DL); + return DAG.getConstant(0, DL, VT); } return SDValue(); @@ -40364,13 +41197,20 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), OpVT, InVec.getOperand(0)); } } - if ((InOpcode == X86ISD::VZEXT || InOpcode == X86ISD::VSEXT) && + if ((InOpcode == ISD::ZERO_EXTEND || InOpcode == ISD::SIGN_EXTEND) && OpVT.is128BitVector() && InVec.getOperand(0).getSimpleValueType().is128BitVector()) { - unsigned ExtOp = InOpcode == X86ISD::VZEXT ? ISD::ZERO_EXTEND_VECTOR_INREG - : ISD::SIGN_EXTEND_VECTOR_INREG; + unsigned ExtOp = + InOpcode == ISD::ZERO_EXTEND ? ISD::ZERO_EXTEND_VECTOR_INREG + : ISD::SIGN_EXTEND_VECTOR_INREG; return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0)); } + if ((InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG || + InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) && + OpVT.is128BitVector() && + InVec.getOperand(0).getSimpleValueType().is128BitVector()) { + return DAG.getNode(InOpcode, SDLoc(N), OpVT, InVec.getOperand(0)); + } if (InOpcode == ISD::BITCAST) { // TODO - do this for target shuffles in general. SDValue InVecBC = peekThroughOneUseBitcasts(InVec); @@ -40448,6 +41288,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget); case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget); + case X86ISD::CMP: return combineCMP(N, DAG); case ISD::ADD: return combineAdd(N, DAG, Subtarget); case ISD::SUB: return combineSub(N, DAG, Subtarget); case X86ISD::SBB: return combineSBB(N, DAG); @@ -40479,6 +41320,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FMAX: return combineFMinFMax(N, DAG); case ISD::FMINNUM: case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget); + case X86ISD::CVTSI2P: + case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI); case X86ISD::BT: return combineBT(N, DAG, DCI); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget); @@ -40489,14 +41332,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget); case X86ISD::PACKSS: case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget); + case X86ISD::VSHL: + case X86ISD::VSRA: + case X86ISD::VSRL: + return combineVectorShiftVar(N, DAG, DCI, Subtarget); case X86ISD::VSHLI: case X86ISD::VSRAI: case X86ISD::VSRLI: return combineVectorShiftImm(N, DAG, DCI, Subtarget); - case ISD::SIGN_EXTEND_VECTOR_INREG: - case ISD::ZERO_EXTEND_VECTOR_INREG: - case X86ISD::VSEXT: - case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget); case X86ISD::PINSRB: case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles @@ -40558,10 +41401,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); } -/// Return true if the target has native support for the specified value type -/// and it is 'desirable' to use the type for the given node type. e.g. On x86 -/// i16 is legal, but undesirable since i16 instruction encodings are longer and -/// some i16 instructions are slow. bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { if (!isTypeLegal(VT)) return false; @@ -40570,26 +41409,37 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8) return false; - if (VT != MVT::i16) - return true; - - switch (Opc) { - default: - return true; - case ISD::LOAD: - case ISD::SIGN_EXTEND: - case ISD::ZERO_EXTEND: - case ISD::ANY_EXTEND: - case ISD::SHL: - case ISD::SRL: - case ISD::SUB: - case ISD::ADD: - case ISD::MUL: - case ISD::AND: - case ISD::OR: - case ISD::XOR: + // 8-bit multiply is probably not much cheaper than 32-bit multiply, and + // we have specializations to turn 32-bit multiply into LEA or other ops. + // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally + // check for a constant operand to the multiply. + if (Opc == ISD::MUL && VT == MVT::i8) return false; + + // i16 instruction encodings are longer and some i16 instructions are slow, + // so those are not desirable. + if (VT == MVT::i16) { + switch (Opc) { + default: + break; + case ISD::LOAD: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + case ISD::SHL: + case ISD::SRL: + case ISD::SUB: + case ISD::ADD: + case ISD::MUL: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + return false; + } } + + // Any legal type not explicitly accounted for above here is desirable. + return true; } SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl, @@ -40608,12 +41458,16 @@ SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl, return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG); } -/// This method query the target whether it is beneficial for dag combiner to -/// promote the specified node. If true, it should return the desired promotion -/// type by reference. bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { EVT VT = Op.getValueType(); - if (VT != MVT::i16) + bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL && + isa(Op.getOperand(1)); + + // i16 is legal, but undesirable since i16 instruction encodings are longer + // and some i16 instructions are slow. + // 8-bit multiply-by-constant can usually be expanded to something cheaper + // using LEA and/or other ALU ops. + if (VT != MVT::i16 && !Is8BitMulByConstant) return false; auto IsFoldableRMW = [](SDValue Load, SDValue Op) { diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 7cda0259bf27bdd837cdf1d0314deb816776354c..17fd315a2b4c5f5e50cb9dd083dda5c0068ff14f 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -226,14 +226,6 @@ namespace llvm { SCALEF, SCALEFS, - // Integer add/sub with unsigned saturation. - ADDUS, - SUBUS, - - // Integer add/sub with signed saturation. - ADDS, - SUBS, - // Unsigned Integer average. AVG, @@ -295,11 +287,6 @@ namespace llvm { // Vector move to low scalar and zero higher vector elements. VZEXT_MOVL, - // Vector integer zero-extend. - VZEXT, - // Vector integer signed-extend. - VSEXT, - // Vector integer truncate. VTRUNC, // Vector integer truncate with unsigned/signed saturation. @@ -876,15 +863,13 @@ namespace llvm { bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, + const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override; SDValue unwrapAddress(SDValue N) const override; - bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA, - int64_t &Offset) const override; - SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const; bool ExpandInlineAsm(CallInst *CI) const override; @@ -1046,10 +1031,15 @@ namespace llvm { bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; + bool reduceSelectOfFPConstantLoads(bool IsFPSetCC) const override; + bool convertSelectOfConstantsToMath(EVT VT) const override; bool decomposeMulByConstant(EVT VT, SDValue C) const override; + bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT, + bool IsSigned) const override; + /// Return true if EXTRACT_SUBVECTOR is cheap for this result type /// with this index. bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, @@ -1365,11 +1355,6 @@ namespace llvm { MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *MBB) const; - /// Emit nodes that will be selected as "test Op0,Op0", or something - /// equivalent, for use with the given x86 condition code. - SDValue EmitTest(SDValue Op0, unsigned X86CC, const SDLoc &dl, - SelectionDAG &DAG) const; - /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent, for use with the given x86 condition code. SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl, diff --git a/lib/Target/X86/X86InsertPrefetch.cpp b/lib/Target/X86/X86InsertPrefetch.cpp new file mode 100644 index 0000000000000000000000000000000000000000..30b46a09ef0f90131127c47f366eb09ce8ce4530 --- /dev/null +++ b/lib/Target/X86/X86InsertPrefetch.cpp @@ -0,0 +1,253 @@ +//===------- X86InsertPrefetch.cpp - Insert cache prefetch hints ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass applies cache prefetch instructions based on a profile. The pass +// assumes DiscriminateMemOps ran immediately before, to ensure debug info +// matches the one used at profile generation time. The profile is encoded in +// afdo format (text or binary). It contains prefetch hints recommendations. +// Each recommendation is made in terms of debug info locations, a type (i.e. +// nta, t{0|1|2}) and a delta. The debug info identifies an instruction with a +// memory operand (see X86DiscriminateMemOps). The prefetch will be made for +// a location at that memory operand + the delta specified in the +// recommendation. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86InstrInfo.h" +#include "X86MachineFunctionInfo.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/ProfileData/SampleProf.h" +#include "llvm/ProfileData/SampleProfReader.h" +#include "llvm/Transforms/IPO/SampleProfile.h" +using namespace llvm; +using namespace sampleprof; + +static cl::opt + PrefetchHintsFile("prefetch-hints-file", + cl::desc("Path to the prefetch hints profile."), + cl::Hidden); +namespace { + +class X86InsertPrefetch : public MachineFunctionPass { + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool doInitialization(Module &) override; + + bool runOnMachineFunction(MachineFunction &MF) override; + struct PrefetchInfo { + unsigned InstructionID; + int64_t Delta; + }; + typedef SmallVectorImpl Prefetches; + bool findPrefetchInfo(const FunctionSamples *Samples, const MachineInstr &MI, + Prefetches &prefetches) const; + +public: + static char ID; + X86InsertPrefetch(const std::string &PrefetchHintsFilename); + StringRef getPassName() const override { + return "X86 Insert Cache Prefetches"; + } + +private: + std::string Filename; + std::unique_ptr Reader; +}; + +using PrefetchHints = SampleRecord::CallTargetMap; + +// Return any prefetching hints for the specified MachineInstruction. The hints +// are returned as pairs (name, delta). +ErrorOr getPrefetchHints(const FunctionSamples *TopSamples, + const MachineInstr &MI) { + if (const auto &Loc = MI.getDebugLoc()) + if (const auto *Samples = TopSamples->findFunctionSamples(Loc)) + return Samples->findCallTargetMapAt(FunctionSamples::getOffset(Loc), + Loc->getBaseDiscriminator()); + return std::error_code(); +} + +// The prefetch instruction can't take memory operands involving vector +// registers. +bool IsMemOpCompatibleWithPrefetch(const MachineInstr &MI, int Op) { + unsigned BaseReg = MI.getOperand(Op + X86::AddrBaseReg).getReg(); + unsigned IndexReg = MI.getOperand(Op + X86::AddrIndexReg).getReg(); + return (BaseReg == 0 || + X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) || + X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg)) && + (IndexReg == 0 || + X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) || + X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)); +} + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// Implementation +//===----------------------------------------------------------------------===// + +char X86InsertPrefetch::ID = 0; + +X86InsertPrefetch::X86InsertPrefetch(const std::string &PrefetchHintsFilename) + : MachineFunctionPass(ID), Filename(PrefetchHintsFilename) {} + +/// Return true if the provided MachineInstruction has cache prefetch hints. In +/// that case, the prefetch hints are stored, in order, in the Prefetches +/// vector. +bool X86InsertPrefetch::findPrefetchInfo(const FunctionSamples *TopSamples, + const MachineInstr &MI, + Prefetches &Prefetches) const { + assert(Prefetches.empty() && + "Expected caller passed empty PrefetchInfo vector."); + static const std::pair HintTypes[] = { + {"_nta_", X86::PREFETCHNTA}, + {"_t0_", X86::PREFETCHT0}, + {"_t1_", X86::PREFETCHT1}, + {"_t2_", X86::PREFETCHT2}, + }; + static const char *SerializedPrefetchPrefix = "__prefetch"; + + const ErrorOr T = getPrefetchHints(TopSamples, MI); + if (!T) + return false; + int16_t max_index = -1; + // Convert serialized prefetch hints into PrefetchInfo objects, and populate + // the Prefetches vector. + for (const auto &S_V : *T) { + StringRef Name = S_V.getKey(); + if (Name.consume_front(SerializedPrefetchPrefix)) { + int64_t D = static_cast(S_V.second); + unsigned IID = 0; + for (const auto &HintType : HintTypes) { + if (Name.startswith(HintType.first)) { + Name = Name.drop_front(HintType.first.size()); + IID = HintType.second; + break; + } + } + if (IID == 0) + return false; + uint8_t index = 0; + Name.consumeInteger(10, index); + + if (index >= Prefetches.size()) + Prefetches.resize(index + 1); + Prefetches[index] = {IID, D}; + max_index = std::max(max_index, static_cast(index)); + } + } + assert(max_index + 1 >= 0 && + "Possible overflow: max_index + 1 should be positive."); + assert(static_cast(max_index + 1) == Prefetches.size() && + "The number of prefetch hints received should match the number of " + "PrefetchInfo objects returned"); + return !Prefetches.empty(); +} + +bool X86InsertPrefetch::doInitialization(Module &M) { + if (Filename.empty()) + return false; + + LLVMContext &Ctx = M.getContext(); + ErrorOr> ReaderOrErr = + SampleProfileReader::create(Filename, Ctx); + if (std::error_code EC = ReaderOrErr.getError()) { + std::string Msg = "Could not open profile: " + EC.message(); + Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg, + DiagnosticSeverity::DS_Warning)); + return false; + } + Reader = std::move(ReaderOrErr.get()); + Reader->read(); + return true; +} + +void X86InsertPrefetch::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired(); +} + +bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) { + if (!Reader) + return false; + const FunctionSamples *Samples = Reader->getSamplesFor(MF.getFunction()); + if (!Samples) + return false; + + bool Changed = false; + + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + SmallVector Prefetches; + for (auto &MBB : MF) { + for (auto MI = MBB.instr_begin(); MI != MBB.instr_end();) { + auto Current = MI; + ++MI; + + int Offset = X86II::getMemoryOperandNo(Current->getDesc().TSFlags); + if (Offset < 0) + continue; + unsigned Bias = X86II::getOperandBias(Current->getDesc()); + int MemOpOffset = Offset + Bias; + // FIXME(mtrofin): ORE message when the recommendation cannot be taken. + if (!IsMemOpCompatibleWithPrefetch(*Current, MemOpOffset)) + continue; + Prefetches.clear(); + if (!findPrefetchInfo(Samples, *Current, Prefetches)) + continue; + assert(!Prefetches.empty() && + "The Prefetches vector should contain at least a value if " + "findPrefetchInfo returned true."); + for (auto &PrefInfo : Prefetches) { + unsigned PFetchInstrID = PrefInfo.InstructionID; + int64_t Delta = PrefInfo.Delta; + const MCInstrDesc &Desc = TII->get(PFetchInstrID); + MachineInstr *PFetch = + MF.CreateMachineInstr(Desc, Current->getDebugLoc(), true); + MachineInstrBuilder MIB(MF, PFetch); + + assert(X86::AddrBaseReg == 0 && X86::AddrScaleAmt == 1 && + X86::AddrIndexReg == 2 && X86::AddrDisp == 3 && + X86::AddrSegmentReg == 4 && + "Unexpected change in X86 operand offset order."); + + // This assumes X86::AddBaseReg = 0, {...}ScaleAmt = 1, etc. + // FIXME(mtrofin): consider adding a: + // MachineInstrBuilder::set(unsigned offset, op). + MIB.addReg(Current->getOperand(MemOpOffset + X86::AddrBaseReg).getReg()) + .addImm( + Current->getOperand(MemOpOffset + X86::AddrScaleAmt).getImm()) + .addReg( + Current->getOperand(MemOpOffset + X86::AddrIndexReg).getReg()) + .addImm(Current->getOperand(MemOpOffset + X86::AddrDisp).getImm() + + Delta) + .addReg(Current->getOperand(MemOpOffset + X86::AddrSegmentReg) + .getReg()); + + if (!Current->memoperands_empty()) { + MachineMemOperand *CurrentOp = *(Current->memoperands_begin()); + MIB.addMemOperand(MF.getMachineMemOperand( + CurrentOp, CurrentOp->getOffset() + Delta, CurrentOp->getSize())); + } + + // Insert before Current. This is because Current may clobber some of + // the registers used to describe the input memory operand. + MBB.insert(Current, PFetch); + Changed = true; + } + } + } + return Changed; +} + +FunctionPass *llvm::createX86InsertPrefetchPass() { + return new X86InsertPrefetch(PrefetchHintsFile); +} diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index f8ade37f8dfce5351e772587071d72d8b7957426..7e60b9caf05b8d83d7c79853e4983b5997649cca 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -4830,13 +4830,13 @@ defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add, SchedWriteVecALU, 1>; defm VPSUB : avx512_binop_rm_vl_all<0xF8, 0xF9, 0xFA, 0xFB, "vpsub", sub, SchedWriteVecALU, 0>; -defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", X86adds, +defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", saddsat, SchedWriteVecALU, HasBWI, 1>; -defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", X86subs, +defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", ssubsat, SchedWriteVecALU, HasBWI, 0>; -defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", X86addus, +defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", uaddsat, SchedWriteVecALU, HasBWI, 1>; -defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus, +defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", usubsat, SchedWriteVecALU, HasBWI, 0>; defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul, SchedWritePMULLD, HasAVX512, 1>, T8PD; @@ -8790,7 +8790,7 @@ let Predicates = [HasVLX] in { (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), 4)))), FR32X)) >; } -// Unordered/Ordered scalar fp compare with Sea and set EFLAGS +// Unordered/Ordered scalar fp compare with Sae and set EFLAGS multiclass avx512_ord_cmp_sae opc, X86VectorVTInfo _, string OpcodeStr, X86FoldableSchedWrite sched> { let hasSideEffects = 0 in @@ -9528,7 +9528,7 @@ multiclass WriteShuffle256_BD opc, string OpcodeStr, EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128, VEX_WIG; defm Z256: WriteShuffle256_common, + v16i8x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { @@ -9547,12 +9547,12 @@ multiclass WriteShuffle256_BQ opc, string OpcodeStr, EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128, VEX_WIG; defm Z256: WriteShuffle256_common, + v16i8x_info, i32mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { defm Z : WriteShuffle256_common, + v16i8x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512, VEX_WIG; } } @@ -9585,7 +9585,7 @@ multiclass WriteShuffle256_WQ opc, string OpcodeStr, EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128, VEX_WIG; defm Z256: WriteShuffle256_common, + v8i16x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { @@ -9615,23 +9615,107 @@ multiclass WriteShuffle256_DQ opc, string OpcodeStr, } } -defm VPMOVZXBW : WriteShuffle256_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z", WriteShuffle256>; -defm VPMOVZXBD : WriteShuffle256_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z", WriteShuffle256>; -defm VPMOVZXBQ : WriteShuffle256_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z", WriteShuffle256>; -defm VPMOVZXWD : WriteShuffle256_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z", WriteShuffle256>; -defm VPMOVZXWQ : WriteShuffle256_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z", WriteShuffle256>; -defm VPMOVZXDQ : WriteShuffle256_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXBW : WriteShuffle256_BW<0x30, "vpmovzxbw", zext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXBD : WriteShuffle256_BD<0x31, "vpmovzxbd", zext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXBQ : WriteShuffle256_BQ<0x32, "vpmovzxbq", zext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXWD : WriteShuffle256_WD<0x33, "vpmovzxwd", zext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXWQ : WriteShuffle256_WQ<0x34, "vpmovzxwq", zext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXDQ : WriteShuffle256_DQ<0x35, "vpmovzxdq", zext, zext_invec, "z", WriteShuffle256>; + +defm VPMOVSXBW: WriteShuffle256_BW<0x20, "vpmovsxbw", sext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXBD: WriteShuffle256_BD<0x21, "vpmovsxbd", sext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXBQ: WriteShuffle256_BQ<0x22, "vpmovsxbq", sext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXWD: WriteShuffle256_WD<0x23, "vpmovsxwd", sext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXWQ: WriteShuffle256_WQ<0x24, "vpmovsxwq", sext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXDQ: WriteShuffle256_DQ<0x25, "vpmovsxdq", sext, sext_invec, "s", WriteShuffle256>; + + +// Patterns that we also need any extend versions of. aext_vector_inreg +// is currently legalized to zext_vector_inreg. +multiclass AVX512_pmovx_patterns_base { + // 256-bit patterns + let Predicates = [HasVLX, HasBWI] in { + def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), + (!cast(OpcPrefix#BWZ256rm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#BWZ256rm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#BWZ256rm) addr:$src)>; + } + + let Predicates = [HasVLX] in { + def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), + (!cast(OpcPrefix#WDZ256rm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#WDZ256rm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#WDZ256rm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), + (!cast(OpcPrefix#DQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#DQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#DQZ256rm) addr:$src)>; + } + + // 512-bit patterns + let Predicates = [HasBWI] in { + def : Pat<(v32i16 (ExtOp (loadv32i8 addr:$src))), + (!cast(OpcPrefix#BWZrm) addr:$src)>; + } + let Predicates = [HasAVX512] in { + def : Pat<(v16i32 (ExtOp (loadv16i8 addr:$src))), + (!cast(OpcPrefix#BDZrm) addr:$src)>; + def : Pat<(v16i32 (ExtOp (loadv16i16 addr:$src))), + (!cast(OpcPrefix#WDZrm) addr:$src)>; + + def : Pat<(v8i64 (ExtOp (loadv8i16 addr:$src))), + (!cast(OpcPrefix#WQZrm) addr:$src)>; + + def : Pat<(v8i64 (ExtOp (loadv8i32 addr:$src))), + (!cast(OpcPrefix#DQZrm) addr:$src)>; + } +} + +multiclass AVX512_pmovx_patterns_aext : + AVX512_pmovx_patterns_base { + let Predicates = [HasVLX, HasBWI] in { + def : Pat<(v16i16 (ExtOp (v16i8 VR128X:$src))), + (!cast(OpcPrefix#BWZ256rr) VR128X:$src)>; + } -defm VPMOVSXBW: WriteShuffle256_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s", WriteShuffle256>; -defm VPMOVSXBD: WriteShuffle256_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s", WriteShuffle256>; -defm VPMOVSXBQ: WriteShuffle256_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s", WriteShuffle256>; -defm VPMOVSXWD: WriteShuffle256_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s", WriteShuffle256>; -defm VPMOVSXWQ: WriteShuffle256_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s", WriteShuffle256>; -defm VPMOVSXDQ: WriteShuffle256_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s", WriteShuffle256>; + let Predicates = [HasVLX] in { + def : Pat<(v8i32 (ExtOp (v8i16 VR128X:$src))), + (!cast(OpcPrefix#WDZ256rr) VR128X:$src)>; + + def : Pat<(v4i64 (ExtOp (v4i32 VR128X:$src))), + (!cast(OpcPrefix#DQZ256rr) VR128X:$src)>; + } + + // 512-bit patterns + let Predicates = [HasBWI] in { + def : Pat<(v32i16 (ExtOp (v32i8 VR256X:$src))), + (!cast(OpcPrefix#BWZrr) VR256X:$src)>; + } + let Predicates = [HasAVX512] in { + def : Pat<(v16i32 (ExtOp (v16i8 VR128X:$src))), + (!cast(OpcPrefix#BDZrr) VR128X:$src)>; + def : Pat<(v16i32 (ExtOp (v16i16 VR256X:$src))), + (!cast(OpcPrefix#WDZrr) VR256X:$src)>; + + def : Pat<(v8i64 (ExtOp (v8i16 VR128X:$src))), + (!cast(OpcPrefix#WQZrr) VR128X:$src)>; + + def : Pat<(v8i64 (ExtOp (v8i32 VR256X:$src))), + (!cast(OpcPrefix#DQZrr) VR256X:$src)>; + } +} multiclass AVX512_pmovx_patterns { + SDNode InVecOp> : + AVX512_pmovx_patterns_base { // 128-bit patterns let Predicates = [HasVLX, HasBWI] in { def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), @@ -9695,84 +9779,70 @@ multiclass AVX512_pmovx_patterns(OpcPrefix#DQZ128rm) addr:$src)>; } - // 256-bit patterns - let Predicates = [HasVLX, HasBWI] in { - def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), - (!cast(OpcPrefix#BWZ256rm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWZ256rm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWZ256rm) addr:$src)>; - } let Predicates = [HasVLX] in { - def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v8i32 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (loadv16i8 addr:$src))), + def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), (!cast(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (loadv16i8 addr:$src))), + def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), - (!cast(OpcPrefix#WDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDZ256rm) addr:$src)>; - - def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#WQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#WQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#WQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (loadv8i16 addr:$src))), + def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WQZ256rm) addr:$src)>; - - def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), - (!cast(OpcPrefix#DQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQZ256rm) addr:$src)>; } // 512-bit patterns - let Predicates = [HasBWI] in { - def : Pat<(v32i16 (ExtOp (loadv32i8 addr:$src))), - (!cast(OpcPrefix#BWZrm) addr:$src)>; - } let Predicates = [HasAVX512] in { - def : Pat<(v16i32 (ExtOp (loadv16i8 addr:$src))), - (!cast(OpcPrefix#BDZrm) addr:$src)>; - - def : Pat<(v8i64 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#BQZrm) addr:$src)>; - def : Pat<(v8i64 (ExtOp (loadv16i8 addr:$src))), + def : Pat<(v8i64 (InVecOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BQZrm) addr:$src)>; + } +} - def : Pat<(v16i32 (ExtOp (loadv16i16 addr:$src))), - (!cast(OpcPrefix#WDZrm) addr:$src)>; - - def : Pat<(v8i64 (ExtOp (loadv8i16 addr:$src))), - (!cast(OpcPrefix#WQZrm) addr:$src)>; +defm : AVX512_pmovx_patterns<"VPMOVSX", sext, sext_invec>; +defm : AVX512_pmovx_patterns<"VPMOVZX", zext, zext_invec>; +defm : AVX512_pmovx_patterns_aext<"VPMOVZX", anyext>; - def : Pat<(v8i64 (ExtOp (loadv8i32 addr:$src))), - (!cast(OpcPrefix#DQZrm) addr:$src)>; - } +// Without BWI we can't do a trunc from v16i16 to v16i8. DAG combine can merge +// ext+trunc aggresively making it impossible to legalize the DAG to this +// pattern directly. +let Predicates = [HasAVX512, NoBWI] in { +def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))), + (VPMOVDBZrr (v16i32 (VPMOVZXWDZrr VR256X:$src)))>; +def: Pat<(v16i8 (trunc (bc_v16i16 (loadv4i64 addr:$src)))), + (VPMOVDBZrr (v16i32 (VPMOVZXWDZrm addr:$src)))>; +def: Pat<(store (v16i8 (trunc (v16i16 VR256X:$src))), addr:$dst), + (VPMOVDBZmr addr:$dst, (v16i32 (VPMOVZXWDZrr VR256X:$src)))>; } -defm : AVX512_pmovx_patterns<"VPMOVSX", X86vsext, sext_invec>; -defm : AVX512_pmovx_patterns<"VPMOVZX", X86vzext, zext_invec>; +// Without BWI we can't do a trunc from v16i16 to v16i8. DAG combine can merge +// ext+trunc aggresively making it impossible to legalize the DAG to this +// pattern directly. +let Predicates = [HasAVX512, NoBWI] in { +def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))), + (VPMOVDBZrr (v16i32 (VPMOVZXWDZrr VR256X:$src)))>; +def: Pat<(v16i8 (trunc (bc_v16i16 (loadv4i64 addr:$src)))), + (VPMOVDBZrr (v16i32 (VPMOVZXWDZrm addr:$src)))>; +def: Pat<(store (v16i8 (trunc (v16i16 VR256X:$src))), addr:$dst), + (VPMOVDBZmr addr:$dst, (v16i32 (VPMOVZXWDZrr VR256X:$src)))>; +} //===----------------------------------------------------------------------===// // GATHER - SCATTER Operations diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index 94e460ff36283a2a20f2f2f87a80c7e42cf98d3d..3288fe22f65ac003fa501d6d9712a7f9d3ae90aa 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -913,8 +913,8 @@ multiclass ArithBinOp_RF BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, let Defs = [EFLAGS] in { let Constraints = "$src1 = $dst" in { let isCommutable = CommutableRR in { - def NAME#8rr : BinOpRR_RF; let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def NAME#8rr : BinOpRR_RF; def NAME#16rr : BinOpRR_RF; def NAME#32rr : BinOpRR_RF; def NAME#64rr : BinOpRR_RF; @@ -931,9 +931,9 @@ multiclass ArithBinOp_RF BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#32rm : BinOpRM_RF; def NAME#64rm : BinOpRM_RF; - def NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>; - let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>; + // NOTE: These are order specific, we want the ri8 forms to be listed // first so that they are slightly preferred to the ri forms. def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>; diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 71b43a38dc2cd994070cbad16728338cd3995ee6..529d054d081a82c9bacd2cbd335126fdb0f56a12 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -143,7 +143,7 @@ def WIN_ALLOCA_64 : I<0, Pseudo, (outs), (ins GR64:$size), // These instructions XOR the frame pointer into a GPR. They are used in some // stack protection schemes. These are post-RA pseudos because we only know the // frame register after register allocation. -let Constraints = "$src = $dst", isPseudo = 1, Defs = [EFLAGS] in { +let Constraints = "$src = $dst", isMoveImm = 1, isPseudo = 1, Defs = [EFLAGS] in { def XOR32_FP : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src), "xorl\t$$FP, $src", []>, Requires<[NotLP64]>, Sched<[WriteALU]>; @@ -270,7 +270,7 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), "", []>; // Alias instruction mapping movr0 to xor. // FIXME: remove when we can teach regalloc that xor reg, reg is ok. let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1, - isPseudo = 1, AddedComplexity = 10 in + isPseudo = 1, isMoveImm = 1, AddedComplexity = 10 in def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "", [(set GR32:$dst, 0)]>, Sched<[WriteZero]>; @@ -362,29 +362,20 @@ def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), (SETBr)>; -// (add OP, SETB) -> (adc OP, 0) -def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op), - (ADC8ri GR8:$op, 0)>; -def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op), - (ADC32ri8 GR32:$op, 0)>; -def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op), - (ADC64ri8 GR64:$op, 0)>; - -// (sub OP, SETB) -> (sbb OP, 0) -def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)), - (SBB8ri GR8:$op, 0)>; -def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)), - (SBB32ri8 GR32:$op, 0)>; -def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)), - (SBB64ri8 GR64:$op, 0)>; - -// (sub OP, SETCC_CARRY) -> (adc OP, 0) -def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))), - (ADC8ri GR8:$op, 0)>; -def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))), - (ADC32ri8 GR32:$op, 0)>; -def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))), - (ADC64ri8 GR64:$op, 0)>; +// Patterns to give priority when both inputs are zero so that we don't use +// an immediate for the RHS. +// TODO: Should we use a 32-bit sbb for 8/16 to push the extract_subreg out? +def : Pat<(X86sbb_flag (i8 0), (i8 0), EFLAGS), + (SBB8rr (EXTRACT_SUBREG (MOV32r0), sub_8bit), + (EXTRACT_SUBREG (MOV32r0), sub_8bit))>; +def : Pat<(X86sbb_flag (i16 0), (i16 0), EFLAGS), + (SBB16rr (EXTRACT_SUBREG (MOV32r0), sub_16bit), + (EXTRACT_SUBREG (MOV32r0), sub_16bit))>; +def : Pat<(X86sbb_flag (i32 0), (i32 0), EFLAGS), + (SBB32rr (MOV32r0), (MOV32r0))>; +def : Pat<(X86sbb_flag (i64 0), (i64 0), EFLAGS), + (SBB64rr (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit), + (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit))>; //===----------------------------------------------------------------------===// // String Pseudo Instructions @@ -1320,6 +1311,15 @@ def : Pat<(i64 (anyext GR16:$src)), def : Pat<(i64 (anyext GR32:$src)), (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, sub_32bit)>; +// If this is an anyext of the remainder of an 8-bit sdivrem, use a MOVSX +// instead of a MOVZX. The sdivrem lowering will emit emit a MOVSX to move +// %ah to the lower byte of a register. By using a MOVSX here we allow a +// post-isel peephole to merge the two MOVSX instructions into one. +def anyext_sdiv : PatFrag<(ops node:$lhs), (anyext node:$lhs),[{ + return (N->getOperand(0).getOpcode() == ISD::SDIVREM && + N->getOperand(0).getResNo() == 1); +}]>; +def : Pat<(i32 (anyext_sdiv GR8:$src)), (MOVSX32rr8 GR8:$src)>; // Any instruction that defines a 32-bit result leaves the high half of the // register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 7bc8d0aa5309ad385a4e199ba76d12c5a8cd2c48..0b98abaa7a267b4592765fdbd9f9e8099594cfeb 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -103,16 +103,6 @@ def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL", def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; -def X86vzext : SDNode<"X86ISD::VZEXT", - SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisInt<0>, SDTCisInt<1>, - SDTCisOpSmallerThanOp<1, 0>]>>; - -def X86vsext : SDNode<"X86ISD::VSEXT", - SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisInt<0>, SDTCisInt<1>, - SDTCisOpSmallerThanOp<1, 0>]>>; - def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<0, 1>]>; @@ -237,10 +227,6 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>; -def X86addus : SDNode<"X86ISD::ADDUS", SDTIntBinOp, [SDNPCommutative]>; -def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>; -def X86adds : SDNode<"X86ISD::ADDS", SDTIntBinOp, [SDNPCommutative]>; -def X86subs : SDNode<"X86ISD::SUBS", SDTIntBinOp>; def X86mulhrs : SDNode<"X86ISD::MULHRS", SDTIntBinOp, [SDNPCommutative]>; def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp, [SDNPCommutative]>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index fe26389050c2e4a2b3399dae11ca8fe9983e3641..40de049bfe8bab671a64ff346ff0f8e358ab7c0d 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -718,7 +718,7 @@ bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr &MI) const { } /// Check whether the shift count for a machine operand is non-zero. -inline static unsigned getTruncatedShiftCount(MachineInstr &MI, +inline static unsigned getTruncatedShiftCount(const MachineInstr &MI, unsigned ShiftAmtOperandIdx) { // The shift count is six bits with the REX.W prefix and five bits without. unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31; @@ -739,8 +739,7 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, unsigned Opc, bool AllowSP, unsigned &NewSrc, - bool &isKill, bool &isUndef, - MachineOperand &ImplicitOp, + bool &isKill, MachineOperand &ImplicitOp, LiveVariables *LV) const { MachineFunction &MF = *MI.getParent()->getParent(); const TargetRegisterClass *RC; @@ -757,7 +756,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, if (Opc != X86::LEA64_32r) { NewSrc = SrcReg; isKill = Src.isKill(); - isUndef = Src.isUndef(); + assert(!Src.isUndef() && "Undef op doesn't need optimization"); if (TargetRegisterInfo::isVirtualRegister(NewSrc) && !MF.getRegInfo().constrainRegClass(NewSrc, RC)) @@ -774,7 +773,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, NewSrc = getX86SubSuperRegister(Src.getReg(), 64); isKill = Src.isKill(); - isUndef = Src.isUndef(); + assert(!Src.isUndef() && "Undef op doesn't need optimization"); } else { // Virtual register of the wrong class, we have to create a temporary 64-bit // vreg to feed into the LEA. @@ -786,7 +785,6 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, // Which is obviously going to be dead after we're done with it. isKill = true; - isUndef = false; if (LV) LV->replaceKillInstruction(SrcReg, MI, *Copy); @@ -796,88 +794,99 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, return true; } -/// Helper for convertToThreeAddress when 16-bit LEA is disabled, use 32-bit -/// LEA to form 3-address code by promoting to a 32-bit superregister and then -/// truncating back down to a 16-bit subregister. MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA( unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI, LiveVariables *LV) const { - MachineBasicBlock::iterator MBBI = MI.getIterator(); - unsigned Dest = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); - bool isDead = MI.getOperand(0).isDead(); - bool isKill = MI.getOperand(1).isKill(); - + // We handle 8-bit adds and various 16-bit opcodes in the switch below. + bool Is16BitOp = !(MIOpc == X86::ADD8rr || MIOpc == X86::ADD8ri); MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo(); - unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass); - unsigned Opc, leaInReg; - if (Subtarget.is64Bit()) { - Opc = X86::LEA64_32r; - leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); - } else { - Opc = X86::LEA32r; - leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); - } + assert((!Is16BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits( + *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) && + "Unexpected type for LEA transform"); + + // TODO: For a 32-bit target, we need to adjust the LEA variables with + // something like this: + // Opcode = X86::LEA32r; + // InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); + // OutRegLEA = + // Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass) + // : RegInfo.createVirtualRegister(&X86::GR32RegClass); + if (!Subtarget.is64Bit()) + return nullptr; + + unsigned Opcode = X86::LEA64_32r; + unsigned InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); + unsigned OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass); // Build and insert into an implicit UNDEF value. This is OK because - // well be shifting and then extracting the lower 16-bits. + // we will be shifting and then extracting the lower 8/16-bits. // This has the potential to cause partial register stall. e.g. // movw (%rbp,%rcx,2), %dx // leal -65(%rdx), %esi // But testing has shown this *does* help performance in 64-bit mode (at // least on modern x86 machines). - BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg); + MachineBasicBlock::iterator MBBI = MI.getIterator(); + unsigned Dest = MI.getOperand(0).getReg(); + unsigned Src = MI.getOperand(1).getReg(); + bool IsDead = MI.getOperand(0).isDead(); + bool IsKill = MI.getOperand(1).isKill(); + unsigned SubReg = Is16BitOp ? X86::sub_16bit : X86::sub_8bit; + assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization"); + BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA); MachineInstr *InsMI = BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY)) - .addReg(leaInReg, RegState::Define, X86::sub_16bit) - .addReg(Src, getKillRegState(isKill)); + .addReg(InRegLEA, RegState::Define, SubReg) + .addReg(Src, getKillRegState(IsKill)); MachineInstrBuilder MIB = - BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opc), leaOutReg); + BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA); switch (MIOpc) { default: llvm_unreachable("Unreachable!"); case X86::SHL16ri: { unsigned ShAmt = MI.getOperand(2).getImm(); MIB.addReg(0).addImm(1ULL << ShAmt) - .addReg(leaInReg, RegState::Kill).addImm(0).addReg(0); + .addReg(InRegLEA, RegState::Kill).addImm(0).addReg(0); break; } case X86::INC16r: - addRegOffset(MIB, leaInReg, true, 1); + addRegOffset(MIB, InRegLEA, true, 1); break; case X86::DEC16r: - addRegOffset(MIB, leaInReg, true, -1); + addRegOffset(MIB, InRegLEA, true, -1); break; + case X86::ADD8ri: case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD16ri_DB: case X86::ADD16ri8_DB: - addRegOffset(MIB, leaInReg, true, MI.getOperand(2).getImm()); + addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm()); break; + case X86::ADD8rr: case X86::ADD16rr: case X86::ADD16rr_DB: { unsigned Src2 = MI.getOperand(2).getReg(); - bool isKill2 = MI.getOperand(2).isKill(); - unsigned leaInReg2 = 0; + bool IsKill2 = MI.getOperand(2).isKill(); + assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization"); + unsigned InRegLEA2 = 0; MachineInstr *InsMI2 = nullptr; if (Src == Src2) { - // ADD16rr killed %reg1028, %reg1028 + // ADD8rr/ADD16rr killed %reg1028, %reg1028 // just a single insert_subreg. - addRegReg(MIB, leaInReg, true, leaInReg, false); + addRegReg(MIB, InRegLEA, true, InRegLEA, false); } else { if (Subtarget.is64Bit()) - leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); + InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); else - leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); + InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); // Build and insert into an implicit UNDEF value. This is OK because - // well be shifting and then extracting the lower 16-bits. - BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg2); + // we will be shifting and then extracting the lower 8/16-bits. + BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA2); InsMI2 = BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY)) - .addReg(leaInReg2, RegState::Define, X86::sub_16bit) - .addReg(Src2, getKillRegState(isKill2)); - addRegReg(MIB, leaInReg, true, leaInReg2, true); + .addReg(InRegLEA2, RegState::Define, SubReg) + .addReg(Src2, getKillRegState(IsKill2)); + addRegReg(MIB, InRegLEA, true, InRegLEA2, true); } - if (LV && isKill2 && InsMI2) + if (LV && IsKill2 && InsMI2) LV->replaceKillInstruction(Src2, MI, *InsMI2); break; } @@ -886,16 +895,16 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA( MachineInstr *NewMI = MIB; MachineInstr *ExtMI = BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY)) - .addReg(Dest, RegState::Define | getDeadRegState(isDead)) - .addReg(leaOutReg, RegState::Kill, X86::sub_16bit); + .addReg(Dest, RegState::Define | getDeadRegState(IsDead)) + .addReg(OutRegLEA, RegState::Kill, SubReg); if (LV) { - // Update live variables - LV->getVarInfo(leaInReg).Kills.push_back(NewMI); - LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI); - if (isKill) + // Update live variables. + LV->getVarInfo(InRegLEA).Kills.push_back(NewMI); + LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI); + if (IsKill) LV->replaceKillInstruction(Src, MI, *InsMI); - if (isDead) + if (IsDead) LV->replaceKillInstruction(Dest, MI, *ExtMI); } @@ -926,12 +935,18 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, const MachineOperand &Dest = MI.getOperand(0); const MachineOperand &Src = MI.getOperand(1); + // Ideally, operations with undef should be folded before we get here, but we + // can't guarantee it. Bail out because optimizing undefs is a waste of time. + // Without this, we have to forward undef state to new register operands to + // avoid machine verifier errors. + if (Src.isUndef()) + return nullptr; + if (MI.getNumOperands() > 2) + if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef()) + return nullptr; + MachineInstr *NewMI = nullptr; - // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When - // we have better subtarget support, enable the 16-bit LEA generation here. - // 16-bit LEA is also slow on Core2. - bool DisableLEA16 = true; - bool is64Bit = Subtarget.is64Bit(); + bool Is64Bit = Subtarget.is64Bit(); unsigned MIOpc = MI.getOpcode(); switch (MIOpc) { @@ -961,14 +976,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned ShAmt = getTruncatedShiftCount(MI, 2); if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; - unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; + unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; // LEA can't handle ESP. - bool isKill, isUndef; + bool isKill; unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, - SrcReg, isKill, isUndef, ImplicitOp, LV)) + SrcReg, isKill, ImplicitOp, LV)) return nullptr; MachineInstrBuilder MIB = @@ -976,7 +991,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, .add(Dest) .addReg(0) .addImm(1ULL << ShAmt) - .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)) + .addReg(SrcReg, getKillRegState(isKill)) .addImm(0) .addReg(0); if (ImplicitOp.getReg() != 0) @@ -988,37 +1003,26 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::SHL16ri: { assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); - if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; - - if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV) - : nullptr; - NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)) - .add(Dest) - .addReg(0) - .addImm(1ULL << ShAmt) - .add(Src) - .addImm(0) - .addReg(0); - break; + if (!isTruncatedShiftCountForLEA(ShAmt)) + return nullptr; + return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV); } case X86::INC64r: case X86::INC32r: { assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!"); - unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r - : (is64Bit ? X86::LEA64_32r : X86::LEA32r); - bool isKill, isUndef; + unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r : + (Is64Bit ? X86::LEA64_32r : X86::LEA32r); + bool isKill; unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); - if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, - SrcReg, isKill, isUndef, ImplicitOp, LV)) + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, + ImplicitOp, LV)) return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) .add(Dest) - .addReg(SrcReg, - getKillRegState(isKill) | getUndefRegState(isUndef)); + .addReg(SrcReg, getKillRegState(isKill)); if (ImplicitOp.getReg() != 0) MIB.add(ImplicitOp); @@ -1026,30 +1030,23 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, break; } case X86::INC16r: - if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV) - : nullptr; - assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!"); - NewMI = addOffset( - BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest).add(Src), 1); - break; + return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV); case X86::DEC64r: case X86::DEC32r: { assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!"); unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r - : (is64Bit ? X86::LEA64_32r : X86::LEA32r); + : (Is64Bit ? X86::LEA64_32r : X86::LEA32r); - bool isKill, isUndef; + bool isKill; unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); - if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, - SrcReg, isKill, isUndef, ImplicitOp, LV)) + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, + ImplicitOp, LV)) return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) .add(Dest) - .addReg(SrcReg, getUndefRegState(isUndef) | - getKillRegState(isKill)); + .addReg(SrcReg, getKillRegState(isKill)); if (ImplicitOp.getReg() != 0) MIB.add(ImplicitOp); @@ -1058,13 +1055,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, break; } case X86::DEC16r: - if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV) - : nullptr; - assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!"); - NewMI = addOffset( - BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest).add(Src), -1); - break; + return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV); case X86::ADD64rr: case X86::ADD64rr_DB: case X86::ADD32rr: @@ -1074,21 +1065,21 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) Opc = X86::LEA64r; else - Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; + Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; - bool isKill, isUndef; + bool isKill; unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, - SrcReg, isKill, isUndef, ImplicitOp, LV)) + SrcReg, isKill, ImplicitOp, LV)) return nullptr; const MachineOperand &Src2 = MI.getOperand(2); - bool isKill2, isUndef2; + bool isKill2; unsigned SrcReg2; MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false, - SrcReg2, isKill2, isUndef2, ImplicitOp2, LV)) + SrcReg2, isKill2, ImplicitOp2, LV)) return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest); @@ -1098,36 +1089,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MIB.add(ImplicitOp2); NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2); - - // Preserve undefness of the operands. - NewMI->getOperand(1).setIsUndef(isUndef); - NewMI->getOperand(3).setIsUndef(isUndef2); - if (LV && Src2.isKill()) LV->replaceKillInstruction(SrcReg2, MI, *NewMI); break; } + case X86::ADD8rr: case X86::ADD16rr: - case X86::ADD16rr_DB: { - if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV) - : nullptr; - assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); - unsigned Src2 = MI.getOperand(2).getReg(); - bool isKill2 = MI.getOperand(2).isKill(); - NewMI = addRegReg(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest), - Src.getReg(), Src.isKill(), Src2, isKill2); - - // Preserve undefness of the operands. - bool isUndef = MI.getOperand(1).isUndef(); - bool isUndef2 = MI.getOperand(2).isUndef(); - NewMI->getOperand(1).setIsUndef(isUndef); - NewMI->getOperand(3).setIsUndef(isUndef2); - - if (LV && isKill2) - LV->replaceKillInstruction(Src2, MI, *NewMI); - break; - } + case X86::ADD16rr_DB: + return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV); case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD64ri32_DB: @@ -1142,38 +1111,30 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::ADD32ri_DB: case X86::ADD32ri8_DB: { assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); - unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; + unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; - bool isKill, isUndef; + bool isKill; unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, - SrcReg, isKill, isUndef, ImplicitOp, LV)) + SrcReg, isKill, ImplicitOp, LV)) return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) .add(Dest) - .addReg(SrcReg, getUndefRegState(isUndef) | - getKillRegState(isKill)); + .addReg(SrcReg, getKillRegState(isKill)); if (ImplicitOp.getReg() != 0) MIB.add(ImplicitOp); NewMI = addOffset(MIB, MI.getOperand(2)); break; } + case X86::ADD8ri: case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD16ri_DB: case X86::ADD16ri8_DB: - if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV) - : nullptr; - assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); - NewMI = addOffset( - BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest).add(Src), - MI.getOperand(2)); - break; - + return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV); case X86::VMOVDQU8Z128rmk: case X86::VMOVDQU8Z256rmk: case X86::VMOVDQU8Zrmk: @@ -3257,9 +3218,9 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, } } -bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg, - int64_t &Offset, - const TargetRegisterInfo *TRI) const { +bool X86InstrInfo::getMemOperandWithOffset( + MachineInstr &MemOp, MachineOperand *&BaseOp, int64_t &Offset, + const TargetRegisterInfo *TRI) const { const MCInstrDesc &Desc = MemOp.getDesc(); int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags); if (MemRefBegin < 0) @@ -3267,11 +3228,10 @@ bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg, MemRefBegin += X86II::getOperandBias(Desc); - MachineOperand &BaseMO = MemOp.getOperand(MemRefBegin + X86::AddrBaseReg); - if (!BaseMO.isReg()) // Can be an MO_FrameIndex + BaseOp = &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg); + if (!BaseOp->isReg()) // Can be an MO_FrameIndex return false; - BaseReg = BaseMO.getReg(); if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1) return false; @@ -3287,6 +3247,8 @@ bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg, Offset = DispMO.getImm(); + assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " + "operands of type register."); return true; } @@ -3459,9 +3421,10 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, /// This function can be extended later on. /// SrcReg, SrcRegs: register operands for FlagI. /// ImmValue: immediate for FlagI if it takes an immediate. -inline static bool isRedundantFlagInstr(MachineInstr &FlagI, unsigned SrcReg, - unsigned SrcReg2, int ImmMask, - int ImmValue, MachineInstr &OI) { +inline static bool isRedundantFlagInstr(const MachineInstr &FlagI, + unsigned SrcReg, unsigned SrcReg2, + int ImmMask, int ImmValue, + const MachineInstr &OI) { if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) || (FlagI.getOpcode() == X86::CMP32rr && OI.getOpcode() == X86::SUB32rr) || (FlagI.getOpcode() == X86::CMP16rr && OI.getOpcode() == X86::SUB16rr) || @@ -3492,7 +3455,7 @@ inline static bool isRedundantFlagInstr(MachineInstr &FlagI, unsigned SrcReg, /// Check whether the definition can be converted /// to remove a comparison against zero. -inline static bool isDefConvertible(MachineInstr &MI) { +inline static bool isDefConvertible(const MachineInstr &MI) { switch (MI.getOpcode()) { default: return false; @@ -3592,12 +3555,16 @@ inline static bool isDefConvertible(MachineInstr &MI) { case X86::BLSFILL64rr: case X86::BLSFILL64rm: case X86::BLSIC32rr: case X86::BLSIC32rm: case X86::BLSIC64rr: case X86::BLSIC64rm: + case X86::T1MSKC32rr: case X86::T1MSKC32rm: + case X86::T1MSKC64rr: case X86::T1MSKC64rm: + case X86::TZMSK32rr: case X86::TZMSK32rm: + case X86::TZMSK64rr: case X86::TZMSK64rm: return true; } } /// Check whether the use can be converted to remove a comparison against zero. -static X86::CondCode isUseDefConvertible(MachineInstr &MI) { +static X86::CondCode isUseDefConvertible(const MachineInstr &MI) { switch (MI.getOpcode()) { default: return X86::COND_INVALID; case X86::LZCNT16rr: case X86::LZCNT16rm: @@ -3612,12 +3579,12 @@ static X86::CondCode isUseDefConvertible(MachineInstr &MI) { case X86::TZCNT32rr: case X86::TZCNT32rm: case X86::TZCNT64rr: case X86::TZCNT64rm: return X86::COND_B; - case X86::BSF16rr: - case X86::BSF16rm: - case X86::BSF32rr: - case X86::BSF32rm: - case X86::BSF64rr: - case X86::BSF64rm: + case X86::BSF16rr: case X86::BSF16rm: + case X86::BSF32rr: case X86::BSF32rm: + case X86::BSF64rr: case X86::BSF64rm: + case X86::BSR16rr: case X86::BSR16rm: + case X86::BSR32rr: case X86::BSR32rm: + case X86::BSR64rr: case X86::BSR64rm: return X86::COND_E; } } @@ -7830,5 +7797,5 @@ X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB, return It; } -#define GET_TII_HELPERS +#define GET_INSTRINFO_HELPERS #include "X86GenInstrInfo.inc" diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index f3965db4fe7c5c114b110650d0a4fbdd95f49c46..159cb50afc5c017fcb10ffb52f85f0528d88a1a1 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -258,7 +258,7 @@ public: /// operand to the LEA instruction. bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc, - bool &isKill, bool &isUndef, MachineOperand &ImplicitOp, + bool &isKill, MachineOperand &ImplicitOp, LiveVariables *LV) const; /// convertToThreeAddress - This method must be implemented by targets that @@ -327,9 +327,9 @@ public: SmallVectorImpl &Cond, bool AllowModify) const override; - bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, - int64_t &Offset, - const TargetRegisterInfo *TRI) const override; + bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp, + int64_t &Offset, + const TargetRegisterInfo *TRI) const override; bool analyzeBranchPredicate(MachineBasicBlock &MBB, TargetInstrInfo::MachineBranchPredicate &MBP, bool AllowModify = false) const override; @@ -558,7 +558,7 @@ public: MachineBasicBlock::iterator &It, MachineFunction &MF, const outliner::Candidate &C) const override; -#define GET_TII_HELPER_DECLS +#define GET_INSTRINFO_HELPER_DECLS #include "X86GenInstrInfo.inc" protected: @@ -584,6 +584,9 @@ protected: const MachineOperand *&Destination) const override; private: + /// This is a helper for convertToThreeAddress for 8 and 16-bit instructions. + /// We use 32-bit LEA to form 3-address code by promoting to a 32-bit + /// super-register and then truncating back down to a 8/16-bit sub-register. MachineInstr *convertToThreeAddressWithLEA(unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI, diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 992e9543b33dac2ccea6bf90d6b48afb7f98726f..fe9b530e8ea1a33e17dc054c5ec8bc6a3c0d74ec 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -17,10 +17,6 @@ // X86 specific DAG Nodes. // -def SDTIntShiftDOp: SDTypeProfile<1, 3, - [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, - SDTCisInt<0>, SDTCisInt<3>]>; - def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>; def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; @@ -1493,7 +1489,7 @@ def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), "mov{q}\t{$src, $dst|$dst, $src}", []>; } -let isReMaterializable = 1, isAsCheapAsAMove = 1 in { +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in { def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src), "mov{b}\t{$src, $dst|$dst, $src}", [(set GR8:$dst, imm:$src)]>; @@ -1507,7 +1503,7 @@ def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src), "mov{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, i64immSExt32:$src)]>; } -let isReMaterializable = 1 in { +let isReMaterializable = 1, isMoveImm = 1 in { def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src), "movabs{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, relocImm:$src)]>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 85e4fd3856339c3eb9e89d083353872e2fa96478..94cd5a611f2ca78cb3c8da3c2c682bd1365d714e 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3623,13 +3623,13 @@ defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, SchedWriteVecALU, 1, NoVLX>; defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, SchedWriteVecALU, 1, NoVLX>; -defm PADDSB : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8, +defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; -defm PADDSW : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16, +defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; -defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8, +defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; -defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16, +defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; @@ -3645,13 +3645,13 @@ defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, SchedWriteVecALU, 0, NoVLX>; defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, SchedWriteVecALU, 0, NoVLX>; -defm PSUBSB : PDI_binop_all<0xE8, "psubsb", X86subs, v16i8, v32i8, +defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8, SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; -defm PSUBSW : PDI_binop_all<0xE9, "psubsw", X86subs, v8i16, v16i16, +defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16, SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; -defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8, +defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8, SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; -defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16, +defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16, SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; @@ -5198,34 +5198,72 @@ defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>; defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>; -// AVX2 Patterns -multiclass SS41I_pmovx_avx2_patterns { +// Patterns that we also need for any_extend. +// Any_extend_vector_inreg is currently legalized to zero_extend_vector_inreg. +multiclass SS41I_pmovx_avx2_patterns_base { // Register-Register patterns - let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { - def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), - (!cast(OpcPrefix#BWYrr) VR128:$src)>; + let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { + def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), + (!cast(OpcPrefix#BWYrr) VR128:$src)>; } - let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))), + + let Predicates = [HasAVX2, NoVLX] in { + def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), + (!cast(OpcPrefix#WDYrr) VR128:$src)>; + + def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), + (!cast(OpcPrefix#DQYrr) VR128:$src)>; + } + + // AVX2 Register-Memory patterns + let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { + def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), + (!cast(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#BWYrm) addr:$src)>; + } + + let Predicates = [HasAVX2, NoVLX] in { + def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), + (!cast(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#WDYrm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), + (!cast(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#DQYrm) addr:$src)>; + } +} + +// AVX2 Patterns +multiclass SS41I_pmovx_avx2_patterns : + SS41I_pmovx_avx2_patterns_base { + + // Register-Register patterns + let Predicates = [HasAVX2, NoVLX] in { + def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))), (!cast(OpcPrefix#BDYrr) VR128:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))), + def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))), (!cast(OpcPrefix#BQYrr) VR128:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), - (!cast(OpcPrefix#WDYrr) VR128:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))), + def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))), (!cast(OpcPrefix#WQYrr) VR128:$src)>; - - def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), - (!cast(OpcPrefix#DQYrr) VR128:$src)>; } // Simple Register-Memory patterns - let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { + let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { def : Pat<(v16i16 (!cast(ExtTy#"extloadvi8") addr:$src)), (!cast(OpcPrefix#BWYrm) addr:$src)>; } - let Predicates = [HasAVX, NoVLX] in { + let Predicates = [HasAVX2, NoVLX] in { def : Pat<(v8i32 (!cast(ExtTy#"extloadvi8") addr:$src)), (!cast(OpcPrefix#BDYrm) addr:$src)>; def : Pat<(v4i64 (!cast(ExtTy#"extloadvi8") addr:$src)), @@ -5241,60 +5279,39 @@ multiclass SS41I_pmovx_avx2_patterns(OpcPrefix#BWYrm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWYrm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWYrm) addr:$src)>; - } - let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + let Predicates = [HasAVX2, NoVLX] in { + def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v8i32 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (loadv16i8 addr:$src))), + def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (loadv16i8 addr:$src))), + def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), - (!cast(OpcPrefix#WDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDYrm) addr:$src)>; - - def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (loadv8i16 addr:$src))), + def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WQYrm) addr:$src)>; - - def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), - (!cast(OpcPrefix#DQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQYrm) addr:$src)>; } } -defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>; -defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>; +defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>; +defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>; +defm : SS41I_pmovx_avx2_patterns_base<"VPMOVZX", anyext>; // SSE4.1/AVX patterns. multiclass SS41I_pmovx_patterns; def UopsIssuedPfmCounter : PfmCounter<"uops_issued:any">; +// No default counters on X86. +def DefaultPfmCounters : ProcPfmCounters {} +def : PfmCountersDefaultBinding; + +// Intel X86 Counters. +def PentiumPfmCounters : ProcPfmCounters { + let CycleCounter = PfmCounter<"cpu_clk_unhalted">; + let UopsCounter = PfmCounter<"uops_retired">; +} +def : PfmCountersBinding<"pentiumpro", PentiumPfmCounters>; +def : PfmCountersBinding<"pentium2", PentiumPfmCounters>; +def : PfmCountersBinding<"pentium3", PentiumPfmCounters>; +def : PfmCountersBinding<"pentium3m", PentiumPfmCounters>; +def : PfmCountersBinding<"pentium-m", PentiumPfmCounters>; + +def CorePfmCounters : ProcPfmCounters { + let CycleCounter = UnhaltedCoreCyclesPfmCounter; + let UopsCounter = PfmCounter<"uops_retired:any">; +} +def : PfmCountersBinding<"yonah", CorePfmCounters>; +def : PfmCountersBinding<"prescott", CorePfmCounters>; +def : PfmCountersBinding<"core2", CorePfmCounters>; +def : PfmCountersBinding<"penryn", CorePfmCounters>; +def : PfmCountersBinding<"nehalem", CorePfmCounters>; +def : PfmCountersBinding<"corei7", CorePfmCounters>; +def : PfmCountersBinding<"westmere", CorePfmCounters>; + +def AtomPfmCounters : ProcPfmCounters { + let CycleCounter = UnhaltedCoreCyclesPfmCounter; + let UopsCounter = PfmCounter<"uops_retired:any">; +} +def : PfmCountersBinding<"bonnell", AtomPfmCounters>; +def : PfmCountersBinding<"atom", AtomPfmCounters>; + +def SLMPfmCounters : ProcPfmCounters { + let CycleCounter = UnhaltedCoreCyclesPfmCounter; + let UopsCounter = PfmCounter<"uops_retired:any">; +} +def : PfmCountersBinding<"silvermont", SLMPfmCounters>; +def : PfmCountersBinding<"goldmont", SLMPfmCounters>; +def : PfmCountersBinding<"goldmont-plus", SLMPfmCounters>; +def : PfmCountersBinding<"tremont", SLMPfmCounters>; + +def KnightPfmCounters : ProcPfmCounters { + let CycleCounter = UnhaltedCoreCyclesPfmCounter; + let UopsCounter = PfmCounter<"uops_retired:all">; +} +def : PfmCountersBinding<"knl", KnightPfmCounters>; +def : PfmCountersBinding<"knm", KnightPfmCounters>; + def SandyBridgePfmCounters : ProcPfmCounters { let CycleCounter = UnhaltedCoreCyclesPfmCounter; let UopsCounter = UopsIssuedPfmCounter; @@ -26,6 +76,7 @@ def SandyBridgePfmCounters : ProcPfmCounters { ]; } def : PfmCountersBinding<"sandybridge", SandyBridgePfmCounters>; +def : PfmCountersBinding<"ivybridge", SandyBridgePfmCounters>; def HaswellPfmCounters : ProcPfmCounters { let CycleCounter = UnhaltedCoreCyclesPfmCounter; @@ -90,6 +141,31 @@ def SkylakeServerPfmCounters : ProcPfmCounters { ]; } def : PfmCountersBinding<"skylake-avx512", SkylakeServerPfmCounters>; +def : PfmCountersBinding<"cascadelake", SkylakeServerPfmCounters>; +def : PfmCountersBinding<"cannonlake", SkylakeServerPfmCounters>; +def : PfmCountersBinding<"icelake-client", SkylakeServerPfmCounters>; +def : PfmCountersBinding<"icelake-server", SkylakeServerPfmCounters>; + +// AMD X86 Counters. +// Set basic counters for AMD cpus that we know libpfm4 supports. +def DefaultAMDPfmCounters : ProcPfmCounters { + let CycleCounter = PfmCounter<"cpu_clk_unhalted">; + let UopsCounter = PfmCounter<"retired_uops">; +} +def : PfmCountersBinding<"athlon", DefaultAMDPfmCounters>; +def : PfmCountersBinding<"athlon-tbird", DefaultAMDPfmCounters>; +def : PfmCountersBinding<"athlon-4", DefaultAMDPfmCounters>; +def : PfmCountersBinding<"athlon-xp", DefaultAMDPfmCounters>; +def : PfmCountersBinding<"athlon-mp", DefaultAMDPfmCounters>; +def : PfmCountersBinding<"k8", DefaultAMDPfmCounters>; +def : PfmCountersBinding<"opteron", DefaultAMDPfmCounters>; +def : PfmCountersBinding<"athlon64", DefaultAMDPfmCounters>; +def : PfmCountersBinding<"athlon-fx", DefaultAMDPfmCounters>; +def : PfmCountersBinding<"k8-sse3", DefaultAMDPfmCounters>; +def : PfmCountersBinding<"opteron-sse3", DefaultAMDPfmCounters>; +def : PfmCountersBinding<"athlon64-sse3", DefaultAMDPfmCounters>; +def : PfmCountersBinding<"amdfam10", DefaultAMDPfmCounters>; +def : PfmCountersBinding<"barcelona", DefaultAMDPfmCounters>; def BdVer2PfmCounters : ProcPfmCounters { let CycleCounter = PfmCounter<"cpu_clk_unhalted">; @@ -101,8 +177,31 @@ def BdVer2PfmCounters : ProcPfmCounters { PfmIssueCounter<"PdFPU3", "dispatched_fpu_ops:ops_pipe3 + dispatched_fpu_ops:ops_dual_pipe3"> ]; } +def : PfmCountersBinding<"bdver1", BdVer2PfmCounters>; def : PfmCountersBinding<"bdver2", BdVer2PfmCounters>; +def BdVer3PfmCounters : ProcPfmCounters { + let CycleCounter = PfmCounter<"cpu_clk_unhalted">; + let UopsCounter = PfmCounter<"retired_uops">; + let IssueCounters = [ + PfmIssueCounter<"SrFPU0", "dispatched_fpu_ops:ops_pipe0 + dispatched_fpu_ops:ops_dual_pipe0">, + PfmIssueCounter<"SrFPU1", "dispatched_fpu_ops:ops_pipe1 + dispatched_fpu_ops:ops_dual_pipe1">, + PfmIssueCounter<"SrFPU2", "dispatched_fpu_ops:ops_pipe2 + dispatched_fpu_ops:ops_dual_pipe2"> + ]; +} +def : PfmCountersBinding<"bdver3", BdVer3PfmCounters>; +def : PfmCountersBinding<"bdver4", BdVer3PfmCounters>; + +def BtVer1PfmCounters : ProcPfmCounters { + let CycleCounter = PfmCounter<"cpu_clk_unhalted">; + let UopsCounter = PfmCounter<"retired_uops">; + let IssueCounters = [ + PfmIssueCounter<"BtFPU0", "dispatched_fpu:pipe0">, + PfmIssueCounter<"BtFPU1", "dispatched_fpu:pipe1"> + ]; +} +def : PfmCountersBinding<"btver1", BtVer1PfmCounters>; + def BtVer2PfmCounters : ProcPfmCounters { let CycleCounter = PfmCounter<"cpu_clk_unhalted">; let UopsCounter = PfmCounter<"retired_uops">; @@ -112,3 +211,16 @@ def BtVer2PfmCounters : ProcPfmCounters { ]; } def : PfmCountersBinding<"btver2", BtVer2PfmCounters>; + +def ZnVer1PfmCounters : ProcPfmCounters { + let CycleCounter = PfmCounter<"cycles_not_in_halt">; + let UopsCounter = PfmCounter<"retired_uops">; + let IssueCounters = [ + PfmIssueCounter<"ZnFPU0", "fpu_pipe_assignment:total0">, + PfmIssueCounter<"ZnFPU1", "fpu_pipe_assignment:total1">, + PfmIssueCounter<"ZnFPU2", "fpu_pipe_assignment:total2">, + PfmIssueCounter<"ZnFPU3", "fpu_pipe_assignment:total3">, + PfmIssueCounter<"ZnDivider", "div_op_count"> + ]; +} +def : PfmCountersBinding<"znver1", ZnVer1PfmCounters>; diff --git a/lib/Target/X86/X86SchedBroadwell.td b/lib/Target/X86/X86SchedBroadwell.td index 028729057fe6ef7ba97c2d5ec83d152eb76e3e48..971a50196e45a477d2b30d9fd45daf4e1a09be3c 100644 --- a/lib/Target/X86/X86SchedBroadwell.td +++ b/lib/Target/X86/X86SchedBroadwell.td @@ -878,10 +878,10 @@ def BWWriteResGroup45 : SchedWriteRes<[BWPort0156]> { } def: InstRW<[BWWriteResGroup45], (instrs FNCLEX)>; -def BWWriteResGroup46 : SchedWriteRes<[BWPort015,BWPort0156]> { - let Latency = 4; +def BWWriteResGroup46 : SchedWriteRes<[]> { + let Latency = 0; let NumMicroOps = 4; - let ResourceCycles = [1,3]; + let ResourceCycles = []; } def: InstRW<[BWWriteResGroup46], (instrs VZEROUPPER)>; @@ -1229,7 +1229,7 @@ def BWWriteResGroup112 : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> { let NumMicroOps = 5; let ResourceCycles = [1,1,3]; } -def: InstRW<[BWWriteResGroup112], (instregex "RDRAND(16|32|64)r")>; +def: InstRW<[BWWriteResGroup112], (instrs RDRAND16r, RDRAND32r, RDRAND64r)>; def BWWriteResGroup113 : SchedWriteRes<[BWPort1,BWPort6,BWPort23,BWPort0156]> { let Latency = 9; diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td index 799ba1efdde3a7e12fde57d6ed9625f99e129040..06a32fb0b1cdfb490ee7d8cce6872465bfaf739d 100644 --- a/lib/Target/X86/X86SchedHaswell.td +++ b/lib/Target/X86/X86SchedHaswell.td @@ -720,7 +720,7 @@ def HWWriteRDRAND : SchedWriteRes<[HWPort23, HWPort015]> { let NumMicroOps = 17; let ResourceCycles = [1, 16]; } -def : InstRW<[HWWriteRDRAND], (instregex "RDRAND(16|32|64)r")>; +def : InstRW<[HWWriteRDRAND], (instrs RDRAND16r, RDRAND32r, RDRAND64r)>; //=== Floating Point x87 Instructions ===// //-- Move instructions --// @@ -1408,10 +1408,10 @@ def HWWriteResGroup81 : SchedWriteRes<[HWPort0156]> { } def: InstRW<[HWWriteResGroup81], (instrs FNCLEX)>; -def HWWriteResGroup82 : SchedWriteRes<[HWPort015,HWPort0156]> { - let Latency = 4; +def HWWriteResGroup82 : SchedWriteRes<[]> { + let Latency = 0; let NumMicroOps = 4; - let ResourceCycles = [1,3]; + let ResourceCycles = []; } def: InstRW<[HWWriteResGroup82], (instrs VZEROUPPER)>; diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td index 569ae366eaafdfa1d0614d820253211aab45d73e..9dbf0976989f83595ca0d715bd12bd3bd95b544c 100644 --- a/lib/Target/X86/X86SchedSandyBridge.td +++ b/lib/Target/X86/X86SchedSandyBridge.td @@ -1112,6 +1112,13 @@ def SBWriteResGroupVzeroall : SchedWriteRes<[SBPort5]> { } def: InstRW<[SBWriteResGroupVzeroall], (instrs VZEROALL)>; +def SBWriteResGroupVzeroupper : SchedWriteRes<[]> { + let Latency = 1; + let NumMicroOps = 4; + let ResourceCycles = []; +} +def: InstRW<[SBWriteResGroupVzeroupper], (instrs VZEROUPPER)>; + def: InstRW<[WriteZero], (instrs CLC)>; // Intruction variants handled by the renamer. These might not need execution diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td index d4a3eb07b982771643611326db277f43b25c7c49..2c9eb7516085c1d6e63da3a91a142e8c2fc314be 100644 --- a/lib/Target/X86/X86SchedSkylakeClient.td +++ b/lib/Target/X86/X86SchedSkylakeClient.td @@ -897,10 +897,10 @@ def SKLWriteResGroup55 : SchedWriteRes<[SKLPort6,SKLPort0156]> { } def: InstRW<[SKLWriteResGroup55], (instrs PAUSE)>; -def SKLWriteResGroup56 : SchedWriteRes<[SKLPort015,SKLPort0156]> { - let Latency = 4; +def SKLWriteResGroup56 : SchedWriteRes<[]> { + let Latency = 0; let NumMicroOps = 4; - let ResourceCycles = [1,3]; + let ResourceCycles = []; } def: InstRW<[SKLWriteResGroup56], (instrs VZEROUPPER)>; diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td index cbcb6a6e58bb2b0aadb2bc276f414035b39312ee..ec8e4db02d8aa59474082ea9cc3a6e7d99cd49f2 100644 --- a/lib/Target/X86/X86SchedSkylakeServer.td +++ b/lib/Target/X86/X86SchedSkylakeServer.td @@ -1009,10 +1009,10 @@ def SKXWriteResGroup55 : SchedWriteRes<[SKXPort0156]> { } def: InstRW<[SKXWriteResGroup55], (instrs FNCLEX)>; -def SKXWriteResGroup56 : SchedWriteRes<[SKXPort015,SKXPort0156]> { - let Latency = 4; +def SKXWriteResGroup56 : SchedWriteRes<[]> { + let Latency = 0; let NumMicroOps = 4; - let ResourceCycles = [1,3]; + let ResourceCycles = []; } def: InstRW<[SKXWriteResGroup56], (instrs VZEROUPPER)>; diff --git a/lib/Target/X86/X86ScheduleBdVer2.td b/lib/Target/X86/X86ScheduleBdVer2.td index bc5d112c2f4f23895d007f7a10869eeba1ab5a4f..5798e1b2671b9581558c5e7e764485466821246c 100644 --- a/lib/Target/X86/X86ScheduleBdVer2.td +++ b/lib/Target/X86/X86ScheduleBdVer2.td @@ -130,18 +130,22 @@ def PdFPU : ProcResGroup<[PdFPU0, PdFPU1, PdFPU2, PdFPU3]> { // Load-Store Units // -// FIXME: does this even make sense? - -def PdLoad : ProcResGroup<[PdAGLU01]> { +let Super = PdAGLU01 in +def PdLoad : ProcResource<2> { // For Piledriver, the load queue is 40 entries deep. let BufferSize = 40; } -def PdStore : ProcResGroup<[PdAGLU01]> { +def PdLoadQueue : LoadQueue; + +let Super = PdAGLU01 in +def PdStore : ProcResource<1> { // For Piledriver, the store queue is 24 entries deep. let BufferSize = 24; } +def PdStoreQueue : StoreQueue; + //===----------------------------------------------------------------------===// // Integer Execution Units // diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td index 23113e0f0df94cdffbda4b4c1994350586687bc4..a866f843106b641439136fbf30324a3d3a2b3fd9 100644 --- a/lib/Target/X86/X86ScheduleZnver1.td +++ b/lib/Target/X86/X86ScheduleZnver1.td @@ -790,7 +790,7 @@ def : InstRW<[WriteMicrocoded], (instregex "RDTSC")>; def : InstRW<[WriteMicrocoded], (instrs RDPMC)>; // RDRAND. -def : InstRW<[WriteMicrocoded], (instregex "RDRAND(16|32|64)r")>; +def : InstRW<[WriteMicrocoded], (instrs RDRAND16r, RDRAND32r, RDRAND64r)>; // XGETBV. def : InstRW<[WriteMicrocoded], (instrs XGETBV)>; diff --git a/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/lib/Target/X86/X86SpeculativeLoadHardening.cpp index b8cb11fb862eb5402037e7780f7410f3e8c54adc..a729161a1beb5c0fe90210ba3a368fe9f74cb157 100644 --- a/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ b/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -1141,7 +1141,9 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches( unsigned TargetReg = TargetAddrSSA.GetValueInMiddleOfBlock(&MBB); // Insert a comparison of the incoming target register with this block's - // address. + // address. This also requires us to mark the block as having its address + // taken explicitly. + MBB.setHasAddressTaken(); auto InsertPt = MBB.SkipPHIsLabelsAndDebug(MBB.begin()); if (MF.getTarget().getCodeModel() == CodeModel::Small && !Subtarget->isPositionIndependent()) { diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 6426e5b076cc8c2cc0b2793cccac324fbafdb801..afcb49dc22632f8c475f1dac861c2822c16601c1 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -57,7 +57,7 @@ static cl::opt EnableMachineCombinerPass("x86-machine-combiner", static cl::opt EnableCondBrFoldingPass("x86-condbr-folding", cl::desc("Enable the conditional branch " "folding pass"), - cl::init(true), cl::Hidden); + cl::init(false), cl::Hidden); extern "C" void LLVMInitializeX86Target() { // Register the target. @@ -78,6 +78,7 @@ extern "C" void LLVMInitializeX86Target() { initializeX86AvoidSFBPassPass(PR); initializeX86SpeculativeLoadHardeningPassPass(PR); initializeX86FlagsCopyLoweringPassPass(PR); + initializeX86CondBrFoldingPassPass(PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -189,10 +190,13 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT, return *RM; } -static CodeModel::Model getEffectiveCodeModel(Optional CM, - bool JIT, bool Is64Bit) { - if (CM) +static CodeModel::Model getEffectiveX86CodeModel(Optional CM, + bool JIT, bool Is64Bit) { + if (CM) { + if (*CM == CodeModel::Tiny) + report_fatal_error("Target does not support the tiny CodeModel"); return *CM; + } if (JIT) return Is64Bit ? CodeModel::Large : CodeModel::Small; return CodeModel::Small; @@ -209,7 +213,8 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT, : LLVMTargetMachine( T, computeDataLayout(TT), TT, CPU, FS, Options, getEffectiveRelocModel(TT, JIT, RM), - getEffectiveCodeModel(CM, JIT, TT.getArch() == Triple::x86_64), OL), + getEffectiveX86CodeModel(CM, JIT, TT.getArch() == Triple::x86_64), + OL), TLOF(createTLOF(getTargetTriple())) { // Windows stack unwinder gets confused when execution flow "falls through" // after a call to 'noreturn' function. @@ -497,6 +502,8 @@ void X86PassConfig::addPreEmitPass() { addPass(createX86FixupLEAs()); addPass(createX86EvexToVexInsts()); } + addPass(createX86DiscriminateMemOpsPass()); + addPass(createX86InsertPrefetchPass()); } void X86PassConfig::addPreEmitPass2() { diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index ebb8aca5fb146a17f41d1f29c3f103650fc26937..7889322159262d3962050614431e93a9798f4473 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -612,9 +612,18 @@ int X86TTIImpl::getArithmeticInstrCost( }; // Look for XOP lowering tricks. - if (ST->hasXOP()) - if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second)) + if (ST->hasXOP()) { + // If the right shift is constant then we'll fold the negation so + // it's as cheap as a left shift. + int ShiftISD = ISD; + if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && + (Op2Info == TargetTransformInfo::OK_UniformConstantValue || + Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) + ShiftISD = ISD::SHL; + if (const auto *Entry = + CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second)) return LT.first * Entry->Cost; + } static const CostTblEntry SSE2UniformShiftCostTable[] = { // Uniform splats are cheaper for the following instructions. @@ -872,6 +881,20 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, if (Kind == TTI::SK_Broadcast) LT.first = 1; + // Subvector extractions are free if they start at the beginning of a + // vector and cheap if the subvectors are aligned. + if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) { + int NumElts = LT.second.getVectorNumElements(); + if ((Index % NumElts) == 0) + return 0; + std::pair SubLT = TLI->getTypeLegalizationCost(DL, SubTp); + if (SubLT.second.isVector()) { + int NumSubElts = SubLT.second.getVectorNumElements(); + if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) + return SubLT.first; + } + } + // We are going to permute multiple sources and the result will be in multiple // destinations. Providing an accurate cost only for splits where the element // type remains the same. @@ -909,15 +932,15 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, } static const CostTblEntry AVX512VBMIShuffleTbl[] = { - { TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb - { TTI::SK_Reverse, MVT::v32i8, 1 }, // vpermb + {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb + {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb - { TTI::SK_PermuteSingleSrc, MVT::v64i8, 1 }, // vpermb - { TTI::SK_PermuteSingleSrc, MVT::v32i8, 1 }, // vpermb + {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb + {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb - { TTI::SK_PermuteTwoSrc, MVT::v64i8, 1 }, // vpermt2b - { TTI::SK_PermuteTwoSrc, MVT::v32i8, 1 }, // vpermt2b - { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 } // vpermt2b + {TTI::SK_PermuteTwoSrc, MVT::v64i8, 1}, // vpermt2b + {TTI::SK_PermuteTwoSrc, MVT::v32i8, 1}, // vpermt2b + {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1} // vpermt2b }; if (ST->hasVBMI()) @@ -926,25 +949,25 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, return LT.first * Entry->Cost; static const CostTblEntry AVX512BWShuffleTbl[] = { - { TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw - { TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb + {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb - { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw - { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw - { TTI::SK_Reverse, MVT::v64i8, 2 }, // pshufb + vshufi64x2 + {TTI::SK_Reverse, MVT::v32i16, 1}, // vpermw + {TTI::SK_Reverse, MVT::v16i16, 1}, // vpermw + {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2 - { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw - { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw - { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // vpermw - { TTI::SK_PermuteSingleSrc, MVT::v64i8, 8 }, // extend to v32i16 - { TTI::SK_PermuteSingleSrc, MVT::v32i8, 3 }, // vpermw + zext/trunc + {TTI::SK_PermuteSingleSrc, MVT::v32i16, 1}, // vpermw + {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpermw + {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // vpermw + {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16 + {TTI::SK_PermuteSingleSrc, MVT::v32i8, 3}, // vpermw + zext/trunc - { TTI::SK_PermuteTwoSrc, MVT::v32i16, 1 }, // vpermt2w - { TTI::SK_PermuteTwoSrc, MVT::v16i16, 1 }, // vpermt2w - { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpermt2w - { TTI::SK_PermuteTwoSrc, MVT::v32i8, 3 }, // zext + vpermt2w + trunc - { TTI::SK_PermuteTwoSrc, MVT::v64i8, 19 }, // 6 * v32i8 + 1 - { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 } // zext + vpermt2w + trunc + {TTI::SK_PermuteTwoSrc, MVT::v32i16, 1}, // vpermt2w + {TTI::SK_PermuteTwoSrc, MVT::v16i16, 1}, // vpermt2w + {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpermt2w + {TTI::SK_PermuteTwoSrc, MVT::v32i8, 3}, // zext + vpermt2w + trunc + {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1 + {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3} // zext + vpermt2w + trunc }; if (ST->hasBWI()) @@ -953,42 +976,42 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, return LT.first * Entry->Cost; static const CostTblEntry AVX512ShuffleTbl[] = { - { TTI::SK_Broadcast, MVT::v8f64, 1 }, // vbroadcastpd - { TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps - { TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq - { TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd - - { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd - { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps - { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq - { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd - - { TTI::SK_PermuteSingleSrc, MVT::v8f64, 1 }, // vpermpd - { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd - { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // vpermpd - { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps - { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps - { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // vpermps - { TTI::SK_PermuteSingleSrc, MVT::v8i64, 1 }, // vpermq - { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq - { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // vpermq - { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd - { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd - { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // vpermd - { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb - - { TTI::SK_PermuteTwoSrc, MVT::v8f64, 1 }, // vpermt2pd - { TTI::SK_PermuteTwoSrc, MVT::v16f32, 1 }, // vpermt2ps - { TTI::SK_PermuteTwoSrc, MVT::v8i64, 1 }, // vpermt2q - { TTI::SK_PermuteTwoSrc, MVT::v16i32, 1 }, // vpermt2d - { TTI::SK_PermuteTwoSrc, MVT::v4f64, 1 }, // vpermt2pd - { TTI::SK_PermuteTwoSrc, MVT::v8f32, 1 }, // vpermt2ps - { TTI::SK_PermuteTwoSrc, MVT::v4i64, 1 }, // vpermt2q - { TTI::SK_PermuteTwoSrc, MVT::v8i32, 1 }, // vpermt2d - { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // vpermt2pd - { TTI::SK_PermuteTwoSrc, MVT::v4f32, 1 }, // vpermt2ps - { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // vpermt2q - { TTI::SK_PermuteTwoSrc, MVT::v4i32, 1 } // vpermt2d + {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd + {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps + {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq + {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd + + {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd + {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps + {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq + {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd + + {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd + {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd + {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd + {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps + {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps + {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps + {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq + {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq + {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq + {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd + {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd + {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd + {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb + + {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd + {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps + {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q + {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d + {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd + {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps + {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q + {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d + {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd + {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps + {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q + {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1} // vpermt2d }; if (ST->hasAVX512()) @@ -996,40 +1019,40 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, return LT.first * Entry->Cost; static const CostTblEntry AVX2ShuffleTbl[] = { - { TTI::SK_Broadcast, MVT::v4f64, 1 }, // vbroadcastpd - { TTI::SK_Broadcast, MVT::v8f32, 1 }, // vbroadcastps - { TTI::SK_Broadcast, MVT::v4i64, 1 }, // vpbroadcastq - { TTI::SK_Broadcast, MVT::v8i32, 1 }, // vpbroadcastd - { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw - { TTI::SK_Broadcast, MVT::v32i8, 1 }, // vpbroadcastb - - { TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd - { TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps - { TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq - { TTI::SK_Reverse, MVT::v8i32, 1 }, // vpermd - { TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb - { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb - - { TTI::SK_Select, MVT::v16i16, 1 }, // vpblendvb - { TTI::SK_Select, MVT::v32i8, 1 }, // vpblendvb - - { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd - { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps - { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq - { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd - { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2*vpshufb + {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd + {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps + {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq + {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd + {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb + + {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd + {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps + {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq + {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd + {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb + {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb + + {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb + {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb + + {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd + {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps + {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq + {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd + {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb // + vpblendvb - { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vperm2i128 + 2*vpshufb + {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb // + vpblendvb - { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vpermpd + vblendpd - { TTI::SK_PermuteTwoSrc, MVT::v8f32, 3 }, // 2*vpermps + vblendps - { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vpermq + vpblendd - { TTI::SK_PermuteTwoSrc, MVT::v8i32, 3 }, // 2*vpermd + vpblendd - { TTI::SK_PermuteTwoSrc, MVT::v16i16, 7 }, // 2*vperm2i128 + 4*vpshufb - // + vpblendvb - { TTI::SK_PermuteTwoSrc, MVT::v32i8, 7 }, // 2*vperm2i128 + 4*vpshufb - // + vpblendvb + {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd + {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps + {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd + {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd + {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb + // + vpblendvb + {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb + // + vpblendvb }; if (ST->hasAVX2()) @@ -1037,21 +1060,21 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, return LT.first * Entry->Cost; static const CostTblEntry XOPShuffleTbl[] = { - { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vpermil2pd - { TTI::SK_PermuteSingleSrc, MVT::v8f32, 2 }, // vperm2f128 + vpermil2ps - { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vpermil2pd - { TTI::SK_PermuteSingleSrc, MVT::v8i32, 2 }, // vperm2f128 + vpermil2ps - { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vextractf128 + 2*vpperm - // + vinsertf128 - { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vextractf128 + 2*vpperm - // + vinsertf128 - - { TTI::SK_PermuteTwoSrc, MVT::v16i16, 9 }, // 2*vextractf128 + 6*vpperm - // + vinsertf128 - { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpperm - { TTI::SK_PermuteTwoSrc, MVT::v32i8, 9 }, // 2*vextractf128 + 6*vpperm - // + vinsertf128 - { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 }, // vpperm + {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd + {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps + {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd + {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps + {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm + // + vinsertf128 + {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm + // + vinsertf128 + + {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm + // + vinsertf128 + {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm + {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm + // + vinsertf128 + {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm }; if (ST->hasXOP()) @@ -1059,46 +1082,46 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, return LT.first * Entry->Cost; static const CostTblEntry AVX1ShuffleTbl[] = { - { TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd - { TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps - { TTI::SK_Broadcast, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd - { TTI::SK_Broadcast, MVT::v8i32, 2 }, // vperm2f128 + vpermilps - { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128 - { TTI::SK_Broadcast, MVT::v32i8, 2 }, // vpshufb + vinsertf128 - - { TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd - { TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps - { TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd - { TTI::SK_Reverse, MVT::v8i32, 2 }, // vperm2f128 + vpermilps - { TTI::SK_Reverse, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb - // + vinsertf128 - { TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb - // + vinsertf128 - - { TTI::SK_Select, MVT::v4i64, 1 }, // vblendpd - { TTI::SK_Select, MVT::v4f64, 1 }, // vblendpd - { TTI::SK_Select, MVT::v8i32, 1 }, // vblendps - { TTI::SK_Select, MVT::v8f32, 1 }, // vblendps - { TTI::SK_Select, MVT::v16i16, 3 }, // vpand + vpandn + vpor - { TTI::SK_Select, MVT::v32i8, 3 }, // vpand + vpandn + vpor - - { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vshufpd - { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vshufpd - { TTI::SK_PermuteSingleSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps - { TTI::SK_PermuteSingleSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps - { TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb + {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd + {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps + {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd + {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps + {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128 + {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128 + + {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd + {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps + {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd + {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps + {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb + // + vinsertf128 + {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb + // + vinsertf128 + + {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd + {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd + {TTI::SK_Select, MVT::v8i32, 1}, // vblendps + {TTI::SK_Select, MVT::v8f32, 1}, // vblendps + {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor + {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor + + {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd + {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd + {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps + {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps + {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb // + 2*por + vinsertf128 - { TTI::SK_PermuteSingleSrc, MVT::v32i8, 8 }, // vextractf128 + 4*pshufb + {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb // + 2*por + vinsertf128 - { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd - { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd - { TTI::SK_PermuteTwoSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps - { TTI::SK_PermuteTwoSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps - { TTI::SK_PermuteTwoSrc, MVT::v16i16, 15 }, // 2*vextractf128 + 8*pshufb - // + 4*por + vinsertf128 - { TTI::SK_PermuteTwoSrc, MVT::v32i8, 15 }, // 2*vextractf128 + 8*pshufb - // + 4*por + vinsertf128 + {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd + {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd + {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps + {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps + {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb + // + 4*por + vinsertf128 + {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb + // + 4*por + vinsertf128 }; if (ST->hasAVX()) @@ -1106,12 +1129,12 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, return LT.first * Entry->Cost; static const CostTblEntry SSE41ShuffleTbl[] = { - { TTI::SK_Select, MVT::v2i64, 1 }, // pblendw - { TTI::SK_Select, MVT::v2f64, 1 }, // movsd - { TTI::SK_Select, MVT::v4i32, 1 }, // pblendw - { TTI::SK_Select, MVT::v4f32, 1 }, // blendps - { TTI::SK_Select, MVT::v8i16, 1 }, // pblendw - { TTI::SK_Select, MVT::v16i8, 1 } // pblendvb + {TTI::SK_Select, MVT::v2i64, 1}, // pblendw + {TTI::SK_Select, MVT::v2f64, 1}, // movsd + {TTI::SK_Select, MVT::v4i32, 1}, // pblendw + {TTI::SK_Select, MVT::v4f32, 1}, // blendps + {TTI::SK_Select, MVT::v8i16, 1}, // pblendw + {TTI::SK_Select, MVT::v16i8, 1} // pblendvb }; if (ST->hasSSE41()) @@ -1119,20 +1142,20 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, return LT.first * Entry->Cost; static const CostTblEntry SSSE3ShuffleTbl[] = { - { TTI::SK_Broadcast, MVT::v8i16, 1 }, // pshufb - { TTI::SK_Broadcast, MVT::v16i8, 1 }, // pshufb + {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb + {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb - { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb - { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb + {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb + {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb - { TTI::SK_Select, MVT::v8i16, 3 }, // 2*pshufb + por - { TTI::SK_Select, MVT::v16i8, 3 }, // 2*pshufb + por + {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por + {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por - { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb - { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb + {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb + {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb - { TTI::SK_PermuteTwoSrc, MVT::v8i16, 3 }, // 2*pshufb + por - { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 }, // 2*pshufb + por + {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por + {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por }; if (ST->hasSSSE3()) @@ -1140,29 +1163,29 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, return LT.first * Entry->Cost; static const CostTblEntry SSE2ShuffleTbl[] = { - { TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd - { TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd - { TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd - { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd - { TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd - - { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd - { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd - { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd - { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd - { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw - // + 2*pshufd + 2*unpck + packus - - { TTI::SK_Select, MVT::v2i64, 1 }, // movsd - { TTI::SK_Select, MVT::v2f64, 1 }, // movsd - { TTI::SK_Select, MVT::v4i32, 2 }, // 2*shufps - { TTI::SK_Select, MVT::v8i16, 3 }, // pand + pandn + por - { TTI::SK_Select, MVT::v16i8, 3 }, // pand + pandn + por - - { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // shufpd - { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd - { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // pshufd - { TTI::SK_PermuteSingleSrc, MVT::v8i16, 5 }, // 2*pshuflw + 2*pshufhw + {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd + {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd + {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd + {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd + {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd + + {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd + {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd + {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd + {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd + {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw + // + 2*pshufd + 2*unpck + packus + + {TTI::SK_Select, MVT::v2i64, 1}, // movsd + {TTI::SK_Select, MVT::v2f64, 1}, // movsd + {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps + {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por + {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por + + {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd + {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd + {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd + {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw // + pshufd/unpck { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw // + 2*pshufd + 2*unpck + 2*packus @@ -1201,6 +1224,27 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, // FIXME: Need a better design of the cost table to handle non-simple types of // potential massive combinations (elem_num x src_type x dst_type). + static const TypeConversionCostTblEntry AVX512BWConversionTbl[] { + { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, + + // Mask sign extend has an instruction. + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 }, + + // Mask zero extend is a load + broadcast. + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 }, + }; + static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, @@ -1526,43 +1570,51 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, if (!SrcTy.isSimple() || !DstTy.isSimple()) return BaseT::getCastInstrCost(Opcode, Dst, Src); - if (ST->hasDQI()) - if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD, - DstTy.getSimpleVT(), - SrcTy.getSimpleVT())) - return Entry->Cost; + MVT SimpleSrcTy = SrcTy.getSimpleVT(); + MVT SimpleDstTy = DstTy.getSimpleVT(); - if (ST->hasAVX512()) - if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD, - DstTy.getSimpleVT(), - SrcTy.getSimpleVT())) - return Entry->Cost; + // Make sure that neither type is going to be split before using the + // AVX512 tables. This handles -mprefer-vector-width=256 + // with -min-legal-vector-width<=256 + if (TLI->getTypeAction(SimpleSrcTy) != TargetLowering::TypeSplitVector && + TLI->getTypeAction(SimpleDstTy) != TargetLowering::TypeSplitVector) { + if (ST->hasBWI()) + if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD, + SimpleDstTy, SimpleSrcTy)) + return Entry->Cost; + + if (ST->hasDQI()) + if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD, + SimpleDstTy, SimpleSrcTy)) + return Entry->Cost; + + if (ST->hasAVX512()) + if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD, + SimpleDstTy, SimpleSrcTy)) + return Entry->Cost; + } if (ST->hasAVX2()) { if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, - DstTy.getSimpleVT(), - SrcTy.getSimpleVT())) + SimpleDstTy, SimpleSrcTy)) return Entry->Cost; } if (ST->hasAVX()) { if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, - DstTy.getSimpleVT(), - SrcTy.getSimpleVT())) + SimpleDstTy, SimpleSrcTy)) return Entry->Cost; } if (ST->hasSSE41()) { if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, - DstTy.getSimpleVT(), - SrcTy.getSimpleVT())) + SimpleDstTy, SimpleSrcTy)) return Entry->Cost; } if (ST->hasSSE2()) { if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, - DstTy.getSimpleVT(), - SrcTy.getSimpleVT())) + SimpleDstTy, SimpleSrcTy)) return Entry->Cost; } @@ -1834,7 +1886,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ }; static const CostTblEntry X64CostTbl[] = { // 64-bit targets - { ISD::BITREVERSE, MVT::i64, 14 } + { ISD::BITREVERSE, MVT::i64, 14 } }; static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets { ISD::BITREVERSE, MVT::i32, 14 }, @@ -1866,71 +1918,163 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, break; } - // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, RetTy); - MVT MTy = LT.second; + if (ISD != ISD::DELETED_NODE) { + // Legalize the type. + std::pair LT = TLI->getTypeLegalizationCost(DL, RetTy); + MVT MTy = LT.second; - // Attempt to lookup cost. - if (ST->isGLM()) - if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + // Attempt to lookup cost. + if (ST->isGLM()) + if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->isSLM()) - if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + if (ST->isSLM()) + if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasCDI()) - if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + if (ST->hasCDI()) + if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasBWI()) - if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasAVX512()) - if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + if (ST->hasAVX512()) + if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasXOP()) - if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + if (ST->hasXOP()) + if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasAVX2()) - if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasAVX()) - if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasSSE42()) - if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasSSSE3()) - if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + if (ST->hasSSSE3()) + if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasSSE2()) - if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasSSE1()) - if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + if (ST->hasSSE1()) + if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->is64Bit()) - if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + if (ST->is64Bit()) + if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + } return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed); } int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef Args, FastMathFlags FMF, unsigned VF) { + ArrayRef Args, FastMathFlags FMF, + unsigned VF) { + static const CostTblEntry AVX512CostTbl[] = { + { ISD::ROTL, MVT::v8i64, 1 }, + { ISD::ROTL, MVT::v4i64, 1 }, + { ISD::ROTL, MVT::v2i64, 1 }, + { ISD::ROTL, MVT::v16i32, 1 }, + { ISD::ROTL, MVT::v8i32, 1 }, + { ISD::ROTL, MVT::v4i32, 1 }, + { ISD::ROTR, MVT::v8i64, 1 }, + { ISD::ROTR, MVT::v4i64, 1 }, + { ISD::ROTR, MVT::v2i64, 1 }, + { ISD::ROTR, MVT::v16i32, 1 }, + { ISD::ROTR, MVT::v8i32, 1 }, + { ISD::ROTR, MVT::v4i32, 1 } + }; + // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y)) + static const CostTblEntry XOPCostTbl[] = { + { ISD::ROTL, MVT::v4i64, 4 }, + { ISD::ROTL, MVT::v8i32, 4 }, + { ISD::ROTL, MVT::v16i16, 4 }, + { ISD::ROTL, MVT::v32i8, 4 }, + { ISD::ROTL, MVT::v2i64, 1 }, + { ISD::ROTL, MVT::v4i32, 1 }, + { ISD::ROTL, MVT::v8i16, 1 }, + { ISD::ROTL, MVT::v16i8, 1 }, + { ISD::ROTR, MVT::v4i64, 6 }, + { ISD::ROTR, MVT::v8i32, 6 }, + { ISD::ROTR, MVT::v16i16, 6 }, + { ISD::ROTR, MVT::v32i8, 6 }, + { ISD::ROTR, MVT::v2i64, 2 }, + { ISD::ROTR, MVT::v4i32, 2 }, + { ISD::ROTR, MVT::v8i16, 2 }, + { ISD::ROTR, MVT::v16i8, 2 } + }; + static const CostTblEntry X64CostTbl[] = { // 64-bit targets + { ISD::ROTL, MVT::i64, 1 }, + { ISD::ROTR, MVT::i64, 1 }, + { ISD::FSHL, MVT::i64, 4 } + }; + static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets + { ISD::ROTL, MVT::i32, 1 }, + { ISD::ROTL, MVT::i16, 1 }, + { ISD::ROTL, MVT::i8, 1 }, + { ISD::ROTR, MVT::i32, 1 }, + { ISD::ROTR, MVT::i16, 1 }, + { ISD::ROTR, MVT::i8, 1 }, + { ISD::FSHL, MVT::i32, 4 }, + { ISD::FSHL, MVT::i16, 4 }, + { ISD::FSHL, MVT::i8, 4 } + }; + + unsigned ISD = ISD::DELETED_NODE; + switch (IID) { + default: + break; + case Intrinsic::fshl: + ISD = ISD::FSHL; + if (Args[0] == Args[1]) + ISD = ISD::ROTL; + break; + case Intrinsic::fshr: + // FSHR has same costs so don't duplicate. + ISD = ISD::FSHL; + if (Args[0] == Args[1]) + ISD = ISD::ROTR; + break; + } + + if (ISD != ISD::DELETED_NODE) { + // Legalize the type. + std::pair LT = TLI->getTypeLegalizationCost(DL, RetTy); + MVT MTy = LT.second; + + // Attempt to lookup cost. + if (ST->hasAVX512()) + if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasXOP()) + if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->is64Bit()) + if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + } + return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF); } diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp index 38925bfd51b0273a3be194aa8830d53112843c5b..2aa9932e2465a51e1a15eed7ea68c55404b02088 100644 --- a/lib/Target/XCore/XCoreTargetMachine.cpp +++ b/lib/Target/XCore/XCoreTargetMachine.cpp @@ -31,7 +31,8 @@ static Reloc::Model getEffectiveRelocModel(Optional RM) { return *RM; } -static CodeModel::Model getEffectiveCodeModel(Optional CM) { +static CodeModel::Model +getEffectiveXCoreCodeModel(Optional CM) { if (CM) { if (*CM != CodeModel::Small && *CM != CodeModel::Large) report_fatal_error("Target only supports CodeModel Small or Large"); @@ -51,7 +52,7 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, const Triple &TT, : LLVMTargetMachine( T, "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32", TT, CPU, FS, Options, getEffectiveRelocModel(RM), - getEffectiveCodeModel(CM), OL), + getEffectiveXCoreCodeModel(CM), OL), TLOF(llvm::make_unique()), Subtarget(TT, CPU, FS, *this) { initAsmInfo(); diff --git a/lib/TextAPI/CMakeLists.txt b/lib/TextAPI/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7912e91fbf1c69c5441767f11545b8e59ddfa313 --- /dev/null +++ b/lib/TextAPI/CMakeLists.txt @@ -0,0 +1,7 @@ +add_llvm_library(LLVMTextAPI + ELF/ELFStub.cpp + ELF/TBEHandler.cpp + + ADDITIONAL_HEADER_DIRS + "${LLVM_MAIN_INCLUDE_DIR}/llvm/TextAPI" +) diff --git a/lib/TextAPI/ELF/ELFStub.cpp b/lib/TextAPI/ELF/ELFStub.cpp new file mode 100644 index 0000000000000000000000000000000000000000..248a078a2404dbb1b0082421f315d1456f708ccc --- /dev/null +++ b/lib/TextAPI/ELF/ELFStub.cpp @@ -0,0 +1,29 @@ +//===- ELFStub.cpp --------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===-----------------------------------------------------------------------===/ + +#include "llvm/TextAPI/ELF/ELFStub.h" + +using namespace llvm; +using namespace llvm::elfabi; + +ELFStub::ELFStub(ELFStub const &Stub) { + TbeVersion = Stub.TbeVersion; + Arch = Stub.Arch; + SoName = Stub.SoName; + NeededLibs = Stub.NeededLibs; + Symbols = Stub.Symbols; +} + +ELFStub::ELFStub(ELFStub &&Stub) { + TbeVersion = std::move(Stub.TbeVersion); + Arch = std::move(Stub.Arch); + SoName = std::move(Stub.SoName); + NeededLibs = std::move(Stub.NeededLibs); + Symbols = std::move(Stub.Symbols); +} diff --git a/lib/TextAPI/ELF/TBEHandler.cpp b/lib/TextAPI/ELF/TBEHandler.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dde980292a167623ee52fd15fbee0370b74648cf --- /dev/null +++ b/lib/TextAPI/ELF/TBEHandler.cpp @@ -0,0 +1,160 @@ +//===- TBEHandler.cpp -----------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===-----------------------------------------------------------------------===/ + +#include "llvm/TextAPI/ELF/TBEHandler.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/YAMLTraits.h" +#include "llvm/TextAPI/ELF/ELFStub.h" + +using namespace llvm; +using namespace llvm::elfabi; + +LLVM_YAML_STRONG_TYPEDEF(ELFArch, ELFArchMapper) + +namespace llvm { +namespace yaml { + +/// YAML traits for ELFSymbolType. +template <> struct ScalarEnumerationTraits { + static void enumeration(IO &IO, ELFSymbolType &SymbolType) { + IO.enumCase(SymbolType, "NoType", ELFSymbolType::NoType); + IO.enumCase(SymbolType, "Func", ELFSymbolType::Func); + IO.enumCase(SymbolType, "Object", ELFSymbolType::Object); + IO.enumCase(SymbolType, "TLS", ELFSymbolType::TLS); + IO.enumCase(SymbolType, "Unknown", ELFSymbolType::Unknown); + // Treat other symbol types as noise, and map to Unknown. + if (!IO.outputting() && IO.matchEnumFallback()) + SymbolType = ELFSymbolType::Unknown; + } +}; + +/// YAML traits for ELFArch. +template <> struct ScalarTraits { + static void output(const ELFArchMapper &Value, void *, + llvm::raw_ostream &Out) { + // Map from integer to architecture string. + switch (Value) { + case (ELFArch)ELF::EM_X86_64: + Out << "x86_64"; + break; + case (ELFArch)ELF::EM_AARCH64: + Out << "AArch64"; + break; + case (ELFArch)ELF::EM_NONE: + default: + Out << "Unknown"; + } + } + + static StringRef input(StringRef Scalar, void *, ELFArchMapper &Value) { + // Map from architecture string to integer. + Value = StringSwitch(Scalar) + .Case("x86_64", ELF::EM_X86_64) + .Case("AArch64", ELF::EM_AARCH64) + .Case("Unknown", ELF::EM_NONE) + .Default(ELF::EM_NONE); + + // Returning empty StringRef indicates successful parse. + return StringRef(); + } + + // Don't place quotation marks around architecture value. + static QuotingType mustQuote(StringRef) { return QuotingType::None; } +}; + +/// YAML traits for TbeVersion. +template <> struct ScalarTraits { + static void output(const VersionTuple &Value, void *, + llvm::raw_ostream &Out) { + Out << Value.getAsString(); + } + + static StringRef input(StringRef Scalar, void *, VersionTuple &Value) { + if (Value.tryParse(Scalar)) + return StringRef("Can't parse version: invalid version format."); + + if (Value > TBEVersionCurrent) + return StringRef("Unsupported TBE version."); + + // Returning empty StringRef indicates successful parse. + return StringRef(); + } + + // Don't place quotation marks around version value. + static QuotingType mustQuote(StringRef) { return QuotingType::None; } +}; + +/// YAML traits for ELFSymbol. +template <> struct MappingTraits { + static void mapping(IO &IO, ELFSymbol &Symbol) { + IO.mapRequired("Type", Symbol.Type); + // The need for symbol size depends on the symbol type. + if (Symbol.Type == ELFSymbolType::NoType) { + IO.mapOptional("Size", Symbol.Size, (uint64_t)0); + } else if (Symbol.Type == ELFSymbolType::Func) { + Symbol.Size = 0; + } else { + IO.mapRequired("Size", Symbol.Size); + } + IO.mapOptional("Undefined", Symbol.Undefined, false); + IO.mapOptional("Warning", Symbol.Warning); + } + + // Compacts symbol information into a single line. + static const bool flow = true; +}; + +/// YAML traits for set of ELFSymbols. +template <> struct CustomMappingTraits> { + static void inputOne(IO &IO, StringRef Key, std::set &Set) { + ELFSymbol Sym(Key.str()); + IO.mapRequired(Key.str().c_str(), Sym); + Set.insert(Sym); + } + + static void output(IO &IO, std::set &Set) { + for (auto &Sym : Set) + IO.mapRequired(Sym.Name.c_str(), const_cast(Sym)); + } +}; + +/// YAML traits for ELFStub objects. +template <> struct MappingTraits { + static void mapping(IO &IO, ELFStub &Stub) { + if (!IO.mapTag("!tapi-tbe", true)) + IO.setError("Not a .tbe YAML file."); + IO.mapRequired("TbeVersion", Stub.TbeVersion); + IO.mapOptional("SoName", Stub.SoName); + IO.mapRequired("Arch", (ELFArchMapper &)Stub.Arch); + IO.mapOptional("NeededLibs", Stub.NeededLibs); + IO.mapRequired("Symbols", Stub.Symbols); + } +}; + +} // end namespace yaml +} // end namespace llvm + +Expected> elfabi::readTBEFromBuffer(StringRef Buf) { + yaml::Input YamlIn(Buf); + std::unique_ptr Stub(new ELFStub()); + YamlIn >> *Stub; + if (std::error_code Err = YamlIn.error()) + return createStringError(Err, "YAML failed reading as TBE"); + + return std::move(Stub); +} + +Error elfabi::writeTBEToOutputStream(raw_ostream &OS, const ELFStub &Stub) { + yaml::Output YamlOut(OS, NULL, /*WrapColumn =*/0); + + YamlOut << const_cast(Stub); + return Error::success(); +} diff --git a/lib/TextAPI/LLVMBuild.txt b/lib/TextAPI/LLVMBuild.txt new file mode 100644 index 0000000000000000000000000000000000000000..83733a72f506b93729748ddca4696be291deb881 --- /dev/null +++ b/lib/TextAPI/LLVMBuild.txt @@ -0,0 +1,22 @@ +;===- ./lib/TextAPI/LLVMBuild.txt ------------------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = TextAPI +parent = Libraries +required_libraries = Support BinaryFormat diff --git a/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 7560060b7604de4e8d04f6431783b4feb96d7417..a9442255e34c1432818bbbc2b16bd932431f8faa 100644 --- a/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -59,6 +59,99 @@ public: }; } // namespace +/// Match a pattern for a bitwise rotate operation that partially guards +/// against undefined behavior by branching around the rotation when the shift +/// amount is 0. +static bool foldGuardedRotateToFunnelShift(Instruction &I) { + if (I.getOpcode() != Instruction::PHI || I.getNumOperands() != 2) + return false; + + // As with the one-use checks below, this is not strictly necessary, but we + // are being cautious to avoid potential perf regressions on targets that + // do not actually have a rotate instruction (where the funnel shift would be + // expanded back into math/shift/logic ops). + if (!isPowerOf2_32(I.getType()->getScalarSizeInBits())) + return false; + + // Match V to funnel shift left/right and capture the source operand and + // shift amount in X and Y. + auto matchRotate = [](Value *V, Value *&X, Value *&Y) { + Value *L0, *L1, *R0, *R1; + unsigned Width = V->getType()->getScalarSizeInBits(); + auto Sub = m_Sub(m_SpecificInt(Width), m_Value(R1)); + + // rotate_left(X, Y) == (X << Y) | (X >> (Width - Y)) + auto RotL = m_OneUse(m_c_Or(m_Shl(m_Value(L0), m_Value(L1)), + m_LShr(m_Value(R0), Sub))); + if (RotL.match(V) && L0 == R0 && L1 == R1) { + X = L0; + Y = L1; + return Intrinsic::fshl; + } + + // rotate_right(X, Y) == (X >> Y) | (X << (Width - Y)) + auto RotR = m_OneUse(m_c_Or(m_LShr(m_Value(L0), m_Value(L1)), + m_Shl(m_Value(R0), Sub))); + if (RotR.match(V) && L0 == R0 && L1 == R1) { + X = L0; + Y = L1; + return Intrinsic::fshr; + } + + return Intrinsic::not_intrinsic; + }; + + // One phi operand must be a rotate operation, and the other phi operand must + // be the source value of that rotate operation: + // phi [ rotate(RotSrc, RotAmt), RotBB ], [ RotSrc, GuardBB ] + PHINode &Phi = cast(I); + Value *P0 = Phi.getOperand(0), *P1 = Phi.getOperand(1); + Value *RotSrc, *RotAmt; + Intrinsic::ID IID = matchRotate(P0, RotSrc, RotAmt); + if (IID == Intrinsic::not_intrinsic || RotSrc != P1) { + IID = matchRotate(P1, RotSrc, RotAmt); + if (IID == Intrinsic::not_intrinsic || RotSrc != P0) + return false; + assert((IID == Intrinsic::fshl || IID == Intrinsic::fshr) && + "Pattern must match funnel shift left or right"); + } + + // The incoming block with our source operand must be the "guard" block. + // That must contain a cmp+branch to avoid the rotate when the shift amount + // is equal to 0. The other incoming block is the block with the rotate. + BasicBlock *GuardBB = Phi.getIncomingBlock(RotSrc == P1); + BasicBlock *RotBB = Phi.getIncomingBlock(RotSrc != P1); + Instruction *TermI = GuardBB->getTerminator(); + BasicBlock *TrueBB, *FalseBB; + ICmpInst::Predicate Pred; + if (!match(TermI, m_Br(m_ICmp(Pred, m_Specific(RotAmt), m_ZeroInt()), + TrueBB, FalseBB))) + return false; + + BasicBlock *PhiBB = Phi.getParent(); + if (Pred != CmpInst::ICMP_EQ || TrueBB != PhiBB || FalseBB != RotBB) + return false; + + // We matched a variation of this IR pattern: + // GuardBB: + // %cmp = icmp eq i32 %RotAmt, 0 + // br i1 %cmp, label %PhiBB, label %RotBB + // RotBB: + // %sub = sub i32 32, %RotAmt + // %shr = lshr i32 %X, %sub + // %shl = shl i32 %X, %RotAmt + // %rot = or i32 %shr, %shl + // br label %PhiBB + // PhiBB: + // %cond = phi i32 [ %rot, %RotBB ], [ %X, %GuardBB ] + // --> + // llvm.fshl.i32(i32 %X, i32 %RotAmt) + IRBuilder<> Builder(PhiBB, PhiBB->getFirstInsertionPt()); + Function *F = Intrinsic::getDeclaration(Phi.getModule(), IID, Phi.getType()); + Phi.replaceAllUsesWith(Builder.CreateCall(F, {RotSrc, RotSrc, RotAmt})); + return true; +} + /// This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and /// the bit indexes (Mask) needed by a masked compare. If we're matching a chain /// of 'and' ops, then we also need to capture the fact that we saw an @@ -174,8 +267,10 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT) { // Also, we want to avoid matching partial patterns. // TODO: It would be more efficient if we removed dead instructions // iteratively in this loop rather than waiting until the end. - for (Instruction &I : make_range(BB.rbegin(), BB.rend())) + for (Instruction &I : make_range(BB.rbegin(), BB.rend())) { MadeChange |= foldAnyOrAllBitsSet(I); + MadeChange |= foldGuardedRotateToFunnelShift(I); + } } // We're done with transforms, so remove dead instructions. diff --git a/lib/Transforms/Coroutines/CoroSplit.cpp b/lib/Transforms/Coroutines/CoroSplit.cpp index 2462ae4abd3d914c7800992b9be43491c2a10c04..9eeceb217ba8bc03e353b8f9971eb24e9681e41d 100644 --- a/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/lib/Transforms/Coroutines/CoroSplit.cpp @@ -538,43 +538,92 @@ static void handleNoSuspendCoroutine(CoroBeginInst *CoroBegin, Type *FrameTy) { CoroBegin->eraseFromParent(); } -// look for a very simple pattern -// coro.save -// no other calls -// resume or destroy call -// coro.suspend -// -// If there are other calls between coro.save and coro.suspend, they can -// potentially resume or destroy the coroutine, so it is unsafe to eliminate a -// suspend point. -static bool simplifySuspendPoint(CoroSuspendInst *Suspend, - CoroBeginInst *CoroBegin) { - auto *Save = Suspend->getCoroSave(); - auto *BB = Suspend->getParent(); - if (BB != Save->getParent()) - return false; +// SimplifySuspendPoint needs to check that there is no calls between +// coro_save and coro_suspend, since any of the calls may potentially resume +// the coroutine and if that is the case we cannot eliminate the suspend point. +static bool hasCallsInBlockBetween(Instruction *From, Instruction *To) { + for (Instruction *I = From; I != To; I = I->getNextNode()) { + // Assume that no intrinsic can resume the coroutine. + if (isa(I)) + continue; - CallSite SingleCallSite; + if (CallSite(I)) + return true; + } + return false; +} - // Check that we have only one CallSite. - for (Instruction *I = Save->getNextNode(); I != Suspend; - I = I->getNextNode()) { - if (isa(I)) - continue; - if (isa(I)) - continue; - if (CallSite CS = CallSite(I)) { - if (SingleCallSite) - return false; - else - SingleCallSite = CS; - } +static bool hasCallsInBlocksBetween(BasicBlock *SaveBB, BasicBlock *ResDesBB) { + SmallPtrSet Set; + SmallVector Worklist; + + Set.insert(SaveBB); + Worklist.push_back(ResDesBB); + + // Accumulate all blocks between SaveBB and ResDesBB. Because CoroSaveIntr + // returns a token consumed by suspend instruction, all blocks in between + // will have to eventually hit SaveBB when going backwards from ResDesBB. + while (!Worklist.empty()) { + auto *BB = Worklist.pop_back_val(); + Set.insert(BB); + for (auto *Pred : predecessors(BB)) + if (Set.count(Pred) == 0) + Worklist.push_back(Pred); } - auto *CallInstr = SingleCallSite.getInstruction(); - if (!CallInstr) + + // SaveBB and ResDesBB are checked separately in hasCallsBetween. + Set.erase(SaveBB); + Set.erase(ResDesBB); + + for (auto *BB : Set) + if (hasCallsInBlockBetween(BB->getFirstNonPHI(), nullptr)) + return true; + + return false; +} + +static bool hasCallsBetween(Instruction *Save, Instruction *ResumeOrDestroy) { + auto *SaveBB = Save->getParent(); + auto *ResumeOrDestroyBB = ResumeOrDestroy->getParent(); + + if (SaveBB == ResumeOrDestroyBB) + return hasCallsInBlockBetween(Save->getNextNode(), ResumeOrDestroy); + + // Any calls from Save to the end of the block? + if (hasCallsInBlockBetween(Save->getNextNode(), nullptr)) + return true; + + // Any calls from begging of the block up to ResumeOrDestroy? + if (hasCallsInBlockBetween(ResumeOrDestroyBB->getFirstNonPHI(), + ResumeOrDestroy)) + return true; + + // Any calls in all of the blocks between SaveBB and ResumeOrDestroyBB? + if (hasCallsInBlocksBetween(SaveBB, ResumeOrDestroyBB)) + return true; + + return false; +} + +// If a SuspendIntrin is preceded by Resume or Destroy, we can eliminate the +// suspend point and replace it with nornal control flow. +static bool simplifySuspendPoint(CoroSuspendInst *Suspend, + CoroBeginInst *CoroBegin) { + Instruction *Prev = Suspend->getPrevNode(); + if (!Prev) { + auto *Pred = Suspend->getParent()->getSinglePredecessor(); + if (!Pred) + return false; + Prev = Pred->getTerminator(); + } + + CallSite CS{Prev}; + if (!CS) return false; - auto *Callee = SingleCallSite.getCalledValue()->stripPointerCasts(); + auto *CallInstr = CS.getInstruction(); + + auto *Callee = CS.getCalledValue()->stripPointerCasts(); // See if the callsite is for resumption or destruction of the coroutine. auto *SubFn = dyn_cast(Callee); @@ -585,6 +634,13 @@ static bool simplifySuspendPoint(CoroSuspendInst *Suspend, if (SubFn->getFrame() != CoroBegin) return false; + // See if the transformation is safe. Specifically, see if there are any + // calls in between Save and CallInstr. They can potenitally resume the + // coroutine rendering this optimization unsafe. + auto *Save = Suspend->getCoroSave(); + if (hasCallsBetween(Save, CallInstr)) + return false; + // Replace llvm.coro.suspend with the value that results in resumption over // the resume or cleanup path. Suspend->replaceAllUsesWith(SubFn->getRawIndex()); @@ -592,8 +648,20 @@ static bool simplifySuspendPoint(CoroSuspendInst *Suspend, Save->eraseFromParent(); // No longer need a call to coro.resume or coro.destroy. + if (auto *Invoke = dyn_cast(CallInstr)) { + BranchInst::Create(Invoke->getNormalDest(), Invoke); + } + + // Grab the CalledValue from CS before erasing the CallInstr. + auto *CalledValue = CS.getCalledValue(); CallInstr->eraseFromParent(); + // If no more users remove it. Usually it is a bitcast of SubFn. + if (CalledValue != SubFn && CalledValue->user_empty()) + if (auto *I = dyn_cast(CalledValue)) + I->eraseFromParent(); + + // Now we are good to remove SubFn. if (SubFn->user_empty()) SubFn->eraseFromParent(); diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp index f2c2b55b1c5b82c528a0780c14eda7197427eeea..517a9c082a4cf6f4fe46c79a7be73ca77d4a0a75 100644 --- a/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -213,7 +213,8 @@ doPromotion(Function *F, SmallPtrSetImpl &ArgsToPromote, FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg()); // Create the new function body and insert it into the module. - Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName()); + Function *NF = Function::Create(NFTy, F->getLinkage(), F->getAddressSpace(), + F->getName()); NF->copyAttributesFrom(F); // Patch the pointer to LLVM function in debug info descriptor. diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp index 0cf238b5d6065943eb7b76ea5943cccdb86934d3..cb30e8f46a54dfc92f38f955caccaabf288c8abf 100644 --- a/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -24,7 +24,6 @@ #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" -#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" @@ -165,7 +164,7 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) { unsigned NumArgs = Params.size(); // Create the new function body and insert it into the module... - Function *NF = Function::Create(NFTy, Fn.getLinkage()); + Function *NF = Function::Create(NFTy, Fn.getLinkage(), Fn.getAddressSpace()); NF->copyAttributesFrom(&Fn); NF->setComdat(Fn.getComdat()); Fn.getParent()->getFunctionList().insert(Fn.getIterator(), NF); @@ -864,7 +863,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { return false; // Create the new function body and insert it into the module... - Function *NF = Function::Create(NFTy, F->getLinkage()); + Function *NF = Function::Create(NFTy, F->getLinkage(), F->getAddressSpace()); NF->copyAttributesFrom(F); NF->setComdat(F->getComdat()); NF->setAttributes(NewPAL); @@ -954,16 +953,16 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { ArgAttrVec.clear(); Instruction *New = NewCS.getInstruction(); - if (!Call->use_empty()) { + if (!Call->use_empty() || Call->isUsedByMetadata()) { if (New->getType() == Call->getType()) { // Return type not changed? Just replace users then. Call->replaceAllUsesWith(New); New->takeName(Call); } else if (New->getType()->isVoidTy()) { - // Our return value has uses, but they will get removed later on. - // Replace by null for now. + // If the return value is dead, replace any uses of it with undef + // (any non-debug value uses will get removed later on). if (!Call->getType()->isX86_MMXTy()) - Call->replaceAllUsesWith(Constant::getNullValue(Call->getType())); + Call->replaceAllUsesWith(UndefValue::get(Call->getType())); } else { assert((RetTy->isStructTy() || RetTy->isArrayTy()) && "Return type changed, but not into a void. The old return type" @@ -1023,10 +1022,10 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { I2->takeName(&*I); ++I2; } else { - // If this argument is dead, replace any uses of it with null constants - // (these are guaranteed to become unused later on). + // If this argument is dead, replace any uses of it with undef + // (any non-debug value uses will get removed later on). if (!I->getType()->isX86_MMXTy()) - I->replaceAllUsesWith(Constant::getNullValue(I->getType())); + I->replaceAllUsesWith(UndefValue::get(I->getType())); } // If we change the return value of the function we must rewrite any return diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp index d45a8832391095f3b5f9584fdf428cd2830a6a57..a744d7f2d2d9597202c32c98183d8b80ed845827 100644 --- a/lib/Transforms/IPO/ExtractGV.cpp +++ b/lib/Transforms/IPO/ExtractGV.cpp @@ -135,6 +135,7 @@ namespace { llvm::Value *Declaration; if (FunctionType *FTy = dyn_cast(Ty)) { Declaration = Function::Create(FTy, GlobalValue::ExternalLinkage, + CurI->getAddressSpace(), CurI->getName(), &M); } else { diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp index f01c6a4e99bf10be8c9d7eaa63be70a384d73fc4..3a04f7a00d413b65b8ca8af08a6e246cdbb719da 100644 --- a/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/lib/Transforms/IPO/FunctionAttrs.cpp @@ -41,6 +41,7 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" @@ -1307,13 +1308,14 @@ static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) { // If all of the calls in F are identifiable and are to norecurse functions, F // is norecurse. This check also detects self-recursion as F is not currently // marked norecurse, so any called from F to F will not be marked norecurse. - for (Instruction &I : instructions(*F)) - if (auto CS = CallSite(&I)) { - Function *Callee = CS.getCalledFunction(); - if (!Callee || Callee == F || !Callee->doesNotRecurse()) - // Function calls a potentially recursive function. - return false; - } + for (auto &BB : *F) + for (auto &I : BB.instructionsWithoutDebug()) + if (auto CS = CallSite(&I)) { + Function *Callee = CS.getCalledFunction(); + if (!Callee || Callee == F || !Callee->doesNotRecurse()) + // Function calls a potentially recursive function. + return false; + } // Every call was to a non-recursive function other than this function, and // we have no indirect recursion as the SCC size is one. This function cannot diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp index 31531beea5e78d76a95d856b0119d123d301a8d8..531a7c19e360075466d83a3dbd7d030ff432cb46 100644 --- a/lib/Transforms/IPO/FunctionImport.cpp +++ b/lib/Transforms/IPO/FunctionImport.cpp @@ -293,11 +293,21 @@ static void computeImportForReferencedGlobals( LLVM_DEBUG(dbgs() << " ref -> " << VI << "\n"); + // If this is a local variable, make sure we import the copy + // in the caller's module. The only time a local variable can + // share an entry in the index is if there is a local with the same name + // in another module that had the same source file name (in a different + // directory), where each was compiled in their own directory so there + // was not distinguishing path. + auto LocalNotInModule = [&](const GlobalValueSummary *RefSummary) -> bool { + return GlobalValue::isLocalLinkage(RefSummary->linkage()) && + RefSummary->modulePath() != Summary.modulePath(); + }; + for (auto &RefSummary : VI.getSummaryList()) - if (RefSummary->getSummaryKind() == GlobalValueSummary::GlobalVarKind && - !RefSummary->notEligibleToImport() && - !GlobalValue::isInterposableLinkage(RefSummary->linkage()) && - RefSummary->refs().empty()) { + if (isa(RefSummary.get()) && + canImportGlobalVar(RefSummary.get()) && + !LocalNotInModule(RefSummary.get())) { auto ILI = ImportList[RefSummary->modulePath()].insert(VI.getGUID()); // Only update stat if we haven't already imported this variable. if (ILI.second) @@ -824,6 +834,25 @@ void llvm::computeDeadSymbols( NumLiveSymbols += LiveSymbols; } +// Compute dead symbols and propagate constants in combined index. +void llvm::computeDeadSymbolsWithConstProp( + ModuleSummaryIndex &Index, + const DenseSet &GUIDPreservedSymbols, + function_ref isPrevailing, + bool ImportEnabled) { + computeDeadSymbols(Index, GUIDPreservedSymbols, isPrevailing); + if (ImportEnabled) { + Index.propagateConstants(GUIDPreservedSymbols); + } else { + // If import is disabled we should drop read-only attribute + // from all summaries to prevent internalization. + for (auto &P : Index) + for (auto &S : P.second.SummaryList) + if (auto *GVS = dyn_cast(S.get())) + GVS->setReadOnly(false); + } +} + /// Compute the set of summaries needed for a ThinLTO backend compilation of /// \p ModulePath. void llvm::gatherImportedSummariesForModule( @@ -882,7 +911,8 @@ bool llvm::convertToDeclaration(GlobalValue &GV) { if (GV.getValueType()->isFunctionTy()) NewGV = Function::Create(cast(GV.getValueType()), - GlobalValue::ExternalLinkage, "", GV.getParent()); + GlobalValue::ExternalLinkage, GV.getAddressSpace(), + "", GV.getParent()); else NewGV = new GlobalVariable(*GV.getParent(), GV.getValueType(), @@ -897,8 +927,8 @@ bool llvm::convertToDeclaration(GlobalValue &GV) { return true; } -/// Fixup WeakForLinker linkages in \p TheModule based on summary analysis. -void llvm::thinLTOResolveWeakForLinkerModule( +/// Fixup prevailing symbol linkages in \p TheModule based on summary analysis. +void llvm::thinLTOResolvePrevailingInModule( Module &TheModule, const GVSummaryMapTy &DefinedGlobals) { auto updateLinkage = [&](GlobalValue &GV) { // See if the global summary analysis computed a new resolved linkage. @@ -915,13 +945,15 @@ void llvm::thinLTOResolveWeakForLinkerModule( // as we need access to the resolution vectors for each input file in // order to find which symbols have been redefined. // We may consider reorganizing this code and moving the linkage recording - // somewhere else, e.g. in thinLTOResolveWeakForLinkerInIndex. + // somewhere else, e.g. in thinLTOResolvePrevailingInIndex. if (NewLinkage == GlobalValue::WeakAnyLinkage) { GV.setLinkage(NewLinkage); return; } - if (!GlobalValue::isWeakForLinker(GV.getLinkage())) + if (GlobalValue::isLocalLinkage(GV.getLinkage()) || + // In case it was dead and already converted to declaration. + GV.isDeclaration()) return; // Check for a non-prevailing def that has interposable linkage // (e.g. non-odr weak or linkonce). In that case we can't simply @@ -932,7 +964,7 @@ void llvm::thinLTOResolveWeakForLinkerModule( GlobalValue::isInterposableLinkage(GV.getLinkage())) { if (!convertToDeclaration(GV)) // FIXME: Change this to collect replaced GVs and later erase - // them from the parent module once thinLTOResolveWeakForLinkerGUID is + // them from the parent module once thinLTOResolvePrevailingGUID is // changed to enable this for aliases. llvm_unreachable("Expected GV to be converted"); } else { @@ -1018,6 +1050,18 @@ static Function *replaceAliasWithAliasee(Module *SrcModule, GlobalAlias *GA) { return NewFn; } +// Internalize values that we marked with specific attribute +// in processGlobalForThinLTO. +static void internalizeImmutableGVs(Module &M) { + for (auto &GV : M.globals()) + // Skip GVs which have been converted to declarations + // by dropDeadSymbols. + if (!GV.isDeclaration() && GV.hasAttribute("thinlto-internalize")) { + GV.setLinkage(GlobalValue::InternalLinkage); + GV.setVisibility(GlobalValue::DefaultVisibility); + } +} + // Automatically import functions in Module \p DestModule based on the summaries // index. Expected FunctionImporter::importFunctions( @@ -1141,6 +1185,8 @@ Expected FunctionImporter::importFunctions( NumImportedModules++; } + internalizeImmutableGVs(DestModule); + NumImportedFunctions += (ImportedCount - ImportedGVCount); NumImportedGlobalVars += ImportedGVCount; diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp index c77ea04102c7a69ac5e96a4c78deece37040d45c..34de87433367c8c576dde9e853efebc4ca1b1b77 100644 --- a/lib/Transforms/IPO/GlobalDCE.cpp +++ b/lib/Transforms/IPO/GlobalDCE.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Transforms/IPO.h" @@ -75,13 +76,17 @@ ModulePass *llvm::createGlobalDCEPass() { return new GlobalDCELegacyPass(); } -/// Returns true if F contains only a single "ret" instruction. +/// Returns true if F is effectively empty. static bool isEmptyFunction(Function *F) { BasicBlock &Entry = F->getEntryBlock(); - if (Entry.size() != 1 || !isa(Entry.front())) - return false; - ReturnInst &RI = cast(Entry.front()); - return RI.getReturnValue() == nullptr; + for (auto &I : Entry) { + if (isa(I)) + continue; + if (auto *RI = dyn_cast(&I)) + return !RI->getReturnValue(); + break; + } + return false; } /// Compute the set of GlobalValue that depends from V. diff --git a/lib/Transforms/IPO/HotColdSplitting.cpp b/lib/Transforms/IPO/HotColdSplitting.cpp index 621ac7dc8ab80ed1524e2d433ace813bc4c8bbd9..5d989a40f65f2058485aac8d98b10ed3ab34cc8f 100644 --- a/lib/Transforms/IPO/HotColdSplitting.cpp +++ b/lib/Transforms/IPO/HotColdSplitting.cpp @@ -13,6 +13,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -25,6 +26,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" @@ -97,49 +99,43 @@ bool blockEndsInUnreachable(const BasicBlock &BB) { return !(isa(I) || isa(I)); } -static bool exceptionHandlingFunctions(const CallInst *CI) { - auto F = CI->getCalledFunction(); - if (!F) - return false; - auto FName = F->getName(); - return FName == "__cxa_begin_catch" || - FName == "__cxa_free_exception" || - FName == "__cxa_allocate_exception" || - FName == "__cxa_begin_catch" || - FName == "__cxa_end_catch"; -} - -static bool unlikelyExecuted(const BasicBlock &BB) { - if (blockEndsInUnreachable(BB)) - return true; +bool unlikelyExecuted(BasicBlock &BB) { // Exception handling blocks are unlikely executed. if (BB.isEHPad()) return true; - for (const Instruction &I : BB) - if (const CallInst *CI = dyn_cast(&I)) { - // The block is cold if it calls functions tagged as cold or noreturn. - if (CI->hasFnAttr(Attribute::Cold) || - CI->hasFnAttr(Attribute::NoReturn) || - exceptionHandlingFunctions(CI)) + + // The block is cold if it calls/invokes a cold function. + for (Instruction &I : BB) + if (auto CS = CallSite(&I)) + if (CS.hasFnAttr(Attribute::Cold)) return true; - // Assume that inline assembly is hot code. - if (isa(CI->getCalledValue())) + // The block is cold if it has an unreachable terminator, unless it's + // preceded by a call to a (possibly warm) noreturn call (e.g. longjmp). + if (blockEndsInUnreachable(BB)) { + if (auto *CI = + dyn_cast_or_null(BB.getTerminator()->getPrevNode())) + if (CI->hasFnAttr(Attribute::NoReturn)) return false; - } + return true; + } + return false; } /// Check whether it's safe to outline \p BB. static bool mayExtractBlock(const BasicBlock &BB) { - return !BB.hasAddressTaken(); + return !BB.hasAddressTaken() && !BB.isEHPad(); } -/// Check whether \p BB is profitable to outline (i.e. its code size cost meets -/// the threshold set in \p MinOutliningThreshold). -static bool isProfitableToOutline(const BasicBlock &BB, +/// Check whether \p Region is profitable to outline. +static bool isProfitableToOutline(const BlockSequence &Region, TargetTransformInfo &TTI) { + if (Region.size() > 1) + return true; + int Cost = 0; + const BasicBlock &BB = *Region[0]; for (const Instruction &I : BB) { if (isa(&I) || &I == BB.getTerminator()) continue; @@ -152,151 +148,16 @@ static bool isProfitableToOutline(const BasicBlock &BB, return false; } -/// Identify the maximal region of cold blocks which includes \p SinkBB. -/// -/// Include all blocks post-dominated by \p SinkBB, \p SinkBB itself, and all -/// blocks dominated by \p SinkBB. Exclude all other blocks, and blocks which -/// cannot be outlined. -/// -/// Return an empty sequence if the cold region is too small to outline, or if -/// the cold region has no warm predecessors. -static BlockSequence findMaximalColdRegion(BasicBlock &SinkBB, - TargetTransformInfo &TTI, - DominatorTree &DT, - PostDomTree &PDT) { - // The maximal cold region. - BlockSequence ColdRegion = {}; - - // The ancestor farthest-away from SinkBB, and also post-dominated by it. - BasicBlock *MaxAncestor = &SinkBB; - unsigned MaxAncestorHeight = 0; - - // Visit SinkBB's ancestors using inverse DFS. - auto PredIt = ++idf_begin(&SinkBB); - auto PredEnd = idf_end(&SinkBB); - while (PredIt != PredEnd) { - BasicBlock &PredBB = **PredIt; - bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB); - - // If SinkBB does not post-dominate a predecessor, do not mark the - // predecessor (or any of its predecessors) cold. - if (!SinkPostDom || !mayExtractBlock(PredBB)) { - PredIt.skipChildren(); - continue; - } - - // Keep track of the post-dominated ancestor farthest away from the sink. - unsigned AncestorHeight = PredIt.getPathLength(); - if (AncestorHeight > MaxAncestorHeight) { - MaxAncestor = &PredBB; - MaxAncestorHeight = AncestorHeight; - } - - ColdRegion.push_back(&PredBB); - ++PredIt; - } - - // CodeExtractor requires that all blocks to be extracted must be dominated - // by the first block to be extracted. - // - // To avoid spurious or repeated outlining, require that the max ancestor - // has a predecessor. By construction this predecessor is not in the cold - // region, i.e. its existence implies we don't outline the whole function. - // - // TODO: If MaxAncestor has no predecessors, we may be able to outline the - // second largest cold region that has a predecessor. - if (pred_empty(MaxAncestor) || - MaxAncestor->getSinglePredecessor() == MaxAncestor) - return {}; - - // Filter out predecessors not dominated by the max ancestor. - // - // TODO: Blocks not dominated by the max ancestor could be extracted as - // other cold regions. Marking outlined calls as noreturn when appropriate - // and outlining more than once per function could achieve most of the win. - auto EraseIt = remove_if(ColdRegion, [&](BasicBlock *PredBB) { - return PredBB != MaxAncestor && !DT.dominates(MaxAncestor, PredBB); - }); - ColdRegion.erase(EraseIt, ColdRegion.end()); - - // Add SinkBB to the cold region. - ColdRegion.push_back(&SinkBB); - - // Ensure that the first extracted block is the max ancestor. - if (ColdRegion[0] != MaxAncestor) { - auto AncestorIt = find(ColdRegion, MaxAncestor); - *AncestorIt = ColdRegion[0]; - ColdRegion[0] = MaxAncestor; - } - - // Find all successors of SinkBB dominated by SinkBB using DFS. - auto SuccIt = ++df_begin(&SinkBB); - auto SuccEnd = df_end(&SinkBB); - while (SuccIt != SuccEnd) { - BasicBlock &SuccBB = **SuccIt; - bool SinkDom = DT.dominates(&SinkBB, &SuccBB); - - // If SinkBB does not dominate a successor, do not mark the successor (or - // any of its successors) cold. - if (!SinkDom || !mayExtractBlock(SuccBB)) { - SuccIt.skipChildren(); - continue; - } - - ColdRegion.push_back(&SuccBB); - ++SuccIt; - } - - if (ColdRegion.size() == 1 && !isProfitableToOutline(*ColdRegion[0], TTI)) - return {}; - - return ColdRegion; -} - -/// Get the largest cold region in \p F. -static BlockSequence getLargestColdRegion(Function &F, ProfileSummaryInfo &PSI, - BlockFrequencyInfo *BFI, - TargetTransformInfo &TTI, - DominatorTree &DT, PostDomTree &PDT) { - // Keep track of the largest cold region. - BlockSequence LargestColdRegion = {}; - - for (BasicBlock &BB : F) { - // Identify cold blocks. - if (!mayExtractBlock(BB)) - continue; - bool Cold = - PSI.isColdBB(&BB, BFI) || (EnableStaticAnalyis && unlikelyExecuted(BB)); - if (!Cold) - continue; - - LLVM_DEBUG({ - dbgs() << "Found cold block:\n"; - BB.dump(); - }); - - // Find a maximal cold region we can outline. - BlockSequence ColdRegion = findMaximalColdRegion(BB, TTI, DT, PDT); - if (ColdRegion.empty()) { - LLVM_DEBUG(dbgs() << " Skipping (block not profitable to extract)\n"); - continue; - } - - ++NumColdRegionsFound; - - LLVM_DEBUG({ - llvm::dbgs() << "Identified cold region with " << ColdRegion.size() - << " blocks:\n"; - for (BasicBlock *BB : ColdRegion) - BB->dump(); - }); - - // TODO: Outline more than one region. - if (ColdRegion.size() > LargestColdRegion.size()) - LargestColdRegion = std::move(ColdRegion); +/// Mark \p F cold. Return true if it's changed. +static bool markEntireFunctionCold(Function &F) { + assert(!F.hasFnAttribute(Attribute::OptimizeNone) && "Can't mark this cold"); + bool Changed = false; + if (!F.hasFnAttribute(Attribute::MinSize)) { + F.addFnAttr(Attribute::MinSize); + Changed = true; } - - return LargestColdRegion; + // TODO: Move this function into a cold section. + return Changed; } class HotColdSplitting { @@ -310,6 +171,10 @@ public: private: bool shouldOutlineFrom(const Function &F) const; + bool outlineColdRegions(Function &F, ProfileSummaryInfo &PSI, + BlockFrequencyInfo *BFI, TargetTransformInfo &TTI, + DominatorTree &DT, PostDomTree &PDT, + OptimizationRemarkEmitter &ORE); Function *extractColdRegion(const BlockSequence &Region, DominatorTree &DT, BlockFrequencyInfo *BFI, TargetTransformInfo &TTI, OptimizationRemarkEmitter &ORE, unsigned Count); @@ -375,8 +240,6 @@ Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region, OptimizationRemarkEmitter &ORE, unsigned Count) { assert(!Region.empty()); - LLVM_DEBUG(for (auto *BB : Region) - llvm::dbgs() << "\nExtracting: " << *BB;); // TODO: Pass BFI and BPI to update profile information. CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr, @@ -408,9 +271,7 @@ Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region, // Try to make the outlined code as small as possible on the assumption // that it's cold. - assert(!OutF->hasFnAttribute(Attribute::OptimizeNone) && - "An outlined function should never be marked optnone"); - OutF->addFnAttr(Attribute::MinSize); + markEntireFunctionCold(*OutF); LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF); ORE.emit([&]() { @@ -431,32 +292,285 @@ Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region, return nullptr; } +/// A pair of (basic block, score). +using BlockTy = std::pair; + +/// A maximal outlining region. This contains all blocks post-dominated by a +/// sink block, the sink block itself, and all blocks dominated by the sink. +class OutliningRegion { + /// A list of (block, score) pairs. A block's score is non-zero iff it's a + /// viable sub-region entry point. Blocks with higher scores are better entry + /// points (i.e. they are more distant ancestors of the sink block). + SmallVector Blocks = {}; + + /// The suggested entry point into the region. If the region has multiple + /// entry points, all blocks within the region may not be reachable from this + /// entry point. + BasicBlock *SuggestedEntryPoint = nullptr; + + /// Whether the entire function is cold. + bool EntireFunctionCold = false; + + /// Whether or not \p BB could be the entry point of an extracted region. + static bool isViableEntryPoint(BasicBlock &BB) { return !BB.isEHPad(); } + + /// If \p BB is a viable entry point, return \p Score. Return 0 otherwise. + static unsigned getEntryPointScore(BasicBlock &BB, unsigned Score) { + return isViableEntryPoint(BB) ? Score : 0; + } + + /// These scores should be lower than the score for predecessor blocks, + /// because regions starting at predecessor blocks are typically larger. + static constexpr unsigned ScoreForSuccBlock = 1; + static constexpr unsigned ScoreForSinkBlock = 1; + + OutliningRegion(const OutliningRegion &) = delete; + OutliningRegion &operator=(const OutliningRegion &) = delete; + +public: + OutliningRegion() = default; + OutliningRegion(OutliningRegion &&) = default; + OutliningRegion &operator=(OutliningRegion &&) = default; + + static OutliningRegion create(BasicBlock &SinkBB, const DominatorTree &DT, + const PostDomTree &PDT) { + OutliningRegion ColdRegion; + + SmallPtrSet RegionBlocks; + + auto addBlockToRegion = [&](BasicBlock *BB, unsigned Score) { + RegionBlocks.insert(BB); + ColdRegion.Blocks.emplace_back(BB, Score); + assert(RegionBlocks.size() == ColdRegion.Blocks.size() && "Duplicate BB"); + }; + + // The ancestor farthest-away from SinkBB, and also post-dominated by it. + unsigned SinkScore = getEntryPointScore(SinkBB, ScoreForSinkBlock); + ColdRegion.SuggestedEntryPoint = (SinkScore > 0) ? &SinkBB : nullptr; + unsigned BestScore = SinkScore; + + // Visit SinkBB's ancestors using inverse DFS. + auto PredIt = ++idf_begin(&SinkBB); + auto PredEnd = idf_end(&SinkBB); + while (PredIt != PredEnd) { + BasicBlock &PredBB = **PredIt; + bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB); + + // If the predecessor is cold and has no predecessors, the entire + // function must be cold. + if (SinkPostDom && pred_empty(&PredBB)) { + ColdRegion.EntireFunctionCold = true; + return ColdRegion; + } + + // If SinkBB does not post-dominate a predecessor, do not mark the + // predecessor (or any of its predecessors) cold. + if (!SinkPostDom || !mayExtractBlock(PredBB)) { + PredIt.skipChildren(); + continue; + } + + // Keep track of the post-dominated ancestor farthest away from the sink. + // The path length is always >= 2, ensuring that predecessor blocks are + // considered as entry points before the sink block. + unsigned PredScore = getEntryPointScore(PredBB, PredIt.getPathLength()); + if (PredScore > BestScore) { + ColdRegion.SuggestedEntryPoint = &PredBB; + BestScore = PredScore; + } + + addBlockToRegion(&PredBB, PredScore); + ++PredIt; + } + + // Add SinkBB to the cold region. It's considered as an entry point before + // any sink-successor blocks. + addBlockToRegion(&SinkBB, SinkScore); + + // Find all successors of SinkBB dominated by SinkBB using DFS. + auto SuccIt = ++df_begin(&SinkBB); + auto SuccEnd = df_end(&SinkBB); + while (SuccIt != SuccEnd) { + BasicBlock &SuccBB = **SuccIt; + bool SinkDom = DT.dominates(&SinkBB, &SuccBB); + + // Don't allow the backwards & forwards DFSes to mark the same block. + bool DuplicateBlock = RegionBlocks.count(&SuccBB); + + // If SinkBB does not dominate a successor, do not mark the successor (or + // any of its successors) cold. + if (DuplicateBlock || !SinkDom || !mayExtractBlock(SuccBB)) { + SuccIt.skipChildren(); + continue; + } + + unsigned SuccScore = getEntryPointScore(SuccBB, ScoreForSuccBlock); + if (SuccScore > BestScore) { + ColdRegion.SuggestedEntryPoint = &SuccBB; + BestScore = SuccScore; + } + + addBlockToRegion(&SuccBB, SuccScore); + ++SuccIt; + } + + return ColdRegion; + } + + /// Whether this region has nothing to extract. + bool empty() const { return !SuggestedEntryPoint; } + + /// The blocks in this region. + ArrayRef> blocks() const { return Blocks; } + + /// Whether the entire function containing this region is cold. + bool isEntireFunctionCold() const { return EntireFunctionCold; } + + /// Remove a sub-region from this region and return it as a block sequence. + BlockSequence takeSingleEntrySubRegion(DominatorTree &DT) { + assert(!empty() && !isEntireFunctionCold() && "Nothing to extract"); + + // Remove blocks dominated by the suggested entry point from this region. + // During the removal, identify the next best entry point into the region. + // Ensure that the first extracted block is the suggested entry point. + BlockSequence SubRegion = {SuggestedEntryPoint}; + BasicBlock *NextEntryPoint = nullptr; + unsigned NextScore = 0; + auto RegionEndIt = Blocks.end(); + auto RegionStartIt = remove_if(Blocks, [&](const BlockTy &Block) { + BasicBlock *BB = Block.first; + unsigned Score = Block.second; + bool InSubRegion = + BB == SuggestedEntryPoint || DT.dominates(SuggestedEntryPoint, BB); + if (!InSubRegion && Score > NextScore) { + NextEntryPoint = BB; + NextScore = Score; + } + if (InSubRegion && BB != SuggestedEntryPoint) + SubRegion.push_back(BB); + return InSubRegion; + }); + Blocks.erase(RegionStartIt, RegionEndIt); + + // Update the suggested entry point. + SuggestedEntryPoint = NextEntryPoint; + + return SubRegion; + } +}; + +bool HotColdSplitting::outlineColdRegions(Function &F, ProfileSummaryInfo &PSI, + BlockFrequencyInfo *BFI, + TargetTransformInfo &TTI, + DominatorTree &DT, PostDomTree &PDT, + OptimizationRemarkEmitter &ORE) { + bool Changed = false; + + // The set of cold blocks. + SmallPtrSet ColdBlocks; + + // The worklist of non-intersecting regions left to outline. + SmallVector OutliningWorklist; + + // Set up an RPO traversal. Experimentally, this performs better (outlines + // more) than a PO traversal, because we prevent region overlap by keeping + // the first region to contain a block. + ReversePostOrderTraversal RPOT(&F); + + // Find all cold regions. + for (BasicBlock *BB : RPOT) { + // Skip blocks which can't be outlined. + if (!mayExtractBlock(*BB)) + continue; + + // This block is already part of some outlining region. + if (ColdBlocks.count(BB)) + continue; + + bool Cold = PSI.isColdBlock(BB, BFI) || + (EnableStaticAnalyis && unlikelyExecuted(*BB)); + if (!Cold) + continue; + + LLVM_DEBUG({ + dbgs() << "Found a cold block:\n"; + BB->dump(); + }); + + auto Region = OutliningRegion::create(*BB, DT, PDT); + if (Region.empty()) + continue; + + if (Region.isEntireFunctionCold()) { + LLVM_DEBUG(dbgs() << "Entire function is cold\n"); + return markEntireFunctionCold(F); + } + + // If this outlining region intersects with another, drop the new region. + // + // TODO: It's theoretically possible to outline more by only keeping the + // largest region which contains a block, but the extra bookkeeping to do + // this is tricky/expensive. + bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) { + return !ColdBlocks.insert(Block.first).second; + }); + if (RegionsOverlap) + continue; + + OutliningWorklist.emplace_back(std::move(Region)); + ++NumColdRegionsFound; + } + + // Outline single-entry cold regions, splitting up larger regions as needed. + unsigned OutlinedFunctionID = 1; + while (!OutliningWorklist.empty()) { + OutliningRegion Region = OutliningWorklist.pop_back_val(); + assert(!Region.empty() && "Empty outlining region in worklist"); + do { + BlockSequence SubRegion = Region.takeSingleEntrySubRegion(DT); + if (!isProfitableToOutline(SubRegion, TTI)) { + LLVM_DEBUG({ + dbgs() << "Skipping outlining; not profitable to outline\n"; + SubRegion[0]->dump(); + }); + continue; + } + + LLVM_DEBUG({ + dbgs() << "Hot/cold splitting attempting to outline these blocks:\n"; + for (BasicBlock *BB : SubRegion) + BB->dump(); + }); + + Function *Outlined = + extractColdRegion(SubRegion, DT, BFI, TTI, ORE, OutlinedFunctionID); + if (Outlined) { + ++OutlinedFunctionID; + OutlinedFunctions.insert(Outlined); + Changed = true; + } + } while (!Region.empty()); + } + + return Changed; +} + bool HotColdSplitting::run(Module &M) { bool Changed = false; + OutlinedFunctions.clear(); for (auto &F : M) { if (!shouldOutlineFrom(F)) { - LLVM_DEBUG(llvm::dbgs() << "Not outlining in " << F.getName() << "\n"); + LLVM_DEBUG(llvm::dbgs() << "Skipping " << F.getName() << "\n"); continue; } - LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n"); DominatorTree DT(F); PostDomTree PDT(F); PDT.recalculate(F); BlockFrequencyInfo *BFI = GetBFI(F); TargetTransformInfo &TTI = GetTTI(F); - - BlockSequence ColdRegion = getLargestColdRegion(F, *PSI, BFI, TTI, DT, PDT); - if (ColdRegion.empty()) - continue; - OptimizationRemarkEmitter &ORE = (*GetORE)(F); - Function *Outlined = - extractColdRegion(ColdRegion, DT, BFI, TTI, ORE, /*Count=*/1); - if (Outlined) { - OutlinedFunctions.insert(Outlined); - Changed = true; - } + Changed |= outlineColdRegions(F, *PSI, BFI, TTI, DT, PDT, ORE); } return Changed; } @@ -465,7 +579,7 @@ bool HotColdSplittingLegacyPass::runOnModule(Module &M) { if (skipModule(M)) return false; ProfileSummaryInfo *PSI = - getAnalysis().getPSI(); + &getAnalysis().getPSI(); auto GTTI = [this](Function &F) -> TargetTransformInfo & { return this->getAnalysis().getTTI(F); }; diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp index 66aea45323fea9a817be7946e0928dcfbe8d8c01..44001b5779cc354dbc89bfc4eea60d9bc5c666a5 100644 --- a/lib/Transforms/IPO/Inliner.cpp +++ b/lib/Transforms/IPO/Inliner.cpp @@ -746,7 +746,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG, bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) { CallGraph &CG = getAnalysis().getCallGraph(); ACT = &getAnalysis(); - PSI = getAnalysis().getPSI(); + PSI = &getAnalysis().getPSI(); auto &TLI = getAnalysis().getTLI(); auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp index 2d29e93b1ca1de6f15ed015434f7fe2ae3feadb6..e4dcd4d4dd7758cf690d3ca182a214db483f8b6a 100644 --- a/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/lib/Transforms/IPO/LowerTypeTests.cpp @@ -989,6 +989,7 @@ void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) { if (F->isDSOLocal()) { Function *RealF = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage, + F->getAddressSpace(), Name + ".cfi", &M); RealF->setVisibility(GlobalVariable::HiddenVisibility); replaceDirectCalls(F, RealF); @@ -1000,13 +1001,13 @@ void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) { if (F->isDeclarationForLinker() && !isDefinition) { // Declaration of an external function. FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage, - Name + ".cfi_jt", &M); + F->getAddressSpace(), Name + ".cfi_jt", &M); FDecl->setVisibility(GlobalValue::HiddenVisibility); } else if (isDefinition) { F->setName(Name + ".cfi"); F->setLinkage(GlobalValue::ExternalLinkage); FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage, - Name, &M); + F->getAddressSpace(), Name, &M); FDecl->setVisibility(Visibility); Visibility = GlobalValue::HiddenVisibility; @@ -1016,7 +1017,8 @@ void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) { for (auto &U : F->uses()) { if (auto *A = dyn_cast(U.getUser())) { Function *AliasDecl = Function::Create( - F->getFunctionType(), GlobalValue::ExternalLinkage, "", &M); + F->getFunctionType(), GlobalValue::ExternalLinkage, + F->getAddressSpace(), "", &M); AliasDecl->takeName(A); A->replaceAllUsesWith(AliasDecl); ToErase.push_back(A); @@ -1191,7 +1193,9 @@ void LowerTypeTestsModule::moveInitializerToModuleConstructor( WeakInitializerFn = Function::Create( FunctionType::get(Type::getVoidTy(M.getContext()), /* IsVarArg */ false), - GlobalValue::InternalLinkage, "__cfi_global_var_init", &M); + GlobalValue::InternalLinkage, + M.getDataLayout().getProgramAddressSpace(), + "__cfi_global_var_init", &M); BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", WeakInitializerFn); ReturnInst::Create(M.getContext(), BB); @@ -1234,7 +1238,8 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr( // placeholder first. Function *PlaceholderFn = Function::Create(cast(F->getValueType()), - GlobalValue::ExternalWeakLinkage, "", &M); + GlobalValue::ExternalWeakLinkage, + F->getAddressSpace(), "", &M); replaceCfiUses(F, PlaceholderFn, IsDefinition); Constant *Target = ConstantExpr::getSelect( @@ -1424,7 +1429,9 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative( Function *JumpTableFn = Function::Create(FunctionType::get(Type::getVoidTy(M.getContext()), /* IsVarArg */ false), - GlobalValue::PrivateLinkage, ".cfi.jumptable", &M); + GlobalValue::PrivateLinkage, + M.getDataLayout().getProgramAddressSpace(), + ".cfi.jumptable", &M); ArrayType *JumpTableType = ArrayType::get(getJumpTableEntryType(), Functions.size()); auto JumpTable = @@ -1813,7 +1820,8 @@ bool LowerTypeTestsModule::lower() { if (!F) F = Function::Create( FunctionType::get(Type::getVoidTy(M.getContext()), false), - GlobalVariable::ExternalLinkage, FunctionName, &M); + GlobalVariable::ExternalLinkage, + M.getDataLayout().getProgramAddressSpace(), FunctionName, &M); // If the function is available_externally, remove its definition so // that it is handled the same way as a declaration. Later we will try diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp index 4c51cd131a1015d654e1de9f8b5232b390d097af..550750d17438cfb408f0c5d1ccb7b123ab9bfb89 100644 --- a/lib/Transforms/IPO/MergeFunctions.cpp +++ b/lib/Transforms/IPO/MergeFunctions.cpp @@ -136,6 +136,7 @@ using namespace llvm; STATISTIC(NumFunctionsMerged, "Number of functions merged"); STATISTIC(NumThunksWritten, "Number of thunks generated"); +STATISTIC(NumAliasesWritten, "Number of aliases generated"); STATISTIC(NumDoubleWeak, "Number of new functions created"); static cl::opt NumFunctionsForSanityCheck( @@ -165,6 +166,11 @@ static cl::opt cl::desc("Preserve debug info in thunk when mergefunc " "transformations are made.")); +static cl::opt + MergeFunctionsAliases("mergefunc-use-aliases", cl::Hidden, + cl::init(false), + cl::desc("Allow mergefunc to create aliases")); + namespace { class FunctionNode { @@ -272,6 +278,13 @@ private: /// delete G. void writeThunk(Function *F, Function *G); + // Replace G with an alias to F (deleting function G) + void writeAlias(Function *F, Function *G); + + // Replace G with an alias to F if possible, or a thunk to F if + // profitable. Returns false if neither is the case. + bool writeThunkOrAlias(Function *F, Function *G); + /// Replace function F with function G in the function tree. void replaceFunctionInTree(const FunctionNode &FN, Function *G); @@ -680,8 +693,8 @@ void MergeFunctions::writeThunk(Function *F, Function *G) { GEntryBlock->getTerminator()->eraseFromParent(); BB = GEntryBlock; } else { - NewG = Function::Create(G->getFunctionType(), G->getLinkage(), "", - G->getParent()); + NewG = Function::Create(G->getFunctionType(), G->getLinkage(), + G->getAddressSpace(), "", G->getParent()); BB = BasicBlock::Create(F->getContext(), "", NewG); } @@ -735,27 +748,76 @@ void MergeFunctions::writeThunk(Function *F, Function *G) { ++NumThunksWritten; } +// Whether this function may be replaced by an alias +static bool canCreateAliasFor(Function *F) { + if (!MergeFunctionsAliases || !F->hasGlobalUnnamedAddr()) + return false; + + // We should only see linkages supported by aliases here + assert(F->hasLocalLinkage() || F->hasExternalLinkage() + || F->hasWeakLinkage() || F->hasLinkOnceLinkage()); + return true; +} + +// Replace G with an alias to F (deleting function G) +void MergeFunctions::writeAlias(Function *F, Function *G) { + Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType()); + PointerType *PtrType = G->getType(); + auto *GA = GlobalAlias::create( + PtrType->getElementType(), PtrType->getAddressSpace(), + G->getLinkage(), "", BitcastF, G->getParent()); + + F->setAlignment(std::max(F->getAlignment(), G->getAlignment())); + GA->takeName(G); + GA->setVisibility(G->getVisibility()); + GA->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + + removeUsers(G); + G->replaceAllUsesWith(GA); + G->eraseFromParent(); + + LLVM_DEBUG(dbgs() << "writeAlias: " << GA->getName() << '\n'); + ++NumAliasesWritten; +} + +// Replace G with an alias to F if possible, or a thunk to F if +// profitable. Returns false if neither is the case. +bool MergeFunctions::writeThunkOrAlias(Function *F, Function *G) { + if (canCreateAliasFor(G)) { + writeAlias(F, G); + return true; + } + if (isThunkProfitable(F)) { + writeThunk(F, G); + return true; + } + return false; +} + // Merge two equivalent functions. Upon completion, Function G is deleted. void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) { if (F->isInterposable()) { assert(G->isInterposable()); - if (!isThunkProfitable(F)) { + // Both writeThunkOrAlias() calls below must succeed, either because we can + // create aliases for G and NewF, or because a thunk for F is profitable. + // F here has the same signature as NewF below, so that's what we check. + if (!isThunkProfitable(F) && (!canCreateAliasFor(F) || !canCreateAliasFor(G))) { return; } // Make them both thunks to the same internal function. - Function *H = Function::Create(F->getFunctionType(), F->getLinkage(), "", - F->getParent()); - H->copyAttributesFrom(F); - H->takeName(F); + Function *NewF = Function::Create(F->getFunctionType(), F->getLinkage(), + F->getAddressSpace(), "", F->getParent()); + NewF->copyAttributesFrom(F); + NewF->takeName(F); removeUsers(F); - F->replaceAllUsesWith(H); + F->replaceAllUsesWith(NewF); - unsigned MaxAlignment = std::max(G->getAlignment(), H->getAlignment()); + unsigned MaxAlignment = std::max(G->getAlignment(), NewF->getAlignment()); - writeThunk(F, G); - writeThunk(F, H); + writeThunkOrAlias(F, G); + writeThunkOrAlias(F, NewF); F->setAlignment(MaxAlignment); F->setLinkage(GlobalValue::PrivateLinkage); @@ -789,12 +851,9 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) { return; } - if (!isThunkProfitable(F)) { - return; + if (writeThunkOrAlias(F, G)) { + ++NumFunctionsMerged; } - - writeThunk(F, G); - ++NumFunctionsMerged; } } diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index bcb19af85b29b9bd501be3b53a3c5f98a61add49..917582a706b36c5bc294a7e7951dbd490b34e615 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -359,7 +359,7 @@ struct PartialInlinerLegacyPass : public ModulePass { TargetTransformInfoWrapperPass *TTIWP = &getAnalysis(); ProfileSummaryInfo *PSI = - getAnalysis().getPSI(); + &getAnalysis().getPSI(); std::function GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & { @@ -403,7 +403,7 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F, auto IsSingleEntry = [](SmallVectorImpl &BlockList) { BasicBlock *Dom = BlockList.front(); - return BlockList.size() > 1 && pred_size(Dom) == 1; + return BlockList.size() > 1 && Dom->hasNPredecessors(1); }; auto IsSingleExit = @@ -468,7 +468,7 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F, // Only consider regions with predecessor blocks that are considered // not-cold (default: part of the top 99.99% of all block counters) // AND greater than our minimum block execution count (default: 100). - if (PSI->isColdBB(thisBB, BFI) || + if (PSI->isColdBlock(thisBB, BFI) || BBProfileCount(thisBB) < MinBlockCounterExecution) continue; for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) { @@ -834,42 +834,41 @@ bool PartialInlinerImpl::shouldPartialInline( int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { int InlineCost = 0; const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { - if (isa(I)) - continue; - - switch (I->getOpcode()) { + for (Instruction &I : BB->instructionsWithoutDebug()) { + // Skip free instructions. + switch (I.getOpcode()) { case Instruction::BitCast: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::Alloca: + case Instruction::PHI: continue; case Instruction::GetElementPtr: - if (cast(I)->hasAllZeroIndices()) + if (cast(&I)->hasAllZeroIndices()) continue; break; default: break; } - IntrinsicInst *IntrInst = dyn_cast(I); + IntrinsicInst *IntrInst = dyn_cast(&I); if (IntrInst) { if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start || IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) continue; } - if (CallInst *CI = dyn_cast(I)) { + if (CallInst *CI = dyn_cast(&I)) { InlineCost += getCallsiteCost(CallSite(CI), DL); continue; } - if (InvokeInst *II = dyn_cast(I)) { + if (InvokeInst *II = dyn_cast(&I)) { InlineCost += getCallsiteCost(CallSite(II), DL); continue; } - if (SwitchInst *SI = dyn_cast(I)) { + if (SwitchInst *SI = dyn_cast(&I)) { InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; continue; } diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 19ff2a21cd2186cb0532822b1e0e405c9dc0e256..60bd6def6cefea8923acc78d57988cbe5773b0f8 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -378,8 +378,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses( if (EnableLoopInterchange) MPM.add(createLoopInterchangePass()); // Interchange loops - if (!DisableUnrollLoops) - MPM.add(createSimpleLoopUnrollPass(OptLevel)); // Unroll small loops + MPM.add(createSimpleLoopUnrollPass(OptLevel, + DisableUnrollLoops)); // Unroll small loops addExtensionsToPM(EP_LoopOptimizerEnd, MPM); // This ends the loop pass pipelines. @@ -637,7 +637,7 @@ void PassManagerBuilder::populateModulePassManager( // llvm.loop.distribute=true or when -enable-loop-distribute is specified. MPM.add(createLoopDistributePass()); - MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize)); + MPM.add(createLoopVectorizePass(DisableUnrollLoops, !LoopVectorize)); // Eliminate loads by forwarding stores from the previous iteration to loads // of the current iteration. @@ -682,16 +682,17 @@ void PassManagerBuilder::populateModulePassManager( addExtensionsToPM(EP_Peephole, MPM); addInstructionCombiningPass(MPM); - if (!DisableUnrollLoops) { - if (EnableUnrollAndJam) { - // Unroll and Jam. We do this before unroll but need to be in a separate - // loop pass manager in order for the outer loop to be processed by - // unroll and jam before the inner loop is unrolled. - MPM.add(createLoopUnrollAndJamPass(OptLevel)); - } + if (EnableUnrollAndJam && !DisableUnrollLoops) { + // Unroll and Jam. We do this before unroll but need to be in a separate + // loop pass manager in order for the outer loop to be processed by + // unroll and jam before the inner loop is unrolled. + MPM.add(createLoopUnrollAndJamPass(OptLevel)); + } - MPM.add(createLoopUnrollPass(OptLevel)); // Unroll small loops + MPM.add(createLoopUnrollPass(OptLevel, + DisableUnrollLoops)); // Unroll small loops + if (!DisableUnrollLoops) { // LoopUnroll may generate some redundency to cleanup. addInstructionCombiningPass(MPM); @@ -700,7 +701,9 @@ void PassManagerBuilder::populateModulePassManager( // outer loop. LICM pass can help to promote the runtime check out if the // checked value is loop invariant. MPM.add(createLICMPass()); - } + } + + MPM.add(createWarnMissedTransformationsPass()); // After vectorization and unrolling, assume intrinsics may tell us more // about pointer alignments. @@ -747,6 +750,12 @@ void PassManagerBuilder::populateModulePassManager( } void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { + // Load sample profile before running the LTO optimization pipeline. + if (!PGOSampleUse.empty()) { + PM.add(createPruneEHPass()); + PM.add(createSampleProfileLoaderPass(PGOSampleUse)); + } + // Remove unused virtual tables to improve the quality of code generated by // whole-program devirtualization and bitset lowering. PM.add(createGlobalDCEPass()); @@ -864,12 +873,13 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { if (EnableLoopInterchange) PM.add(createLoopInterchangePass()); - if (!DisableUnrollLoops) - PM.add(createSimpleLoopUnrollPass(OptLevel)); // Unroll small loops - PM.add(createLoopVectorizePass(true, LoopVectorize)); + PM.add(createSimpleLoopUnrollPass(OptLevel, + DisableUnrollLoops)); // Unroll small loops + PM.add(createLoopVectorizePass(true, !LoopVectorize)); // The vectorizer may have significantly shortened a loop body; unroll again. - if (!DisableUnrollLoops) - PM.add(createLoopUnrollPass(OptLevel)); + PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops)); + + PM.add(createWarnMissedTransformationsPass()); // Now that we've optimized loops (in particular loop induction variables), // we may have exposed more scalar opportunities. Run parts of the scalar diff --git a/lib/Transforms/IPO/SCCP.cpp b/lib/Transforms/IPO/SCCP.cpp index e2bb6f185c3bf5b76fb4e58ce3b14684a6881147..d2c34abfc13270f25dba1203155a375cc470832f 100644 --- a/lib/Transforms/IPO/SCCP.cpp +++ b/lib/Transforms/IPO/SCCP.cpp @@ -1,5 +1,6 @@ #include "llvm/Transforms/IPO/SCCP.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar/SCCP.h" @@ -10,16 +11,21 @@ PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) { const DataLayout &DL = M.getDataLayout(); auto &TLI = AM.getResult(M); auto &FAM = AM.getResult(M).getManager(); - auto getPredicateInfo = - [&FAM](Function &F) -> std::unique_ptr { - return make_unique(F, - FAM.getResult(F), - FAM.getResult(F)); + auto getAnalysis = [&FAM](Function &F) -> AnalysisResultsForFn { + DominatorTree &DT = FAM.getResult(F); + return { + make_unique(F, DT, FAM.getResult(F)), + &DT, FAM.getCachedResult(F)}; }; - if (!runIPSCCP(M, DL, &TLI, getPredicateInfo)) + if (!runIPSCCP(M, DL, &TLI, getAnalysis)) return PreservedAnalyses::all(); - return PreservedAnalyses::none(); + + PreservedAnalyses PA; + PA.preserve(); + PA.preserve(); + PA.preserve(); + return PA; } namespace { @@ -44,14 +50,19 @@ public: const TargetLibraryInfo *TLI = &getAnalysis().getTLI(); - auto getPredicateInfo = - [this](Function &F) -> std::unique_ptr { - return make_unique( - F, this->getAnalysis(F).getDomTree(), - this->getAnalysis().getAssumptionCache(F)); + auto getAnalysis = [this](Function &F) -> AnalysisResultsForFn { + DominatorTree &DT = + this->getAnalysis(F).getDomTree(); + return { + make_unique( + F, DT, + this->getAnalysis().getAssumptionCache( + F)), + nullptr, // We cannot preserve the DT or PDT with the legacy pass + nullptr}; // manager, so set them to nullptr. }; - return runIPSCCP(M, DL, TLI, getPredicateInfo); + return runIPSCCP(M, DL, TLI, getAnalysis); } void getAnalysisUsage(AnalysisUsage &AU) const override { diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp index a78e0d459c891b909d7f9766667f0f8c7cecb7d4..06a1ce89827f65265ecce14a60021fce6504415f 100644 --- a/lib/Transforms/IPO/SampleProfile.cpp +++ b/lib/Transforms/IPO/SampleProfile.cpp @@ -123,6 +123,12 @@ static cl::opt NoWarnSampleUnused( cl::desc("Use this option to turn off/on warnings about function with " "samples but without debug information to use those samples. ")); +static cl::opt ProfileSampleAccurate( + "profile-sample-accurate", cl::Hidden, cl::init(false), + cl::desc("If the sample profile is accurate, we will mark all un-sampled " + "callsite and function as having 0 samples. Otherwise, treat " + "un-sampled callsites and functions conservatively as unknown. ")); + namespace { using BlockWeightMap = DenseMap; @@ -1599,15 +1605,23 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) { ACT = &getAnalysis(); TTIWP = &getAnalysis(); ProfileSummaryInfo *PSI = - getAnalysis().getPSI(); + &getAnalysis().getPSI(); return SampleLoader.runOnModule(M, nullptr, PSI); } bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) { - // Initialize the entry count to -1, which will be treated conservatively - // by getEntryCount as the same as unknown (None). If we have samples this - // will be overwritten in emitAnnotations. - F.setEntryCount(ProfileCount(-1, Function::PCT_Real)); + // By default the entry count is initialized to -1, which will be treated + // conservatively by getEntryCount as the same as unknown (None). This is + // to avoid newly added code to be treated as cold. If we have samples + // this will be overwritten in emitAnnotations. + // If ProfileSampleAccurate is true or F has profile-sample-accurate + // attribute, initialize the entry count to 0 so callsites or functions + // unsampled will be treated as cold. + uint64_t initialEntryCount = + (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) + ? 0 + : -1; + F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real)); std::unique_ptr OwnedORE; if (AM) { auto &FAM = diff --git a/lib/Transforms/IPO/SyntheticCountsPropagation.cpp b/lib/Transforms/IPO/SyntheticCountsPropagation.cpp index 3c5ad37bced1997800b4e72a0070acd1984ba387..64837d4f5d6196a60c2a2625f35c15e9c7c10c26 100644 --- a/lib/Transforms/IPO/SyntheticCountsPropagation.cpp +++ b/lib/Transforms/IPO/SyntheticCountsPropagation.cpp @@ -46,7 +46,7 @@ using ProfileCount = Function::ProfileCount; #define DEBUG_TYPE "synthetic-counts-propagation" /// Initial synthetic count assigned to functions. -static cl::opt +cl::opt InitialSyntheticCount("initial-synthetic-count", cl::Hidden, cl::init(10), cl::ZeroOrMore, cl::desc("Initial value of synthetic entry count.")); diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index bfab96a1ddb9536280e2e456837c7925c6598e54..a5382c4638d8baf7d4c71f8a8b3699d0adbc60b1 100644 --- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -154,7 +154,8 @@ void simplifyExternals(Module &M) { continue; Function *NewF = - Function::Create(EmptyFT, GlobalValue::ExternalLinkage, "", &M); + Function::Create(EmptyFT, GlobalValue::ExternalLinkage, + F.getAddressSpace(), "", &M); NewF->setVisibility(F.getVisibility()); NewF->takeName(&F); F.replaceAllUsesWith(ConstantExpr::getBitCast(NewF, F.getType())); diff --git a/lib/Transforms/IPO/WholeProgramDevirt.cpp b/lib/Transforms/IPO/WholeProgramDevirt.cpp index b8f68d4d15d4055c1fb607735fa4704be2ab6274..37905daee4cbd792f83b47ff93f1fb3946b8874a 100644 --- a/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -864,10 +864,13 @@ void DevirtModule::tryICallBranchFunnel( Function *JT; if (isa(Slot.TypeID)) { JT = Function::Create(FT, Function::ExternalLinkage, + M.getDataLayout().getProgramAddressSpace(), getGlobalName(Slot, {}, "branch_funnel"), &M); JT->setVisibility(GlobalValue::HiddenVisibility); } else { - JT = Function::Create(FT, Function::InternalLinkage, "branch_funnel", &M); + JT = Function::Create(FT, Function::InternalLinkage, + M.getDataLayout().getProgramAddressSpace(), + "branch_funnel", &M); } JT->addAttribute(1, Attribute::Nest); diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 7e7a515bfc8de4869966455018cfdecc070db285..24a82ba9ae40049de5fe15b691ddaa2e9494ac7c 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -53,11 +53,11 @@ static unsigned getFCmpCode(FCmpInst::Predicate CC) { /// operands into either a constant true or false, or a brand new ICmp /// instruction. The sign is passed in to determine which kind of predicate to /// use in the new icmp instruction. -static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS, +static Value *getNewICmpValue(unsigned Code, bool Sign, Value *LHS, Value *RHS, InstCombiner::BuilderTy &Builder) { ICmpInst::Predicate NewPred; - if (Value *NewConstant = getICmpValue(Sign, Code, LHS, RHS, NewPred)) - return NewConstant; + if (Constant *TorF = getPredForICmpCode(Code, Sign, LHS->getType(), NewPred)) + return TorF; return Builder.CreateICmp(NewPred, LHS, RHS); } @@ -1033,7 +1033,7 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B) - if (PredicatesFoldable(PredL, PredR)) { + if (predicatesFoldable(PredL, PredR)) { if (LHS->getOperand(0) == RHS->getOperand(1) && LHS->getOperand(1) == RHS->getOperand(0)) LHS->swapOperands(); @@ -1041,8 +1041,8 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, LHS->getOperand(1) == RHS->getOperand(1)) { Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1); unsigned Code = getICmpCode(LHS) & getICmpCode(RHS); - bool isSigned = LHS->isSigned() || RHS->isSigned(); - return getNewICmpValue(isSigned, Code, Op0, Op1, Builder); + bool IsSigned = LHS->isSigned() || RHS->isSigned(); + return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder); } } @@ -1131,7 +1131,7 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, return nullptr; // We can't fold (ugt x, C) & (sgt x, C2). - if (!PredicatesFoldable(PredL, PredR)) + if (!predicatesFoldable(PredL, PredR)) return nullptr; // Ensure that the larger constant is on the RHS. @@ -1762,10 +1762,9 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { return nullptr; } -/// Given an OR instruction, check to see if this is a bswap idiom. If so, -/// insert the new intrinsic and return it. -Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) { - Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); +Instruction *InstCombiner::matchBSwap(BinaryOperator &Or) { + assert(Or.getOpcode() == Instruction::Or && "bswap requires an 'or'"); + Value *Op0 = Or.getOperand(0), *Op1 = Or.getOperand(1); // Look through zero extends. if (Instruction *Ext = dyn_cast(Op0)) @@ -1801,7 +1800,7 @@ Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) { return nullptr; SmallVector Insts; - if (!recognizeBSwapOrBitReverseIdiom(&I, true, false, Insts)) + if (!recognizeBSwapOrBitReverseIdiom(&Or, true, false, Insts)) return nullptr; Instruction *LastInst = Insts.pop_back_val(); LastInst->removeFromParent(); @@ -1977,7 +1976,7 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, } // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B) - if (PredicatesFoldable(PredL, PredR)) { + if (predicatesFoldable(PredL, PredR)) { if (LHS->getOperand(0) == RHS->getOperand(1) && LHS->getOperand(1) == RHS->getOperand(0)) LHS->swapOperands(); @@ -1985,8 +1984,8 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, LHS->getOperand(1) == RHS->getOperand(1)) { Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1); unsigned Code = getICmpCode(LHS) | getICmpCode(RHS); - bool isSigned = LHS->isSigned() || RHS->isSigned(); - return getNewICmpValue(isSigned, Code, Op0, Op1, Builder); + bool IsSigned = LHS->isSigned() || RHS->isSigned(); + return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder); } } @@ -2067,7 +2066,7 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, return nullptr; // We can't fold (ugt x, C) | (sgt x, C2). - if (!PredicatesFoldable(PredL, PredR)) + if (!predicatesFoldable(PredL, PredR)) return nullptr; // Ensure that the larger constant is on the RHS. @@ -2168,7 +2167,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I)) return FoldedLogic; - if (Instruction *BSwap = MatchBSwap(I)) + if (Instruction *BSwap = matchBSwap(I)) return BSwap; Value *X, *Y; @@ -2463,7 +2462,7 @@ static Instruction *foldXorToXor(BinaryOperator &I, } Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) { - if (PredicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) { + if (predicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) { if (LHS->getOperand(0) == RHS->getOperand(1) && LHS->getOperand(1) == RHS->getOperand(0)) LHS->swapOperands(); @@ -2472,8 +2471,8 @@ Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) { // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B) Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1); unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS); - bool isSigned = LHS->isSigned() || RHS->isSigned(); - return getNewICmpValue(isSigned, Code, Op0, Op1, Builder); + bool IsSigned = LHS->isSigned() || RHS->isSigned(); + return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder); } } diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index bdd9a43be2721a9e441456060a1e127805877230..ae158aeabdff6c9a206f305e0ccad9bd2c7ff473 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -736,7 +736,8 @@ static Value *simplifyX86round(IntrinsicInst &II, return Builder.CreateInsertElement(Dst, Res, (uint64_t)0); } -static Value *simplifyX86movmsk(const IntrinsicInst &II) { +static Value *simplifyX86movmsk(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { Value *Arg = II.getArgOperand(0); Type *ResTy = II.getType(); Type *ArgTy = Arg->getType(); @@ -749,29 +750,46 @@ static Value *simplifyX86movmsk(const IntrinsicInst &II) { if (!ArgTy->isVectorTy()) return nullptr; - auto *C = dyn_cast(Arg); - if (!C) - return nullptr; + if (auto *C = dyn_cast(Arg)) { + // Extract signbits of the vector input and pack into integer result. + APInt Result(ResTy->getPrimitiveSizeInBits(), 0); + for (unsigned I = 0, E = ArgTy->getVectorNumElements(); I != E; ++I) { + auto *COp = C->getAggregateElement(I); + if (!COp) + return nullptr; + if (isa(COp)) + continue; - // Extract signbits of the vector input and pack into integer result. - APInt Result(ResTy->getPrimitiveSizeInBits(), 0); - for (unsigned I = 0, E = ArgTy->getVectorNumElements(); I != E; ++I) { - auto *COp = C->getAggregateElement(I); - if (!COp) - return nullptr; - if (isa(COp)) - continue; + auto *CInt = dyn_cast(COp); + auto *CFp = dyn_cast(COp); + if (!CInt && !CFp) + return nullptr; - auto *CInt = dyn_cast(COp); - auto *CFp = dyn_cast(COp); - if (!CInt && !CFp) - return nullptr; + if ((CInt && CInt->isNegative()) || (CFp && CFp->isNegative())) + Result.setBit(I); + } + return Constant::getIntegerValue(ResTy, Result); + } - if ((CInt && CInt->isNegative()) || (CFp && CFp->isNegative())) - Result.setBit(I); + // Look for a sign-extended boolean source vector as the argument to this + // movmsk. If the argument is bitcast, look through that, but make sure the + // source of that bitcast is still a vector with the same number of elements. + // TODO: We can also convert a bitcast with wider elements, but that requires + // duplicating the bool source sign bits to match the number of elements + // expected by the movmsk call. + Arg = peekThroughBitcast(Arg); + Value *X; + if (Arg->getType()->isVectorTy() && + Arg->getType()->getVectorNumElements() == ArgTy->getVectorNumElements() && + match(Arg, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) { + // call iM movmsk(sext X) --> zext (bitcast X to iN) to iM + unsigned NumElts = X->getType()->getVectorNumElements(); + Type *ScalarTy = Type::getIntNTy(Arg->getContext(), NumElts); + Value *BC = Builder.CreateBitCast(X, ScalarTy); + return Builder.CreateZExtOrTrunc(BC, ResTy); } - return Constant::getIntegerValue(ResTy, Result); + return nullptr; } static Value *simplifyX86insertps(const IntrinsicInst &II, @@ -1837,6 +1855,17 @@ Instruction *InstCombiner::visitVACopyInst(VACopyInst &I) { return nullptr; } +static Instruction *canonicalizeConstantArg0ToArg1(CallInst &Call) { + assert(Call.getNumArgOperands() > 1 && "Need at least 2 args to swap"); + Value *Arg0 = Call.getArgOperand(0), *Arg1 = Call.getArgOperand(1); + if (isa(Arg0) && !isa(Arg1)) { + Call.setArgOperand(0, Arg1); + Call.setArgOperand(1, Arg0); + return &Call; + } + return nullptr; +} + /// CallInst simplification. This mostly only handles folding of intrinsic /// instructions. For normal calls, it allows visitCallSite to do the heavy /// lifting. @@ -1990,18 +2019,49 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return I; break; + case Intrinsic::fshl: + case Intrinsic::fshr: { + const APInt *SA; + if (match(II->getArgOperand(2), m_APInt(SA))) { + Value *Op0 = II->getArgOperand(0), *Op1 = II->getArgOperand(1); + unsigned BitWidth = SA->getBitWidth(); + uint64_t ShiftAmt = SA->urem(BitWidth); + assert(ShiftAmt != 0 && "SimplifyCall should have handled zero shift"); + // Normalize to funnel shift left. + if (II->getIntrinsicID() == Intrinsic::fshr) + ShiftAmt = BitWidth - ShiftAmt; + + // fshl(X, 0, C) -> shl X, C + // fshl(X, undef, C) -> shl X, C + if (match(Op1, m_Zero()) || match(Op1, m_Undef())) + return BinaryOperator::CreateShl( + Op0, ConstantInt::get(II->getType(), ShiftAmt)); + + // fshl(0, X, C) -> lshr X, (BW-C) + // fshl(undef, X, C) -> lshr X, (BW-C) + if (match(Op0, m_Zero()) || match(Op0, m_Undef())) + return BinaryOperator::CreateLShr( + Op1, ConstantInt::get(II->getType(), BitWidth - ShiftAmt)); + } + + // The shift amount (operand 2) of a funnel shift is modulo the bitwidth, + // so only the low bits of the shift amount are demanded if the bitwidth is + // a power-of-2. + unsigned BitWidth = II->getType()->getScalarSizeInBits(); + if (!isPowerOf2_32(BitWidth)) + break; + APInt Op2Demanded = APInt::getLowBitsSet(BitWidth, Log2_32_Ceil(BitWidth)); + KnownBits Op2Known(BitWidth); + if (SimplifyDemandedBits(II, 2, Op2Demanded, Op2Known)) + return &CI; + break; + } case Intrinsic::uadd_with_overflow: case Intrinsic::sadd_with_overflow: case Intrinsic::umul_with_overflow: case Intrinsic::smul_with_overflow: - if (isa(II->getArgOperand(0)) && - !isa(II->getArgOperand(1))) { - // Canonicalize constants into the RHS. - Value *LHS = II->getArgOperand(0); - II->setArgOperand(0, II->getArgOperand(1)); - II->setArgOperand(1, LHS); - return II; - } + if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) + return I; LLVM_FALLTHROUGH; case Intrinsic::usub_with_overflow: @@ -2019,19 +2079,102 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::uadd_sat: + case Intrinsic::sadd_sat: + if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) + return I; + LLVM_FALLTHROUGH; + case Intrinsic::usub_sat: + case Intrinsic::ssub_sat: { + Value *Arg0 = II->getArgOperand(0); + Value *Arg1 = II->getArgOperand(1); + Intrinsic::ID IID = II->getIntrinsicID(); + + // Make use of known overflow information. + OverflowResult OR; + switch (IID) { + default: + llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::uadd_sat: + OR = computeOverflowForUnsignedAdd(Arg0, Arg1, II); + if (OR == OverflowResult::NeverOverflows) + return BinaryOperator::CreateNUWAdd(Arg0, Arg1); + if (OR == OverflowResult::AlwaysOverflows) + return replaceInstUsesWith(*II, + ConstantInt::getAllOnesValue(II->getType())); + break; + case Intrinsic::usub_sat: + OR = computeOverflowForUnsignedSub(Arg0, Arg1, II); + if (OR == OverflowResult::NeverOverflows) + return BinaryOperator::CreateNUWSub(Arg0, Arg1); + if (OR == OverflowResult::AlwaysOverflows) + return replaceInstUsesWith(*II, + ConstantInt::getNullValue(II->getType())); + break; + case Intrinsic::sadd_sat: + if (willNotOverflowSignedAdd(Arg0, Arg1, *II)) + return BinaryOperator::CreateNSWAdd(Arg0, Arg1); + break; + case Intrinsic::ssub_sat: + if (willNotOverflowSignedSub(Arg0, Arg1, *II)) + return BinaryOperator::CreateNSWSub(Arg0, Arg1); + break; + } + + // ssub.sat(X, C) -> sadd.sat(X, -C) if C != MIN + Constant *C; + if (IID == Intrinsic::ssub_sat && match(Arg1, m_Constant(C)) && + C->isNotMinSignedValue()) { + Value *NegVal = ConstantExpr::getNeg(C); + return replaceInstUsesWith( + *II, Builder.CreateBinaryIntrinsic( + Intrinsic::sadd_sat, Arg0, NegVal)); + } + + // sat(sat(X + Val2) + Val) -> sat(X + (Val+Val2)) + // sat(sat(X - Val2) - Val) -> sat(X - (Val+Val2)) + // if Val and Val2 have the same sign + if (auto *Other = dyn_cast(Arg0)) { + Value *X; + const APInt *Val, *Val2; + APInt NewVal; + bool IsUnsigned = + IID == Intrinsic::uadd_sat || IID == Intrinsic::usub_sat; + if (Other->getIntrinsicID() == II->getIntrinsicID() && + match(Arg1, m_APInt(Val)) && + match(Other->getArgOperand(0), m_Value(X)) && + match(Other->getArgOperand(1), m_APInt(Val2))) { + if (IsUnsigned) + NewVal = Val->uadd_sat(*Val2); + else if (Val->isNonNegative() == Val2->isNonNegative()) { + bool Overflow; + NewVal = Val->sadd_ov(*Val2, Overflow); + if (Overflow) { + // Both adds together may add more than SignedMaxValue + // without saturating the final result. + break; + } + } else { + // Cannot fold saturated addition with different signs. + break; + } + + return replaceInstUsesWith( + *II, Builder.CreateBinaryIntrinsic( + IID, X, ConstantInt::get(II->getType(), NewVal))); + } + } + break; + } + case Intrinsic::minnum: case Intrinsic::maxnum: case Intrinsic::minimum: case Intrinsic::maximum: { + if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) + return I; Value *Arg0 = II->getArgOperand(0); Value *Arg1 = II->getArgOperand(1); - // Canonicalize constants to the RHS. - if (isa(Arg0) && !isa(Arg1)) { - II->setArgOperand(0, Arg1); - II->setArgOperand(1, Arg0); - return II; - } - Intrinsic::ID IID = II->getIntrinsicID(); Value *X, *Y; if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) && @@ -2111,17 +2254,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { LLVM_FALLTHROUGH; } case Intrinsic::fma: { - Value *Src0 = II->getArgOperand(0); - Value *Src1 = II->getArgOperand(1); - - // Canonicalize constant multiply operand to Src1. - if (isa(Src0) && !isa(Src1)) { - II->setArgOperand(0, Src1); - II->setArgOperand(1, Src0); - std::swap(Src0, Src1); - } + if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) + return I; // fma fneg(x), fneg(y), z -> fma x, y, z + Value *Src0 = II->getArgOperand(0); + Value *Src1 = II->getArgOperand(1); Value *X, *Y; if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) { II->setArgOperand(0, X); @@ -2423,7 +2561,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_avx_movmsk_pd_256: case Intrinsic::x86_avx_movmsk_ps_256: case Intrinsic::x86_avx2_pmovmskb: - if (Value *V = simplifyX86movmsk(*II)) + if (Value *V = simplifyX86movmsk(*II, Builder)) return replaceInstUsesWith(*II, V); break; @@ -3479,22 +3617,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { bool Signed = II->getIntrinsicID() == Intrinsic::amdgcn_sbfe; - // TODO: Also emit sub if only width is constant. - if (!CWidth && COffset && Offset == 0) { - Constant *KSize = ConstantInt::get(COffset->getType(), IntSize); - Value *ShiftVal = Builder.CreateSub(KSize, II->getArgOperand(2)); - ShiftVal = Builder.CreateZExt(ShiftVal, II->getType()); - - Value *Shl = Builder.CreateShl(Src, ShiftVal); - Value *RightShift = Signed ? Builder.CreateAShr(Shl, ShiftVal) - : Builder.CreateLShr(Shl, ShiftVal); - RightShift->takeName(II); - return replaceInstUsesWith(*II, RightShift); - } - if (!CWidth || !COffset) break; + // The case of Width == 0 is handled above, which makes this tranformation + // safe. If Width == 0, then the ashr and lshr instructions become poison + // value since the shift amount would be equal to the bit size. + assert(Width != 0); + // TODO: This allows folding to undef when the hardware has specific // behavior? if (Offset + Width < IntSize) { @@ -3910,8 +4040,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return replaceInstUsesWith(*II, ConstantPointerNull::get(PT)); // isKnownNonNull -> nonnull attribute - if (isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT)) + if (!II->hasRetAttr(Attribute::NonNull) && + isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT)) { II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); + return II; + } } // TODO: bitcast(relocate(p)) -> relocate(bitcast(p)) diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index 9fa27d89911dbea920ab88f843f0c08b3151b2c6..7a8c762d494fb481862277ba4d68150cf62ef723 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -498,6 +498,13 @@ Instruction *InstCombiner::narrowRotate(TruncInst &Trunc) { shouldChangeType(Trunc.getSrcTy(), Trunc.getType())) && "Don't narrow to an illegal scalar type"); + // Bail out on strange types. It is possible to handle some of these patterns + // even with non-power-of-2 sizes, but it is not a likely scenario. + Type *DestTy = Trunc.getType(); + unsigned NarrowWidth = DestTy->getScalarSizeInBits(); + if (!isPowerOf2_32(NarrowWidth)) + return nullptr; + // First, find an or'd pair of opposite shifts with the same shifted operand: // trunc (or (lshr ShVal, ShAmt0), (shl ShVal, ShAmt1)) Value *Or0, *Or1; @@ -514,22 +521,38 @@ Instruction *InstCombiner::narrowRotate(TruncInst &Trunc) { if (ShiftOpcode0 == ShiftOpcode1) return nullptr; - // The shift amounts must add up to the narrow bit width. - Value *ShAmt; - bool SubIsOnLHS; - Type *DestTy = Trunc.getType(); - unsigned NarrowWidth = DestTy->getScalarSizeInBits(); - if (match(ShAmt0, - m_OneUse(m_Sub(m_SpecificInt(NarrowWidth), m_Specific(ShAmt1))))) { - ShAmt = ShAmt1; - SubIsOnLHS = true; - } else if (match(ShAmt1, m_OneUse(m_Sub(m_SpecificInt(NarrowWidth), - m_Specific(ShAmt0))))) { - ShAmt = ShAmt0; - SubIsOnLHS = false; - } else { + // Match the shift amount operands for a rotate pattern. This always matches + // a subtraction on the R operand. + auto matchShiftAmount = [](Value *L, Value *R, unsigned Width) -> Value * { + // The shift amounts may add up to the narrow bit width: + // (shl ShVal, L) | (lshr ShVal, Width - L) + if (match(R, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(L))))) + return L; + + // The shift amount may be masked with negation: + // (shl ShVal, (X & (Width - 1))) | (lshr ShVal, ((-X) & (Width - 1))) + Value *X; + unsigned Mask = Width - 1; + if (match(L, m_And(m_Value(X), m_SpecificInt(Mask))) && + match(R, m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask)))) + return X; + + // Same as above, but the shift amount may be extended after masking: + if (match(L, m_ZExt(m_And(m_Value(X), m_SpecificInt(Mask)))) && + match(R, m_ZExt(m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask))))) + return X; + return nullptr; + }; + + Value *ShAmt = matchShiftAmount(ShAmt0, ShAmt1, NarrowWidth); + bool SubIsOnLHS = false; + if (!ShAmt) { + ShAmt = matchShiftAmount(ShAmt1, ShAmt0, NarrowWidth); + SubIsOnLHS = true; } + if (!ShAmt) + return nullptr; // The shifted value must have high zeros in the wide type. Typically, this // will be a zext, but it could also be the result of an 'and' or 'shift'. @@ -1084,12 +1107,9 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) { Value *Src = CI.getOperand(0); Type *SrcTy = Src->getType(), *DestTy = CI.getType(); - // Attempt to extend the entire input expression tree to the destination - // type. Only do this if the dest type is a simple type, don't convert the - // expression tree to something weird like i93 unless the source is also - // strange. + // Try to extend the entire expression tree to the wide destination type. unsigned BitsToClear; - if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) && + if (shouldChangeType(SrcTy, DestTy) && canEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) { assert(BitsToClear <= SrcTy->getScalarSizeInBits() && "Can't clear more bits than in SrcTy"); @@ -1366,12 +1386,8 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) { return replaceInstUsesWith(CI, ZExt); } - // Attempt to extend the entire input expression tree to the destination - // type. Only do this if the dest type is a simple type, don't convert the - // expression tree to something weird like i93 unless the source is also - // strange. - if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) && - canEvaluateSExtd(Src, DestTy)) { + // Try to extend the entire expression tree to the wide destination type. + if (shouldChangeType(SrcTy, DestTy) && canEvaluateSExtd(Src, DestTy)) { // Okay, we can transform this! Insert the new expression now. LLVM_DEBUG( dbgs() << "ICE: EvaluateInDifferentType converting expression type" diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 2ba1174517ffe662b7b8258ffa5421963b45c597..b5bbb09935e2731e378164a89565f1ea8346522e 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -522,11 +522,9 @@ static Value *evaluateGEPOffsetExpression(User *GEP, InstCombiner &IC, } // Otherwise, there is an index. The computation we will do will be modulo - // the pointer size, so get it. - uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth); - - Offset &= PtrSizeMask; - VariableScale &= PtrSizeMask; + // the pointer size. + Offset = SignExtend64(Offset, IntPtrWidth); + VariableScale = SignExtend64(VariableScale, IntPtrWidth); // To do this transformation, any constant index must be a multiple of the // variable scale factor. For example, we can evaluate "12 + 4*i" as "3 + i", @@ -1335,17 +1333,12 @@ Instruction *InstCombiner::foldICmpWithZero(ICmpInst &Cmp) { return nullptr; } -// Fold icmp Pred X, C. +/// Fold icmp Pred X, C. +/// TODO: This code structure does not make sense. The saturating add fold +/// should be moved to some other helper and extended as noted below (it is also +/// possible that code has been made unnecessary - do we canonicalize IR to +/// overflow/saturating intrinsics or not?). Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) { - CmpInst::Predicate Pred = Cmp.getPredicate(); - Value *X = Cmp.getOperand(0); - - const APInt *C; - if (!match(Cmp.getOperand(1), m_APInt(C))) - return nullptr; - - Value *A = nullptr, *B = nullptr; - // Match the following pattern, which is a common idiom when writing // overflow-safe integer arithmetic functions. The source performs an addition // in wider type and explicitly checks for overflow using comparisons against @@ -1357,37 +1350,62 @@ Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) { // // sum = a + b // if (sum+128 >u 255) ... -> llvm.sadd.with.overflow.i8 - { - ConstantInt *CI2; // I = icmp ugt (add (add A, B), CI2), CI - if (Pred == ICmpInst::ICMP_UGT && - match(X, m_Add(m_Add(m_Value(A), m_Value(B)), m_ConstantInt(CI2)))) - if (Instruction *Res = processUGT_ADDCST_ADD( - Cmp, A, B, CI2, cast(Cmp.getOperand(1)), *this)) - return Res; - } + CmpInst::Predicate Pred = Cmp.getPredicate(); + Value *Op0 = Cmp.getOperand(0), *Op1 = Cmp.getOperand(1); + Value *A, *B; + ConstantInt *CI, *CI2; // I = icmp ugt (add (add A, B), CI2), CI + if (Pred == ICmpInst::ICMP_UGT && match(Op1, m_ConstantInt(CI)) && + match(Op0, m_Add(m_Add(m_Value(A), m_Value(B)), m_ConstantInt(CI2)))) + if (Instruction *Res = processUGT_ADDCST_ADD(Cmp, A, B, CI2, CI, *this)) + return Res; - // FIXME: Use m_APInt to allow folds for splat constants. - ConstantInt *CI = dyn_cast(Cmp.getOperand(1)); - if (!CI) + return nullptr; +} + +/// Canonicalize icmp instructions based on dominating conditions. +Instruction *InstCombiner::foldICmpWithDominatingICmp(ICmpInst &Cmp) { + // This is a cheap/incomplete check for dominance - just match a single + // predecessor with a conditional branch. + BasicBlock *CmpBB = Cmp.getParent(); + BasicBlock *DomBB = CmpBB->getSinglePredecessor(); + if (!DomBB) return nullptr; - // Canonicalize icmp instructions based on dominating conditions. - BasicBlock *Parent = Cmp.getParent(); - BasicBlock *Dom = Parent->getSinglePredecessor(); - auto *BI = Dom ? dyn_cast(Dom->getTerminator()) : nullptr; - ICmpInst::Predicate Pred2; + Value *DomCond; BasicBlock *TrueBB, *FalseBB; - ConstantInt *CI2; - if (BI && match(BI, m_Br(m_ICmp(Pred2, m_Specific(X), m_ConstantInt(CI2)), - TrueBB, FalseBB)) && - TrueBB != FalseBB) { - ConstantRange CR = - ConstantRange::makeAllowedICmpRegion(Pred, CI->getValue()); + if (!match(DomBB->getTerminator(), m_Br(m_Value(DomCond), TrueBB, FalseBB))) + return nullptr; + + assert((TrueBB == CmpBB || FalseBB == CmpBB) && + "Predecessor block does not point to successor?"); + + // The branch should get simplified. Don't bother simplifying this condition. + if (TrueBB == FalseBB) + return nullptr; + + // Try to simplify this compare to T/F based on the dominating condition. + Optional Imp = isImpliedCondition(DomCond, &Cmp, DL, TrueBB == CmpBB); + if (Imp) + return replaceInstUsesWith(Cmp, ConstantInt::get(Cmp.getType(), *Imp)); + + CmpInst::Predicate Pred = Cmp.getPredicate(); + Value *X = Cmp.getOperand(0), *Y = Cmp.getOperand(1); + ICmpInst::Predicate DomPred; + const APInt *C, *DomC; + if (match(DomCond, m_ICmp(DomPred, m_Specific(X), m_APInt(DomC))) && + match(Y, m_APInt(C))) { + // We have 2 compares of a variable with constants. Calculate the constant + // ranges of those compares to see if we can transform the 2nd compare: + // DomBB: + // DomCond = icmp DomPred X, DomC + // br DomCond, CmpBB, FalseBB + // CmpBB: + // Cmp = icmp Pred X, C + ConstantRange CR = ConstantRange::makeAllowedICmpRegion(Pred, *C); ConstantRange DominatingCR = - (Parent == TrueBB) - ? ConstantRange::makeExactICmpRegion(Pred2, CI2->getValue()) - : ConstantRange::makeExactICmpRegion( - CmpInst::getInversePredicate(Pred2), CI2->getValue()); + (CmpBB == TrueBB) ? ConstantRange::makeExactICmpRegion(DomPred, *DomC) + : ConstantRange::makeExactICmpRegion( + CmpInst::getInversePredicate(DomPred), *DomC); ConstantRange Intersection = DominatingCR.intersectWith(CR); ConstantRange Difference = DominatingCR.difference(CR); if (Intersection.isEmptySet()) @@ -1395,23 +1413,20 @@ Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) { if (Difference.isEmptySet()) return replaceInstUsesWith(Cmp, Builder.getTrue()); - // If this is a normal comparison, it demands all bits. If it is a sign - // bit comparison, it only demands the sign bit. - bool UnusedBit; - bool IsSignBit = isSignBitCheck(Pred, CI->getValue(), UnusedBit); - // Canonicalizing a sign bit comparison that gets used in a branch, // pessimizes codegen by generating branch on zero instruction instead // of a test and branch. So we avoid canonicalizing in such situations // because test and branch instruction has better branch displacement // than compare and branch instruction. + bool UnusedBit; + bool IsSignBit = isSignBitCheck(Pred, *C, UnusedBit); if (Cmp.isEquality() || (IsSignBit && hasBranchUse(Cmp))) return nullptr; - if (auto *AI = Intersection.getSingleElement()) - return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder.getInt(*AI)); - if (auto *AD = Difference.getSingleElement()) - return new ICmpInst(ICmpInst::ICMP_NE, X, Builder.getInt(*AD)); + if (const APInt *EqC = Intersection.getSingleElement()) + return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder.getInt(*EqC)); + if (const APInt *NeC = Difference.getSingleElement()) + return new ICmpInst(ICmpInst::ICMP_NE, X, Builder.getInt(*NeC)); } return nullptr; @@ -2750,6 +2765,7 @@ Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp, // Handle icmp {eq|ne} , Constant. Type *Ty = II->getType(); + unsigned BitWidth = C.getBitWidth(); switch (II->getIntrinsicID()) { case Intrinsic::bswap: Worklist.Add(II); @@ -2758,21 +2774,39 @@ Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp, return &Cmp; case Intrinsic::ctlz: - case Intrinsic::cttz: + case Intrinsic::cttz: { // ctz(A) == bitwidth(A) -> A == 0 and likewise for != - if (C == C.getBitWidth()) { + if (C == BitWidth) { Worklist.Add(II); Cmp.setOperand(0, II->getArgOperand(0)); Cmp.setOperand(1, ConstantInt::getNullValue(Ty)); return &Cmp; } + + // ctz(A) == C -> A & Mask1 == Mask2, where Mask2 only has bit C set + // and Mask1 has bits 0..C+1 set. Similar for ctl, but for high bits. + // Limit to one use to ensure we don't increase instruction count. + unsigned Num = C.getLimitedValue(BitWidth); + if (Num != BitWidth && II->hasOneUse()) { + bool IsTrailing = II->getIntrinsicID() == Intrinsic::cttz; + APInt Mask1 = IsTrailing ? APInt::getLowBitsSet(BitWidth, Num + 1) + : APInt::getHighBitsSet(BitWidth, Num + 1); + APInt Mask2 = IsTrailing + ? APInt::getOneBitSet(BitWidth, Num) + : APInt::getOneBitSet(BitWidth, BitWidth - Num - 1); + Cmp.setOperand(0, Builder.CreateAnd(II->getArgOperand(0), Mask1)); + Cmp.setOperand(1, ConstantInt::get(Ty, Mask2)); + Worklist.Add(II); + return &Cmp; + } break; + } case Intrinsic::ctpop: { // popcount(A) == 0 -> A == 0 and likewise for != // popcount(A) == bitwidth(A) -> A == -1 and likewise for != bool IsZero = C.isNullValue(); - if (IsZero || C == C.getBitWidth()) { + if (IsZero || C == BitWidth) { Worklist.Add(II); Cmp.setOperand(0, II->getArgOperand(0)); auto *NewOp = @@ -2955,12 +2989,20 @@ static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I, // x & (-1 >> y) s>= x -> x s<= (-1 >> y) if (X != I.getOperand(1)) // X must be on RHS of comparison! return nullptr; // Ignore the other case. + if (!match(M, m_Constant())) // Can not do this fold with non-constant. + return nullptr; + if (!match(M, m_NonNegative())) // Must not have any -1 vector elements. + return nullptr; DstPred = ICmpInst::Predicate::ICMP_SLE; break; case ICmpInst::Predicate::ICMP_SLT: // x & (-1 >> y) s< x -> x s> (-1 >> y) if (X != I.getOperand(1)) // X must be on RHS of comparison! return nullptr; // Ignore the other case. + if (!match(M, m_Constant())) // Can not do this fold with non-constant. + return nullptr; + if (!match(M, m_NonNegative())) // Must not have any -1 vector elements. + return nullptr; DstPred = ICmpInst::Predicate::ICMP_SGT; break; case ICmpInst::Predicate::ICMP_SLE: @@ -4765,6 +4807,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { if (Instruction *Res = foldICmpWithConstant(I)) return Res; + if (Instruction *Res = foldICmpWithDominatingICmp(I)) + return Res; + if (Instruction *Res = foldICmpUsingKnownBits(I)) return Res; diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h index 431856c9e00609f45623383138556585d8f8b0fe..e507630f9ea0b2722bb356913c02c7ab868b611d 100644 --- a/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/lib/Transforms/InstCombine/InstCombineInternal.h @@ -857,6 +857,7 @@ private: Instruction *foldICmpWithCastAndCast(ICmpInst &ICI); Instruction *foldICmpUsingKnownBits(ICmpInst &Cmp); + Instruction *foldICmpWithDominatingICmp(ICmpInst &Cmp); Instruction *foldICmpWithConstant(ICmpInst &Cmp); Instruction *foldICmpInstWithConstant(ICmpInst &Cmp); Instruction *foldICmpInstWithConstantNotInt(ICmpInst &Cmp); @@ -919,8 +920,11 @@ private: Value *insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi, bool isSigned, bool Inside); Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI); - Instruction *MatchBSwap(BinaryOperator &I); - bool SimplifyStoreAtEndOfBlock(StoreInst &SI); + bool mergeStoreIntoSuccessor(StoreInst &SI); + + /// Given an 'or' instruction, check to see if it is part of a bswap idiom. + /// If so, return the equivalent bswap intrinsic. + Instruction *matchBSwap(BinaryOperator &Or); Instruction *SimplifyAnyMemTransfer(AnyMemTransferInst *MI); Instruction *SimplifyAnyMemSet(AnyMemSetInst *MI); diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 1ff7037c8d2d2b67ff8163ba0d35321bc9f14846..02ebb5ef294681473a8a4592ccba602c4b195944 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -19,6 +19,7 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" @@ -1498,64 +1499,45 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { if (isa(Val)) return eraseInstFromFunction(SI); - // If this store is the last instruction in the basic block (possibly - // excepting debug info instructions), and if the block ends with an - // unconditional branch, try to move it to the successor block. + // If this store is the second-to-last instruction in the basic block + // (excluding debug info and bitcasts of pointers) and if the block ends with + // an unconditional branch, try to move the store to the successor block. BBI = SI.getIterator(); do { ++BBI; } while (isa(BBI) || (isa(BBI) && BBI->getType()->isPointerTy())); + if (BranchInst *BI = dyn_cast(BBI)) if (BI->isUnconditional()) - if (SimplifyStoreAtEndOfBlock(SI)) - return nullptr; // xform done! + mergeStoreIntoSuccessor(SI); return nullptr; } -/// SimplifyStoreAtEndOfBlock - Turn things like: +/// Try to transform: /// if () { *P = v1; } else { *P = v2 } -/// into a phi node with a store in the successor. -/// -/// Simplify things like: +/// or: /// *P = v1; if () { *P = v2; } /// into a phi node with a store in the successor. -/// -bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) { +bool InstCombiner::mergeStoreIntoSuccessor(StoreInst &SI) { assert(SI.isUnordered() && - "this code has not been auditted for volatile or ordered store case"); + "This code has not been audited for volatile or ordered store case."); + // Check if the successor block has exactly 2 incoming edges. BasicBlock *StoreBB = SI.getParent(); - - // Check to see if the successor block has exactly two incoming edges. If - // so, see if the other predecessor contains a store to the same location. - // if so, insert a PHI node (if needed) and move the stores down. BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0); - - // Determine whether Dest has exactly two predecessors and, if so, compute - // the other predecessor. - pred_iterator PI = pred_begin(DestBB); - BasicBlock *P = *PI; - BasicBlock *OtherBB = nullptr; - - if (P != StoreBB) - OtherBB = P; - - if (++PI == pred_end(DestBB)) + if (!DestBB->hasNPredecessors(2)) return false; - P = *PI; - if (P != StoreBB) { - if (OtherBB) - return false; - OtherBB = P; - } - if (++PI != pred_end(DestBB)) - return false; + // Capture the other block (the block that doesn't contain our store). + pred_iterator PredIter = pred_begin(DestBB); + if (*PredIter == StoreBB) + ++PredIter; + BasicBlock *OtherBB = *PredIter; - // Bail out if all the relevant blocks aren't distinct (this can happen, - // for example, if SI is in an infinite loop) + // Bail out if all of the relevant blocks aren't distinct. This can happen, + // for example, if SI is in an infinite loop. if (StoreBB == DestBB || OtherBB == DestBB) return false; @@ -1566,7 +1548,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) { return false; // If the other block ends in an unconditional branch, check for the 'if then - // else' case. there is an instruction before the branch. + // else' case. There is an instruction before the branch. StoreInst *OtherStore = nullptr; if (OtherBr->isUnconditional()) { --BBI; @@ -1591,7 +1573,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) { return false; // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an - // if/then triangle. See if there is a store to the same ptr as SI that + // if/then triangle. See if there is a store to the same ptr as SI that // lives in OtherBB. for (;; --BBI) { // Check to see if we find the matching store. @@ -1602,15 +1584,14 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) { break; } // If we find something that may be using or overwriting the stored - // value, or if we run out of instructions, we can't do the xform. + // value, or if we run out of instructions, we can't do the transform. if (BBI->mayReadFromMemory() || BBI->mayThrow() || BBI->mayWriteToMemory() || BBI == OtherBB->begin()) return false; } - // In order to eliminate the store in OtherBr, we have to - // make sure nothing reads or overwrites the stored value in - // StoreBB. + // In order to eliminate the store in OtherBr, we have to make sure nothing + // reads or overwrites the stored value in StoreBB. for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) { // FIXME: This should really be AA driven. if (I->mayReadFromMemory() || I->mayThrow() || I->mayWriteToMemory()) @@ -1620,24 +1601,24 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) { // Insert a PHI node now if we need it. Value *MergedVal = OtherStore->getOperand(0); + // The debug locations of the original instructions might differ. Merge them. + DebugLoc MergedLoc = DILocation::getMergedLocation(SI.getDebugLoc(), + OtherStore->getDebugLoc()); if (MergedVal != SI.getOperand(0)) { PHINode *PN = PHINode::Create(MergedVal->getType(), 2, "storemerge"); PN->addIncoming(SI.getOperand(0), SI.getParent()); PN->addIncoming(OtherStore->getOperand(0), OtherBB); MergedVal = InsertNewInstBefore(PN, DestBB->front()); + PN->setDebugLoc(MergedLoc); } - // Advance to a place where it is safe to insert the new store and - // insert it. + // Advance to a place where it is safe to insert the new store and insert it. BBI = DestBB->getFirstInsertionPt(); StoreInst *NewSI = new StoreInst(MergedVal, SI.getOperand(1), - SI.isVolatile(), - SI.getAlignment(), - SI.getOrdering(), - SI.getSyncScopeID()); + SI.isVolatile(), SI.getAlignment(), + SI.getOrdering(), SI.getSyncScopeID()); InsertNewInstBefore(NewSI, *BBI); - // The debug locations of the original instructions might differ; merge them. - NewSI->applyMergedLocation(SI.getDebugLoc(), OtherStore->getDebugLoc()); + NewSI->setDebugLoc(MergedLoc); // If the two stores had AA tags, merge them. AAMDNodes AATags; diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp index 88a72bb8eb57e1e87d3fe6ce4019d64fdb1947c4..19858ae149aba951db98330eb0823061342cc022 100644 --- a/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1546,6 +1546,66 @@ static Instruction *factorizeMinMaxTree(SelectPatternFlavor SPF, Value *LHS, return SelectInst::Create(CmpABC, MinMaxOp, ThirdOp); } +/// Try to reduce a rotate pattern that includes a compare and select into a +/// sequence of ALU ops only. Example: +/// rotl32(a, b) --> (b == 0 ? a : ((a >> (32 - b)) | (a << b))) +/// --> (a >> (-b & 31)) | (a << (b & 31)) +static Instruction *foldSelectRotate(SelectInst &Sel, + InstCombiner::BuilderTy &Builder) { + // The false value of the select must be a rotate of the true value. + Value *Or0, *Or1; + if (!match(Sel.getFalseValue(), m_OneUse(m_Or(m_Value(Or0), m_Value(Or1))))) + return nullptr; + + Value *TVal = Sel.getTrueValue(); + Value *SA0, *SA1; + if (!match(Or0, m_OneUse(m_LogicalShift(m_Specific(TVal), m_Value(SA0)))) || + !match(Or1, m_OneUse(m_LogicalShift(m_Specific(TVal), m_Value(SA1))))) + return nullptr; + + auto ShiftOpcode0 = cast(Or0)->getOpcode(); + auto ShiftOpcode1 = cast(Or1)->getOpcode(); + if (ShiftOpcode0 == ShiftOpcode1) + return nullptr; + + // We have one of these patterns so far: + // select ?, TVal, (or (lshr TVal, SA0), (shl TVal, SA1)) + // select ?, TVal, (or (shl TVal, SA0), (lshr TVal, SA1)) + // This must be a power-of-2 rotate for a bitmasking transform to be valid. + unsigned Width = Sel.getType()->getScalarSizeInBits(); + if (!isPowerOf2_32(Width)) + return nullptr; + + // Check the shift amounts to see if they are an opposite pair. + Value *ShAmt; + if (match(SA1, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(SA0))))) + ShAmt = SA0; + else if (match(SA0, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(SA1))))) + ShAmt = SA1; + else + return nullptr; + + // Finally, see if the select is filtering out a shift-by-zero. + Value *Cond = Sel.getCondition(); + ICmpInst::Predicate Pred; + if (!match(Cond, m_OneUse(m_ICmp(Pred, m_Specific(ShAmt), m_ZeroInt()))) || + Pred != ICmpInst::ICMP_EQ) + return nullptr; + + // This is a rotate that avoids shift-by-bitwidth UB in a suboptimal way. + // Convert to safely bitmasked shifts. + // TODO: When we can canonicalize to funnel shift intrinsics without risk of + // performance regressions, replace this sequence with that call. + Value *NegShAmt = Builder.CreateNeg(ShAmt); + Value *MaskedShAmt = Builder.CreateAnd(ShAmt, Width - 1); + Value *MaskedNegShAmt = Builder.CreateAnd(NegShAmt, Width - 1); + Value *NewSA0 = ShAmt == SA0 ? MaskedShAmt : MaskedNegShAmt; + Value *NewSA1 = ShAmt == SA1 ? MaskedShAmt : MaskedNegShAmt; + Value *NewSh0 = Builder.CreateBinOp(ShiftOpcode0, TVal, NewSA0); + Value *NewSh1 = Builder.CreateBinOp(ShiftOpcode1, TVal, NewSA1); + return BinaryOperator::CreateOr(NewSh0, NewSh1); +} + Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { Value *CondVal = SI.getCondition(); Value *TrueVal = SI.getTrueValue(); @@ -1806,8 +1866,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { // MAX(~a, C) -> ~MIN(a, ~C) // MIN(~a, ~b) -> ~MAX(a, b) // MIN(~a, C) -> ~MAX(a, ~C) - auto moveNotAfterMinMax = [&](Value *X, Value *Y, - bool Swapped) -> Instruction * { + auto moveNotAfterMinMax = [&](Value *X, Value *Y) -> Instruction * { Value *A; if (match(X, m_Not(m_Value(A))) && !X->hasNUsesOrMore(3) && !IsFreeToInvert(A, A->hasOneUse()) && @@ -1820,14 +1879,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { if (MDNode *MD = SI.getMetadata(LLVMContext::MD_prof)) { cast(NewMinMax)->setMetadata(LLVMContext::MD_prof, MD); // Swap the metadata if the operands are swapped. - if (Swapped) { - assert(X == SI.getFalseValue() && Y == SI.getTrueValue() && - "Unexpected operands."); + if (X == SI.getFalseValue() && Y == SI.getTrueValue()) cast(NewMinMax)->swapProfMetadata(); - } else { - assert(X == SI.getTrueValue() && Y == SI.getFalseValue() && - "Unexpected operands."); - } } return BinaryOperator::CreateNot(NewMinMax); @@ -1836,9 +1889,9 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { return nullptr; }; - if (Instruction *I = moveNotAfterMinMax(LHS, RHS, /*Swapped*/false)) + if (Instruction *I = moveNotAfterMinMax(LHS, RHS)) return I; - if (Instruction *I = moveNotAfterMinMax(RHS, LHS, /*Swapped*/true)) + if (Instruction *I = moveNotAfterMinMax(RHS, LHS)) return I; if (Instruction *I = factorizeMinMaxTree(SPF, LHS, RHS, Builder)) @@ -1968,24 +2021,6 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { } } - // See if we can determine the result of this select based on a dominating - // condition. - BasicBlock *Parent = SI.getParent(); - if (BasicBlock *Dom = Parent->getSinglePredecessor()) { - auto *PBI = dyn_cast_or_null(Dom->getTerminator()); - if (PBI && PBI->isConditional() && - PBI->getSuccessor(0) != PBI->getSuccessor(1) && - (PBI->getSuccessor(0) == Parent || PBI->getSuccessor(1) == Parent)) { - bool CondIsTrue = PBI->getSuccessor(0) == Parent; - Optional Implication = isImpliedCondition( - PBI->getCondition(), SI.getCondition(), DL, CondIsTrue); - if (Implication) { - Value *V = *Implication ? TrueVal : FalseVal; - return replaceInstUsesWith(SI, V); - } - } - } - // If we can compute the condition, there's no need for a select. // Like the above fold, we are attempting to reduce compile-time cost by // putting this fold here with limitations rather than in InstSimplify. @@ -2010,5 +2045,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { if (Instruction *Select = foldSelectBinOpIdentity(SI, TLI)) return Select; + if (Instruction *Rot = foldSelectRotate(SI, Builder)) + return Rot; + return nullptr; } diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 45cacc73d63becb324ffabe1d4d5425a349e2873..a193dde1c39517e4c971aca666ac9e9709a67328 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -690,6 +690,30 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // TODO: Could compute known zero/one bits based on the input. break; } + case Intrinsic::fshr: + case Intrinsic::fshl: { + const APInt *SA; + if (!match(I->getOperand(2), m_APInt(SA))) + break; + + // Normalize to funnel shift left. APInt shifts of BitWidth are well- + // defined, so no need to special-case zero shifts here. + uint64_t ShiftAmt = SA->urem(BitWidth); + if (II->getIntrinsicID() == Intrinsic::fshr) + ShiftAmt = BitWidth - ShiftAmt; + + APInt DemandedMaskLHS(DemandedMask.lshr(ShiftAmt)); + APInt DemandedMaskRHS(DemandedMask.shl(BitWidth - ShiftAmt)); + if (SimplifyDemandedBits(I, 0, DemandedMaskLHS, LHSKnown, Depth + 1) || + SimplifyDemandedBits(I, 1, DemandedMaskRHS, RHSKnown, Depth + 1)) + return I; + + Known.Zero = LHSKnown.Zero.shl(ShiftAmt) | + RHSKnown.Zero.lshr(BitWidth - ShiftAmt); + Known.One = LHSKnown.One.shl(ShiftAmt) | + RHSKnown.One.lshr(BitWidth - ShiftAmt); + break; + } case Intrinsic::x86_mmx_pmovmskb: case Intrinsic::x86_sse_movmsk_ps: case Intrinsic::x86_sse2_movmsk_pd: diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 21dd7ed227af476fac845aeb974e4b62dd98164c..0ad1fc0e791f140a28c910ae44bf17570143652d 100644 --- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -46,40 +46,34 @@ using namespace PatternMatch; #define DEBUG_TYPE "instcombine" /// Return true if the value is cheaper to scalarize than it is to leave as a -/// vector operation. isConstant indicates whether we're extracting one known -/// element. If false we're extracting a variable index. -static bool cheapToScalarize(Value *V, bool isConstant) { - if (Constant *C = dyn_cast(V)) { - if (isConstant) return true; +/// vector operation. IsConstantExtractIndex indicates whether we are extracting +/// one known element from a vector constant. +/// +/// FIXME: It's possible to create more instructions than previously existed. +static bool cheapToScalarize(Value *V, bool IsConstantExtractIndex) { + // If we can pick a scalar constant value out of a vector, that is free. + if (auto *C = dyn_cast(V)) + return IsConstantExtractIndex || C->getSplatValue(); + + // An insertelement to the same constant index as our extract will simplify + // to the scalar inserted element. An insertelement to a different constant + // index is irrelevant to our extract. + if (match(V, m_InsertElement(m_Value(), m_Value(), m_ConstantInt()))) + return IsConstantExtractIndex; + + if (match(V, m_OneUse(m_Load(m_Value())))) + return true; - // If all elts are the same, we can extract it and use any of the values. - if (Constant *Op0 = C->getAggregateElement(0U)) { - for (unsigned i = 1, e = V->getType()->getVectorNumElements(); i != e; - ++i) - if (C->getAggregateElement(i) != Op0) - return false; + Value *V0, *V1; + if (match(V, m_OneUse(m_BinOp(m_Value(V0), m_Value(V1))))) + if (cheapToScalarize(V0, IsConstantExtractIndex) || + cheapToScalarize(V1, IsConstantExtractIndex)) return true; - } - } - Instruction *I = dyn_cast(V); - if (!I) return false; - // Insert element gets simplified to the inserted element or is deleted if - // this is constant idx extract element and its a constant idx insertelt. - if (I->getOpcode() == Instruction::InsertElement && isConstant && - isa(I->getOperand(2))) - return true; - if (I->getOpcode() == Instruction::Load && I->hasOneUse()) - return true; - if (BinaryOperator *BO = dyn_cast(I)) - if (BO->hasOneUse() && - (cheapToScalarize(BO->getOperand(0), isConstant) || - cheapToScalarize(BO->getOperand(1), isConstant))) - return true; - if (CmpInst *CI = dyn_cast(I)) - if (CI->hasOneUse() && - (cheapToScalarize(CI->getOperand(0), isConstant) || - cheapToScalarize(CI->getOperand(1), isConstant))) + CmpInst::Predicate UnusedPred; + if (match(V, m_OneUse(m_Cmp(UnusedPred, m_Value(V0), m_Value(V1))))) + if (cheapToScalarize(V0, IsConstantExtractIndex) || + cheapToScalarize(V1, IsConstantExtractIndex)) return true; return false; @@ -261,34 +255,30 @@ static Instruction *foldBitcastExtElt(ExtractElementInst &Ext, } Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { - if (Value *V = SimplifyExtractElementInst(EI.getVectorOperand(), - EI.getIndexOperand(), + Value *SrcVec = EI.getVectorOperand(); + Value *Index = EI.getIndexOperand(); + if (Value *V = SimplifyExtractElementInst(SrcVec, Index, SQ.getWithInstruction(&EI))) return replaceInstUsesWith(EI, V); - // If vector val is constant with all elements the same, replace EI with - // that element. We handle a known element # below. - if (Constant *C = dyn_cast(EI.getOperand(0))) - if (cheapToScalarize(C, false)) - return replaceInstUsesWith(EI, C->getAggregateElement(0U)); - // If extracting a specified index from the vector, see if we can recursively // find a previously computed scalar that was inserted into the vector. - if (ConstantInt *IdxC = dyn_cast(EI.getOperand(1))) { + auto *IndexC = dyn_cast(Index); + if (IndexC) { unsigned NumElts = EI.getVectorOperandType()->getNumElements(); // InstSimplify should handle cases where the index is invalid. - if (!IdxC->getValue().ule(NumElts)) + if (!IndexC->getValue().ule(NumElts)) return nullptr; // This instruction only demands the single element from the input vector. // If the input vector has a single use, simplify it based on this use // property. - if (EI.getOperand(0)->hasOneUse() && NumElts != 1) { + if (SrcVec->hasOneUse() && NumElts != 1) { APInt UndefElts(NumElts, 0); - APInt DemandedMask(NumElts, 0); - DemandedMask.setBit(IdxC->getZExtValue()); - if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0), DemandedMask, + APInt DemandedElts(NumElts, 0); + DemandedElts.setBit(IndexC->getZExtValue()); + if (Value *V = SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts)) { EI.setOperand(0, V); return &EI; @@ -300,43 +290,46 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { // If there's a vector PHI feeding a scalar use through this extractelement // instruction, try to scalarize the PHI. - if (PHINode *PN = dyn_cast(EI.getOperand(0))) { - Instruction *scalarPHI = scalarizePHI(EI, PN); - if (scalarPHI) - return scalarPHI; - } + if (auto *Phi = dyn_cast(SrcVec)) + if (Instruction *ScalarPHI = scalarizePHI(EI, Phi)) + return ScalarPHI; } - if (Instruction *I = dyn_cast(EI.getOperand(0))) { - // Push extractelement into predecessor operation if legal and - // profitable to do so. - if (BinaryOperator *BO = dyn_cast(I)) { - if (I->hasOneUse() && - cheapToScalarize(BO, isa(EI.getOperand(1)))) { - Value *newEI0 = - Builder.CreateExtractElement(BO->getOperand(0), EI.getOperand(1), - EI.getName()+".lhs"); - Value *newEI1 = - Builder.CreateExtractElement(BO->getOperand(1), EI.getOperand(1), - EI.getName()+".rhs"); - return BinaryOperator::CreateWithCopiedFlags(BO->getOpcode(), - newEI0, newEI1, BO); - } - } else if (InsertElementInst *IE = dyn_cast(I)) { + BinaryOperator *BO; + if (match(SrcVec, m_BinOp(BO)) && cheapToScalarize(SrcVec, IndexC)) { + // extelt (binop X, Y), Index --> binop (extelt X, Index), (extelt Y, Index) + Value *X = BO->getOperand(0), *Y = BO->getOperand(1); + Value *E0 = Builder.CreateExtractElement(X, Index); + Value *E1 = Builder.CreateExtractElement(Y, Index); + return BinaryOperator::CreateWithCopiedFlags(BO->getOpcode(), E0, E1, BO); + } + + Value *X, *Y; + CmpInst::Predicate Pred; + if (match(SrcVec, m_Cmp(Pred, m_Value(X), m_Value(Y))) && + cheapToScalarize(SrcVec, IndexC)) { + // extelt (cmp X, Y), Index --> cmp (extelt X, Index), (extelt Y, Index) + Value *E0 = Builder.CreateExtractElement(X, Index); + Value *E1 = Builder.CreateExtractElement(Y, Index); + return CmpInst::Create(cast(SrcVec)->getOpcode(), Pred, E0, E1); + } + + if (auto *I = dyn_cast(SrcVec)) { + if (auto *IE = dyn_cast(I)) { // Extracting the inserted element? - if (IE->getOperand(2) == EI.getOperand(1)) + if (IE->getOperand(2) == Index) return replaceInstUsesWith(EI, IE->getOperand(1)); // If the inserted and extracted elements are constants, they must not // be the same value, extract from the pre-inserted value instead. - if (isa(IE->getOperand(2)) && isa(EI.getOperand(1))) { - Worklist.AddValue(EI.getOperand(0)); + if (isa(IE->getOperand(2)) && IndexC) { + Worklist.AddValue(SrcVec); EI.setOperand(0, IE->getOperand(0)); return &EI; } - } else if (ShuffleVectorInst *SVI = dyn_cast(I)) { + } else if (auto *SVI = dyn_cast(I)) { // If this is extracting an element from a shufflevector, figure out where // it came from and extract from the appropriate input element instead. - if (ConstantInt *Elt = dyn_cast(EI.getOperand(1))) { + if (auto *Elt = dyn_cast(Index)) { int SrcIdx = SVI->getMaskValue(Elt->getZExtValue()); Value *Src; unsigned LHSWidth = @@ -355,13 +348,12 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { ConstantInt::get(Int32Ty, SrcIdx, false)); } - } else if (CastInst *CI = dyn_cast(I)) { + } else if (auto *CI = dyn_cast(I)) { // Canonicalize extractelement(cast) -> cast(extractelement). // Bitcasts can change the number of vector elements, and they cost // nothing. if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) { - Value *EE = Builder.CreateExtractElement(CI->getOperand(0), - EI.getIndexOperand()); + Value *EE = Builder.CreateExtractElement(CI->getOperand(0), Index); Worklist.AddValue(EE); return CastInst::Create(CI->getOpcode(), EE, EI.getType()); } diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index a3962a04b500cbfa0fbf169f1e9cf6aa074a3c19..be7d43bbcf2c32fbfe25920e40f20b24fdef4369 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -183,7 +183,10 @@ bool InstCombiner::shouldChangeType(unsigned FromWidth, /// a fundamental type in IR, and there are many specialized optimizations for /// i1 types. bool InstCombiner::shouldChangeType(Type *From, Type *To) const { - assert(From->isIntegerTy() && To->isIntegerTy()); + // TODO: This could be extended to allow vectors. Datalayout changes might be + // needed to properly support that. + if (!From->isIntegerTy() || !To->isIntegerTy()) + return false; unsigned FromWidth = From->getPrimitiveSizeInBits(); unsigned ToWidth = To->getPrimitiveSizeInBits(); @@ -1424,21 +1427,23 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) { V1->getType()->getVectorNumElements() <= NumElts) { assert(Inst.getType()->getScalarType() == V1->getType()->getScalarType() && "Shuffle should not change scalar type"); - unsigned V1Width = V1->getType()->getVectorNumElements(); + // Find constant NewC that has property: // shuffle(NewC, ShMask) = C // If such constant does not exist (example: ShMask=<0,0> and C=<1,2>) // reorder is not possible. A 1-to-1 mapping is not required. Example: // ShMask = <1,1,2,2> and C = <5,5,6,6> --> NewC = + bool ConstOp1 = isa(RHS); SmallVector ShMask; ShuffleVectorInst::getShuffleMask(Mask, ShMask); - SmallVector - NewVecC(V1Width, UndefValue::get(C->getType()->getScalarType())); + unsigned SrcVecNumElts = V1->getType()->getVectorNumElements(); + UndefValue *UndefScalar = UndefValue::get(C->getType()->getScalarType()); + SmallVector NewVecC(SrcVecNumElts, UndefScalar); bool MayChange = true; for (unsigned I = 0; I < NumElts; ++I) { + Constant *CElt = C->getAggregateElement(I); if (ShMask[I] >= 0) { assert(ShMask[I] < (int)NumElts && "Not expecting narrowing shuffle"); - Constant *CElt = C->getAggregateElement(I); Constant *NewCElt = NewVecC[ShMask[I]]; // Bail out if: // 1. The constant vector contains a constant expression. @@ -1447,26 +1452,41 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) { // 3. This is a widening shuffle that copies elements of V1 into the // extended elements (extending with undef is allowed). if (!CElt || (!isa(NewCElt) && NewCElt != CElt) || - I >= V1Width) { + I >= SrcVecNumElts) { MayChange = false; break; } NewVecC[ShMask[I]] = CElt; } + // If this is a widening shuffle, we must be able to extend with undef + // elements. If the original binop does not produce an undef in the high + // lanes, then this transform is not safe. + // TODO: We could shuffle those non-undef constant values into the + // result by using a constant vector (rather than an undef vector) + // as operand 1 of the new binop, but that might be too aggressive + // for target-independent shuffle creation. + if (I >= SrcVecNumElts) { + Constant *MaybeUndef = + ConstOp1 ? ConstantExpr::get(Opcode, UndefScalar, CElt) + : ConstantExpr::get(Opcode, CElt, UndefScalar); + if (!isa(MaybeUndef)) { + MayChange = false; + break; + } + } } if (MayChange) { Constant *NewC = ConstantVector::get(NewVecC); // It may not be safe to execute a binop on a vector with undef elements // because the entire instruction can be folded to undef or create poison // that did not exist in the original code. - bool ConstOp1 = isa(Inst.getOperand(1)); if (Inst.isIntDivRem() || (Inst.isShift() && ConstOp1)) NewC = getSafeVectorConstantForBinop(Opcode, NewC, ConstOp1); // Op(shuffle(V1, Mask), C) -> shuffle(Op(V1, NewC), Mask) // Op(C, shuffle(V1, Mask)) -> shuffle(Op(NewC, V1), Mask) - Value *NewLHS = isa(LHS) ? NewC : V1; - Value *NewRHS = isa(LHS) ? V1 : NewC; + Value *NewLHS = ConstOp1 ? V1 : NewC; + Value *NewRHS = ConstOp1 ? NewC : V1; return createBinOpShuffle(NewLHS, NewRHS, Mask); } } @@ -2244,9 +2264,16 @@ static bool isAllocSiteRemovable(Instruction *AI, } Instruction *InstCombiner::visitAllocSite(Instruction &MI) { - // If we have a malloc call which is only used in any amount of comparisons - // to null and free calls, delete the calls and replace the comparisons with - // true or false as appropriate. + // If we have a malloc call which is only used in any amount of comparisons to + // null and free calls, delete the calls and replace the comparisons with true + // or false as appropriate. + + // This is based on the principle that we can substitute our own allocation + // function (which will never return null) rather than knowledge of the + // specific function being called. In some sense this can change the permitted + // outputs of a program (when we convert a malloc to an alloca, the fact that + // the allocation is now on the stack is potentially visible, for example), + // but we believe in a permissible manner. SmallVector Users; // If we are removing an alloca with a dbg.declare, insert dbg.value calls diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 42b8179f80052704ef6f9e828625f2de34ce8f05..2ede46774b24ebff28080fefea5e8d0581710326 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -109,6 +109,7 @@ static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30; static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46; static const uint64_t kNetBSD_ShadowOffset32 = 1ULL << 30; static const uint64_t kNetBSD_ShadowOffset64 = 1ULL << 46; +static const uint64_t kNetBSDKasan_ShadowOffset64 = 0xdfff900000000000; static const uint64_t kPS4CPU_ShadowOffset64 = 1ULL << 40; static const uint64_t kWindowsShadowOffset32 = 3ULL << 28; @@ -344,10 +345,14 @@ static cl::opt ClForceExperiment( cl::init(0)); static cl::opt - ClUsePrivateAliasForGlobals("asan-use-private-alias", - cl::desc("Use private aliases for global" - " variables"), - cl::Hidden, cl::init(false)); + ClUsePrivateAlias("asan-use-private-alias", + cl::desc("Use private aliases for global variables"), + cl::Hidden, cl::init(false)); + +static cl::opt + ClUseOdrIndicator("asan-use-odr-indicator", + cl::desc("Use odr indicators to improve ODR reporting"), + cl::Hidden, cl::init(false)); static cl::opt ClUseGlobalsGC("asan-globals-live-support", @@ -436,8 +441,11 @@ public: for (auto MDN : Globals->operands()) { // Metadata node contains the global and the fields of "Entry". assert(MDN->getNumOperands() == 5); - auto *GV = mdconst::extract_or_null(MDN->getOperand(0)); + auto *V = mdconst::extract_or_null(MDN->getOperand(0)); // The optimizer may optimize away a global entirely. + if (!V) continue; + auto *StrippedV = V->stripPointerCasts(); + auto *GV = dyn_cast(StrippedV); if (!GV) continue; // We can already have an entry for GV if it was merged with another // global. @@ -540,9 +548,12 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize, Mapping.Offset = kSystemZ_ShadowOffset64; else if (IsFreeBSD && !IsMIPS64) Mapping.Offset = kFreeBSD_ShadowOffset64; - else if (IsNetBSD) - Mapping.Offset = kNetBSD_ShadowOffset64; - else if (IsPS4CPU) + else if (IsNetBSD) { + if (IsKasan) + Mapping.Offset = kNetBSDKasan_ShadowOffset64; + else + Mapping.Offset = kNetBSD_ShadowOffset64; + } else if (IsPS4CPU) Mapping.Offset = kPS4CPU_ShadowOffset64; else if (IsLinux && IsX86_64) { if (IsKasan) @@ -731,9 +742,12 @@ public: explicit AddressSanitizerModule(bool CompileKernel = false, bool Recover = false, - bool UseGlobalsGC = true) - : ModulePass(ID), - UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC), + bool UseGlobalsGC = true, + bool UseOdrIndicator = false) + : ModulePass(ID), UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC), + // Enable aliases as they should have no downside with ODR indicators. + UsePrivateAlias(UseOdrIndicator || ClUsePrivateAlias), + UseOdrIndicator(UseOdrIndicator || ClUseOdrIndicator), // Not a typo: ClWithComdat is almost completely pointless without // ClUseGlobalsGC (because then it only works on modules without // globals, which are rare); it is a prerequisite for ClUseGlobalsGC; @@ -742,11 +756,10 @@ public: // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to // do globals-gc. UseCtorComdat(UseGlobalsGC && ClWithComdat) { - this->Recover = ClRecover.getNumOccurrences() > 0 ? - ClRecover : Recover; - this->CompileKernel = ClEnableKasan.getNumOccurrences() > 0 ? - ClEnableKasan : CompileKernel; - } + this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover; + this->CompileKernel = + ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan : CompileKernel; + } bool runOnModule(Module &M) override; StringRef getPassName() const override { return "AddressSanitizerModule"; } @@ -790,6 +803,8 @@ private: bool CompileKernel; bool Recover; bool UseGlobalsGC; + bool UsePrivateAlias; + bool UseOdrIndicator; bool UseCtorComdat; Type *IntptrTy; LLVMContext *C; @@ -1089,9 +1104,11 @@ INITIALIZE_PASS( ModulePass *llvm::createAddressSanitizerModulePass(bool CompileKernel, bool Recover, - bool UseGlobalsGC) { + bool UseGlobalsGC, + bool UseOdrIndicator) { assert(!CompileKernel || Recover); - return new AddressSanitizerModule(CompileKernel, Recover, UseGlobalsGC); + return new AddressSanitizerModule(CompileKernel, Recover, UseGlobalsGC, + UseOdrIndicator); } static size_t TypeSizeToSizeIndex(uint32_t TypeSize) { @@ -2173,12 +2190,20 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool bool CanUsePrivateAliases = TargetTriple.isOSBinFormatELF() || TargetTriple.isOSBinFormatMachO() || TargetTriple.isOSBinFormatWasm(); - if (CanUsePrivateAliases && ClUsePrivateAliasForGlobals) { + if (CanUsePrivateAliases && UsePrivateAlias) { // Create local alias for NewGlobal to avoid crash on ODR between // instrumented and non-instrumented libraries. - auto *GA = GlobalAlias::create(GlobalValue::InternalLinkage, - NameForGlobal + M.getName(), NewGlobal); + InstrumentedGlobal = + GlobalAlias::create(GlobalValue::PrivateLinkage, "", NewGlobal); + } + // ODR check is not useful for the following, but we see false reports + // caused by linker optimizations. + if (NewGlobal->hasLocalLinkage() || NewGlobal->hasLinkOnceODRLinkage() || + NewGlobal->hasWeakODRLinkage()) { + ODRIndicator = ConstantExpr::getIntToPtr(ConstantInt::get(IntptrTy, -1), + IRB.getInt8PtrTy()); + } else if (UseOdrIndicator) { // With local aliases, we need to provide another externally visible // symbol __odr_asan_XXX to detect ODR violation. auto *ODRIndicatorSym = @@ -2192,7 +2217,6 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool ODRIndicatorSym->setDLLStorageClass(NewGlobal->getDLLStorageClass()); ODRIndicatorSym->setAlignment(1); ODRIndicator = ODRIndicatorSym; - InstrumentedGlobal = GA; } Constant *Initializer = ConstantStruct::get( diff --git a/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/lib/Transforms/Instrumentation/ControlHeightReduction.cpp index 8f4159d3d191ce61ade22b1cd8b6aa03ccfe653a..1ada0b7130927aeed0efd624dde2d837de362e09 100644 --- a/lib/Transforms/Instrumentation/ControlHeightReduction.cpp +++ b/lib/Transforms/Instrumentation/ControlHeightReduction.cpp @@ -1416,7 +1416,7 @@ bool CHRScopeSorter(CHRScope *Scope1, CHRScope *Scope2) { void CHR::sortScopes(SmallVectorImpl &Input, SmallVectorImpl &Output) { Output.resize(Input.size()); - std::copy(Input.begin(), Input.end(), Output.begin()); + llvm::copy(Input, Output.begin()); std::stable_sort(Output.begin(), Output.end(), CHRScopeSorter); } @@ -2040,7 +2040,7 @@ bool ControlHeightReductionLegacyPass::runOnFunction(Function &F) { getAnalysis().getBFI(); DominatorTree &DT = getAnalysis().getDomTree(); ProfileSummaryInfo &PSI = - *getAnalysis().getPSI(); + getAnalysis().getPSI(); RegionInfo &RI = getAnalysis().getRegionInfo(); std::unique_ptr OwnedORE = llvm::make_unique(&F); diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp index 084e6b7e43690d50033ce4d2d32396a3d123c5e9..829123911c61b4f6c54bc0a832d72b9bccf36925 100644 --- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -36,6 +36,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" +#include "llvm/Support/Regex.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Instrumentation/GCOVProfiler.h" @@ -96,6 +97,11 @@ private: // profiling runtime to emit .gcda files when run. bool emitProfileArcs(); + bool isFunctionInstrumented(const Function &F); + std::vector createRegexesFromString(StringRef RegexesStr); + static bool doesFilenameMatchARegex(StringRef Filename, + std::vector &Regexes); + // Get pointers to the functions in the runtime library. Constant *getStartFileFunc(); Constant *getEmitFunctionFunc(); @@ -125,6 +131,9 @@ private: const TargetLibraryInfo *TLI; LLVMContext *Ctx; SmallVector, 16> Funcs; + std::vector FilterRe; + std::vector ExcludeRe; + StringMap InstrumentedFiles; }; class GCOVProfilerLegacyPass : public ModulePass { @@ -171,6 +180,21 @@ static StringRef getFunctionName(const DISubprogram *SP) { return SP->getName(); } +/// Extract a filename for a DISubprogram. +/// +/// Prefer relative paths in the coverage notes. Clang also may split +/// up absolute paths into a directory and filename component. When +/// the relative path doesn't exist, reconstruct the absolute path. +SmallString<128> getFilename(const DISubprogram *SP) { + SmallString<128> Path; + StringRef RelPath = SP->getFilename(); + if (sys::fs::exists(RelPath)) + Path = RelPath; + else + sys::path::append(Path, SP->getDirectory(), SP->getFilename()); + return Path; +} + namespace { class GCOVRecord { protected: @@ -247,7 +271,7 @@ namespace { } private: - StringRef Filename; + std::string Filename; SmallVector Lines; }; @@ -368,8 +392,9 @@ namespace { void writeOut() { writeBytes(FunctionTag, 4); + SmallString<128> Filename = getFilename(SP); uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(getFunctionName(SP)) + - 1 + lengthOfGCOVString(SP->getFilename()) + 1; + 1 + lengthOfGCOVString(Filename) + 1; if (UseCfgChecksum) ++BlockLen; write(BlockLen); @@ -378,7 +403,7 @@ namespace { if (UseCfgChecksum) write(CfgChecksum); writeGCOVString(getFunctionName(SP)); - writeGCOVString(SP->getFilename()); + writeGCOVString(Filename); write(SP->getLine()); // Emit count of blocks. @@ -423,6 +448,72 @@ namespace { }; } +// RegexesStr is a string containing differents regex separated by a semi-colon. +// For example "foo\..*$;bar\..*$". +std::vector GCOVProfiler::createRegexesFromString(StringRef RegexesStr) { + std::vector Regexes; + while (!RegexesStr.empty()) { + std::pair HeadTail = RegexesStr.split(';'); + if (!HeadTail.first.empty()) { + Regex Re(HeadTail.first); + std::string Err; + if (!Re.isValid(Err)) { + Ctx->emitError(Twine("Regex ") + HeadTail.first + + " is not valid: " + Err); + } + Regexes.emplace_back(std::move(Re)); + } + RegexesStr = HeadTail.second; + } + return Regexes; +} + +bool GCOVProfiler::doesFilenameMatchARegex(StringRef Filename, + std::vector &Regexes) { + for (Regex &Re : Regexes) { + if (Re.match(Filename)) { + return true; + } + } + return false; +} + +bool GCOVProfiler::isFunctionInstrumented(const Function &F) { + if (FilterRe.empty() && ExcludeRe.empty()) { + return true; + } + SmallString<128> Filename = getFilename(F.getSubprogram()); + auto It = InstrumentedFiles.find(Filename); + if (It != InstrumentedFiles.end()) { + return It->second; + } + + SmallString<256> RealPath; + StringRef RealFilename; + + // Path can be + // /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8/bits/*.h so for + // such a case we must get the real_path. + if (sys::fs::real_path(Filename, RealPath)) { + // real_path can fail with path like "foo.c". + RealFilename = Filename; + } else { + RealFilename = RealPath; + } + + bool ShouldInstrument; + if (FilterRe.empty()) { + ShouldInstrument = !doesFilenameMatchARegex(RealFilename, ExcludeRe); + } else if (ExcludeRe.empty()) { + ShouldInstrument = doesFilenameMatchARegex(RealFilename, FilterRe); + } else { + ShouldInstrument = doesFilenameMatchARegex(RealFilename, FilterRe) && + !doesFilenameMatchARegex(RealFilename, ExcludeRe); + } + InstrumentedFiles[Filename] = ShouldInstrument; + return ShouldInstrument; +} + std::string GCOVProfiler::mangleName(const DICompileUnit *CU, GCovFileType OutputType) { bool Notes = OutputType == GCovFileType::GCNO; @@ -472,6 +563,9 @@ bool GCOVProfiler::runOnModule(Module &M, const TargetLibraryInfo &TLI) { AddFlushBeforeForkAndExec(); + FilterRe = createRegexesFromString(Options.Filter); + ExcludeRe = createRegexesFromString(Options.Exclude); + if (Options.EmitNotes) emitProfileNotes(); if (Options.EmitData) return emitProfileArcs(); return false; @@ -589,7 +683,8 @@ void GCOVProfiler::emitProfileNotes() { for (auto &F : M->functions()) { DISubprogram *SP = F.getSubprogram(); if (!SP) continue; - if (!functionHasLines(F)) continue; + if (!functionHasLines(F) || !isFunctionInstrumented(F)) + continue; // TODO: Functions using scope-based EH are currently not supported. if (isUsingScopeBasedEH(F)) continue; @@ -609,7 +704,8 @@ void GCOVProfiler::emitProfileNotes() { // Add the function line number to the lines of the entry block // to have a counter for the function definition. uint32_t Line = SP->getLine(); - Func.getBlock(&EntryBlock).getFile(SP->getFilename()).addLine(Line); + auto Filename = getFilename(SP); + Func.getBlock(&EntryBlock).getFile(Filename).addLine(Line); for (auto &BB : F) { GCOVBlock &Block = Func.getBlock(&BB); @@ -640,7 +736,7 @@ void GCOVProfiler::emitProfileNotes() { if (SP != getDISubprogram(Loc.getScope())) continue; - GCOVLines &Lines = Block.getFile(SP->getFilename()); + GCOVLines &Lines = Block.getFile(Filename); Lines.addLine(Loc.getLine()); } Line = 0; @@ -673,7 +769,8 @@ bool GCOVProfiler::emitProfileArcs() { for (auto &F : M->functions()) { DISubprogram *SP = F.getSubprogram(); if (!SP) continue; - if (!functionHasLines(F)) continue; + if (!functionHasLines(F) || !isFunctionInstrumented(F)) + continue; // TODO: Functions using scope-based EH are currently not supported. if (isUsingScopeBasedEH(F)) continue; if (!Result) Result = true; diff --git a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 91021604169a6936e642567291202fe15ec31b50..042e8ea0d2945fea576259ec523aa97442ef75b0 100644 --- a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -309,15 +309,24 @@ bool HWAddressSanitizer::doInitialization(Module &M) { kHwasanInitName, /*InitArgTypes=*/{}, /*InitArgs=*/{}); - appendToGlobalCtors(M, HwasanCtorFunction, 0); - } + Comdat *CtorComdat = M.getOrInsertComdat(kHwasanModuleCtorName); + HwasanCtorFunction->setComdat(CtorComdat); + appendToGlobalCtors(M, HwasanCtorFunction, 0, HwasanCtorFunction); + + // Create a zero-length global in __hwasan_frame so that the linker will + // always create start and stop symbols. + // + // N.B. If we ever start creating associated metadata in this pass this + // global will need to be associated with the ctor. + Type *Int8Arr0Ty = ArrayType::get(Int8Ty, 0); + auto GV = + new GlobalVariable(M, Int8Arr0Ty, /*isConstantGlobal*/ true, + GlobalVariable::PrivateLinkage, + Constant::getNullValue(Int8Arr0Ty), "__hwasan"); + GV->setSection(getFrameSection()); + GV->setComdat(CtorComdat); + appendToCompilerUsed(M, GV); - // Create a call to __hwasan_init_frames. - if (HwasanCtorFunction) { - // Create a dummy frame description for the CTOR function. - // W/o it we would have to create the call to __hwasan_init_frames after - // all functions are instrumented (i.e. need to have a ModulePass). - createFrameGlobal(*HwasanCtorFunction, ""); IRBuilder<> IRBCtor(HwasanCtorFunction->getEntryBlock().getTerminator()); IRBCtor.CreateCall( declareSanitizerInitFunction(M, "__hwasan_init_frames", @@ -742,10 +751,9 @@ void HWAddressSanitizer::createFrameGlobal(Function &F, GV->setSection(getFrameSection()); appendToCompilerUsed(M, GV); // Put GV into the F's Comadat so that if F is deleted GV can be deleted too. - if (&F != HwasanCtorFunction) - if (auto Comdat = - GetOrCreateFunctionComdat(F, TargetTriple, CurModuleUniqueId)) - GV->setComdat(Comdat); + if (auto Comdat = + GetOrCreateFunctionComdat(F, TargetTriple, CurModuleUniqueId)) + GV->setComdat(Comdat); } Value *HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, diff --git a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp index d839dd21a239c4fd379d7c9ab7a638dce9c8853f..7edc53d07c7565fd3892d709744ad2bffef9a975 100644 --- a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp +++ b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp @@ -427,7 +427,7 @@ static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI, bool PGOIndirectCallPromotionLegacyPass::runOnModule(Module &M) { ProfileSummaryInfo *PSI = - getAnalysis().getPSI(); + &getAnalysis().getPSI(); // Command-line option has the priority for InLTO. return promoteIndirectCalls(M, PSI, InLTO | ICPLTOMode, diff --git a/lib/Transforms/Instrumentation/InstrProfiling.cpp b/lib/Transforms/Instrumentation/InstrProfiling.cpp index 62da93002539e45d56c12fe285ba4a286ed8415f..15b94388cbe51429b513478eaee6f1af8e251742 100644 --- a/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -701,6 +701,7 @@ static bool needsRuntimeRegistrationOfSectionRange(const Module &M) { // Use linker script magic to get data/cnts/name start/end. if (Triple(M.getTargetTriple()).isOSLinux() || Triple(M.getTargetTriple()).isOSFreeBSD() || + Triple(M.getTargetTriple()).isOSNetBSD() || Triple(M.getTargetTriple()).isOSFuchsia() || Triple(M.getTargetTriple()).isPS4CPU()) return false; diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 960c1f42900c63ddb4e02a7bf5cc342fd0eb45f5..0bbf3a90b9560c13a59af88d3f574a2168634bec 100644 --- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -255,10 +255,13 @@ static cl::opt ClHandleICmpExact("msan-handle-icmp-exact", // passed into an assembly call. Note that this may cause false positives. // Because it's impossible to figure out the array sizes, we can only unpoison // the first sizeof(type) bytes for each type* pointer. +// The instrumentation is only enabled in KMSAN builds, and only if +// -msan-handle-asm-conservative is on. This is done because we may want to +// quickly disable assembly instrumentation when it breaks. static cl::opt ClHandleAsmConservative( "msan-handle-asm-conservative", cl::desc("conservative handling of inline assembly"), cl::Hidden, - cl::init(false)); + cl::init(true)); // This flag controls whether we check the shadow of the address // operand of load or store. Such bugs are very rare, since load from @@ -3118,7 +3121,7 @@ struct MemorySanitizerVisitor : public InstVisitor { // outputs as clean. Note that any side effects of the inline asm that are // not immediately visible in its constraints are not handled. if (Call->isInlineAsm()) { - if (ClHandleAsmConservative) + if (ClHandleAsmConservative && MS.CompileKernel) visitAsmInstruction(I); else visitInstruction(I); diff --git a/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h index ba4924c9cb2d35d0aacfc1c79bad4550d3431382..7f6b157304a3808589fe4245e6a120bc9469af18 100644 --- a/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h +++ b/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h @@ -26,6 +26,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/Support/ErrorHandling.h" @@ -74,27 +75,27 @@ public: switch (kind) { case ARCRuntimeEntryPointKind::AutoreleaseRV: - return getI8XRetI8XEntryPoint(AutoreleaseRV, - "objc_autoreleaseReturnValue", true); + return getIntrinsicEntryPoint(AutoreleaseRV, + Intrinsic::objc_autoreleaseReturnValue); case ARCRuntimeEntryPointKind::Release: - return getVoidRetI8XEntryPoint(Release, "objc_release"); + return getIntrinsicEntryPoint(Release, Intrinsic::objc_release); case ARCRuntimeEntryPointKind::Retain: - return getI8XRetI8XEntryPoint(Retain, "objc_retain", true); + return getIntrinsicEntryPoint(Retain, Intrinsic::objc_retain); case ARCRuntimeEntryPointKind::RetainBlock: - return getI8XRetI8XEntryPoint(RetainBlock, "objc_retainBlock", false); + return getIntrinsicEntryPoint(RetainBlock, Intrinsic::objc_retainBlock); case ARCRuntimeEntryPointKind::Autorelease: - return getI8XRetI8XEntryPoint(Autorelease, "objc_autorelease", true); + return getIntrinsicEntryPoint(Autorelease, Intrinsic::objc_autorelease); case ARCRuntimeEntryPointKind::StoreStrong: - return getI8XRetI8XXI8XEntryPoint(StoreStrong, "objc_storeStrong"); + return getIntrinsicEntryPoint(StoreStrong, Intrinsic::objc_storeStrong); case ARCRuntimeEntryPointKind::RetainRV: - return getI8XRetI8XEntryPoint(RetainRV, - "objc_retainAutoreleasedReturnValue", true); + return getIntrinsicEntryPoint(RetainRV, + Intrinsic::objc_retainAutoreleasedReturnValue); case ARCRuntimeEntryPointKind::RetainAutorelease: - return getI8XRetI8XEntryPoint(RetainAutorelease, "objc_retainAutorelease", - true); + return getIntrinsicEntryPoint(RetainAutorelease, + Intrinsic::objc_retainAutorelease); case ARCRuntimeEntryPointKind::RetainAutoreleaseRV: - return getI8XRetI8XEntryPoint(RetainAutoreleaseRV, - "objc_retainAutoreleaseReturnValue", true); + return getIntrinsicEntryPoint(RetainAutoreleaseRV, + Intrinsic::objc_retainAutoreleaseReturnValue); } llvm_unreachable("Switch should be a covered switch."); @@ -131,54 +132,11 @@ private: /// Declaration for objc_retainAutoreleaseReturnValue(). Constant *RetainAutoreleaseRV = nullptr; - Constant *getVoidRetI8XEntryPoint(Constant *&Decl, StringRef Name) { + Constant *getIntrinsicEntryPoint(Constant *&Decl, Intrinsic::ID IntID) { if (Decl) return Decl; - LLVMContext &C = TheModule->getContext(); - Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; - AttributeList Attr = AttributeList().addAttribute( - C, AttributeList::FunctionIndex, Attribute::NoUnwind); - FunctionType *Fty = FunctionType::get(Type::getVoidTy(C), Params, - /*isVarArg=*/false); - return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr); - } - - Constant *getI8XRetI8XEntryPoint(Constant *&Decl, StringRef Name, - bool NoUnwind = false) { - if (Decl) - return Decl; - - LLVMContext &C = TheModule->getContext(); - Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); - Type *Params[] = { I8X }; - FunctionType *Fty = FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttributeList Attr = AttributeList(); - - if (NoUnwind) - Attr = Attr.addAttribute(C, AttributeList::FunctionIndex, - Attribute::NoUnwind); - - return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr); - } - - Constant *getI8XRetI8XXI8XEntryPoint(Constant *&Decl, StringRef Name) { - if (Decl) - return Decl; - - LLVMContext &C = TheModule->getContext(); - Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); - Type *I8XX = PointerType::getUnqual(I8X); - Type *Params[] = { I8XX, I8X }; - - AttributeList Attr = AttributeList().addAttribute( - C, AttributeList::FunctionIndex, Attribute::NoUnwind); - Attr = Attr.addParamAttribute(C, 0, Attribute::NoCapture); - - FunctionType *Fty = FunctionType::get(Type::getVoidTy(C), Params, - /*isVarArg=*/false); - - return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr); + return Decl = Intrinsic::getDeclaration(TheModule, IntID); } }; diff --git a/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/lib/Transforms/ObjCARC/ObjCARCContract.cpp index 1f1ea9f5873923020ed3be9b7a1e576ce059640a..abe2871c0b8f97f7f411de2a0f68a4a4a848e2fe 100644 --- a/lib/Transforms/ObjCARC/ObjCARCContract.cpp +++ b/lib/Transforms/ObjCARC/ObjCARCContract.cpp @@ -522,7 +522,7 @@ bool ObjCARCContract::tryToPeepholeInstruction( TailOkForStoreStrongs = false; return true; case ARCInstKind::IntrinsicUser: - // Remove calls to @clang.arc.use(...). + // Remove calls to @llvm.objc.clang.arc.use(...). Inst->eraseFromParent(); return true; default: diff --git a/lib/Transforms/Scalar/BDCE.cpp b/lib/Transforms/Scalar/BDCE.cpp index 3a8ef073cb48ade72987474e753c14c9ce76eef7..f63182e57c1a3186015848994ed9cf3a947efc5a 100644 --- a/lib/Transforms/Scalar/BDCE.cpp +++ b/lib/Transforms/Scalar/BDCE.cpp @@ -38,7 +38,8 @@ STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)"); /// instruction may need to be cleared of assumptions that can no longer be /// guaranteed correct. static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) { - assert(I->getType()->isIntegerTy() && "Trivializing a non-integer value?"); + assert(I->getType()->isIntOrIntVectorTy() && + "Trivializing a non-integer value?"); // Initialize the worklist with eligible direct users. SmallVector WorkList; @@ -46,13 +47,13 @@ static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) { // If all bits of a user are demanded, then we know that nothing below that // in the def-use chain needs to be changed. auto *J = dyn_cast(JU); - if (J && J->getType()->isSized() && + if (J && J->getType()->isIntOrIntVectorTy() && !DB.getDemandedBits(J).isAllOnesValue()) WorkList.push_back(J); - // Note that we need to check for unsized types above before asking for + // Note that we need to check for non-int types above before asking for // demanded bits. Normally, the only way to reach an instruction with an - // unsized type is via an instruction that has side effects (or otherwise + // non-int type is via an instruction that has side effects (or otherwise // will demand its input bits). However, if we have a readnone function // that returns an unsized type (e.g., void), we must avoid asking for the // demanded bits of the function call's return value. A void-returning @@ -78,7 +79,7 @@ static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) { // If all bits of a user are demanded, then we know that nothing below // that in the def-use chain needs to be changed. auto *K = dyn_cast(KU); - if (K && !Visited.count(K) && K->getType()->isSized() && + if (K && !Visited.count(K) && K->getType()->isIntOrIntVectorTy() && !DB.getDemandedBits(K).isAllOnesValue()) WorkList.push_back(K); } @@ -95,7 +96,7 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) { if (I.mayHaveSideEffects() && I.use_empty()) continue; - if (I.getType()->isIntegerTy() && + if (I.getType()->isIntOrIntVectorTy() && !DB.getDemandedBits(&I).getBoolValue()) { // For live instructions that have all dead bits, first make them dead by // replacing all uses with something else. Then, if they don't need to diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index fce37d4bffb8f509079b14f7b9192e3cb408aa54..e3548ce5cd0afdd7472a9107d66939de49709ff6 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -45,6 +45,7 @@ add_llvm_library(LLVMScalarOpts LowerAtomic.cpp LowerExpectIntrinsic.cpp LowerGuardIntrinsic.cpp + MakeGuardsExplicit.cpp MemCpyOptimizer.cpp MergeICmps.cpp MergedLoadStoreMotion.cpp @@ -68,6 +69,7 @@ add_llvm_library(LLVMScalarOpts StraightLineStrengthReduce.cpp StructurizeCFG.cpp TailRecursionElimination.cpp + WarnMissedTransforms.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp index b9e8e3424ccfe61e80587adad135b83daf69b056..a806d6faed60a6333eedf4529647366b41f9322a 100644 --- a/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -149,14 +149,14 @@ static void recordCondition(CallSite CS, BasicBlock *From, BasicBlock *To, /// Record ICmp conditions relevant to any argument in CS following Pred's /// single predecessors. If there are conflicting conditions along a path, like -/// x == 1 and x == 0, the first condition will be used. +/// x == 1 and x == 0, the first condition will be used. We stop once we reach +/// an edge to StopAt. static void recordConditions(CallSite CS, BasicBlock *Pred, - ConditionsTy &Conditions) { - recordCondition(CS, Pred, CS.getInstruction()->getParent(), Conditions); + ConditionsTy &Conditions, BasicBlock *StopAt) { BasicBlock *From = Pred; BasicBlock *To = Pred; SmallPtrSet Visited; - while (!Visited.count(From->getSinglePredecessor()) && + while (To != StopAt && !Visited.count(From->getSinglePredecessor()) && (From = From->getSinglePredecessor())) { recordCondition(CS, From, To, Conditions); Visited.insert(From); @@ -302,7 +302,7 @@ static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI, static void splitCallSite( CallSite CS, const SmallVectorImpl> &Preds, - DominatorTree *DT) { + DomTreeUpdater &DTU) { Instruction *Instr = CS.getInstruction(); BasicBlock *TailBB = Instr->getParent(); bool IsMustTailCall = CS.isMustTailCall(); @@ -327,7 +327,7 @@ static void splitCallSite( BasicBlock *PredBB = Preds[i].first; BasicBlock *SplitBlock = DuplicateInstructionsInSplitBetween( TailBB, PredBB, &*std::next(Instr->getIterator()), ValueToValueMaps[i], - DT); + DTU); assert(SplitBlock && "Unexpected new basic block split."); Instruction *NewCI = @@ -365,11 +365,13 @@ static void splitCallSite( // attempting removal. SmallVector Splits(predecessors((TailBB))); assert(Splits.size() == 2 && "Expected exactly 2 splits!"); - for (unsigned i = 0; i < Splits.size(); i++) + for (unsigned i = 0; i < Splits.size(); i++) { Splits[i]->getTerminator()->eraseFromParent(); + DTU.deleteEdge(Splits[i], TailBB); + } // Erase the tail block once done with musttail patching - TailBB->eraseFromParent(); + DTU.deleteBB(TailBB); return; } @@ -438,48 +440,73 @@ static bool isPredicatedOnPHI(CallSite CS) { return false; } -static bool tryToSplitOnPHIPredicatedArgument(CallSite CS, DominatorTree *DT) { +using PredsWithCondsTy = SmallVector, 2>; + +// Check if any of the arguments in CS are predicated on a PHI node and return +// the set of predecessors we should use for splitting. +static PredsWithCondsTy shouldSplitOnPHIPredicatedArgument(CallSite CS) { if (!isPredicatedOnPHI(CS)) - return false; + return {}; auto Preds = getTwoPredecessors(CS.getInstruction()->getParent()); - SmallVector, 2> PredsCS = { - {Preds[0], {}}, {Preds[1], {}}}; - splitCallSite(CS, PredsCS, DT); - return true; + return {{Preds[0], {}}, {Preds[1], {}}}; } -static bool tryToSplitOnPredicatedArgument(CallSite CS, DominatorTree *DT) { +// Checks if any of the arguments in CS are predicated in a predecessor and +// returns a list of predecessors with the conditions that hold on their edges +// to CS. +static PredsWithCondsTy shouldSplitOnPredicatedArgument(CallSite CS, + DomTreeUpdater &DTU) { auto Preds = getTwoPredecessors(CS.getInstruction()->getParent()); if (Preds[0] == Preds[1]) - return false; + return {}; + + // We can stop recording conditions once we reached the immediate dominator + // for the block containing the call site. Conditions in predecessors of the + // that node will be the same for all paths to the call site and splitting + // is not beneficial. + assert(DTU.hasDomTree() && "We need a DTU with a valid DT!"); + auto *CSDTNode = DTU.getDomTree().getNode(CS.getInstruction()->getParent()); + BasicBlock *StopAt = CSDTNode ? CSDTNode->getIDom()->getBlock() : nullptr; SmallVector, 2> PredsCS; for (auto *Pred : make_range(Preds.rbegin(), Preds.rend())) { ConditionsTy Conditions; - recordConditions(CS, Pred, Conditions); + // Record condition on edge BB(CS) <- Pred + recordCondition(CS, Pred, CS.getInstruction()->getParent(), Conditions); + // Record conditions following Pred's single predecessors. + recordConditions(CS, Pred, Conditions, StopAt); PredsCS.push_back({Pred, Conditions}); } if (all_of(PredsCS, [](const std::pair &P) { return P.second.empty(); })) - return false; + return {}; - splitCallSite(CS, PredsCS, DT); - return true; + return PredsCS; } static bool tryToSplitCallSite(CallSite CS, TargetTransformInfo &TTI, - DominatorTree *DT) { + DomTreeUpdater &DTU) { + // Check if we can split the call site. if (!CS.arg_size() || !canSplitCallSite(CS, TTI)) return false; - return tryToSplitOnPredicatedArgument(CS, DT) || - tryToSplitOnPHIPredicatedArgument(CS, DT); + + auto PredsWithConds = shouldSplitOnPredicatedArgument(CS, DTU); + if (PredsWithConds.empty()) + PredsWithConds = shouldSplitOnPHIPredicatedArgument(CS); + if (PredsWithConds.empty()) + return false; + + splitCallSite(CS, PredsWithConds, DTU); + return true; } static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI, - TargetTransformInfo &TTI, DominatorTree *DT) { + TargetTransformInfo &TTI, DominatorTree &DT) { + + DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Lazy); bool Changed = false; for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) { BasicBlock &BB = *BI++; @@ -503,7 +530,7 @@ static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI, // Check if such path is possible before attempting the splitting. bool IsMustTail = CS.isMustTailCall(); - Changed |= tryToSplitCallSite(CS, TTI, DT); + Changed |= tryToSplitCallSite(CS, TTI, DTU); // There're no interesting instructions after this. The call site // itself might have been erased on splitting. @@ -524,6 +551,7 @@ struct CallSiteSplittingLegacyPass : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addPreserved(); FunctionPass::getAnalysisUsage(AU); } @@ -534,9 +562,8 @@ struct CallSiteSplittingLegacyPass : public FunctionPass { auto &TLI = getAnalysis().getTLI(); auto &TTI = getAnalysis().getTTI(F); - auto *DTWP = getAnalysisIfAvailable(); - return doCallSiteSplitting(F, TLI, TTI, - DTWP ? &DTWP->getDomTree() : nullptr); + auto &DT = getAnalysis().getDomTree(); + return doCallSiteSplitting(F, TLI, TTI, DT); } }; } // namespace @@ -546,6 +573,7 @@ INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting", "Call-site splitting", false, false) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting", "Call-site splitting", false, false) FunctionPass *llvm::createCallSiteSplittingPass() { @@ -556,7 +584,7 @@ PreservedAnalyses CallSiteSplittingPass::run(Function &F, FunctionAnalysisManager &AM) { auto &TLI = AM.getResult(F); auto &TTI = AM.getResult(F); - auto *DT = AM.getCachedResult(F); + auto &DT = AM.getResult(F); if (!doCallSiteSplitting(F, TLI, TTI, DT)) return PreservedAnalyses::all(); diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp index 46915889ce7cd6da8101ce791c4caf85cd1fc460..51032b0625f886a3c0a466fecf75c6493d0464e1 100644 --- a/lib/Transforms/Scalar/ConstantProp.cpp +++ b/lib/Transforms/Scalar/ConstantProp.cpp @@ -18,21 +18,25 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Constant.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/Pass.h" +#include "llvm/Support/DebugCounter.h" #include "llvm/Transforms/Scalar.h" -#include +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; #define DEBUG_TYPE "constprop" STATISTIC(NumInstKilled, "Number of instructions killed"); +DEBUG_COUNTER(CPCounter, "constprop-transform", + "Controls which instructions are killed"); namespace { struct ConstantPropagation : public FunctionPass { @@ -66,9 +70,15 @@ bool ConstantPropagation::runOnFunction(Function &F) { return false; // Initialize the worklist to all of the instructions ready to process... - std::set WorkList; - for (Instruction &I: instructions(&F)) + SmallPtrSet WorkList; + // The SmallVector of WorkList ensures that we do iteration at stable order. + // We use two containers rather than one SetVector, since remove is + // linear-time, and we don't care enough to remove from Vec. + SmallVector WorkListVec; + for (Instruction &I : instructions(&F)) { WorkList.insert(&I); + WorkListVec.push_back(&I); + } bool Changed = false; const DataLayout &DL = F.getParent()->getDataLayout(); @@ -76,29 +86,36 @@ bool ConstantPropagation::runOnFunction(Function &F) { &getAnalysis().getTLI(); while (!WorkList.empty()) { - Instruction *I = *WorkList.begin(); - WorkList.erase(WorkList.begin()); // Get an element from the worklist... - - if (!I->use_empty()) // Don't muck with dead instructions... - if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) { - // Add all of the users of this instruction to the worklist, they might - // be constant propagatable now... - for (User *U : I->users()) - WorkList.insert(cast(U)); - - // Replace all of the uses of a variable with uses of the constant. - I->replaceAllUsesWith(C); - - // Remove the dead instruction. - WorkList.erase(I); - if (isInstructionTriviallyDead(I, TLI)) { - I->eraseFromParent(); - ++NumInstKilled; + SmallVector NewWorkListVec; + for (auto *I : WorkListVec) { + WorkList.erase(I); // Remove element from the worklist... + + if (!I->use_empty()) // Don't muck with dead instructions... + if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) { + if (!DebugCounter::shouldExecute(CPCounter)) + continue; + + // Add all of the users of this instruction to the worklist, they might + // be constant propagatable now... + for (User *U : I->users()) { + // If user not in the set, then add it to the vector. + if (WorkList.insert(cast(U)).second) + NewWorkListVec.push_back(cast(U)); + } + + // Replace all of the uses of a variable with uses of the constant. + I->replaceAllUsesWith(C); + + if (isInstructionTriviallyDead(I, TLI)) { + I->eraseFromParent(); + ++NumInstKilled; + } + + // We made a change to the function... + Changed = true; } - - // We made a change to the function... - Changed = true; - } + } + WorkListVec = std::move(NewWorkListVec); } return Changed; } diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 99402985f28dc0dc186649561845fcf80690378e..d0105701c73fcc3478fe0eb6b3fd884bad83f7b7 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -273,10 +273,11 @@ static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) { /// information is sufficient to prove this comparison. Even for local /// conditions, this can sometimes prove conditions instcombine can't by /// exploiting range information. -static bool processCmp(CmpInst *C, LazyValueInfo *LVI) { - Value *Op0 = C->getOperand(0); - Constant *Op1 = dyn_cast(C->getOperand(1)); - if (!Op1) return false; +static bool processCmp(CmpInst *Cmp, LazyValueInfo *LVI) { + Value *Op0 = Cmp->getOperand(0); + auto *C = dyn_cast(Cmp->getOperand(1)); + if (!C) + return false; // As a policy choice, we choose not to waste compile time on anything where // the comparison is testing local values. While LVI can sometimes reason @@ -284,20 +285,18 @@ static bool processCmp(CmpInst *C, LazyValueInfo *LVI) { // the block local query for uses from terminator instructions, but that's // handled in the code for each terminator. auto *I = dyn_cast(Op0); - if (I && I->getParent() == C->getParent()) + if (I && I->getParent() == Cmp->getParent()) return false; LazyValueInfo::Tristate Result = - LVI->getPredicateAt(C->getPredicate(), Op0, Op1, C); - if (Result == LazyValueInfo::Unknown) return false; + LVI->getPredicateAt(Cmp->getPredicate(), Op0, C, Cmp); + if (Result == LazyValueInfo::Unknown) + return false; ++NumCmps; - if (Result == LazyValueInfo::True) - C->replaceAllUsesWith(ConstantInt::getTrue(C->getContext())); - else - C->replaceAllUsesWith(ConstantInt::getFalse(C->getContext())); - C->eraseFromParent(); - + Constant *TorF = ConstantInt::get(Type::getInt1Ty(Cmp->getContext()), Result); + Cmp->replaceAllUsesWith(TorF); + Cmp->eraseFromParent(); return true; } @@ -308,7 +307,8 @@ static bool processCmp(CmpInst *C, LazyValueInfo *LVI) { /// that cannot fire no matter what the incoming edge can safely be removed. If /// a case fires on every incoming edge then the entire switch can be removed /// and replaced with a branch to the case destination. -static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI, DominatorTree *DT) { +static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI, + DominatorTree *DT) { DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy); Value *Cond = SI->getCondition(); BasicBlock *BB = SI->getParent(); @@ -430,23 +430,21 @@ static bool willNotOverflow(IntrinsicInst *II, LazyValueInfo *LVI) { } static void processOverflowIntrinsic(IntrinsicInst *II) { + IRBuilder<> B(II); Value *NewOp = nullptr; switch (II->getIntrinsicID()) { default: llvm_unreachable("Unexpected instruction."); case Intrinsic::uadd_with_overflow: case Intrinsic::sadd_with_overflow: - NewOp = BinaryOperator::CreateAdd(II->getOperand(0), II->getOperand(1), - II->getName(), II); + NewOp = B.CreateAdd(II->getOperand(0), II->getOperand(1), II->getName()); break; case Intrinsic::usub_with_overflow: case Intrinsic::ssub_with_overflow: - NewOp = BinaryOperator::CreateSub(II->getOperand(0), II->getOperand(1), - II->getName(), II); + NewOp = B.CreateSub(II->getOperand(0), II->getOperand(1), II->getName()); break; } ++NumOverflows; - IRBuilder<> B(II); Value *NewI = B.CreateInsertValue(UndefValue::get(II->getType()), NewOp, 0); NewI = B.CreateInsertValue(NewI, ConstantInt::getFalse(II->getContext()), 1); II->replaceAllUsesWith(NewI); @@ -528,17 +526,17 @@ static bool processUDivOrURem(BinaryOperator *Instr, LazyValueInfo *LVI) { return false; ++NumUDivs; + IRBuilder<> B{Instr}; auto *TruncTy = Type::getIntNTy(Instr->getContext(), NewWidth); - auto *LHS = CastInst::Create(Instruction::Trunc, Instr->getOperand(0), TruncTy, - Instr->getName() + ".lhs.trunc", Instr); - auto *RHS = CastInst::Create(Instruction::Trunc, Instr->getOperand(1), TruncTy, - Instr->getName() + ".rhs.trunc", Instr); - auto *BO = - BinaryOperator::Create(Instr->getOpcode(), LHS, RHS, Instr->getName(), Instr); - auto *Zext = CastInst::Create(Instruction::ZExt, BO, Instr->getType(), - Instr->getName() + ".zext", Instr); - if (BO->getOpcode() == Instruction::UDiv) - BO->setIsExact(Instr->isExact()); + auto *LHS = B.CreateTruncOrBitCast(Instr->getOperand(0), TruncTy, + Instr->getName() + ".lhs.trunc"); + auto *RHS = B.CreateTruncOrBitCast(Instr->getOperand(1), TruncTy, + Instr->getName() + ".rhs.trunc"); + auto *BO = B.CreateBinOp(Instr->getOpcode(), LHS, RHS, Instr->getName()); + auto *Zext = B.CreateZExt(BO, Instr->getType(), Instr->getName() + ".zext"); + if (auto *BinOp = dyn_cast(BO)) + if (BinOp->getOpcode() == Instruction::UDiv) + BinOp->setIsExact(Instr->isExact()); Instr->replaceAllUsesWith(Zext); Instr->eraseFromParent(); @@ -552,6 +550,7 @@ static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) { ++NumSRems; auto *BO = BinaryOperator::CreateURem(SDI->getOperand(0), SDI->getOperand(1), SDI->getName(), SDI); + BO->setDebugLoc(SDI->getDebugLoc()); SDI->replaceAllUsesWith(BO); SDI->eraseFromParent(); @@ -573,6 +572,7 @@ static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) { ++NumSDivs; auto *BO = BinaryOperator::CreateUDiv(SDI->getOperand(0), SDI->getOperand(1), SDI->getName(), SDI); + BO->setDebugLoc(SDI->getDebugLoc()); BO->setIsExact(SDI->isExact()); SDI->replaceAllUsesWith(BO); SDI->eraseFromParent(); @@ -595,6 +595,7 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) { ++NumAShrs; auto *BO = BinaryOperator::CreateLShr(SDI->getOperand(0), SDI->getOperand(1), SDI->getName(), SDI); + BO->setDebugLoc(SDI->getDebugLoc()); BO->setIsExact(SDI->isExact()); SDI->replaceAllUsesWith(BO); SDI->eraseFromParent(); diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp index 11009d501a9c89a6351af7fde3981589a5cebd6d..1f09979b33824da181180b37bac02b36d2ec9d11 100644 --- a/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/lib/Transforms/Scalar/EarlyCSE.cpp @@ -812,7 +812,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); continue; } - salvageDebugInfo(*Inst); + if (!salvageDebugInfo(*Inst)) + replaceDbgUsesWithUndef(Inst); removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index c080c2a1813de93c49048cf0f401bf7570ff224e..440ea4a5bc789668d2f4cf39d14d20ef37460ce0 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -2156,6 +2156,16 @@ bool GVN::performScalarPRE(Instruction *CurInst) { if (isa(CurInst)) return false; + // Don't do PRE on GEPs. The inserted PHI would prevent CodeGenPrepare from + // sinking the addressing mode computation back to its uses. Extending the + // GEP's live range increases the register pressure, and therefore it can + // introduce unnecessary spills. + // + // This doesn't prevent Load PRE. PHI translation will make the GEP available + // to the load by moving it to the predecessor block if necessary. + if (isa(CurInst)) + return false; + // We don't currently value number ANY inline asm calls. if (CallInst *CallI = dyn_cast(CurInst)) if (CallI->isInlineAsm()) diff --git a/lib/Transforms/Scalar/GVNSink.cpp b/lib/Transforms/Scalar/GVNSink.cpp index 187c668c338584a935ab8266b35e86ac3aeac7dd..1df5f5400c148f508f1f965311caa6755b5abc5f 100644 --- a/lib/Transforms/Scalar/GVNSink.cpp +++ b/lib/Transforms/Scalar/GVNSink.cpp @@ -258,14 +258,14 @@ public: /// Create a PHI from an array of incoming values and incoming blocks. template ModelledPHI(const VArray &V, const BArray &B) { - std::copy(V.begin(), V.end(), std::back_inserter(Values)); - std::copy(B.begin(), B.end(), std::back_inserter(Blocks)); + llvm::copy(V, std::back_inserter(Values)); + llvm::copy(B, std::back_inserter(Blocks)); } /// Create a PHI from [I[OpNum] for I in Insts]. template ModelledPHI(ArrayRef Insts, unsigned OpNum, const BArray &B) { - std::copy(B.begin(), B.end(), std::back_inserter(Blocks)); + llvm::copy(B, std::back_inserter(Blocks)); for (auto *I : Insts) Values.push_back(I->getOperand(OpNum)); } diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index ec51ad71abc517e92c16ddca63d1437a6f0a0441..48d8e457ba7c1d561b28a6b935c5b5e51749ed64 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -145,6 +145,7 @@ class IndVarSimplify { bool canLoopBeDeleted(Loop *L, SmallVector &RewritePhiSet); bool rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); bool rewriteFirstIterationLoopExitValues(Loop *L); + bool hasHardUserWithinLoop(const Loop *L, const Instruction *I) const; bool linearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, PHINode *IndVar, SCEVExpander &Rewriter); @@ -524,6 +525,29 @@ struct RewritePhi { // As a side effect, reduces the amount of IV processing within the loop. //===----------------------------------------------------------------------===// +bool IndVarSimplify::hasHardUserWithinLoop(const Loop *L, const Instruction *I) const { + SmallPtrSet Visited; + SmallVector WorkList; + Visited.insert(I); + WorkList.push_back(I); + while (!WorkList.empty()) { + const Instruction *Curr = WorkList.pop_back_val(); + // This use is outside the loop, nothing to do. + if (!L->contains(Curr)) + continue; + // Do we assume it is a "hard" use which will not be eliminated easily? + if (Curr->mayHaveSideEffects()) + return true; + // Otherwise, add all its users to worklist. + for (auto U : Curr->users()) { + auto *UI = cast(U); + if (Visited.insert(UI).second) + WorkList.push_back(UI); + } + } + return false; +} + /// Check to see if this loop has a computable loop-invariant execution count. /// If so, this means that we can compute the final value of any expressions /// that are recurrent in the loop, and substitute the exit values from the loop @@ -598,19 +622,8 @@ bool IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { // Computing the value outside of the loop brings no benefit if it is // definitely used inside the loop in a way which can not be optimized // away. - if (ExitValue->getSCEVType()>=scMulExpr) { - bool HasHardInternalUses = false; - for (auto *IB : Inst->users()) { - Instruction *UseInstr = cast(IB); - unsigned Opc = UseInstr->getOpcode(); - if (L->contains(UseInstr) && Opc == Instruction::Call) { - HasHardInternalUses = true; - break; - } - } - if (HasHardInternalUses) - continue; - } + if (!isa(ExitValue) && hasHardUserWithinLoop(L, Inst)) + continue; bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L, Inst); Value *ExitVal = Rewriter.expandCodeFor(ExitValue, PN->getType(), Inst); diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 849ff71e198b6033402d0db5e8712e44cbbbd83a..14296293ddaa67609b9565c1d513379b0d335c37 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -574,9 +574,11 @@ static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) { /// BB in the result vector. /// /// This returns true if there were any known values. -bool JumpThreadingPass::ComputeValueKnownInPredecessors( +bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl( Value *V, BasicBlock *BB, PredValueInfo &Result, - ConstantPreference Preference, Instruction *CxtI) { + ConstantPreference Preference, + DenseSet> &RecursionSet, + Instruction *CxtI) { // This method walks up use-def chains recursively. Because of this, we could // get into an infinite loop going around loops in the use-def chain. To // prevent this, keep track of what (value, block) pairs we've already visited @@ -584,10 +586,6 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( if (!RecursionSet.insert(std::make_pair(V, BB)).second) return false; - // An RAII help to remove this pair from the recursion set once the recursion - // stack pops back out again. - RecursionSetRemover remover(RecursionSet, std::make_pair(V, BB)); - // If V is a constant, then it is known in all predecessors. if (Constant *KC = getKnownConstant(V, Preference)) { for (BasicBlock *Pred : predecessors(BB)) @@ -657,7 +655,8 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( Value *Source = CI->getOperand(0); if (!isa(Source) && !isa(Source)) return false; - ComputeValueKnownInPredecessors(Source, BB, Result, Preference, CxtI); + ComputeValueKnownInPredecessorsImpl(Source, BB, Result, Preference, + RecursionSet, CxtI); if (Result.empty()) return false; @@ -677,10 +676,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( I->getOpcode() == Instruction::And) { PredValueInfoTy LHSVals, RHSVals; - ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals, - WantInteger, CxtI); - ComputeValueKnownInPredecessors(I->getOperand(1), BB, RHSVals, - WantInteger, CxtI); + ComputeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals, + WantInteger, RecursionSet, CxtI); + ComputeValueKnownInPredecessorsImpl(I->getOperand(1), BB, RHSVals, + WantInteger, RecursionSet, CxtI); if (LHSVals.empty() && RHSVals.empty()) return false; @@ -715,8 +714,8 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( if (I->getOpcode() == Instruction::Xor && isa(I->getOperand(1)) && cast(I->getOperand(1))->isOne()) { - ComputeValueKnownInPredecessors(I->getOperand(0), BB, Result, - WantInteger, CxtI); + ComputeValueKnownInPredecessorsImpl(I->getOperand(0), BB, Result, + WantInteger, RecursionSet, CxtI); if (Result.empty()) return false; @@ -733,8 +732,8 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( && "A binary operator creating a block address?"); if (ConstantInt *CI = dyn_cast(BO->getOperand(1))) { PredValueInfoTy LHSVals; - ComputeValueKnownInPredecessors(BO->getOperand(0), BB, LHSVals, - WantInteger, CxtI); + ComputeValueKnownInPredecessorsImpl(BO->getOperand(0), BB, LHSVals, + WantInteger, RecursionSet, CxtI); // Try to use constant folding to simplify the binary operator. for (const auto &LHSVal : LHSVals) { @@ -879,8 +878,8 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( // Try to find a constant value for the LHS of a comparison, // and evaluate it statically if we can. PredValueInfoTy LHSVals; - ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals, - WantInteger, CxtI); + ComputeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals, + WantInteger, RecursionSet, CxtI); for (const auto &LHSVal : LHSVals) { Constant *V = LHSVal.first; @@ -900,8 +899,8 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference); PredValueInfoTy Conds; if ((TrueVal || FalseVal) && - ComputeValueKnownInPredecessors(SI->getCondition(), BB, Conds, - WantInteger, CxtI)) { + ComputeValueKnownInPredecessorsImpl(SI->getCondition(), BB, Conds, + WantInteger, RecursionSet, CxtI)) { for (auto &C : Conds) { Constant *Cond = C.first; @@ -2655,28 +2654,16 @@ bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard, // Duplicate all instructions before the guard and the guard itself to the // branch where implication is not proved. BasicBlock *GuardedBlock = DuplicateInstructionsInSplitBetween( - BB, PredGuardedBlock, AfterGuard, GuardedMapping); + BB, PredGuardedBlock, AfterGuard, GuardedMapping, *DTU); assert(GuardedBlock && "Could not create the guarded block?"); // Duplicate all instructions before the guard in the unguarded branch. // Since we have successfully duplicated the guarded block and this block // has fewer instructions, we expect it to succeed. BasicBlock *UnguardedBlock = DuplicateInstructionsInSplitBetween( - BB, PredUnguardedBlock, Guard, UnguardedMapping); + BB, PredUnguardedBlock, Guard, UnguardedMapping, *DTU); assert(UnguardedBlock && "Could not create the unguarded block?"); LLVM_DEBUG(dbgs() << "Moved guard " << *Guard << " to block " << GuardedBlock->getName() << "\n"); - // DuplicateInstructionsInSplitBetween inserts a new block "BB.split" between - // PredBB and BB. We need to perform two inserts and one delete for each of - // the above calls to update Dominators. - DTU->applyUpdates( - {// Guarded block split. - {DominatorTree::Delete, PredGuardedBlock, BB}, - {DominatorTree::Insert, PredGuardedBlock, GuardedBlock}, - {DominatorTree::Insert, GuardedBlock, BB}, - // Unguarded block split. - {DominatorTree::Delete, PredUnguardedBlock, BB}, - {DominatorTree::Insert, PredUnguardedBlock, UnguardedBlock}, - {DominatorTree::Insert, UnguardedBlock, BB}}); // Some instructions before the guard may still have uses. For them, we need // to create Phi nodes merging their copies in both guarded and unguarded // branches. Those instructions that have no uses can be just removed. diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 7789cb923450d6c377b8fb7c5a0bc5ee3b1d0214..695eaf6cf4a73aa42196ab555cf2574b41084469 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -31,6 +31,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LICM.h" +#include "llvm/ADT/SetOperations.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" @@ -41,6 +42,7 @@ #include "llvm/Analysis/GuardUtils.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemorySSA.h" @@ -75,6 +77,8 @@ using namespace llvm; #define DEBUG_TYPE "licm" +STATISTIC(NumCreatedBlocks, "Number of blocks created"); +STATISTIC(NumClonedBranches, "Number of branches cloned"); STATISTIC(NumSunk, "Number of instructions sunk out of loop"); STATISTIC(NumHoisted, "Number of instructions hoisted out of loop"); STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk"); @@ -86,6 +90,10 @@ static cl::opt DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false), cl::desc("Disable memory promotion in LICM pass")); +static cl::opt ControlFlowHoisting( + "licm-control-flow-hoisting", cl::Hidden, cl::init(false), + cl::desc("Enable control flow (and PHI) hoisting in LICM")); + static cl::opt MaxNumUsesTraversed( "licm-max-num-uses-traversed", cl::Hidden, cl::init(8), cl::desc("Max num uses visited for identifying load " @@ -103,7 +111,7 @@ static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, TargetTransformInfo *TTI, bool &FreeInLoop); static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, - ICFLoopSafetyInfo *SafetyInfo, + BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE); static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo, @@ -126,6 +134,9 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN, static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, AliasSetTracker *AST); +static void moveInstructionBefore(Instruction &I, Instruction &Dest, + ICFLoopSafetyInfo &SafetyInfo); + namespace { struct LoopInvariantCodeMotion { using ASTrackerMapTy = DenseMap>; @@ -434,6 +445,228 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, return Changed; } +// This is a helper class for hoistRegion to make it able to hoist control flow +// in order to be able to hoist phis. The way this works is that we initially +// start hoisting to the loop preheader, and when we see a loop invariant branch +// we make note of this. When we then come to hoist an instruction that's +// conditional on such a branch we duplicate the branch and the relevant control +// flow, then hoist the instruction into the block corresponding to its original +// block in the duplicated control flow. +class ControlFlowHoister { +private: + // Information about the loop we are hoisting from + LoopInfo *LI; + DominatorTree *DT; + Loop *CurLoop; + + // A map of blocks in the loop to the block their instructions will be hoisted + // to. + DenseMap HoistDestinationMap; + + // The branches that we can hoist, mapped to the block that marks a + // convergence point of their control flow. + DenseMap HoistableBranches; + +public: + ControlFlowHoister(LoopInfo *LI, DominatorTree *DT, Loop *CurLoop) + : LI(LI), DT(DT), CurLoop(CurLoop) {} + + void registerPossiblyHoistableBranch(BranchInst *BI) { + // We can only hoist conditional branches with loop invariant operands. + if (!ControlFlowHoisting || !BI->isConditional() || + !CurLoop->hasLoopInvariantOperands(BI)) + return; + + // The branch destinations need to be in the loop, and we don't gain + // anything by duplicating conditional branches with duplicate successors, + // as it's essentially the same as an unconditional branch. + BasicBlock *TrueDest = BI->getSuccessor(0); + BasicBlock *FalseDest = BI->getSuccessor(1); + if (!CurLoop->contains(TrueDest) || !CurLoop->contains(FalseDest) || + TrueDest == FalseDest) + return; + + // We can hoist BI if one branch destination is the successor of the other, + // or both have common successor which we check by seeing if the + // intersection of their successors is non-empty. + // TODO: This could be expanded to allowing branches where both ends + // eventually converge to a single block. + SmallPtrSet TrueDestSucc, FalseDestSucc; + TrueDestSucc.insert(succ_begin(TrueDest), succ_end(TrueDest)); + FalseDestSucc.insert(succ_begin(FalseDest), succ_end(FalseDest)); + BasicBlock *CommonSucc = nullptr; + if (TrueDestSucc.count(FalseDest)) { + CommonSucc = FalseDest; + } else if (FalseDestSucc.count(TrueDest)) { + CommonSucc = TrueDest; + } else { + set_intersect(TrueDestSucc, FalseDestSucc); + // If there's one common successor use that. + if (TrueDestSucc.size() == 1) + CommonSucc = *TrueDestSucc.begin(); + // If there's more than one pick whichever appears first in the block list + // (we can't use the value returned by TrueDestSucc.begin() as it's + // unpredicatable which element gets returned). + else if (!TrueDestSucc.empty()) { + Function *F = TrueDest->getParent(); + auto IsSucc = [&](BasicBlock &BB) { return TrueDestSucc.count(&BB); }; + auto It = std::find_if(F->begin(), F->end(), IsSucc); + assert(It != F->end() && "Could not find successor in function"); + CommonSucc = &*It; + } + } + // The common successor has to be dominated by the branch, as otherwise + // there will be some other path to the successor that will not be + // controlled by this branch so any phi we hoist would be controlled by the + // wrong condition. This also takes care of avoiding hoisting of loop back + // edges. + // TODO: In some cases this could be relaxed if the successor is dominated + // by another block that's been hoisted and we can guarantee that the + // control flow has been replicated exactly. + if (CommonSucc && DT->dominates(BI, CommonSucc)) + HoistableBranches[BI] = CommonSucc; + } + + bool canHoistPHI(PHINode *PN) { + // The phi must have loop invariant operands. + if (!ControlFlowHoisting || !CurLoop->hasLoopInvariantOperands(PN)) + return false; + // We can hoist phis if the block they are in is the target of hoistable + // branches which cover all of the predecessors of the block. + SmallPtrSet PredecessorBlocks; + BasicBlock *BB = PN->getParent(); + for (BasicBlock *PredBB : predecessors(BB)) + PredecessorBlocks.insert(PredBB); + // If we have less predecessor blocks than predecessors then the phi will + // have more than one incoming value for the same block which we can't + // handle. + // TODO: This could be handled be erasing some of the duplicate incoming + // values. + if (PredecessorBlocks.size() != pred_size(BB)) + return false; + for (auto &Pair : HoistableBranches) { + if (Pair.second == BB) { + // Which blocks are predecessors via this branch depends on if the + // branch is triangle-like or diamond-like. + if (Pair.first->getSuccessor(0) == BB) { + PredecessorBlocks.erase(Pair.first->getParent()); + PredecessorBlocks.erase(Pair.first->getSuccessor(1)); + } else if (Pair.first->getSuccessor(1) == BB) { + PredecessorBlocks.erase(Pair.first->getParent()); + PredecessorBlocks.erase(Pair.first->getSuccessor(0)); + } else { + PredecessorBlocks.erase(Pair.first->getSuccessor(0)); + PredecessorBlocks.erase(Pair.first->getSuccessor(1)); + } + } + } + // PredecessorBlocks will now be empty if for every predecessor of BB we + // found a hoistable branch source. + return PredecessorBlocks.empty(); + } + + BasicBlock *getOrCreateHoistedBlock(BasicBlock *BB) { + if (!ControlFlowHoisting) + return CurLoop->getLoopPreheader(); + // If BB has already been hoisted, return that + if (HoistDestinationMap.count(BB)) + return HoistDestinationMap[BB]; + + // Check if this block is conditional based on a pending branch + auto HasBBAsSuccessor = + [&](DenseMap::value_type &Pair) { + return BB != Pair.second && (Pair.first->getSuccessor(0) == BB || + Pair.first->getSuccessor(1) == BB); + }; + auto It = std::find_if(HoistableBranches.begin(), HoistableBranches.end(), + HasBBAsSuccessor); + + // If not involved in a pending branch, hoist to preheader + BasicBlock *InitialPreheader = CurLoop->getLoopPreheader(); + if (It == HoistableBranches.end()) { + LLVM_DEBUG(dbgs() << "LICM using " << InitialPreheader->getName() + << " as hoist destination for " << BB->getName() + << "\n"); + HoistDestinationMap[BB] = InitialPreheader; + return InitialPreheader; + } + BranchInst *BI = It->first; + assert(std::find_if(++It, HoistableBranches.end(), HasBBAsSuccessor) == + HoistableBranches.end() && + "BB is expected to be the target of at most one branch"); + + LLVMContext &C = BB->getContext(); + BasicBlock *TrueDest = BI->getSuccessor(0); + BasicBlock *FalseDest = BI->getSuccessor(1); + BasicBlock *CommonSucc = HoistableBranches[BI]; + BasicBlock *HoistTarget = getOrCreateHoistedBlock(BI->getParent()); + + // Create hoisted versions of blocks that currently don't have them + auto CreateHoistedBlock = [&](BasicBlock *Orig) { + if (HoistDestinationMap.count(Orig)) + return HoistDestinationMap[Orig]; + BasicBlock *New = + BasicBlock::Create(C, Orig->getName() + ".licm", Orig->getParent()); + HoistDestinationMap[Orig] = New; + DT->addNewBlock(New, HoistTarget); + if (CurLoop->getParentLoop()) + CurLoop->getParentLoop()->addBasicBlockToLoop(New, *LI); + ++NumCreatedBlocks; + LLVM_DEBUG(dbgs() << "LICM created " << New->getName() + << " as hoist destination for " << Orig->getName() + << "\n"); + return New; + }; + BasicBlock *HoistTrueDest = CreateHoistedBlock(TrueDest); + BasicBlock *HoistFalseDest = CreateHoistedBlock(FalseDest); + BasicBlock *HoistCommonSucc = CreateHoistedBlock(CommonSucc); + + // Link up these blocks with branches. + if (!HoistCommonSucc->getTerminator()) { + // The new common successor we've generated will branch to whatever that + // hoist target branched to. + BasicBlock *TargetSucc = HoistTarget->getSingleSuccessor(); + assert(TargetSucc && "Expected hoist target to have a single successor"); + HoistCommonSucc->moveBefore(TargetSucc); + BranchInst::Create(TargetSucc, HoistCommonSucc); + } + if (!HoistTrueDest->getTerminator()) { + HoistTrueDest->moveBefore(HoistCommonSucc); + BranchInst::Create(HoistCommonSucc, HoistTrueDest); + } + if (!HoistFalseDest->getTerminator()) { + HoistFalseDest->moveBefore(HoistCommonSucc); + BranchInst::Create(HoistCommonSucc, HoistFalseDest); + } + + // If BI is being cloned to what was originally the preheader then + // HoistCommonSucc will now be the new preheader. + if (HoistTarget == InitialPreheader) { + // Phis in the loop header now need to use the new preheader. + InitialPreheader->replaceSuccessorsPhiUsesWith(HoistCommonSucc); + // The new preheader dominates the loop header. + DomTreeNode *PreheaderNode = DT->getNode(HoistCommonSucc); + DomTreeNode *HeaderNode = DT->getNode(CurLoop->getHeader()); + DT->changeImmediateDominator(HeaderNode, PreheaderNode); + // The preheader hoist destination is now the new preheader, with the + // exception of the hoist destination of this branch. + for (auto &Pair : HoistDestinationMap) + if (Pair.second == InitialPreheader && Pair.first != BI->getParent()) + Pair.second = HoistCommonSucc; + } + + // Now finally clone BI. + ReplaceInstWithInst( + HoistTarget->getTerminator(), + BranchInst::Create(HoistTrueDest, HoistFalseDest, BI->getCondition())); + ++NumClonedBranches; + + assert(CurLoop->getLoopPreheader() && + "Hoisting blocks should not have destroyed preheader"); + return HoistDestinationMap[BB]; + } +}; + /// Walk the specified region of the CFG (defined by all blocks dominated by /// the specified block, and that are in the current loop) in depth first /// order w.r.t the DominatorTree. This allows us to visit definitions before @@ -448,24 +681,24 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr && "Unexpected input to hoistRegion"); - // We want to visit parents before children. We will enque all the parents - // before their children in the worklist and process the worklist in order. - SmallVector Worklist = collectChildrenInLoop(N, CurLoop); + ControlFlowHoister CFH(LI, DT, CurLoop); + // Keep track of instructions that have been hoisted, as they may need to be + // re-hoisted if they end up not dominating all of their uses. + SmallVector HoistedInstructions; + + // For PHI hoisting to work we need to hoist blocks before their successors. + // We can do this by iterating through the blocks in the loop in reverse + // post-order. + LoopBlocksRPO Worklist(CurLoop); + Worklist.perform(LI); bool Changed = false; - for (DomTreeNode *DTN : Worklist) { - BasicBlock *BB = DTN->getBlock(); + for (BasicBlock *BB : Worklist) { // Only need to process the contents of this block if it is not part of a // subloop (which would already have been processed). if (inSubLoop(BB, CurLoop, LI)) continue; - // Keep track of whether the prefix instructions could have written memory. - // TODO: This may be done smarter if we keep track of all throwing and - // mem-writing operations in every block, e.g. using something similar to - // isGuaranteedToExecute. - bool IsMemoryNotModified = CurLoop->getHeader() == BB; - for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) { Instruction &I = *II++; // Try constant folding this instruction. If all the operands are @@ -486,13 +719,16 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, // Try hoisting the instruction out to the preheader. We can only do // this if all of the operands of the instruction are loop invariant and // if it is safe to hoist the instruction. - // + // TODO: It may be safe to hoist if we are hoisting to a conditional block + // and we have accurately duplicated the control flow from the loop header + // to that block. if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, true, ORE) && isSafeToExecuteUnconditionally( I, DT, CurLoop, SafetyInfo, ORE, CurLoop->getLoopPreheader()->getTerminator())) { - hoist(I, DT, CurLoop, SafetyInfo, ORE); + hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, ORE); + HoistedInstructions.push_back(&I); Changed = true; continue; } @@ -517,7 +753,9 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, I.replaceAllUsesWith(Product); eraseInstruction(I, *SafetyInfo, CurAST); - hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE); + hoist(*ReciprocalDivisor, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), + SafetyInfo, ORE); + HoistedInstructions.push_back(ReciprocalDivisor); Changed = true; continue; } @@ -526,18 +764,75 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, if (((I.use_empty() && match(&I, m_Intrinsic())) || isGuard(&I)) && - IsMemoryNotModified && CurLoop->hasLoopInvariantOperands(&I) && - SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop)) { - hoist(I, DT, CurLoop, SafetyInfo, ORE); + CurLoop->hasLoopInvariantOperands(&I) && + SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop) && + SafetyInfo->doesNotWriteMemoryBefore(I, CurLoop)) { + hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, ORE); + HoistedInstructions.push_back(&I); Changed = true; continue; } - if (IsMemoryNotModified) - IsMemoryNotModified = !I.mayWriteToMemory(); + if (PHINode *PN = dyn_cast(&I)) { + if (CFH.canHoistPHI(PN)) { + // Redirect incoming blocks first to ensure that we create hoisted + // versions of those blocks before we hoist the phi. + for (unsigned int i = 0; i < PN->getNumIncomingValues(); ++i) + PN->setIncomingBlock( + i, CFH.getOrCreateHoistedBlock(PN->getIncomingBlock(i))); + hoist(*PN, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, + ORE); + assert(DT->dominates(PN, BB) && "Conditional PHIs not expected"); + Changed = true; + continue; + } + } + + // Remember possibly hoistable branches so we can actually hoist them + // later if needed. + if (BranchInst *BI = dyn_cast(&I)) + CFH.registerPossiblyHoistableBranch(BI); } } + // If we hoisted instructions to a conditional block they may not dominate + // their uses that weren't hoisted (such as phis where some operands are not + // loop invariant). If so make them unconditional by moving them to their + // immediate dominator. We iterate through the instructions in reverse order + // which ensures that when we rehoist an instruction we rehoist its operands, + // and also keep track of where in the block we are rehoisting to to make sure + // that we rehoist instructions before the instructions that use them. + Instruction *HoistPoint = nullptr; + if (ControlFlowHoisting) { + for (Instruction *I : reverse(HoistedInstructions)) { + if (!llvm::all_of(I->uses(), + [&](Use &U) { return DT->dominates(I, U); })) { + BasicBlock *Dominator = + DT->getNode(I->getParent())->getIDom()->getBlock(); + LLVM_DEBUG(dbgs() << "LICM rehoisting to " << Dominator->getName() + << ": " << *I << "\n"); + if (!HoistPoint || HoistPoint->getParent() != Dominator) { + if (HoistPoint) + assert(DT->dominates(Dominator, HoistPoint->getParent()) && + "New hoist point expected to dominate old hoist point"); + HoistPoint = Dominator->getTerminator(); + } + moveInstructionBefore(*I, *HoistPoint, *SafetyInfo); + HoistPoint = I; + Changed = true; + } + } + } + + // Now that we've finished hoisting make sure that LI and DT are still valid. +#ifndef NDEBUG + if (Changed) { + assert(DT->verify(DominatorTree::VerificationLevel::Fast) && + "Dominator tree verification failed"); + LI->verify(*DT); + } +#endif + return Changed; } @@ -890,6 +1185,13 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, I.eraseFromParent(); } +static void moveInstructionBefore(Instruction &I, Instruction &Dest, + ICFLoopSafetyInfo &SafetyInfo) { + SafetyInfo.removeInstruction(&I); + SafetyInfo.insertInstructionTo(Dest.getParent()); + I.moveBefore(&Dest); +} + static Instruction *sinkThroughTriviallyReplaceablePHI( PHINode *TPN, Instruction *I, LoopInfo *LI, SmallDenseMap &SunkCopies, @@ -1098,9 +1400,9 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, /// is safe to hoist, this instruction is called to do the dirty work. /// static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, - ICFLoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) { - auto *Preheader = CurLoop->getLoopPreheader(); - LLVM_DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I + BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo, + OptimizationRemarkEmitter *ORE) { + LLVM_DEBUG(dbgs() << "LICM hoisting to " << Dest->getName() << ": " << I << "\n"); ORE->emit([&]() { return OptimizationRemark(DEBUG_TYPE, "Hoisted", &I) << "hoisting " @@ -1118,10 +1420,12 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop)) I.dropUnknownNonDebugMetadata(); - SafetyInfo->removeInstruction(&I); - SafetyInfo->insertInstructionTo(Preheader); - // Move the new node to the Preheader, before its terminator. - I.moveBefore(Preheader->getTerminator()); + if (isa(I)) + // Move the new node to the end of the phi list in the destination block. + moveInstructionBefore(I, *Dest->getFirstNonPHI(), *SafetyInfo); + else + // Move the new node to the destination block, before its terminator. + moveInstructionBefore(I, *Dest->getTerminator(), *SafetyInfo); // Do not retain debug locations when we are moving instructions to different // basic blocks, because we want to avoid jumpy line tables. Calls, however, diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp index 06083a4f5086c1c99e876b1d1f0a095af817490a..d797c9dc9e726e02deb175f54387475f9b3f87ed 100644 --- a/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/lib/Transforms/Scalar/LoopDistribute.cpp @@ -78,6 +78,18 @@ using namespace llvm; #define LDIST_NAME "loop-distribute" #define DEBUG_TYPE LDIST_NAME +/// @{ +/// Metadata attribute names +static const char *const LLVMLoopDistributeFollowupAll = + "llvm.loop.distribute.followup_all"; +static const char *const LLVMLoopDistributeFollowupCoincident = + "llvm.loop.distribute.followup_coincident"; +static const char *const LLVMLoopDistributeFollowupSequential = + "llvm.loop.distribute.followup_sequential"; +static const char *const LLVMLoopDistributeFollowupFallback = + "llvm.loop.distribute.followup_fallback"; +/// @} + static cl::opt LDistVerify("loop-distribute-verify", cl::Hidden, cl::desc("Turn on DominatorTree and LoopInfo verification " @@ -186,7 +198,7 @@ public: /// Returns the loop where this partition ends up after distribution. /// If this partition is mapped to the original loop then use the block from /// the loop. - const Loop *getDistributedLoop() const { + Loop *getDistributedLoop() const { return ClonedLoop ? ClonedLoop : OrigLoop; } @@ -443,6 +455,9 @@ public: assert(&*OrigPH->begin() == OrigPH->getTerminator() && "preheader not empty"); + // Preserve the original loop ID for use after the transformation. + MDNode *OrigLoopID = L->getLoopID(); + // Create a loop for each partition except the last. Clone the original // loop before PH along with adding a preheader for the cloned loop. Then // update PH to point to the newly added preheader. @@ -457,9 +472,13 @@ public: Part->getVMap()[ExitBlock] = TopPH; Part->remapInstructions(); + setNewLoopID(OrigLoopID, Part); } Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); + // Also set a new loop ID for the last loop. + setNewLoopID(OrigLoopID, &PartitionContainer.back()); + // Now go in forward order and update the immediate dominator for the // preheaders with the exiting block of the previous loop. Dominance // within the loop is updated in cloneLoopWithPreheader. @@ -575,6 +594,19 @@ private: } } } + + /// Assign new LoopIDs for the partition's cloned loop. + void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) { + Optional PartitionID = makeFollowupLoopID( + OrigLoopID, + {LLVMLoopDistributeFollowupAll, + Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential + : LLVMLoopDistributeFollowupCoincident}); + if (PartitionID.hasValue()) { + Loop *NewLoop = Part->getDistributedLoop(); + NewLoop->setLoopID(PartitionID.getValue()); + } + } }; /// For each memory instruction, this class maintains difference of the @@ -743,6 +775,9 @@ public: return fail("TooManySCEVRuntimeChecks", "too many SCEV run-time checks needed.\n"); + if (!IsForced.getValueOr(false) && hasDisableAllTransformsHint(L)) + return fail("HeuristicDisabled", "distribution heuristic disabled"); + LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); // We're done forming the partitions set up the reverse mapping from // instructions to partitions. @@ -762,6 +797,8 @@ public: RtPtrChecking); if (!Pred.isAlwaysTrue() || !Checks.empty()) { + MDNode *OrigLoopID = L->getLoopID(); + LLVM_DEBUG(dbgs() << "\nPointers:\n"); LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); LoopVersioning LVer(*LAI, L, LI, DT, SE, false); @@ -769,6 +806,17 @@ public: LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate()); LVer.versionLoop(DefsUsedOutside); LVer.annotateLoopWithNoAlias(); + + // The unversioned loop will not be changed, so we inherit all attributes + // from the original loop, but remove the loop distribution metadata to + // avoid to distribute it again. + MDNode *UnversionedLoopID = + makeFollowupLoopID(OrigLoopID, + {LLVMLoopDistributeFollowupAll, + LLVMLoopDistributeFollowupFallback}, + "llvm.loop.distribute.", true) + .getValue(); + LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID); } // Create identical copies of the original loop for each partition and hook diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp index 7a4ae2eb30307a590ad805f4da4c37e17203456f..766e39b439a0d762fb9f7e39d5267159040f0cc9 100644 --- a/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/lib/Transforms/Scalar/LoopInterchange.cpp @@ -339,10 +339,21 @@ public: bool currentLimitations(); + const SmallPtrSetImpl &getOuterInnerReductions() const { + return OuterInnerReductions; + } + private: bool tightlyNested(Loop *Outer, Loop *Inner); bool containsUnsafeInstructions(BasicBlock *BB); - bool findInductions(Loop *L, SmallVector &Inductions); + + /// Discover induction and reduction PHIs in the header of \p L. Induction + /// PHIs are added to \p Inductions, reductions are added to + /// OuterInnerReductions. When the outer loop is passed, the inner loop needs + /// to be passed as \p InnerLoop. + bool findInductionAndReductions(Loop *L, + SmallVector &Inductions, + Loop *InnerLoop); Loop *OuterLoop; Loop *InnerLoop; @@ -352,6 +363,9 @@ private: /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; + /// Set of reduction PHIs taking part of a reduction across the inner and + /// outer loop. + SmallPtrSet OuterInnerReductions; }; /// LoopInterchangeProfitability checks if it is profitable to interchange the @@ -384,9 +398,10 @@ class LoopInterchangeTransform { public: LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, - BasicBlock *LoopNestExit) + BasicBlock *LoopNestExit, + const LoopInterchangeLegality &LIL) : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), - LoopExit(LoopNestExit) {} + LoopExit(LoopNestExit), LIL(LIL) {} /// Interchange OuterLoop and InnerLoop. bool transform(); @@ -411,6 +426,8 @@ private: LoopInfo *LI; DominatorTree *DT; BasicBlock *LoopExit; + + const LoopInterchangeLegality &LIL; }; // Main LoopInterchange Pass. @@ -560,8 +577,8 @@ struct LoopInterchange : public LoopPass { << "Loop interchanged with enclosing loop."; }); - LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, - LoopNestExit); + LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LoopNestExit, + LIL); LIT.transform(); LLVM_DEBUG(dbgs() << "Loops interchanged.\n"); LoopsInterchanged++; @@ -633,8 +650,36 @@ bool LoopInterchangeLegality::isLoopStructureUnderstood( return true; } -bool LoopInterchangeLegality::findInductions( - Loop *L, SmallVector &Inductions) { +// If SV is a LCSSA PHI node with a single incoming value, return the incoming +// value. +static Value *followLCSSA(Value *SV) { + PHINode *PHI = dyn_cast(SV); + if (!PHI) + return SV; + + if (PHI->getNumIncomingValues() != 1) + return SV; + return followLCSSA(PHI->getIncomingValue(0)); +} + +// Check V's users to see if it is involved in a reduction in L. +static PHINode *findInnerReductionPhi(Loop *L, Value *V) { + for (Value *User : V->users()) { + if (PHINode *PHI = dyn_cast(User)) { + if (PHI->getNumIncomingValues() == 1) + continue; + RecurrenceDescriptor RD; + if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) + return PHI; + return nullptr; + } + } + + return nullptr; +} + +bool LoopInterchangeLegality::findInductionAndReductions( + Loop *L, SmallVector &Inductions, Loop *InnerLoop) { if (!L->getLoopLatch() || !L->getLoopPredecessor()) return false; for (PHINode &PHI : L->getHeader()->phis()) { @@ -643,8 +688,32 @@ bool LoopInterchangeLegality::findInductions( if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) Inductions.push_back(&PHI); else { - LLVM_DEBUG(dbgs() << "Failed to recognize PHI as an induction.\n"); - return false; + // PHIs in inner loops need to be part of a reduction in the outer loop, + // discovered when checking the PHIs of the outer loop earlier. + if (!InnerLoop) { + if (OuterInnerReductions.find(&PHI) == OuterInnerReductions.end()) { + LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions " + "across the outer loop.\n"); + return false; + } + } else { + assert(PHI.getNumIncomingValues() == 2 && + "Phis in loop header should have exactly 2 incoming values"); + // Check if we have a PHI node in the outer loop that has a reduction + // result from the inner loop as an incoming value. + Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch())); + PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V); + if (!InnerRedPhi || + !llvm::any_of(InnerRedPhi->incoming_values(), + [&PHI](Value *V) { return V == &PHI; })) { + LLVM_DEBUG( + dbgs() + << "Failed to recognize PHI as an induction or reduction.\n"); + return false; + } + OuterInnerReductions.insert(&PHI); + OuterInnerReductions.insert(InnerRedPhi); + } } } return true; @@ -693,63 +762,64 @@ bool LoopInterchangeLegality::currentLimitations() { PHINode *InnerInductionVar; SmallVector Inductions; - if (!findInductions(InnerLoop, Inductions)) { + if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) { LLVM_DEBUG( - dbgs() << "Only inner loops with induction or reduction PHI nodes " + dbgs() << "Only outer loops with induction or reduction PHI nodes " << "are supported currently.\n"); ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Only inner loops with induction or reduction PHI nodes can be" - " interchange currently."; + return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Only outer loops with induction or reduction PHI nodes can be" + " interchanged currently."; }); return true; } // TODO: Currently we handle only loops with 1 induction variable. if (Inductions.size() != 1) { - LLVM_DEBUG( - dbgs() << "We currently only support loops with 1 induction variable." - << "Failed to interchange due to current limitation\n"); + LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not " + << "supported currently.\n"); ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Only inner loops with 1 induction variable can be " + return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Only outer loops with 1 induction variable can be " "interchanged currently."; }); return true; } - InnerInductionVar = Inductions.pop_back_val(); - if (!findInductions(OuterLoop, Inductions)) { + Inductions.clear(); + if (!findInductionAndReductions(InnerLoop, Inductions, nullptr)) { LLVM_DEBUG( - dbgs() << "Only outer loops with induction or reduction PHI nodes " + dbgs() << "Only inner loops with induction or reduction PHI nodes " << "are supported currently.\n"); ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter", - OuterLoop->getStartLoc(), - OuterLoop->getHeader()) - << "Only outer loops with induction or reduction PHI nodes can be" - " interchanged currently."; + return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Only inner loops with induction or reduction PHI nodes can be" + " interchange currently."; }); return true; } // TODO: Currently we handle only loops with 1 induction variable. if (Inductions.size() != 1) { - LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not " - << "supported currently.\n"); + LLVM_DEBUG( + dbgs() << "We currently only support loops with 1 induction variable." + << "Failed to interchange due to current limitation\n"); ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter", - OuterLoop->getStartLoc(), - OuterLoop->getHeader()) - << "Only outer loops with 1 induction variable can be " + return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Only inner loops with 1 induction variable can be " "interchanged currently."; }); return true; } + InnerInductionVar = Inductions.pop_back_val(); // TODO: Triangular loops are not handled for now. if (!isLoopStructureUnderstood(InnerInductionVar)) { @@ -1387,13 +1457,29 @@ bool LoopInterchangeTransform::adjustLoopBranches() { // replaced by Inners'. updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch); - // Make sure we have no other PHIs. - auto InnerPhis = drop_begin(InnerLoopHeader->phis(), 1); - auto OuterPhis = drop_begin(OuterLoopHeader->phis(), 1); - (void) InnerPhis; - (void) OuterPhis; - assert(begin(InnerPhis) == end(InnerPhis) && "Unexpected PHIs in inner loop"); - assert(begin(OuterPhis) == end(OuterPhis) && "Unexpected PHis in outer loop"); + // Now update the reduction PHIs in the inner and outer loop headers. + SmallVector InnerLoopPHIs, OuterLoopPHIs; + for (PHINode &PHI : drop_begin(InnerLoopHeader->phis(), 1)) + InnerLoopPHIs.push_back(cast(&PHI)); + for (PHINode &PHI : drop_begin(OuterLoopHeader->phis(), 1)) + OuterLoopPHIs.push_back(cast(&PHI)); + + auto &OuterInnerReductions = LIL.getOuterInnerReductions(); + (void)OuterInnerReductions; + + // Now move the remaining reduction PHIs from outer to inner loop header and + // vice versa. The PHI nodes must be part of a reduction across the inner and + // outer loop and all the remains to do is and updating the incoming blocks. + for (PHINode *PHI : OuterLoopPHIs) { + PHI->moveBefore(InnerLoopHeader->getFirstNonPHI()); + assert(OuterInnerReductions.find(PHI) != OuterInnerReductions.end() && + "Expected a reduction PHI node"); + } + for (PHINode *PHI : InnerLoopPHIs) { + PHI->moveBefore(OuterLoopHeader->getFirstNonPHI()); + assert(OuterInnerReductions.find(PHI) != OuterInnerReductions.end() && + "Expected a reduction PHI node"); + } // Update the incoming blocks for moved PHI nodes. updateIncomingBlock(OuterLoopHeader, InnerLoopPreHeader, OuterLoopPreHeader); diff --git a/lib/Transforms/Scalar/LoopPassManager.cpp b/lib/Transforms/Scalar/LoopPassManager.cpp index 6759bd2537c6fbaa0364edd9a16895031f7ef6ce..774ad7b945a0f282fff4a27f16142c0c1d8cce0d 100644 --- a/lib/Transforms/Scalar/LoopPassManager.cpp +++ b/lib/Transforms/Scalar/LoopPassManager.cpp @@ -44,7 +44,11 @@ PassManagerrun(L, AM, AR, U); - PI.runAfterPass(*Pass, L); + // do not pass deleted Loop into the instrumentation + if (U.skipCurrentLoop()) + PI.runAfterPassInvalidated(*Pass); + else + PI.runAfterPass(*Pass, L); // If the loop was deleted, abort the run and return to the outer walk. if (U.skipCurrentLoop()) { diff --git a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index 6cac3787311794de4dd69c9ecd8c4c09e035e2c4..c370efa120772714e2d3f56bc4e3c60e3cfcc71d 100644 --- a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -41,6 +41,389 @@ using namespace llvm; #define DEBUG_TYPE "loop-simplifycfg" +static cl::opt EnableTermFolding("enable-loop-simplifycfg-term-folding", + cl::init(true)); + +STATISTIC(NumTerminatorsFolded, + "Number of terminators folded to unconditional branches"); + +/// If \p BB is a switch or a conditional branch, but only one of its successors +/// can be reached from this block in runtime, return this successor. Otherwise, +/// return nullptr. +static BasicBlock *getOnlyLiveSuccessor(BasicBlock *BB) { + Instruction *TI = BB->getTerminator(); + if (BranchInst *BI = dyn_cast(TI)) { + if (BI->isUnconditional()) + return nullptr; + if (BI->getSuccessor(0) == BI->getSuccessor(1)) + return BI->getSuccessor(0); + ConstantInt *Cond = dyn_cast(BI->getCondition()); + if (!Cond) + return nullptr; + return Cond->isZero() ? BI->getSuccessor(1) : BI->getSuccessor(0); + } + + if (SwitchInst *SI = dyn_cast(TI)) { + auto *CI = dyn_cast(SI->getCondition()); + if (!CI) + return nullptr; + for (auto Case : SI->cases()) + if (Case.getCaseValue() == CI) + return Case.getCaseSuccessor(); + return SI->getDefaultDest(); + } + + return nullptr; +} + +/// Helper class that can turn branches and switches with constant conditions +/// into unconditional branches. +class ConstantTerminatorFoldingImpl { +private: + Loop &L; + LoopInfo &LI; + DominatorTree &DT; + MemorySSAUpdater *MSSAU; + + // Whether or not the current loop has irreducible CFG. + bool HasIrreducibleCFG = false; + // Whether or not the current loop will still exist after terminator constant + // folding will be done. In theory, there are two ways how it can happen: + // 1. Loop's latch(es) become unreachable from loop header; + // 2. Loop's header becomes unreachable from method entry. + // In practice, the second situation is impossible because we only modify the + // current loop and its preheader and do not affect preheader's reachibility + // from any other block. So this variable set to true means that loop's latch + // has become unreachable from loop header. + bool DeleteCurrentLoop = false; + + // The blocks of the original loop that will still be reachable from entry + // after the constant folding. + SmallPtrSet LiveLoopBlocks; + // The blocks of the original loop that will become unreachable from entry + // after the constant folding. + SmallPtrSet DeadLoopBlocks; + // The exits of the original loop that will still be reachable from entry + // after the constant folding. + SmallPtrSet LiveExitBlocks; + // The exits of the original loop that will become unreachable from entry + // after the constant folding. + SmallVector DeadExitBlocks; + // The blocks that will still be a part of the current loop after folding. + SmallPtrSet BlocksInLoopAfterFolding; + // The blocks that have terminators with constant condition that can be + // folded. Note: fold candidates should be in L but not in any of its + // subloops to avoid complex LI updates. + SmallVector FoldCandidates; + + void dump() const { + dbgs() << "Constant terminator folding for loop " << L << "\n"; + dbgs() << "After terminator constant-folding, the loop will"; + if (!DeleteCurrentLoop) + dbgs() << " not"; + dbgs() << " be destroyed\n"; + auto PrintOutVector = [&](const char *Message, + const SmallVectorImpl &S) { + dbgs() << Message << "\n"; + for (const BasicBlock *BB : S) + dbgs() << "\t" << BB->getName() << "\n"; + }; + auto PrintOutSet = [&](const char *Message, + const SmallPtrSetImpl &S) { + dbgs() << Message << "\n"; + for (const BasicBlock *BB : S) + dbgs() << "\t" << BB->getName() << "\n"; + }; + PrintOutVector("Blocks in which we can constant-fold terminator:", + FoldCandidates); + PrintOutSet("Live blocks from the original loop:", LiveLoopBlocks); + PrintOutSet("Dead blocks from the original loop:", DeadLoopBlocks); + PrintOutSet("Live exit blocks:", LiveExitBlocks); + PrintOutVector("Dead exit blocks:", DeadExitBlocks); + if (!DeleteCurrentLoop) + PrintOutSet("The following blocks will still be part of the loop:", + BlocksInLoopAfterFolding); + } + + /// Whether or not the current loop has irreducible CFG. + bool hasIrreducibleCFG(LoopBlocksDFS &DFS) { + assert(DFS.isComplete() && "DFS is expected to be finished"); + // Index of a basic block in RPO traversal. + DenseMap RPO; + unsigned Current = 0; + for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) + RPO[*I] = Current++; + + for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) { + BasicBlock *BB = *I; + for (auto *Succ : successors(BB)) + if (L.contains(Succ) && !LI.isLoopHeader(Succ) && RPO[BB] > RPO[Succ]) + // If an edge goes from a block with greater order number into a block + // with lesses number, and it is not a loop backedge, then it can only + // be a part of irreducible non-loop cycle. + return true; + } + return false; + } + + /// Fill all information about status of blocks and exits of the current loop + /// if constant folding of all branches will be done. + void analyze() { + LoopBlocksDFS DFS(&L); + DFS.perform(&LI); + assert(DFS.isComplete() && "DFS is expected to be finished"); + + // TODO: The algorithm below relies on both RPO and Postorder traversals. + // When the loop has only reducible CFG inside, then the invariant "all + // predecessors of X are processed before X in RPO" is preserved. However + // an irreducible loop can break this invariant (e.g. latch does not have to + // be the last block in the traversal in this case, and the algorithm relies + // on this). We can later decide to support such cases by altering the + // algorithms, but so far we just give up analyzing them. + if (hasIrreducibleCFG(DFS)) { + HasIrreducibleCFG = true; + return; + } + + // Collect live and dead loop blocks and exits. + LiveLoopBlocks.insert(L.getHeader()); + for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) { + BasicBlock *BB = *I; + + // If a loop block wasn't marked as live so far, then it's dead. + if (!LiveLoopBlocks.count(BB)) { + DeadLoopBlocks.insert(BB); + continue; + } + + BasicBlock *TheOnlySucc = getOnlyLiveSuccessor(BB); + + // If a block has only one live successor, it's a candidate on constant + // folding. Only handle blocks from current loop: branches in child loops + // are skipped because if they can be folded, they should be folded during + // the processing of child loops. + if (TheOnlySucc && LI.getLoopFor(BB) == &L) + FoldCandidates.push_back(BB); + + // Handle successors. + for (BasicBlock *Succ : successors(BB)) + if (!TheOnlySucc || TheOnlySucc == Succ) { + if (L.contains(Succ)) + LiveLoopBlocks.insert(Succ); + else + LiveExitBlocks.insert(Succ); + } + } + + // Sanity check: amount of dead and live loop blocks should match the total + // number of blocks in loop. + assert(L.getNumBlocks() == LiveLoopBlocks.size() + DeadLoopBlocks.size() && + "Malformed block sets?"); + + // Now, all exit blocks that are not marked as live are dead. + SmallVector ExitBlocks; + L.getExitBlocks(ExitBlocks); + for (auto *ExitBlock : ExitBlocks) + if (!LiveExitBlocks.count(ExitBlock)) + DeadExitBlocks.push_back(ExitBlock); + + // Whether or not the edge From->To will still be present in graph after the + // folding. + auto IsEdgeLive = [&](BasicBlock *From, BasicBlock *To) { + if (!LiveLoopBlocks.count(From)) + return false; + BasicBlock *TheOnlySucc = getOnlyLiveSuccessor(From); + return !TheOnlySucc || TheOnlySucc == To; + }; + + // The loop will not be destroyed if its latch is live. + DeleteCurrentLoop = !IsEdgeLive(L.getLoopLatch(), L.getHeader()); + + // If we are going to delete the current loop completely, no extra analysis + // is needed. + if (DeleteCurrentLoop) + return; + + // Otherwise, we should check which blocks will still be a part of the + // current loop after the transform. + BlocksInLoopAfterFolding.insert(L.getLoopLatch()); + // If the loop is live, then we should compute what blocks are still in + // loop after all branch folding has been done. A block is in loop if + // it has a live edge to another block that is in the loop; by definition, + // latch is in the loop. + auto BlockIsInLoop = [&](BasicBlock *BB) { + return any_of(successors(BB), [&](BasicBlock *Succ) { + return BlocksInLoopAfterFolding.count(Succ) && IsEdgeLive(BB, Succ); + }); + }; + for (auto I = DFS.beginPostorder(), E = DFS.endPostorder(); I != E; ++I) { + BasicBlock *BB = *I; + if (BlockIsInLoop(BB)) + BlocksInLoopAfterFolding.insert(BB); + } + + // Sanity check: header must be in loop. + assert(BlocksInLoopAfterFolding.count(L.getHeader()) && + "Header not in loop?"); + assert(BlocksInLoopAfterFolding.size() <= LiveLoopBlocks.size() && + "All blocks that stay in loop should be live!"); + } + + /// Constant-fold terminators of blocks acculumated in FoldCandidates into the + /// unconditional branches. + void foldTerminators() { + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + + for (BasicBlock *BB : FoldCandidates) { + assert(LI.getLoopFor(BB) == &L && "Should be a loop block!"); + BasicBlock *TheOnlySucc = getOnlyLiveSuccessor(BB); + assert(TheOnlySucc && "Should have one live successor!"); + + LLVM_DEBUG(dbgs() << "Replacing terminator of " << BB->getName() + << " with an unconditional branch to the block " + << TheOnlySucc->getName() << "\n"); + + SmallPtrSet DeadSuccessors; + // Remove all BB's successors except for the live one. + unsigned TheOnlySuccDuplicates = 0; + for (auto *Succ : successors(BB)) + if (Succ != TheOnlySucc) { + DeadSuccessors.insert(Succ); + // If our successor lies in a different loop, we don't want to remove + // the one-input Phi because it is a LCSSA Phi. + bool PreserveLCSSAPhi = !L.contains(Succ); + Succ->removePredecessor(BB, PreserveLCSSAPhi); + if (MSSAU) + MSSAU->removeEdge(BB, Succ); + } else + ++TheOnlySuccDuplicates; + + assert(TheOnlySuccDuplicates > 0 && "Should be!"); + // If TheOnlySucc was BB's successor more than once, after transform it + // will be its successor only once. Remove redundant inputs from + // TheOnlySucc's Phis. + bool PreserveLCSSAPhi = !L.contains(TheOnlySucc); + for (unsigned Dup = 1; Dup < TheOnlySuccDuplicates; ++Dup) + TheOnlySucc->removePredecessor(BB, PreserveLCSSAPhi); + if (MSSAU && TheOnlySuccDuplicates > 1) + MSSAU->removeDuplicatePhiEdgesBetween(BB, TheOnlySucc); + + IRBuilder<> Builder(BB->getContext()); + Instruction *Term = BB->getTerminator(); + Builder.SetInsertPoint(Term); + Builder.CreateBr(TheOnlySucc); + Term->eraseFromParent(); + + for (auto *DeadSucc : DeadSuccessors) + DTU.deleteEdge(BB, DeadSucc); + + ++NumTerminatorsFolded; + } + } + +public: + ConstantTerminatorFoldingImpl(Loop &L, LoopInfo &LI, DominatorTree &DT, + MemorySSAUpdater *MSSAU) + : L(L), LI(LI), DT(DT), MSSAU(MSSAU) {} + bool run() { + assert(L.getLoopLatch() && "Should be single latch!"); + + // Collect all available information about status of blocks after constant + // folding. + analyze(); + + LLVM_DEBUG(dbgs() << "In function " << L.getHeader()->getParent()->getName() + << ": "); + + if (HasIrreducibleCFG) { + LLVM_DEBUG(dbgs() << "Loops with irreducible CFG are not supported!\n"); + return false; + } + + // Nothing to constant-fold. + if (FoldCandidates.empty()) { + LLVM_DEBUG( + dbgs() << "No constant terminator folding candidates found in loop " + << L.getHeader()->getName() << "\n"); + return false; + } + + // TODO: Support deletion of the current loop. + if (DeleteCurrentLoop) { + LLVM_DEBUG( + dbgs() + << "Give up constant terminator folding in loop " + << L.getHeader()->getName() + << ": we don't currently support deletion of the current loop.\n"); + return false; + } + + // TODO: Support deletion of dead loop blocks. + if (!DeadLoopBlocks.empty()) { + LLVM_DEBUG(dbgs() << "Give up constant terminator folding in loop " + << L.getHeader()->getName() + << ": we don't currently" + " support deletion of dead in-loop blocks.\n"); + return false; + } + + // TODO: Support dead loop exits. + if (!DeadExitBlocks.empty()) { + LLVM_DEBUG(dbgs() << "Give up constant terminator folding in loop " + << L.getHeader()->getName() + << ": we don't currently support dead loop exits.\n"); + return false; + } + + // TODO: Support blocks that are not dead, but also not in loop after the + // folding. + if (BlocksInLoopAfterFolding.size() != L.getNumBlocks()) { + LLVM_DEBUG( + dbgs() << "Give up constant terminator folding in loop " + << L.getHeader()->getName() + << ": we don't currently" + " support blocks that are not dead, but will stop " + "being a part of the loop after constant-folding.\n"); + return false; + } + + // Dump analysis results. + LLVM_DEBUG(dump()); + + LLVM_DEBUG(dbgs() << "Constant-folding " << FoldCandidates.size() + << " terminators in loop " << L.getHeader()->getName() + << "\n"); + + // Make the actual transforms. + foldTerminators(); + +#ifndef NDEBUG + // Make sure that we have preserved all data structures after the transform. + DT.verify(); + assert(DT.isReachableFromEntry(L.getHeader())); + LI.verify(DT); +#endif + + return true; + } +}; + +/// Turn branches and switches with known constant conditions into unconditional +/// branches. +static bool constantFoldTerminators(Loop &L, DominatorTree &DT, LoopInfo &LI, + MemorySSAUpdater *MSSAU) { + if (!EnableTermFolding) + return false; + + // To keep things simple, only process loops with single latch. We + // canonicalize most loops to this form. We can support multi-latch if needed. + if (!L.getLoopLatch()) + return false; + + ConstantTerminatorFoldingImpl BranchFolder(L, LI, DT, MSSAU); + return BranchFolder.run(); +} + static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU) { bool Changed = false; @@ -73,6 +456,9 @@ static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE, MemorySSAUpdater *MSSAU) { bool Changed = false; + // Constant-fold terminators with known constant conditions. + Changed |= constantFoldTerminators(L, DT, LI, MSSAU); + // Eliminate unconditional branches by merging blocks into their predecessors. Changed |= mergeBlocksIntoPredecessors(L, DT, LI, MSSAU); diff --git a/lib/Transforms/Scalar/LoopSink.cpp b/lib/Transforms/Scalar/LoopSink.cpp index ce6ecea2dc2a8dcf6bdd3059175b095811d32b4f..540d19fc7939199b63c7c753fe1b9880583e4f84 100644 --- a/lib/Transforms/Scalar/LoopSink.cpp +++ b/lib/Transforms/Scalar/LoopSink.cpp @@ -280,6 +280,7 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI, // Compute alias set. for (BasicBlock *BB : L.blocks()) CurAST.add(*BB); + CurAST.add(*Preheader); // Sort loop's basic blocks by frequency SmallVector ColdLoopBBs; diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 857b83da96d8cf84502e6bd4c3d3374bc2b0a04b..773ffb9df0a2804a7d70898945c73f99dcf8ef90 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -155,6 +155,11 @@ static cl::opt FilterSameScaledReg( cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale")); +static cl::opt ComplexityLimit( + "lsr-complexity-limit", cl::Hidden, + cl::init(std::numeric_limits::max()), + cl::desc("LSR search space complexity limit")); + #ifndef NDEBUG // Stress test IV chain generation. static cl::opt StressIVChain( @@ -3638,32 +3643,62 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base) { // This method is only interesting on a plurality of registers. - if (Base.BaseRegs.size() + (Base.Scale == 1) <= 1) + if (Base.BaseRegs.size() + (Base.Scale == 1) + + (Base.UnfoldedOffset != 0) <= 1) return; // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before // processing the formula. Base.unscale(); - Formula F = Base; - F.BaseRegs.clear(); SmallVector Ops; + Formula NewBase = Base; + NewBase.BaseRegs.clear(); + Type *CombinedIntegerType = nullptr; for (const SCEV *BaseReg : Base.BaseRegs) { if (SE.properlyDominates(BaseReg, L->getHeader()) && - !SE.hasComputableLoopEvolution(BaseReg, L)) + !SE.hasComputableLoopEvolution(BaseReg, L)) { + if (!CombinedIntegerType) + CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType()); Ops.push_back(BaseReg); + } else - F.BaseRegs.push_back(BaseReg); + NewBase.BaseRegs.push_back(BaseReg); } - if (Ops.size() > 1) { - const SCEV *Sum = SE.getAddExpr(Ops); + + // If no register is relevant, we're done. + if (Ops.size() == 0) + return; + + // Utility function for generating the required variants of the combined + // registers. + auto GenerateFormula = [&](const SCEV *Sum) { + Formula F = NewBase; + // TODO: If Sum is zero, it probably means ScalarEvolution missed an // opportunity to fold something. For now, just ignore such cases // rather than proceed with zero in a register. - if (!Sum->isZero()) { - F.BaseRegs.push_back(Sum); - F.canonicalize(*L); - (void)InsertFormula(LU, LUIdx, F); - } + if (Sum->isZero()) + return; + + F.BaseRegs.push_back(Sum); + F.canonicalize(*L); + (void)InsertFormula(LU, LUIdx, F); + }; + + // If we collected at least two registers, generate a formula combining them. + if (Ops.size() > 1) { + SmallVector OpsCopy(Ops); // Don't let SE modify Ops. + GenerateFormula(SE.getAddExpr(OpsCopy)); + } + + // If we have an unfolded offset, generate a formula combining it with the + // registers collected. + if (NewBase.UnfoldedOffset) { + assert(CombinedIntegerType && "Missing a type for the unfolded offset"); + Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset, + true)); + NewBase.UnfoldedOffset = 0; + GenerateFormula(SE.getAddExpr(Ops)); } } @@ -4281,9 +4316,6 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { }); } -// This is a rough guess that seems to work fairly well. -static const size_t ComplexityLimit = std::numeric_limits::max(); - /// Estimate the worst-case number of solutions the solver might have to /// consider. It almost never considers this many solutions because it prune the /// search space, but the pruning isn't always sufficient. diff --git a/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp index 30dfb9b5dd26459d1e7652b8d5090a84a314c50a..da46210b6fdd21facd3c53d0c59801e880101619 100644 --- a/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -56,6 +56,20 @@ using namespace llvm; #define DEBUG_TYPE "loop-unroll-and-jam" +/// @{ +/// Metadata attribute names +static const char *const LLVMLoopUnrollAndJamFollowupAll = + "llvm.loop.unroll_and_jam.followup_all"; +static const char *const LLVMLoopUnrollAndJamFollowupInner = + "llvm.loop.unroll_and_jam.followup_inner"; +static const char *const LLVMLoopUnrollAndJamFollowupOuter = + "llvm.loop.unroll_and_jam.followup_outer"; +static const char *const LLVMLoopUnrollAndJamFollowupRemainderInner = + "llvm.loop.unroll_and_jam.followup_remainder_inner"; +static const char *const LLVMLoopUnrollAndJamFollowupRemainderOuter = + "llvm.loop.unroll_and_jam.followup_remainder_outer"; +/// @} + static cl::opt AllowUnrollAndJam("allow-unroll-and-jam", cl::Hidden, cl::desc("Allows loops to be unroll-and-jammed.")); @@ -112,11 +126,6 @@ static bool HasUnrollAndJamEnablePragma(const Loop *L) { return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.enable"); } -// Returns true if the loop has an unroll_and_jam(disable) pragma. -static bool HasUnrollAndJamDisablePragma(const Loop *L) { - return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.disable"); -} - // If loop has an unroll_and_jam_count pragma return the (necessarily // positive) value from the pragma. Otherwise return 0. static unsigned UnrollAndJamCountPragmaValue(const Loop *L) { @@ -299,13 +308,16 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, << L->getHeader()->getParent()->getName() << "] Loop %" << L->getHeader()->getName() << "\n"); + TransformationMode EnableMode = hasUnrollAndJamTransformation(L); + if (EnableMode & TM_Disable) + return LoopUnrollResult::Unmodified; + // A loop with any unroll pragma (enabling/disabling/count/etc) is left for // the unroller, so long as it does not explicitly have unroll_and_jam // metadata. This means #pragma nounroll will disable unroll and jam as well // as unrolling - if (HasUnrollAndJamDisablePragma(L) || - (HasAnyUnrollPragma(L, "llvm.loop.unroll.") && - !HasAnyUnrollPragma(L, "llvm.loop.unroll_and_jam."))) { + if (HasAnyUnrollPragma(L, "llvm.loop.unroll.") && + !HasAnyUnrollPragma(L, "llvm.loop.unroll_and_jam.")) { LLVM_DEBUG(dbgs() << " Disabled due to pragma.\n"); return LoopUnrollResult::Unmodified; } @@ -344,6 +356,19 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, return LoopUnrollResult::Unmodified; } + // Save original loop IDs for after the transformation. + MDNode *OrigOuterLoopID = L->getLoopID(); + MDNode *OrigSubLoopID = SubLoop->getLoopID(); + + // To assign the loop id of the epilogue, assign it before unrolling it so it + // is applied to every inner loop of the epilogue. We later apply the loop ID + // for the jammed inner loop. + Optional NewInnerEpilogueLoopID = makeFollowupLoopID( + OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll, + LLVMLoopUnrollAndJamFollowupRemainderInner}); + if (NewInnerEpilogueLoopID.hasValue()) + SubLoop->setLoopID(NewInnerEpilogueLoopID.getValue()); + // Find trip count and trip multiple unsigned OuterTripCount = SE.getSmallConstantTripCount(L, Latch); unsigned OuterTripMultiple = SE.getSmallConstantTripMultiple(L, Latch); @@ -359,9 +384,39 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, if (OuterTripCount && UP.Count > OuterTripCount) UP.Count = OuterTripCount; - LoopUnrollResult UnrollResult = - UnrollAndJamLoop(L, UP.Count, OuterTripCount, OuterTripMultiple, - UP.UnrollRemainder, LI, &SE, &DT, &AC, &ORE); + Loop *EpilogueOuterLoop = nullptr; + LoopUnrollResult UnrollResult = UnrollAndJamLoop( + L, UP.Count, OuterTripCount, OuterTripMultiple, UP.UnrollRemainder, LI, + &SE, &DT, &AC, &ORE, &EpilogueOuterLoop); + + // Assign new loop attributes. + if (EpilogueOuterLoop) { + Optional NewOuterEpilogueLoopID = makeFollowupLoopID( + OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll, + LLVMLoopUnrollAndJamFollowupRemainderOuter}); + if (NewOuterEpilogueLoopID.hasValue()) + EpilogueOuterLoop->setLoopID(NewOuterEpilogueLoopID.getValue()); + } + + Optional NewInnerLoopID = + makeFollowupLoopID(OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll, + LLVMLoopUnrollAndJamFollowupInner}); + if (NewInnerLoopID.hasValue()) + SubLoop->setLoopID(NewInnerLoopID.getValue()); + else + SubLoop->setLoopID(OrigSubLoopID); + + if (UnrollResult == LoopUnrollResult::PartiallyUnrolled) { + Optional NewOuterLoopID = makeFollowupLoopID( + OrigOuterLoopID, + {LLVMLoopUnrollAndJamFollowupAll, LLVMLoopUnrollAndJamFollowupOuter}); + if (NewOuterLoopID.hasValue()) { + L->setLoopID(NewOuterLoopID.getValue()); + + // Do not setLoopAlreadyUnrolled if a followup was given. + return UnrollResult; + } + } // If loop has an unroll count pragma or unrolled by explicitly set count // mark loop as unrolled to prevent unrolling beyond that requested. diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index d10dae124a79477cd383c4c41584e2d942a386fa..38b80f48ed0e2ccf5bb86432db3905d9e6b915b4 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -661,11 +661,6 @@ static bool HasUnrollEnablePragma(const Loop *L) { return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.enable"); } -// Returns true if the loop has an unroll(disable) pragma. -static bool HasUnrollDisablePragma(const Loop *L) { - return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable"); -} - // Returns true if the loop has an runtime unroll(disable) pragma. static bool HasRuntimeUnrollDisablePragma(const Loop *L) { return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable"); @@ -713,12 +708,19 @@ static uint64_t getUnrolledLoopSize( // Returns true if unroll count was set explicitly. // Calculates unroll count and writes it to UP.Count. +// Unless IgnoreUser is true, will also use metadata and command-line options +// that are specific to to the LoopUnroll pass (which, for instance, are +// irrelevant for the LoopUnrollAndJam pass). +// FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes +// many LoopUnroll-specific options. The shared functionality should be +// refactored into it own function. bool llvm::computeUnrollCount( Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const SmallPtrSetImpl &EphValues, OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) { + // Check for explicit Count. // 1st priority is unroll count set by "unroll-count" option. bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0; @@ -963,13 +965,15 @@ static LoopUnrollResult tryToUnrollLoop( Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache &AC, OptimizationRemarkEmitter &ORE, bool PreserveLCSSA, int OptLevel, - Optional ProvidedCount, Optional ProvidedThreshold, - Optional ProvidedAllowPartial, Optional ProvidedRuntime, - Optional ProvidedUpperBound, Optional ProvidedAllowPeeling) { + bool OnlyWhenForced, Optional ProvidedCount, + Optional ProvidedThreshold, Optional ProvidedAllowPartial, + Optional ProvidedRuntime, Optional ProvidedUpperBound, + Optional ProvidedAllowPeeling) { LLVM_DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName() << "] Loop %" << L->getHeader()->getName() << "\n"); - if (HasUnrollDisablePragma(L)) + TransformationMode TM = hasUnrollTransformation(L); + if (TM & TM_Disable) return LoopUnrollResult::Unmodified; if (!L->isLoopSimplifyForm()) { LLVM_DEBUG( @@ -977,6 +981,11 @@ static LoopUnrollResult tryToUnrollLoop( return LoopUnrollResult::Unmodified; } + // When automtatic unrolling is disabled, do not unroll unless overridden for + // this loop. + if (OnlyWhenForced && !(TM & TM_Enable)) + return LoopUnrollResult::Unmodified; + unsigned NumInlineCandidates; bool NotDuplicatable; bool Convergent; @@ -1066,14 +1075,39 @@ static LoopUnrollResult tryToUnrollLoop( if (TripCount && UP.Count > TripCount) UP.Count = TripCount; + // Save loop properties before it is transformed. + MDNode *OrigLoopID = L->getLoopID(); + // Unroll the loop. + Loop *RemainderLoop = nullptr; LoopUnrollResult UnrollResult = UnrollLoop( L, UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount, UseUpperBound, MaxOrZero, TripMultiple, UP.PeelCount, UP.UnrollRemainder, - LI, &SE, &DT, &AC, &ORE, PreserveLCSSA); + LI, &SE, &DT, &AC, &ORE, PreserveLCSSA, &RemainderLoop); if (UnrollResult == LoopUnrollResult::Unmodified) return LoopUnrollResult::Unmodified; + if (RemainderLoop) { + Optional RemainderLoopID = + makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll, + LLVMLoopUnrollFollowupRemainder}); + if (RemainderLoopID.hasValue()) + RemainderLoop->setLoopID(RemainderLoopID.getValue()); + } + + if (UnrollResult != LoopUnrollResult::FullyUnrolled) { + Optional NewLoopID = + makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll, + LLVMLoopUnrollFollowupUnrolled}); + if (NewLoopID.hasValue()) { + L->setLoopID(NewLoopID.getValue()); + + // Do not setLoopAlreadyUnrolled if loop attributes have been specified + // explicitly. + return UnrollResult; + } + } + // If loop has an unroll count pragma or unrolled by explicitly set count // mark loop as unrolled to prevent unrolling beyond that requested. // If the loop was peeled, we already "used up" the profile information @@ -1092,6 +1126,12 @@ public: static char ID; // Pass ID, replacement for typeid int OptLevel; + + /// If false, use a cost model to determine whether unrolling of a loop is + /// profitable. If true, only loops that explicitly request unrolling via + /// metadata are considered. All other loops are skipped. + bool OnlyWhenForced; + Optional ProvidedCount; Optional ProvidedThreshold; Optional ProvidedAllowPartial; @@ -1099,15 +1139,16 @@ public: Optional ProvidedUpperBound; Optional ProvidedAllowPeeling; - LoopUnroll(int OptLevel = 2, Optional Threshold = None, + LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false, + Optional Threshold = None, Optional Count = None, Optional AllowPartial = None, Optional Runtime = None, Optional UpperBound = None, Optional AllowPeeling = None) - : LoopPass(ID), OptLevel(OptLevel), ProvidedCount(std::move(Count)), - ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial), - ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound), - ProvidedAllowPeeling(AllowPeeling) { + : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced), + ProvidedCount(std::move(Count)), ProvidedThreshold(Threshold), + ProvidedAllowPartial(AllowPartial), ProvidedRuntime(Runtime), + ProvidedUpperBound(UpperBound), ProvidedAllowPeeling(AllowPeeling) { initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); } @@ -1130,8 +1171,8 @@ public: bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); LoopUnrollResult Result = tryToUnrollLoop( - L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel, ProvidedCount, - ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime, + L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel, OnlyWhenForced, + ProvidedCount, ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, ProvidedAllowPeeling); if (Result == LoopUnrollResult::FullyUnrolled) @@ -1161,14 +1202,16 @@ INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) -Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count, - int AllowPartial, int Runtime, int UpperBound, +Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced, + int Threshold, int Count, int AllowPartial, + int Runtime, int UpperBound, int AllowPeeling) { // TODO: It would make more sense for this function to take the optionals // directly, but that's dangerous since it would silently break out of tree // callers. return new LoopUnroll( - OptLevel, Threshold == -1 ? None : Optional(Threshold), + OptLevel, OnlyWhenForced, + Threshold == -1 ? None : Optional(Threshold), Count == -1 ? None : Optional(Count), AllowPartial == -1 ? None : Optional(AllowPartial), Runtime == -1 ? None : Optional(Runtime), @@ -1176,8 +1219,8 @@ Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count, AllowPeeling == -1 ? None : Optional(AllowPeeling)); } -Pass *llvm::createSimpleLoopUnrollPass(int OptLevel) { - return createLoopUnrollPass(OptLevel, -1, -1, 0, 0, 0, 0); +Pass *llvm::createSimpleLoopUnrollPass(int OptLevel, bool OnlyWhenForced) { + return createLoopUnrollPass(OptLevel, OnlyWhenForced, -1, -1, 0, 0, 0, 0); } PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM, @@ -1207,7 +1250,8 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM, bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE, - /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None, + /*PreserveLCSSA*/ true, OptLevel, OnlyWhenForced, + /*Count*/ None, /*Threshold*/ None, /*AllowPartial*/ false, /*Runtime*/ false, /*UpperBound*/ false, /*AllowPeeling*/ false) != LoopUnrollResult::Unmodified; @@ -1344,7 +1388,8 @@ PreservedAnalyses LoopUnrollPass::run(Function &F, // flavors of unrolling during construction time (by setting UnrollOpts). LoopUnrollResult Result = tryToUnrollLoop( &L, DT, &LI, SE, TTI, AC, ORE, - /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, /*Count*/ None, + /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced, + /*Count*/ None, /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime, UnrollOpts.AllowUpperBound, LocalAllowPeeling); Changed |= Result != LoopUnrollResult::Unmodified; diff --git a/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/lib/Transforms/Scalar/LoopVersioningLICM.cpp index 06e86081e8a0c31cc286a1aefb66f776d9bdb6d5..c0c59d24dffc708b61d582e4ef951a24f91ce322 100644 --- a/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -594,6 +594,11 @@ bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) { if (skipLoop(L)) return false; + + // Do not do the transformation if disabled by metadata. + if (hasLICMVersioningTransformation(L) & TM_Disable) + return false; + // Get Analysis information. AA = &getAnalysis().getAAResults(); SE = &getAnalysis().getSE(); diff --git a/lib/Transforms/Scalar/MakeGuardsExplicit.cpp b/lib/Transforms/Scalar/MakeGuardsExplicit.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1ba3994eba0ed57aac65aa2a1d98357f9cf257d1 --- /dev/null +++ b/lib/Transforms/Scalar/MakeGuardsExplicit.cpp @@ -0,0 +1,120 @@ +//===- MakeGuardsExplicit.cpp - Turn guard intrinsics into guard branches -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass lowers the @llvm.experimental.guard intrinsic to the new form of +// guard represented as widenable explicit branch to the deopt block. The +// difference between this pass and LowerGuardIntrinsic is that after this pass +// the guard represented as intrinsic: +// +// call void(i1, ...) @llvm.experimental.guard(i1 %old_cond) [ "deopt"() ] +// +// transforms to a guard represented as widenable explicit branch: +// +// %widenable_cond = call i1 @llvm.experimental.widenable.condition() +// br i1 (%old_cond & %widenable_cond), label %guarded, label %deopt +// +// Here: +// - The semantics of @llvm.experimental.widenable.condition allows to replace +// %widenable_cond with the construction (%widenable_cond & %any_other_cond) +// without loss of correctness; +// - %guarded is the lower part of old guard intrinsic's parent block split by +// the intrinsic call; +// - %deopt is a block containing a sole call to @llvm.experimental.deoptimize +// intrinsic. +// +// Therefore, this branch preserves the property of widenability. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/MakeGuardsExplicit.h" +#include "llvm/Analysis/GuardUtils.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/GuardUtils.h" + +using namespace llvm; + +namespace { +struct MakeGuardsExplicitLegacyPass : public FunctionPass { + static char ID; + MakeGuardsExplicitLegacyPass() : FunctionPass(ID) { + initializeMakeGuardsExplicitLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; +}; +} + +static void turnToExplicitForm(CallInst *Guard, Function *DeoptIntrinsic) { + // Replace the guard with an explicit branch (just like in GuardWidening). + BasicBlock *BB = Guard->getParent(); + makeGuardControlFlowExplicit(DeoptIntrinsic, Guard); + BranchInst *ExplicitGuard = cast(BB->getTerminator()); + assert(ExplicitGuard->isConditional() && "Must be!"); + + // We want the guard to be expressed as explicit control flow, but still be + // widenable. For that, we add Widenable Condition intrinsic call to the + // guard's condition. + IRBuilder<> B(ExplicitGuard); + auto *WidenableCondition = + B.CreateIntrinsic(Intrinsic::experimental_widenable_condition, + {}, {}, nullptr, "widenable_cond"); + WidenableCondition->setCallingConv(Guard->getCallingConv()); + auto *NewCond = + B.CreateAnd(ExplicitGuard->getCondition(), WidenableCondition); + NewCond->setName("exiplicit_guard_cond"); + ExplicitGuard->setCondition(NewCond); + Guard->eraseFromParent(); +} + +static bool explicifyGuards(Function &F) { + // Check if we can cheaply rule out the possibility of not having any work to + // do. + auto *GuardDecl = F.getParent()->getFunction( + Intrinsic::getName(Intrinsic::experimental_guard)); + if (!GuardDecl || GuardDecl->use_empty()) + return false; + + SmallVector GuardIntrinsics; + for (auto &I : instructions(F)) + if (isGuard(&I)) + GuardIntrinsics.push_back(cast(&I)); + + if (GuardIntrinsics.empty()) + return false; + + auto *DeoptIntrinsic = Intrinsic::getDeclaration( + F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()}); + DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv()); + + for (auto *Guard : GuardIntrinsics) + turnToExplicitForm(Guard, DeoptIntrinsic); + + return true; +} + +bool MakeGuardsExplicitLegacyPass::runOnFunction(Function &F) { + return explicifyGuards(F); +} + +char MakeGuardsExplicitLegacyPass::ID = 0; +INITIALIZE_PASS(MakeGuardsExplicitLegacyPass, "make-guards-explicit", + "Lower the guard intrinsic to explicit control flow form", + false, false) + +PreservedAnalyses MakeGuardsExplicitPass::run(Function &F, + FunctionAnalysisManager &) { + if (explicifyGuards(F)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 4e82e2bd42c008ffefe5e5527bca37c06ee68314..8756a1afcddfb52f2ffec90e56f3df0ebeb37e40 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1144,6 +1144,21 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, return true; } +/// Determine whether the instruction has undefined content for the given Size, +/// either because it was freshly alloca'd or started its lifetime. +static bool hasUndefContents(Instruction *I, ConstantInt *Size) { + if (isa(I)) + return true; + + if (IntrinsicInst *II = dyn_cast(I)) + if (II->getIntrinsicID() == Intrinsic::lifetime_start) + if (ConstantInt *LTSize = dyn_cast(II->getArgOperand(0))) + if (LTSize->getZExtValue() >= Size->getZExtValue()) + return true; + + return false; +} + /// Transform memcpy to memset when its source was just memset. /// In other words, turn: /// \code @@ -1167,12 +1182,27 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, if (!AA.isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource())) return false; - ConstantInt *CopySize = cast(MemCpy->getLength()); + // A known memset size is required. ConstantInt *MemSetSize = dyn_cast(MemSet->getLength()); + if (!MemSetSize) + return false; + // Make sure the memcpy doesn't read any more than what the memset wrote. // Don't worry about sizes larger than i64. - if (!MemSetSize || CopySize->getZExtValue() > MemSetSize->getZExtValue()) - return false; + ConstantInt *CopySize = cast(MemCpy->getLength()); + if (CopySize->getZExtValue() > MemSetSize->getZExtValue()) { + // If the memcpy is larger than the memset, but the memory was undef prior + // to the memset, we can just ignore the tail. Technically we're only + // interested in the bytes from MemSetSize..CopySize here, but as we can't + // easily represent this location, we use the full 0..CopySize range. + MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy); + MemDepResult DepInfo = MD->getPointerDependencyFrom( + MemCpyLoc, true, MemSet->getIterator(), MemSet->getParent()); + if (DepInfo.isDef() && hasUndefContents(DepInfo.getInst(), CopySize)) + CopySize = MemSetSize; + else + return false; + } IRBuilder<> Builder(MemCpy); Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), @@ -1252,19 +1282,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) { if (MemCpyInst *MDep = dyn_cast(SrcDepInfo.getInst())) return processMemCpyMemCpyDependence(M, MDep); } else if (SrcDepInfo.isDef()) { - Instruction *I = SrcDepInfo.getInst(); - bool hasUndefContents = false; - - if (isa(I)) { - hasUndefContents = true; - } else if (IntrinsicInst *II = dyn_cast(I)) { - if (II->getIntrinsicID() == Intrinsic::lifetime_start) - if (ConstantInt *LTSize = dyn_cast(II->getArgOperand(0))) - if (LTSize->getZExtValue() >= CopySize->getZExtValue()) - hasUndefContents = true; - } - - if (hasUndefContents) { + if (hasUndefContents(SrcDepInfo.getInst(), CopySize)) { MD->removeInstruction(M); M->eraseFromParent(); ++NumMemCpyInstr; diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp index 9803bcb485d26994f17c82a20a8809bdd2f9b818..7cbb0fe70f820762001e702f0b8ab81418e3af37 100644 --- a/lib/Transforms/Scalar/NewGVN.cpp +++ b/lib/Transforms/Scalar/NewGVN.cpp @@ -3175,8 +3175,7 @@ bool NewGVN::singleReachablePHIPath( auto FilteredPhiArgs = make_filter_range(MP->operands(), ReachableOperandPred); SmallVector OperandList; - std::copy(FilteredPhiArgs.begin(), FilteredPhiArgs.end(), - std::back_inserter(OperandList)); + llvm::copy(FilteredPhiArgs, std::back_inserter(OperandList)); bool Okay = is_splat(OperandList); if (Okay) return singleReachablePHIPath(Visited, cast(OperandList[0]), @@ -4094,10 +4093,13 @@ bool NewGVN::eliminateInstructions(Function &F) { // It's about to be alive again. if (LeaderUseCount == 0 && isa(DominatingLeader)) ProbablyDead.erase(cast(DominatingLeader)); - // Copy instructions, however, are still dead because we use their - // operand as the leader. - if (LeaderUseCount == 0 && isSSACopy) - ProbablyDead.insert(II); + // For copy instructions, we use their operand as a leader, + // which means we remove a user of the copy and it may become dead. + if (isSSACopy) { + unsigned &IIUseCount = UseCounts[II]; + if (--IIUseCount == 0) + ProbablyDead.insert(II); + } ++LeaderUseCount; AnythingReplaced = true; } diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index 61b6d7ca259300f1f5435fe30752c878b2590d01..cb893eab1654f6968cd3812758b26628f0495789 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -789,13 +789,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, // Discard any debug info related to the expressions that has changed (we // can leave debug infor related to the root, since the result of the // expression tree should be the same even after reassociation). - SmallVector DbgUsers; - findDbgUsers(DbgUsers, ExpressionChanged); - for (auto *DII : DbgUsers) { - Value *Undef = UndefValue::get(ExpressionChanged->getType()); - DII->setOperand(0, MetadataAsValue::get(DII->getContext(), - ValueAsMetadata::get(Undef))); - } + replaceDbgUsesWithUndef(ExpressionChanged); ExpressionChanged->moveBefore(I); ExpressionChanged = cast(*ExpressionChanged->user_begin()); diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 1f98128f923a6301593d5c69b97af4d8b9d4d874..2f6ed05c023b1e6bb63835fb59682ebc334b1786 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -247,19 +247,25 @@ class SCCPSolver : public InstVisitor { using Edge = std::pair; DenseSet KnownFeasibleEdges; - DenseMap> PredInfos; + DenseMap AnalysisResults; DenseMap> AdditionalUsers; public: - void addPredInfo(Function &F, std::unique_ptr PI) { - PredInfos[&F] = std::move(PI); + void addAnalysis(Function &F, AnalysisResultsForFn A) { + AnalysisResults.insert({&F, std::move(A)}); } const PredicateBase *getPredicateInfoFor(Instruction *I) { - auto PI = PredInfos.find(I->getFunction()); - if (PI == PredInfos.end()) + auto A = AnalysisResults.find(I->getParent()->getParent()); + if (A == AnalysisResults.end()) return nullptr; - return PI->second->getPredicateInfoFor(I); + return A->second.PredInfo->getPredicateInfoFor(I); + } + + DomTreeUpdater getDTU(Function &F) { + auto A = AnalysisResults.find(&F); + assert(A != AnalysisResults.end() && "Need analysis results for function."); + return {A->second.DT, A->second.PDT, DomTreeUpdater::UpdateStrategy::Lazy}; } SCCPSolver(const DataLayout &DL, const TargetLibraryInfo *tli) @@ -1165,26 +1171,26 @@ void SCCPSolver::visitCallSite(CallSite CS) { if (!PI) return; - auto *PBranch = dyn_cast(getPredicateInfoFor(I)); + Value *CopyOf = I->getOperand(0); + auto *PBranch = dyn_cast(PI); if (!PBranch) { - mergeInValue(ValueState[I], I, getValueState(PI->OriginalOp)); + mergeInValue(ValueState[I], I, getValueState(CopyOf)); return; } - Value *CopyOf = I->getOperand(0); Value *Cond = PBranch->Condition; // Everything below relies on the condition being a comparison. auto *Cmp = dyn_cast(Cond); if (!Cmp) { - mergeInValue(ValueState[I], I, getValueState(PI->OriginalOp)); + mergeInValue(ValueState[I], I, getValueState(CopyOf)); return; } Value *CmpOp0 = Cmp->getOperand(0); Value *CmpOp1 = Cmp->getOperand(1); if (CopyOf != CmpOp0 && CopyOf != CmpOp1) { - mergeInValue(ValueState[I], I, getValueState(PI->OriginalOp)); + mergeInValue(ValueState[I], I, getValueState(CopyOf)); return; } @@ -1211,7 +1217,7 @@ void SCCPSolver::visitCallSite(CallSite CS) { return; } - return (void)mergeInValue(IV, I, getValueState(PBranch->OriginalOp)); + return (void)mergeInValue(IV, I, getValueState(CopyOf)); } } @@ -1927,10 +1933,9 @@ static void forceIndeterminateEdge(Instruction* I, SCCPSolver &Solver) { } } - bool llvm::runIPSCCP( Module &M, const DataLayout &DL, const TargetLibraryInfo *TLI, - function_ref(Function &)> getPredicateInfo) { + function_ref getAnalysis) { SCCPSolver Solver(DL, TLI); // Loop over all functions, marking arguments to those with their addresses @@ -1939,7 +1944,8 @@ bool llvm::runIPSCCP( if (F.isDeclaration()) continue; - Solver.addPredInfo(F, getPredicateInfo(F)); + Solver.addAnalysis(F, getAnalysis(F)); + // Determine if we can track the function's return values. If so, add the // function to the solver's set of return-tracked functions. if (canTrackReturnsInterprocedurally(&F)) @@ -1988,12 +1994,13 @@ bool llvm::runIPSCCP( // Iterate over all of the instructions in the module, replacing them with // constants if we have found them to be of constant values. - SmallVector BlocksToErase; for (Function &F : M) { if (F.isDeclaration()) continue; + SmallVector BlocksToErase; + if (Solver.isBlockExecutable(&F.front())) for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E; ++AI) { @@ -2029,23 +2036,26 @@ bool llvm::runIPSCCP( } } - // Change dead blocks to unreachable. We do it after replacing constants in - // all executable blocks, because changeToUnreachable may remove PHI nodes - // in executable blocks we found values for. The function's entry block is - // not part of BlocksToErase, so we have to handle it separately. - for (BasicBlock *BB : BlocksToErase) + DomTreeUpdater DTU = Solver.getDTU(F); + // Change dead blocks to unreachable. We do it after replacing constants + // in all executable blocks, because changeToUnreachable may remove PHI + // nodes in executable blocks we found values for. The function's entry + // block is not part of BlocksToErase, so we have to handle it separately. + for (BasicBlock *BB : BlocksToErase) { NumInstRemoved += - changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false); + changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false, + /*PreserveLCSSA=*/false, &DTU); + } if (!Solver.isBlockExecutable(&F.front())) NumInstRemoved += changeToUnreachable(F.front().getFirstNonPHI(), - /*UseLLVMTrap=*/false); + /*UseLLVMTrap=*/false, + /*PreserveLCSSA=*/false, &DTU); - // Now that all instructions in the function are constant folded, erase dead - // blocks, because we can now use ConstantFoldTerminator to get rid of - // in-edges. - for (unsigned i = 0, e = BlocksToErase.size(); i != e; ++i) { + // Now that all instructions in the function are constant folded, + // use ConstantFoldTerminator to get rid of in-edges, record DT updates and + // delete dead BBs. + for (BasicBlock *DeadBB : BlocksToErase) { // If there are any PHI nodes in this successor, drop entries for BB now. - BasicBlock *DeadBB = BlocksToErase[i]; for (Value::user_iterator UI = DeadBB->user_begin(), UE = DeadBB->user_end(); UI != UE;) { @@ -2060,16 +2070,16 @@ bool llvm::runIPSCCP( // If we have forced an edge for an indeterminate value, then force the // terminator to fold to that edge. forceIndeterminateEdge(I, Solver); - bool Folded = ConstantFoldTerminator(I->getParent()); + bool Folded = ConstantFoldTerminator(I->getParent(), + /*DeleteDeadConditions=*/false, + /*TLI=*/nullptr, &DTU); assert(Folded && "Expect TermInst on constantint or blockaddress to be folded"); (void) Folded; } - - // Finally, delete the basic block. - F.getBasicBlockList().erase(DeadBB); + // Mark dead BB for deletion. + DTU.deleteBB(DeadBB); } - BlocksToErase.clear(); for (BasicBlock &BB : F) { for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) { diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index a8b9ee566395a3b6cabc74ba05eaa2afef4795ad..81eea0dee4e4425fb192aae64e92d86bd5efa0ec 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -3164,7 +3164,12 @@ class AggLoadStoreRewriter : public InstVisitor { /// value (as opposed to the user). Use *U; + /// Used to calculate offsets, and hence alignment, of subobjects. + const DataLayout &DL; + public: + AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {} + /// Rewrite loads and stores through a pointer and all pointers derived from /// it. bool rewrite(Instruction &I) { @@ -3208,10 +3213,22 @@ private: /// split operations. Value *Ptr; + /// The base pointee type being GEPed into. + Type *BaseTy; + + /// Known alignment of the base pointer. + unsigned BaseAlign; + + /// To calculate offset of each component so we can correctly deduce + /// alignments. + const DataLayout &DL; + /// Initialize the splitter with an insertion point, Ptr and start with a /// single zero GEP index. - OpSplitter(Instruction *InsertionPoint, Value *Ptr) - : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} + OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, + unsigned BaseAlign, const DataLayout &DL) + : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), + BaseTy(BaseTy), BaseAlign(BaseAlign), DL(DL) {} public: /// Generic recursive split emission routine. @@ -3228,8 +3245,11 @@ private: /// \param Agg The aggregate value being built up or stored, depending on /// whether this is splitting a load or a store respectively. void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) { - if (Ty->isSingleValueType()) - return static_cast(this)->emitFunc(Ty, Agg, Name); + if (Ty->isSingleValueType()) { + unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices); + return static_cast(this)->emitFunc( + Ty, Agg, MinAlign(BaseAlign, Offset), Name); + } if (ArrayType *ATy = dyn_cast(Ty)) { unsigned OldSize = Indices.size(); @@ -3268,17 +3288,19 @@ private: struct LoadOpSplitter : public OpSplitter { AAMDNodes AATags; - LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, AAMDNodes AATags) - : OpSplitter(InsertionPoint, Ptr), AATags(AATags) {} + LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, + AAMDNodes AATags, unsigned BaseAlign, const DataLayout &DL) + : OpSplitter(InsertionPoint, Ptr, BaseTy, BaseAlign, + DL), AATags(AATags) {} /// Emit a leaf load of a single value. This is called at the leaves of the /// recursive emission to actually load values. - void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { + void emitFunc(Type *Ty, Value *&Agg, unsigned Align, const Twine &Name) { assert(Ty->isSingleValueType()); // Load the single value and insert it using the indices. Value *GEP = IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep"); - LoadInst *Load = IRB.CreateLoad(GEP, Name + ".load"); + LoadInst *Load = IRB.CreateAlignedLoad(GEP, Align, Name + ".load"); if (AATags) Load->setAAMetadata(AATags); Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); @@ -3295,7 +3317,8 @@ private: LLVM_DEBUG(dbgs() << " original: " << LI << "\n"); AAMDNodes AATags; LI.getAAMetadata(AATags); - LoadOpSplitter Splitter(&LI, *U, AATags); + LoadOpSplitter Splitter(&LI, *U, LI.getType(), AATags, + getAdjustedAlignment(&LI, 0, DL), DL); Value *V = UndefValue::get(LI.getType()); Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); LI.replaceAllUsesWith(V); @@ -3304,13 +3327,15 @@ private: } struct StoreOpSplitter : public OpSplitter { - StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, AAMDNodes AATags) - : OpSplitter(InsertionPoint, Ptr), AATags(AATags) {} + StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, + AAMDNodes AATags, unsigned BaseAlign, const DataLayout &DL) + : OpSplitter(InsertionPoint, Ptr, BaseTy, BaseAlign, + DL), + AATags(AATags) {} AAMDNodes AATags; - /// Emit a leaf store of a single value. This is called at the leaves of the /// recursive emission to actually produce stores. - void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { + void emitFunc(Type *Ty, Value *&Agg, unsigned Align, const Twine &Name) { assert(Ty->isSingleValueType()); // Extract the single value and store it using the indices. // @@ -3320,7 +3345,8 @@ private: IRB.CreateExtractValue(Agg, Indices, Name + ".extract"); Value *InBoundsGEP = IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep"); - StoreInst *Store = IRB.CreateStore(ExtractValue, InBoundsGEP); + StoreInst *Store = + IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Align); if (AATags) Store->setAAMetadata(AATags); LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); @@ -3338,7 +3364,8 @@ private: LLVM_DEBUG(dbgs() << " original: " << SI << "\n"); AAMDNodes AATags; SI.getAAMetadata(AATags); - StoreOpSplitter Splitter(&SI, *U, AATags); + StoreOpSplitter Splitter(&SI, *U, V->getType(), AATags, + getAdjustedAlignment(&SI, 0, DL), DL); Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); SI.eraseFromParent(); return true; @@ -4356,7 +4383,7 @@ bool SROA::runOnAlloca(AllocaInst &AI) { // First, split any FCA loads and stores touching this alloca to promote // better splitting and promotion opportunities. - AggLoadStoreRewriter AggRewriter; + AggLoadStoreRewriter AggRewriter(DL); Changed |= AggRewriter.rewrite(AI); // Build the slices using a recursive instruction-visiting builder. diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 1b140acbaeee1fa6d8c24a813d37cef9d9d129e4..976daf4c78c2fd2c3ab86e755c1e2055aadfd9de 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" #include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Scalar/Scalarizer.h" #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" @@ -43,7 +44,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeDCELegacyPassPass(Registry); initializeDeadInstEliminationPass(Registry); initializeDivRemPairsLegacyPassPass(Registry); - initializeScalarizerPass(Registry); + initializeScalarizerLegacyPassPass(Registry); initializeDSELegacyPassPass(Registry); initializeGuardWideningLegacyPassPass(Registry); initializeLoopGuardWideningLegacyPassPass(Registry); @@ -51,6 +52,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeNewGVNLegacyPassPass(Registry); initializeEarlyCSELegacyPassPass(Registry); initializeEarlyCSEMemSSALegacyPassPass(Registry); + initializeMakeGuardsExplicitLegacyPassPass(Registry); initializeGVNHoistLegacyPassPass(Registry); initializeGVNSinkLegacyPassPass(Registry); initializeFlattenCFGPassPass(Registry); @@ -73,6 +75,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLoopUnrollPass(Registry); initializeLoopUnrollAndJamPass(Registry); initializeLoopUnswitchPass(Registry); + initializeWarnMissedTransformationsLegacyPass(Registry); initializeLoopVersioningLICMPass(Registry); initializeLoopIdiomRecognizeLegacyPassPass(Registry); initializeLowerAtomicLegacyPassPass(Registry); diff --git a/lib/Transforms/Scalar/Scalarizer.cpp b/lib/Transforms/Scalar/Scalarizer.cpp index 2f873abca66a95fb3ac9975d6bd1abcbd5006390..3a6f8e6b09595833481e78ff883b3355a0ad3887 100644 --- a/lib/Transforms/Scalar/Scalarizer.cpp +++ b/lib/Transforms/Scalar/Scalarizer.cpp @@ -39,6 +39,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/Options.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/Scalarizer.h" #include #include #include @@ -49,6 +50,13 @@ using namespace llvm; #define DEBUG_TYPE "scalarizer" +// This is disabled by default because having separate loads and stores +// makes it more likely that the -combiner-alias-analysis limits will be +// reached. +static cl::opt + ScalarizeLoadStore("scalarize-load-store", cl::init(false), cl::Hidden, + cl::desc("Allow the scalarizer pass to scalarize loads and store")); + namespace { // Used to store the scattered form of a vector. @@ -152,17 +160,13 @@ struct VectorLayout { uint64_t ElemSize = 0; }; -class Scalarizer : public FunctionPass, - public InstVisitor { +class ScalarizerVisitor : public InstVisitor { public: - static char ID; - - Scalarizer() : FunctionPass(ID) { - initializeScalarizerPass(*PassRegistry::getPassRegistry()); + ScalarizerVisitor(unsigned ParallelLoopAccessMDKind) + : ParallelLoopAccessMDKind(ParallelLoopAccessMDKind) { } - bool doInitialization(Module &M) override; - bool runOnFunction(Function &F) override; + bool visit(Function &F); // InstVisitor methods. They return true if the instruction was scalarized, // false if nothing changed. @@ -180,16 +184,6 @@ public: bool visitStoreInst(StoreInst &SI); bool visitCallInst(CallInst &ICI); - static void registerOptions() { - // This is disabled by default because having separate loads and stores - // makes it more likely that the -combiner-alias-analysis limits will be - // reached. - OptionRegistry::registerOption( - "scalarize-load-store", - "Allow the scalarizer pass to scalarize loads and store", false); - } - private: Scatterer scatter(Instruction *Point, Value *V); void gather(Instruction *Op, const ValueVector &CV); @@ -205,16 +199,28 @@ private: ScatterMap Scattered; GatherList Gathered; + unsigned ParallelLoopAccessMDKind; - bool ScalarizeLoadStore; }; -} // end anonymous namespace +class ScalarizerLegacyPass : public FunctionPass { +public: + static char ID; + + ScalarizerLegacyPass() : FunctionPass(ID) { + initializeScalarizerLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; +}; -char Scalarizer::ID = 0; +} // end anonymous namespace -INITIALIZE_PASS_WITH_OPTIONS(Scalarizer, "scalarizer", - "Scalarize vector operations", false, false) +char ScalarizerLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(ScalarizerLegacyPass, "scalarizer", + "Scalarize vector operations", false, false) +INITIALIZE_PASS_END(ScalarizerLegacyPass, "scalarizer", + "Scalarize vector operations", false, false) Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, ValueVector *cachePtr) @@ -278,17 +284,22 @@ Value *Scatterer::operator[](unsigned I) { return CV[I]; } -bool Scalarizer::doInitialization(Module &M) { - ParallelLoopAccessMDKind = +bool ScalarizerLegacyPass::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + Module &M = *F.getParent(); + unsigned ParallelLoopAccessMDKind = M.getContext().getMDKindID("llvm.mem.parallel_loop_access"); - ScalarizeLoadStore = - M.getContext().getOption(); - return false; + ScalarizerVisitor Impl(ParallelLoopAccessMDKind); + return Impl.visit(F); } -bool Scalarizer::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; +FunctionPass *llvm::createScalarizerPass() { + return new ScalarizerLegacyPass(); +} + +bool ScalarizerVisitor::visit(Function &F) { assert(Gathered.empty() && Scattered.empty()); // To ensure we replace gathered components correctly we need to do an ordered @@ -297,7 +308,7 @@ bool Scalarizer::runOnFunction(Function &F) { for (BasicBlock *BB : RPOT) { for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) { Instruction *I = &*II; - bool Done = visit(I); + bool Done = InstVisitor::visit(I); ++II; if (Done && I->getType()->isVoidTy()) I->eraseFromParent(); @@ -308,7 +319,7 @@ bool Scalarizer::runOnFunction(Function &F) { // Return a scattered form of V that can be accessed by Point. V must be a // vector or a pointer to a vector. -Scatterer Scalarizer::scatter(Instruction *Point, Value *V) { +Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V) { if (Argument *VArg = dyn_cast(V)) { // Put the scattered form of arguments in the entry block, // so that it can be used everywhere. @@ -332,7 +343,7 @@ Scatterer Scalarizer::scatter(Instruction *Point, Value *V) { // deletion of Op and creation of the gathered form to the end of the pass, // so that we can avoid creating the gathered form if all uses of Op are // replaced with uses of CV. -void Scalarizer::gather(Instruction *Op, const ValueVector &CV) { +void ScalarizerVisitor::gather(Instruction *Op, const ValueVector &CV) { // Since we're not deleting Op yet, stub out its operands, so that it // doesn't make anything live unnecessarily. for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I) @@ -361,7 +372,7 @@ void Scalarizer::gather(Instruction *Op, const ValueVector &CV) { // Return true if it is safe to transfer the given metadata tag from // vector to scalar instructions. -bool Scalarizer::canTransferMetadata(unsigned Tag) { +bool ScalarizerVisitor::canTransferMetadata(unsigned Tag) { return (Tag == LLVMContext::MD_tbaa || Tag == LLVMContext::MD_fpmath || Tag == LLVMContext::MD_tbaa_struct @@ -373,7 +384,7 @@ bool Scalarizer::canTransferMetadata(unsigned Tag) { // Transfer metadata from Op to the instructions in CV if it is known // to be safe to do so. -void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) { +void ScalarizerVisitor::transferMetadata(Instruction *Op, const ValueVector &CV) { SmallVector, 4> MDs; Op->getAllMetadataOtherThanDebugLoc(MDs); for (unsigned I = 0, E = CV.size(); I != E; ++I) { @@ -389,7 +400,7 @@ void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) { // Try to fill in Layout from Ty, returning true on success. Alignment is // the alignment of the vector, or 0 if the ABI default should be used. -bool Scalarizer::getVectorLayout(Type *Ty, unsigned Alignment, +bool ScalarizerVisitor::getVectorLayout(Type *Ty, unsigned Alignment, VectorLayout &Layout, const DataLayout &DL) { // Make sure we're dealing with a vector. Layout.VecTy = dyn_cast(Ty); @@ -413,7 +424,7 @@ bool Scalarizer::getVectorLayout(Type *Ty, unsigned Alignment, // Scalarize two-operand instruction I, using Split(Builder, X, Y, Name) // to create an instruction like I with operands X and Y and name Name. template -bool Scalarizer::splitBinary(Instruction &I, const Splitter &Split) { +bool ScalarizerVisitor::splitBinary(Instruction &I, const Splitter &Split) { VectorType *VT = dyn_cast(I.getType()); if (!VT) return false; @@ -446,7 +457,7 @@ static Function *getScalarIntrinsicDeclaration(Module *M, /// If a call to a vector typed intrinsic function, split into a scalar call per /// element if possible for the intrinsic. -bool Scalarizer::splitCall(CallInst &CI) { +bool ScalarizerVisitor::splitCall(CallInst &CI) { VectorType *VT = dyn_cast(CI.getType()); if (!VT) return false; @@ -504,7 +515,7 @@ bool Scalarizer::splitCall(CallInst &CI) { return true; } -bool Scalarizer::visitSelectInst(SelectInst &SI) { +bool ScalarizerVisitor::visitSelectInst(SelectInst &SI) { VectorType *VT = dyn_cast(SI.getType()); if (!VT) return false; @@ -534,19 +545,19 @@ bool Scalarizer::visitSelectInst(SelectInst &SI) { return true; } -bool Scalarizer::visitICmpInst(ICmpInst &ICI) { +bool ScalarizerVisitor::visitICmpInst(ICmpInst &ICI) { return splitBinary(ICI, ICmpSplitter(ICI)); } -bool Scalarizer::visitFCmpInst(FCmpInst &FCI) { +bool ScalarizerVisitor::visitFCmpInst(FCmpInst &FCI) { return splitBinary(FCI, FCmpSplitter(FCI)); } -bool Scalarizer::visitBinaryOperator(BinaryOperator &BO) { +bool ScalarizerVisitor::visitBinaryOperator(BinaryOperator &BO) { return splitBinary(BO, BinarySplitter(BO)); } -bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) { +bool ScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) { VectorType *VT = dyn_cast(GEPI.getType()); if (!VT) return false; @@ -592,7 +603,7 @@ bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) { return true; } -bool Scalarizer::visitCastInst(CastInst &CI) { +bool ScalarizerVisitor::visitCastInst(CastInst &CI) { VectorType *VT = dyn_cast(CI.getDestTy()); if (!VT) return false; @@ -610,7 +621,7 @@ bool Scalarizer::visitCastInst(CastInst &CI) { return true; } -bool Scalarizer::visitBitCastInst(BitCastInst &BCI) { +bool ScalarizerVisitor::visitBitCastInst(BitCastInst &BCI) { VectorType *DstVT = dyn_cast(BCI.getDestTy()); VectorType *SrcVT = dyn_cast(BCI.getSrcTy()); if (!DstVT || !SrcVT) @@ -665,7 +676,7 @@ bool Scalarizer::visitBitCastInst(BitCastInst &BCI) { return true; } -bool Scalarizer::visitShuffleVectorInst(ShuffleVectorInst &SVI) { +bool ScalarizerVisitor::visitShuffleVectorInst(ShuffleVectorInst &SVI) { VectorType *VT = dyn_cast(SVI.getType()); if (!VT) return false; @@ -689,7 +700,7 @@ bool Scalarizer::visitShuffleVectorInst(ShuffleVectorInst &SVI) { return true; } -bool Scalarizer::visitPHINode(PHINode &PHI) { +bool ScalarizerVisitor::visitPHINode(PHINode &PHI) { VectorType *VT = dyn_cast(PHI.getType()); if (!VT) return false; @@ -714,7 +725,7 @@ bool Scalarizer::visitPHINode(PHINode &PHI) { return true; } -bool Scalarizer::visitLoadInst(LoadInst &LI) { +bool ScalarizerVisitor::visitLoadInst(LoadInst &LI) { if (!ScalarizeLoadStore) return false; if (!LI.isSimple()) @@ -738,7 +749,7 @@ bool Scalarizer::visitLoadInst(LoadInst &LI) { return true; } -bool Scalarizer::visitStoreInst(StoreInst &SI) { +bool ScalarizerVisitor::visitStoreInst(StoreInst &SI) { if (!ScalarizeLoadStore) return false; if (!SI.isSimple()) @@ -765,13 +776,13 @@ bool Scalarizer::visitStoreInst(StoreInst &SI) { return true; } -bool Scalarizer::visitCallInst(CallInst &CI) { +bool ScalarizerVisitor::visitCallInst(CallInst &CI) { return splitCall(CI); } // Delete the instructions that we scalarized. If a full vector result // is still needed, recreate it using InsertElements. -bool Scalarizer::finish() { +bool ScalarizerVisitor::finish() { // The presence of data in Gathered or Scattered indicates changes // made to the Function. if (Gathered.empty() && Scattered.empty()) @@ -802,6 +813,11 @@ bool Scalarizer::finish() { return true; } -FunctionPass *llvm::createScalarizerPass() { - return new Scalarizer(); +PreservedAnalyses ScalarizerPass::run(Function &F, FunctionAnalysisManager &AM) { + Module &M = *F.getParent(); + unsigned ParallelLoopAccessMDKind = + M.getContext().getMDKindID("llvm.mem.parallel_loop_access"); + ScalarizerVisitor Impl(ParallelLoopAccessMDKind); + bool Changed = Impl.visit(F); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 368f0925abac0011e658acf2c11ecff2265f4b4c..abec9183f0fe13ae92c6e016dade5d2968966f10 100644 --- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -25,6 +25,8 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -62,6 +64,9 @@ STATISTIC(NumBranches, "Number of branches unswitched"); STATISTIC(NumSwitches, "Number of switches unswitched"); STATISTIC(NumGuards, "Number of guards turned into branches for unswitching"); STATISTIC(NumTrivial, "Number of unswitches that are trivial"); +STATISTIC( + NumCostMultiplierSkipped, + "Number of unswitch candidates that had their cost multiplier skipped"); static cl::opt EnableNonTrivialUnswitch( "enable-nontrivial-unswitch", cl::init(false), cl::Hidden, @@ -72,6 +77,17 @@ static cl::opt UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden, cl::desc("The cost threshold for unswitching a loop.")); +static cl::opt EnableUnswitchCostMultiplier( + "enable-unswitch-cost-multiplier", cl::init(true), cl::Hidden, + cl::desc("Enable unswitch cost multiplier that prohibits exponential " + "explosion in nontrivial unswitch.")); +static cl::opt UnswitchSiblingsToplevelDiv( + "unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden, + cl::desc("Toplevel siblings divisor for cost multiplier.")); +static cl::opt UnswitchNumInitialUnscaledCandidates( + "unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden, + cl::desc("Number of unswitch candidates that are ignored when calculating " + "cost multiplier.")); static cl::opt UnswitchGuards( "simple-loop-unswitch-guards", cl::init(true), cl::Hidden, cl::desc("If enabled, simple loop unswitching will also consider " @@ -335,7 +351,8 @@ static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader, /// If `SE` is not null, it will be updated based on the potential loop SCEVs /// invalidated by this. static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, - LoopInfo &LI, ScalarEvolution *SE) { + LoopInfo &LI, ScalarEvolution *SE, + MemorySSAUpdater *MSSAU) { assert(BI.isConditional() && "Can only unswitch a conditional branch!"); LLVM_DEBUG(dbgs() << " Trying to unswitch branch: " << BI << "\n"); @@ -409,11 +426,14 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, SE->forgetTopmostLoop(&L); } + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + // Split the preheader, so that we know that there is a safe place to insert // the conditional branch. We will change the preheader to have a conditional // branch on LoopCond. BasicBlock *OldPH = L.getLoopPreheader(); - BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI); + BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI, MSSAU); // Now that we have a place to insert the conditional branch, create a place // to branch to: this is the exit block out of the loop that we are @@ -425,9 +445,13 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, "A branch's parent isn't a predecessor!"); UnswitchedBB = LoopExitBB; } else { - UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI); + UnswitchedBB = + SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI, MSSAU); } + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + // Actually move the invariant uses into the unswitched position. If possible, // we do this by moving the instructions, but when doing partial unswitching // we do it by building a new merge of the values in the unswitched position. @@ -438,12 +462,17 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, // its successors. OldPH->getInstList().splice(OldPH->end(), BI.getParent()->getInstList(), BI); + if (MSSAU) { + // Temporarily clone the terminator, to make MSSA update cheaper by + // separating "insert edge" updates from "remove edge" ones. + ParentBB->getInstList().push_back(BI.clone()); + } else { + // Create a new unconditional branch that will continue the loop as a new + // terminator. + BranchInst::Create(ContinueBB, ParentBB); + } BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB); BI.setSuccessor(1 - LoopExitSuccIdx, NewPH); - - // Create a new unconditional branch that will continue the loop as a new - // terminator. - BranchInst::Create(ContinueBB, ParentBB); } else { // Only unswitching a subset of inputs to the condition, so we will need to // build a new branch that merges the invariant inputs. @@ -459,6 +488,32 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, *UnswitchedBB, *NewPH); } + // Update the dominator tree with the added edge. + DT.insertEdge(OldPH, UnswitchedBB); + + // After the dominator tree was updated with the added edge, update MemorySSA + // if available. + if (MSSAU) { + SmallVector Updates; + Updates.push_back({cfg::UpdateKind::Insert, OldPH, UnswitchedBB}); + MSSAU->applyInsertUpdates(Updates, DT); + } + + // Finish updating dominator tree and memory ssa for full unswitch. + if (FullUnswitch) { + if (MSSAU) { + // Remove the cloned branch instruction. + ParentBB->getTerminator()->eraseFromParent(); + // Create unconditional branch now. + BranchInst::Create(ContinueBB, ParentBB); + MSSAU->removeEdge(ParentBB, LoopExitBB); + } + DT.deleteEdge(ParentBB, LoopExitBB); + } + + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + // Rewrite the relevant PHI nodes. if (UnswitchedBB == LoopExitBB) rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH); @@ -466,13 +521,6 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB, *ParentBB, *OldPH, FullUnswitch); - // Now we need to update the dominator tree. - SmallVector DTUpdates; - DTUpdates.push_back({DT.Insert, OldPH, UnswitchedBB}); - if (FullUnswitch) - DTUpdates.push_back({DT.Delete, ParentBB, LoopExitBB}); - DT.applyUpdates(DTUpdates); - // The constant we can replace all of our invariants with inside the loop // body. If any of the invariants have a value other than this the loop won't // be entered. @@ -523,7 +571,8 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, /// If `SE` is not null, it will be updated based on the potential loop SCEVs /// invalidated by this. static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, - LoopInfo &LI, ScalarEvolution *SE) { + LoopInfo &LI, ScalarEvolution *SE, + MemorySSAUpdater *MSSAU) { LLVM_DEBUG(dbgs() << " Trying to unswitch switch: " << SI << "\n"); Value *LoopCond = SI.getCondition(); @@ -550,6 +599,9 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, LLVM_DEBUG(dbgs() << " unswitching trivial switch...\n"); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + // We may need to invalidate SCEVs for the outermost loop reached by any of // the exits. Loop *OuterL = &L; @@ -612,7 +664,7 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, // Split the preheader, so that we know that there is a safe place to insert // the switch. BasicBlock *OldPH = L.getLoopPreheader(); - BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI); + BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI, MSSAU); OldPH->getTerminator()->eraseFromParent(); // Now add the unswitched switch. @@ -635,9 +687,10 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH); } else { auto *SplitBB = - SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI); - rewritePHINodesForExitAndUnswitchedBlocks( - *DefaultExitBB, *SplitBB, *ParentBB, *OldPH, /*FullUnswitch*/ true); + SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI, MSSAU); + rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB, + *ParentBB, *OldPH, + /*FullUnswitch*/ true); DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB; } } @@ -661,9 +714,10 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, BasicBlock *&SplitExitBB = SplitExitBBMap[ExitBB]; if (!SplitExitBB) { // If this is the first time we see this, do the split and remember it. - SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI); - rewritePHINodesForExitAndUnswitchedBlocks( - *ExitBB, *SplitExitBB, *ParentBB, *OldPH, /*FullUnswitch*/ true); + SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI, MSSAU); + rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB, + *ParentBB, *OldPH, + /*FullUnswitch*/ true); } // Update the case pair to point to the split block. CasePair.second = SplitExitBB; @@ -740,6 +794,13 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, DTUpdates.push_back({DT.Insert, OldPH, UnswitchedBB}); } DT.applyUpdates(DTUpdates); + + if (MSSAU) { + MSSAU->applyUpdates(DTUpdates, DT); + if (VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + } + assert(DT.verify(DominatorTree::VerificationLevel::Fast)); // We may have changed the nesting relationship for this loop so hoist it to @@ -765,7 +826,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, /// If `SE` is not null, it will be updated based on the potential loop SCEVs /// invalidated by this. static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, - LoopInfo &LI, ScalarEvolution *SE) { + LoopInfo &LI, ScalarEvolution *SE, + MemorySSAUpdater *MSSAU) { bool Changed = false; // If loop header has only one reachable successor we should keep looking for @@ -799,7 +861,7 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, if (isa(SI->getCondition())) return Changed; - if (!unswitchTrivialSwitch(L, *SI, DT, LI, SE)) + if (!unswitchTrivialSwitch(L, *SI, DT, LI, SE, MSSAU)) // Couldn't unswitch this one so we're done. return Changed; @@ -831,7 +893,7 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, // Found a trivial condition candidate: non-foldable conditional branch. If // we fail to unswitch this, we can't do anything else that is trivial. - if (!unswitchTrivialBranch(L, *BI, DT, LI, SE)) + if (!unswitchTrivialBranch(L, *BI, DT, LI, SE, MSSAU)) return Changed; // Mark that we managed to unswitch something. @@ -884,7 +946,7 @@ static BasicBlock *buildClonedLoopBlocks( const SmallDenseMap &DominatingSucc, ValueToValueMapTy &VMap, SmallVectorImpl &DTUpdates, AssumptionCache &AC, - DominatorTree &DT, LoopInfo &LI) { + DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU) { SmallVector NewBlocks; NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size()); @@ -929,7 +991,7 @@ static BasicBlock *buildClonedLoopBlocks( // place to merge the CFG, so split the exit first. This is always safe to // do because there cannot be any non-loop predecessors of a loop exit in // loop simplified form. - auto *MergeBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI); + auto *MergeBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI, MSSAU); // Rearrange the names to make it easier to write test cases by having the // exit block carry the suffix rather than the merge block carrying the @@ -1360,7 +1422,7 @@ static void buildClonedLoops(Loop &OrigL, ArrayRef ExitBlocks, static void deleteDeadClonedBlocks(Loop &L, ArrayRef ExitBlocks, ArrayRef> VMaps, - DominatorTree &DT) { + DominatorTree &DT, MemorySSAUpdater *MSSAU) { // Find all the dead clones, and remove them from their successors. SmallVector DeadBlocks; for (BasicBlock *BB : llvm::concat(L.blocks(), ExitBlocks)) @@ -1372,6 +1434,13 @@ deleteDeadClonedBlocks(Loop &L, ArrayRef ExitBlocks, DeadBlocks.push_back(ClonedBB); } + // Remove all MemorySSA in the dead blocks + if (MSSAU) { + SmallPtrSet DeadBlockSet(DeadBlocks.begin(), + DeadBlocks.end()); + MSSAU->removeBlocks(DeadBlockSet); + } + // Drop any remaining references to break cycles. for (BasicBlock *BB : DeadBlocks) BB->dropAllReferences(); @@ -1380,10 +1449,10 @@ deleteDeadClonedBlocks(Loop &L, ArrayRef ExitBlocks, BB->eraseFromParent(); } -static void -deleteDeadBlocksFromLoop(Loop &L, - SmallVectorImpl &ExitBlocks, - DominatorTree &DT, LoopInfo &LI) { +static void deleteDeadBlocksFromLoop(Loop &L, + SmallVectorImpl &ExitBlocks, + DominatorTree &DT, LoopInfo &LI, + MemorySSAUpdater *MSSAU) { // Find all the dead blocks tied to this loop, and remove them from their // successors. SmallPtrSet DeadBlockSet; @@ -1404,6 +1473,10 @@ deleteDeadBlocksFromLoop(Loop &L, } } + // Remove all MemorySSA in the dead blocks + if (MSSAU) + MSSAU->removeBlocks(DeadBlockSet); + // Filter out the dead blocks from the exit blocks list so that it can be // used in the caller. llvm::erase_if(ExitBlocks, @@ -1803,7 +1876,7 @@ static void unswitchNontrivialInvariants( Loop &L, Instruction &TI, ArrayRef Invariants, SmallVectorImpl &ExitBlocks, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, function_ref)> UnswitchCB, - ScalarEvolution *SE) { + ScalarEvolution *SE, MemorySSAUpdater *MSSAU) { auto *ParentBB = TI.getParent(); BranchInst *BI = dyn_cast(&TI); SwitchInst *SI = BI ? nullptr : cast(&TI); @@ -1820,6 +1893,9 @@ static void unswitchNontrivialInvariants( assert(isa(BI->getCondition()) && "Partial unswitching requires an instruction as the condition!"); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + // Constant and BBs tracking the cloned and continuing successor. When we are // unswitching the entire condition, this can just be trivially chosen to // unswitch towards `true`. However, when we are unswitching a set of @@ -1860,6 +1936,10 @@ static void unswitchNontrivialInvariants( // Compute the parent loop now before we start hacking on things. Loop *ParentL = L.getParentLoop(); + // Get blocks in RPO order for MSSA update, before changing the CFG. + LoopBlocksRPO LBRPO(&L); + if (MSSAU) + LBRPO.perform(&LI); // Compute the outer-most loop containing one of our exit blocks. This is the // furthest up our loopnest which can be mutated, which we will use below to @@ -1909,7 +1989,7 @@ static void unswitchNontrivialInvariants( // between the unswitched versions, and we will have a new preheader for the // original loop. BasicBlock *SplitBB = L.getLoopPreheader(); - BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI); + BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI, MSSAU); // Keep track of the dominator tree updates needed. SmallVector DTUpdates; @@ -1922,7 +2002,7 @@ static void unswitchNontrivialInvariants( VMaps.emplace_back(new ValueToValueMapTy()); ClonedPHs[SuccBB] = buildClonedLoopBlocks( L, LoopPH, SplitBB, ExitBlocks, ParentBB, SuccBB, RetainedSuccBB, - DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI); + DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI, MSSAU); } // The stitching of the branched code back together depends on whether we're @@ -1930,7 +2010,63 @@ static void unswitchNontrivialInvariants( // nuke the initial terminator placed in the split block. SplitBB->getTerminator()->eraseFromParent(); if (FullUnswitch) { - // First we need to unhook the successor relationship as we'll be replacing + // Splice the terminator from the original loop and rewrite its + // successors. + SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), TI); + + // Keep a clone of the terminator for MSSA updates. + Instruction *NewTI = TI.clone(); + ParentBB->getInstList().push_back(NewTI); + + // First wire up the moved terminator to the preheaders. + if (BI) { + BasicBlock *ClonedPH = ClonedPHs.begin()->second; + BI->setSuccessor(ClonedSucc, ClonedPH); + BI->setSuccessor(1 - ClonedSucc, LoopPH); + DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); + } else { + assert(SI && "Must either be a branch or switch!"); + + // Walk the cases and directly update their successors. + assert(SI->getDefaultDest() == RetainedSuccBB && + "Not retaining default successor!"); + SI->setDefaultDest(LoopPH); + for (auto &Case : SI->cases()) + if (Case.getCaseSuccessor() == RetainedSuccBB) + Case.setSuccessor(LoopPH); + else + Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second); + + // We need to use the set to populate domtree updates as even when there + // are multiple cases pointing at the same successor we only want to + // remove and insert one edge in the domtree. + for (BasicBlock *SuccBB : UnswitchedSuccBBs) + DTUpdates.push_back( + {DominatorTree::Insert, SplitBB, ClonedPHs.find(SuccBB)->second}); + } + + if (MSSAU) { + DT.applyUpdates(DTUpdates); + DTUpdates.clear(); + + // Remove all but one edge to the retained block and all unswitched + // blocks. This is to avoid having duplicate entries in the cloned Phis, + // when we know we only keep a single edge for each case. + MSSAU->removeDuplicatePhiEdgesBetween(ParentBB, RetainedSuccBB); + for (BasicBlock *SuccBB : UnswitchedSuccBBs) + MSSAU->removeDuplicatePhiEdgesBetween(ParentBB, SuccBB); + + for (auto &VMap : VMaps) + MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, *VMap, + /*IgnoreIncomingWithNoClones=*/true); + MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT); + + // Remove all edges to unswitched blocks. + for (BasicBlock *SuccBB : UnswitchedSuccBBs) + MSSAU->removeEdge(ParentBB, SuccBB); + } + + // Now unhook the successor relationship as we'll be replacing // the terminator with a direct branch. This is much simpler for branches // than switches so we handle those first. if (BI) { @@ -1948,9 +2084,10 @@ static void unswitchNontrivialInvariants( // is a duplicate edge to the retained successor as the retained successor // is always the default successor and as we'll replace this with a direct // branch we no longer need the duplicate entries in the PHI nodes. - assert(SI->getDefaultDest() == RetainedSuccBB && + SwitchInst *NewSI = cast(NewTI); + assert(NewSI->getDefaultDest() == RetainedSuccBB && "Not retaining default successor!"); - for (auto &Case : SI->cases()) + for (auto &Case : NewSI->cases()) Case.getCaseSuccessor()->removePredecessor( ParentBB, /*DontDeleteUselessPHIs*/ true); @@ -1962,34 +2099,8 @@ static void unswitchNontrivialInvariants( DTUpdates.push_back({DominatorTree::Delete, ParentBB, SuccBB}); } - // Now that we've unhooked the successor relationship, splice the terminator - // from the original loop to the split. - SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), TI); - - // Now wire up the terminator to the preheaders. - if (BI) { - BasicBlock *ClonedPH = ClonedPHs.begin()->second; - BI->setSuccessor(ClonedSucc, ClonedPH); - BI->setSuccessor(1 - ClonedSucc, LoopPH); - DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); - } else { - assert(SI && "Must either be a branch or switch!"); - - // Walk the cases and directly update their successors. - SI->setDefaultDest(LoopPH); - for (auto &Case : SI->cases()) - if (Case.getCaseSuccessor() == RetainedSuccBB) - Case.setSuccessor(LoopPH); - else - Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second); - - // We need to use the set to populate domtree updates as even when there - // are multiple cases pointing at the same successor we only want to - // remove and insert one edge in the domtree. - for (BasicBlock *SuccBB : UnswitchedSuccBBs) - DTUpdates.push_back( - {DominatorTree::Insert, SplitBB, ClonedPHs.find(SuccBB)->second}); - } + // After MSSAU update, remove the cloned terminator instruction NewTI. + ParentBB->getTerminator()->eraseFromParent(); // Create a new unconditional branch to the continuing block (as opposed to // the one cloned). @@ -2008,12 +2119,19 @@ static void unswitchNontrivialInvariants( // Apply the updates accumulated above to get an up-to-date dominator tree. DT.applyUpdates(DTUpdates); + if (!FullUnswitch && MSSAU) { + // Update MSSA for partial unswitch, after DT update. + SmallVector Updates; + Updates.push_back( + {cfg::UpdateKind::Insert, SplitBB, ClonedPHs.begin()->second}); + MSSAU->applyInsertUpdates(Updates, DT); + } // Now that we have an accurate dominator tree, first delete the dead cloned // blocks so that we can accurately build any cloned loops. It is important to // not delete the blocks from the original loop yet because we still want to // reference the original loop to understand the cloned loop's structure. - deleteDeadClonedBlocks(L, ExitBlocks, VMaps, DT); + deleteDeadClonedBlocks(L, ExitBlocks, VMaps, DT, MSSAU); // Build the cloned loop structure itself. This may be substantially // different from the original structure due to the simplified CFG. This also @@ -2025,10 +2143,17 @@ static void unswitchNontrivialInvariants( // Now that our cloned loops have been built, we can update the original loop. // First we delete the dead blocks from it and then we rebuild the loop // structure taking these deletions into account. - deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI); + deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI, MSSAU); + + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + SmallVector HoistedLoops; bool IsStillLoop = rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + // This transformation has a high risk of corrupting the dominator tree, and // the below steps to rebuild loop structures will result in hard to debug // errors in that case so verify that the dominator tree is sane first. @@ -2153,6 +2278,9 @@ static void unswitchNontrivialInvariants( SibLoops.push_back(UpdatedL); UnswitchCB(IsStillLoop, SibLoops); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + ++NumBranches; } @@ -2213,11 +2341,14 @@ computeDomSubtreeCost(DomTreeNode &N, static BranchInst * turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, SmallVectorImpl &ExitBlocks, - DominatorTree &DT, LoopInfo &LI) { + DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU) { SmallVector DTUpdates; LLVM_DEBUG(dbgs() << "Turning " << *GI << " into a branch.\n"); BasicBlock *CheckBB = GI->getParent(); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + // Remove all CheckBB's successors from DomTree. A block can be seen among // successors more than once, but for DomTree it should be added only once. SmallPtrSet Successors; @@ -2235,10 +2366,14 @@ turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, BasicBlock *GuardedBlock = CheckBI->getSuccessor(0); GuardedBlock->setName("guarded"); CheckBI->getSuccessor(1)->setName("deopt"); + BasicBlock *DeoptBlock = CheckBI->getSuccessor(1); // We now have a new exit block. ExitBlocks.push_back(CheckBI->getSuccessor(1)); + if (MSSAU) + MSSAU->moveAllAfterSpliceBlocks(CheckBB, GuardedBlock, GI); + GI->moveBefore(DeoptBlockTerm); GI->setArgOperand(0, ConstantInt::getFalse(GI->getContext())); @@ -2256,15 +2391,107 @@ turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, // Inform LI of a new loop block. L.addBasicBlockToLoop(GuardedBlock, LI); + if (MSSAU) { + MemoryDef *MD = cast(MSSAU->getMemorySSA()->getMemoryAccess(GI)); + MSSAU->moveToPlace(MD, DeoptBlock, MemorySSA::End); + if (VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + } + ++NumGuards; return CheckBI; } +/// Cost multiplier is a way to limit potentially exponential behavior +/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch +/// candidates available. Also accounting for the number of "sibling" loops with +/// the idea to account for previous unswitches that already happened on this +/// cluster of loops. There was an attempt to keep this formula simple, +/// just enough to limit the worst case behavior. Even if it is not that simple +/// now it is still not an attempt to provide a detailed heuristic size +/// prediction. +/// +/// TODO: Make a proper accounting of "explosion" effect for all kinds of +/// unswitch candidates, making adequate predictions instead of wild guesses. +/// That requires knowing not just the number of "remaining" candidates but +/// also costs of unswitching for each of these candidates. +static int calculateUnswitchCostMultiplier( + Instruction &TI, Loop &L, LoopInfo &LI, DominatorTree &DT, + ArrayRef>> + UnswitchCandidates) { + + // Guards and other exiting conditions do not contribute to exponential + // explosion as soon as they dominate the latch (otherwise there might be + // another path to the latch remaining that does not allow to eliminate the + // loop copy on unswitch). + BasicBlock *Latch = L.getLoopLatch(); + BasicBlock *CondBlock = TI.getParent(); + if (DT.dominates(CondBlock, Latch) && + (isGuard(&TI) || + llvm::count_if(successors(&TI), [&L](BasicBlock *SuccBB) { + return L.contains(SuccBB); + }) <= 1)) { + NumCostMultiplierSkipped++; + return 1; + } + + auto *ParentL = L.getParentLoop(); + int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size() + : std::distance(LI.begin(), LI.end())); + // Count amount of clones that all the candidates might cause during + // unswitching. Branch/guard counts as 1, switch counts as log2 of its cases. + int UnswitchedClones = 0; + for (auto Candidate : UnswitchCandidates) { + Instruction *CI = Candidate.first; + BasicBlock *CondBlock = CI->getParent(); + bool SkipExitingSuccessors = DT.dominates(CondBlock, Latch); + if (isGuard(CI)) { + if (!SkipExitingSuccessors) + UnswitchedClones++; + continue; + } + int NonExitingSuccessors = llvm::count_if( + successors(CondBlock), [SkipExitingSuccessors, &L](BasicBlock *SuccBB) { + return !SkipExitingSuccessors || L.contains(SuccBB); + }); + UnswitchedClones += Log2_32(NonExitingSuccessors); + } + + // Ignore up to the "unscaled candidates" number of unswitch candidates + // when calculating the power-of-two scaling of the cost. The main idea + // with this control is to allow a small number of unswitches to happen + // and rely more on siblings multiplier (see below) when the number + // of candidates is small. + unsigned ClonesPower = + std::max(UnswitchedClones - (int)UnswitchNumInitialUnscaledCandidates, 0); + + // Allowing top-level loops to spread a bit more than nested ones. + int SiblingsMultiplier = + std::max((ParentL ? SiblingsCount + : SiblingsCount / (int)UnswitchSiblingsToplevelDiv), + 1); + // Compute the cost multiplier in a way that won't overflow by saturating + // at an upper bound. + int CostMultiplier; + if (ClonesPower > Log2_32(UnswitchThreshold) || + SiblingsMultiplier > UnswitchThreshold) + CostMultiplier = UnswitchThreshold; + else + CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower), + (int)UnswitchThreshold); + + LLVM_DEBUG(dbgs() << " Computed multiplier " << CostMultiplier + << " (siblings " << SiblingsMultiplier << " * clones " + << (1 << ClonesPower) << ")" + << " for unswitch candidate: " << TI << "\n"); + return CostMultiplier; +} + static bool unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, TargetTransformInfo &TTI, function_ref)> UnswitchCB, - ScalarEvolution *SE) { + ScalarEvolution *SE, MemorySSAUpdater *MSSAU) { // Collect all invariant conditions within this loop (as opposed to an inner // loop which would be handled when visiting that inner loop). SmallVector>, 4> @@ -2473,8 +2700,23 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, int CandidateCost = ComputeUnswitchedCost( TI, /*FullUnswitch*/ !BI || (Invariants.size() == 1 && Invariants[0] == BI->getCondition())); - LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost - << " for unswitch candidate: " << TI << "\n"); + // Calculate cost multiplier which is a tool to limit potentially + // exponential behavior of loop-unswitch. + if (EnableUnswitchCostMultiplier) { + int CostMultiplier = + calculateUnswitchCostMultiplier(TI, L, LI, DT, UnswitchCandidates); + assert( + (CostMultiplier > 0 && CostMultiplier <= UnswitchThreshold) && + "cost multiplier needs to be in the range of 1..UnswitchThreshold"); + CandidateCost *= CostMultiplier; + LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost + << " (multiplier: " << CostMultiplier << ")" + << " for unswitch candidate: " << TI << "\n"); + } else { + LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost + << " for unswitch candidate: " << TI << "\n"); + } + if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) { BestUnswitchTI = &TI; BestUnswitchCost = CandidateCost; @@ -2491,13 +2733,13 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, // If the best candidate is a guard, turn it into a branch. if (isGuard(BestUnswitchTI)) BestUnswitchTI = turnGuardIntoBranch(cast(BestUnswitchTI), L, - ExitBlocks, DT, LI); + ExitBlocks, DT, LI, MSSAU); LLVM_DEBUG(dbgs() << " Unswitching non-trivial (cost = " << BestUnswitchCost << ") terminator: " << *BestUnswitchTI << "\n"); unswitchNontrivialInvariants(L, *BestUnswitchTI, BestUnswitchInvariants, - ExitBlocks, DT, LI, AC, UnswitchCB, SE); + ExitBlocks, DT, LI, AC, UnswitchCB, SE, MSSAU); return true; } @@ -2510,6 +2752,7 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, /// /// The `DT`, `LI`, `AC`, `TTI` parameters are required analyses that are also /// updated based on the unswitch. +/// The `MSSA` analysis is also updated if valid (i.e. its use is enabled). /// /// If either `NonTrivial` is true or the flag `EnableNonTrivialUnswitch` is /// true, we will attempt to do non-trivial unswitching as well as trivial @@ -2525,7 +2768,7 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, TargetTransformInfo &TTI, bool NonTrivial, function_ref)> UnswitchCB, - ScalarEvolution *SE) { + ScalarEvolution *SE, MemorySSAUpdater *MSSAU) { assert(L.isRecursivelyLCSSAForm(DT, LI) && "Loops must be in LCSSA form before unswitching."); bool Changed = false; @@ -2535,7 +2778,7 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, return false; // Try trivial unswitch first before loop over other basic blocks in the loop. - if (unswitchAllTrivialConditions(L, DT, LI, SE)) { + if (unswitchAllTrivialConditions(L, DT, LI, SE, MSSAU)) { // If we unswitched successfully we will want to clean up the loop before // processing it further so just mark it as unswitched and return. UnswitchCB(/*CurrentLoopValid*/ true, {}); @@ -2556,7 +2799,7 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, // Try to unswitch the best invariant condition. We prefer this full unswitch to // a partial unswitch when possible below the threshold. - if (unswitchBestCondition(L, DT, LI, AC, TTI, UnswitchCB, SE)) + if (unswitchBestCondition(L, DT, LI, AC, TTI, UnswitchCB, SE, MSSAU)) return true; // No other opportunities to unswitch. @@ -2590,10 +2833,19 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, U.markLoopAsDeleted(L, LoopName); }; + Optional MSSAU; + if (AR.MSSA) { + MSSAU = MemorySSAUpdater(AR.MSSA); + if (VerifyMemorySSA) + AR.MSSA->verifyMemorySSA(); + } if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial, UnswitchCB, - &AR.SE)) + &AR.SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr)) return PreservedAnalyses::all(); + if (AR.MSSA && VerifyMemorySSA) + AR.MSSA->verifyMemorySSA(); + // Historically this pass has had issues with the dominator tree so verify it // in asserts builds. assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast)); @@ -2619,6 +2871,10 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + if (EnableMSSALoopDependency) { + AU.addRequired(); + AU.addPreserved(); + } getLoopAnalysisUsage(AU); } }; @@ -2638,6 +2894,12 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { auto &LI = getAnalysis().getLoopInfo(); auto &AC = getAnalysis().getAssumptionCache(F); auto &TTI = getAnalysis().getTTI(F); + MemorySSA *MSSA = nullptr; + Optional MSSAU; + if (EnableMSSALoopDependency) { + MSSA = &getAnalysis().getMSSA(); + MSSAU = MemorySSAUpdater(MSSA); + } auto *SEWP = getAnalysisIfAvailable(); auto *SE = SEWP ? &SEWP->getSE() : nullptr; @@ -2657,7 +2919,14 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { LPM.markLoopAsDeleted(*L); }; - bool Changed = unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, UnswitchCB, SE); + if (MSSA && VerifyMemorySSA) + MSSA->verifyMemorySSA(); + + bool Changed = unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, UnswitchCB, SE, + MSSAU.hasValue() ? MSSAU.getPointer() : nullptr); + + if (MSSA && VerifyMemorySSA) + MSSA->verifyMemorySSA(); // If anything was unswitched, also clear any cached information about this // loop. @@ -2677,6 +2946,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch", "Simple unswitch loops", false, false) diff --git a/lib/Transforms/Scalar/WarnMissedTransforms.cpp b/lib/Transforms/Scalar/WarnMissedTransforms.cpp new file mode 100644 index 0000000000000000000000000000000000000000..80f761e537747462adcd7bdbf477a8e5c035c957 --- /dev/null +++ b/lib/Transforms/Scalar/WarnMissedTransforms.cpp @@ -0,0 +1,149 @@ +//===- LoopTransformWarning.cpp - ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Emit warnings if forced code transformations have not been performed. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/WarnMissedTransforms.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Transforms/Utils/LoopUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "transform-warning" + +/// Emit warnings for forced (i.e. user-defined) loop transformations which have +/// still not been performed. +static void warnAboutLeftoverTransformations(Loop *L, + OptimizationRemarkEmitter *ORE) { + if (hasUnrollTransformation(L) == TM_ForcedByUser) { + LLVM_DEBUG(dbgs() << "Leftover unroll transformation\n"); + ORE->emit( + DiagnosticInfoOptimizationFailure(DEBUG_TYPE, + "FailedRequestedUnrolling", + L->getStartLoc(), L->getHeader()) + << "loop not unrolled: the optimizer was unable to perform the " + "requested transformation; the transformation might be disabled or " + "specified as part of an unsupported transformation ordering"); + } + + if (hasUnrollAndJamTransformation(L) == TM_ForcedByUser) { + LLVM_DEBUG(dbgs() << "Leftover unroll-and-jam transformation\n"); + ORE->emit( + DiagnosticInfoOptimizationFailure(DEBUG_TYPE, + "FailedRequestedUnrollAndJamming", + L->getStartLoc(), L->getHeader()) + << "loop not unroll-and-jammed: the optimizer was unable to perform " + "the requested transformation; the transformation might be disabled " + "or specified as part of an unsupported transformation ordering"); + } + + if (hasVectorizeTransformation(L) == TM_ForcedByUser) { + LLVM_DEBUG(dbgs() << "Leftover vectorization transformation\n"); + Optional VectorizeWidth = + getOptionalIntLoopAttribute(L, "llvm.loop.vectorize.width"); + Optional InterleaveCount = + getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count"); + + if (VectorizeWidth.getValueOr(0) != 1) + ORE->emit( + DiagnosticInfoOptimizationFailure(DEBUG_TYPE, + "FailedRequestedVectorization", + L->getStartLoc(), L->getHeader()) + << "loop not vectorized: the optimizer was unable to perform the " + "requested transformation; the transformation might be disabled " + "or specified as part of an unsupported transformation ordering"); + else if (InterleaveCount.getValueOr(0) != 1) + ORE->emit( + DiagnosticInfoOptimizationFailure(DEBUG_TYPE, + "FailedRequestedInterleaving", + L->getStartLoc(), L->getHeader()) + << "loop not interleaved: the optimizer was unable to perform the " + "requested transformation; the transformation might be disabled " + "or specified as part of an unsupported transformation ordering"); + } + + if (hasDistributeTransformation(L) == TM_ForcedByUser) { + LLVM_DEBUG(dbgs() << "Leftover distribute transformation\n"); + ORE->emit( + DiagnosticInfoOptimizationFailure(DEBUG_TYPE, + "FailedRequestedDistribution", + L->getStartLoc(), L->getHeader()) + << "loop not distributed: the optimizer was unable to perform the " + "requested transformation; the transformation might be disabled or " + "specified as part of an unsupported transformation ordering"); + } +} + +static void warnAboutLeftoverTransformations(Function *F, LoopInfo *LI, + OptimizationRemarkEmitter *ORE) { + for (auto *L : LI->getLoopsInPreorder()) + warnAboutLeftoverTransformations(L, ORE); +} + +// New pass manager boilerplate +PreservedAnalyses +WarnMissedTransformationsPass::run(Function &F, FunctionAnalysisManager &AM) { + // Do not warn about not applied transformations if optimizations are + // disabled. + if (F.hasFnAttribute(Attribute::OptimizeNone)) + return PreservedAnalyses::all(); + + auto &ORE = AM.getResult(F); + auto &LI = AM.getResult(F); + + warnAboutLeftoverTransformations(&F, &LI, &ORE); + + return PreservedAnalyses::all(); +} + +// Legacy pass manager boilerplate +namespace { +class WarnMissedTransformationsLegacy : public FunctionPass { +public: + static char ID; + + explicit WarnMissedTransformationsLegacy() : FunctionPass(ID) { + initializeWarnMissedTransformationsLegacyPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + auto &ORE = getAnalysis().getORE(); + auto &LI = getAnalysis().getLoopInfo(); + + warnAboutLeftoverTransformations(&F, &LI, &ORE); + return false; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + + AU.setPreservesAll(); + } +}; +} // end anonymous namespace + +char WarnMissedTransformationsLegacy::ID = 0; + +INITIALIZE_PASS_BEGIN(WarnMissedTransformationsLegacy, "transform-warning", + "Warn about non-applied transformations", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_END(WarnMissedTransformationsLegacy, "transform-warning", + "Warn about non-applied transformations", false, false) + +Pass *llvm::createWarnMissedTransformationsPass() { + return new WarnMissedTransformationsLegacy(); +} diff --git a/lib/Transforms/Utils/CallPromotionUtils.cpp b/lib/Transforms/Utils/CallPromotionUtils.cpp index 4db579156d90aa538008434d1b90c0dec6765cd6..e58ddcf34667d5bf6be9cab699d216885c5f1162 100644 --- a/lib/Transforms/Utils/CallPromotionUtils.cpp +++ b/lib/Transforms/Utils/CallPromotionUtils.cpp @@ -393,6 +393,13 @@ Instruction *llvm::promoteCall(CallSite CS, Function *Callee, // to the correct type. auto CalleeType = Callee->getFunctionType(); auto CalleeParamNum = CalleeType->getNumParams(); + + LLVMContext &Ctx = Callee->getContext(); + const AttributeList &CallerPAL = CS.getAttributes(); + // The new list of argument attributes. + SmallVector NewArgAttrs; + bool AttributeChanged = false; + for (unsigned ArgNo = 0; ArgNo < CalleeParamNum; ++ArgNo) { auto *Arg = CS.getArgument(ArgNo); Type *FormalTy = CalleeType->getParamType(ArgNo); @@ -401,13 +408,31 @@ Instruction *llvm::promoteCall(CallSite CS, Function *Callee, auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "", CS.getInstruction()); CS.setArgument(ArgNo, Cast); - } + + // Remove any incompatible attributes for the argument. + AttrBuilder ArgAttrs(CallerPAL.getParamAttributes(ArgNo)); + ArgAttrs.remove(AttributeFuncs::typeIncompatible(FormalTy)); + NewArgAttrs.push_back(AttributeSet::get(Ctx, ArgAttrs)); + AttributeChanged = true; + } else + NewArgAttrs.push_back(CallerPAL.getParamAttributes(ArgNo)); } // If the return type of the call site doesn't match that of the callee, cast // the returned value to the appropriate type. - if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy) + // Remove any incompatible return value attribute. + AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); + if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy) { createRetBitCast(CS, CallSiteRetTy, RetBitCast); + RAttrs.remove(AttributeFuncs::typeIncompatible(CalleeRetTy)); + AttributeChanged = true; + } + + // Set the new callsite attribute. + if (AttributeChanged) + CS.setAttributes(AttributeList::get(Ctx, CallerPAL.getFnAttributes(), + AttributeSet::get(Ctx, RAttrs), + NewArgAttrs)); return CS.getInstruction(); } diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp index 000af808945ae599d2089d5b5a3610a91989a610..8f8c601f5f136b9ff86258710ad15357e2003d8d 100644 --- a/lib/Transforms/Utils/CloneFunction.cpp +++ b/lib/Transforms/Utils/CloneFunction.cpp @@ -18,11 +18,11 @@ #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DomTreeUpdater.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" @@ -32,6 +32,7 @@ #include "llvm/IR/Module.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include using namespace llvm; @@ -795,11 +796,12 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, /// Duplicate non-Phi instructions from the beginning of block up to /// StopAt instruction into a split block between BB and its predecessor. -BasicBlock * -llvm::DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB, - Instruction *StopAt, - ValueToValueMapTy &ValueMapping, - DominatorTree *DT) { +BasicBlock *llvm::DuplicateInstructionsInSplitBetween( + BasicBlock *BB, BasicBlock *PredBB, Instruction *StopAt, + ValueToValueMapTy &ValueMapping, DomTreeUpdater &DTU) { + + assert(count(successors(PredBB), BB) == 1 && + "There must be a single edge between PredBB and BB!"); // We are going to have to map operands from the original BB block to the new // copy of the block 'NewBB'. If there are PHI nodes in BB, evaluate them to // account for entry from PredBB. @@ -807,10 +809,16 @@ llvm::DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB, for (; PHINode *PN = dyn_cast(BI); ++BI) ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); - BasicBlock *NewBB = SplitEdge(PredBB, BB, DT); + BasicBlock *NewBB = SplitEdge(PredBB, BB); NewBB->setName(PredBB->getName() + ".split"); Instruction *NewTerm = NewBB->getTerminator(); + // FIXME: SplitEdge does not yet take a DTU, so we include the split edge + // in the update set here. + DTU.applyUpdates({{DominatorTree::Delete, PredBB, BB}, + {DominatorTree::Insert, PredBB, NewBB}, + {DominatorTree::Insert, NewBB, BB}}); + // Clone the non-phi instructions of BB into NewBB, keeping track of the // mapping and using it to remap operands in the cloned instructions. // Stop once we see the terminator too. This covers the case where BB's diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp index 419e1db08bfd5320e654215daffb999812295431..a6b01107752e5b5ce6d7a08a2980ed7047e34a6f 100644 --- a/lib/Transforms/Utils/CodeExtractor.cpp +++ b/lib/Transforms/Utils/CodeExtractor.cpp @@ -531,10 +531,10 @@ void CodeExtractor::findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, } } -/// severSplitPHINodes - If a PHI node has multiple inputs from outside of the -/// region, we need to split the entry block of the region so that the PHI node -/// is easier to deal with. -void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { +/// severSplitPHINodesOfEntry - If a PHI node has multiple inputs from outside +/// of the region, we need to split the entry block of the region so that the +/// PHI node is easier to deal with. +void CodeExtractor::severSplitPHINodesOfEntry(BasicBlock *&Header) { unsigned NumPredsFromRegion = 0; unsigned NumPredsOutsideRegion = 0; @@ -606,6 +606,56 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { } } +/// severSplitPHINodesOfExits - if PHI nodes in exit blocks have inputs from +/// outlined region, we split these PHIs on two: one with inputs from region +/// and other with remaining incoming blocks; then first PHIs are placed in +/// outlined region. +void CodeExtractor::severSplitPHINodesOfExits( + const SmallPtrSetImpl &Exits) { + for (BasicBlock *ExitBB : Exits) { + BasicBlock *NewBB = nullptr; + + for (PHINode &PN : ExitBB->phis()) { + // Find all incoming values from the outlining region. + SmallVector IncomingVals; + for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i) + if (Blocks.count(PN.getIncomingBlock(i))) + IncomingVals.push_back(i); + + // Do not process PHI if there is one (or fewer) predecessor from region. + // If PHI has exactly one predecessor from region, only this one incoming + // will be replaced on codeRepl block, so it should be safe to skip PHI. + if (IncomingVals.size() <= 1) + continue; + + // Create block for new PHIs and add it to the list of outlined if it + // wasn't done before. + if (!NewBB) { + NewBB = BasicBlock::Create(ExitBB->getContext(), + ExitBB->getName() + ".split", + ExitBB->getParent(), ExitBB); + SmallVector Preds(pred_begin(ExitBB), + pred_end(ExitBB)); + for (BasicBlock *PredBB : Preds) + if (Blocks.count(PredBB)) + PredBB->getTerminator()->replaceUsesOfWith(ExitBB, NewBB); + BranchInst::Create(ExitBB, NewBB); + Blocks.insert(NewBB); + } + + // Split this PHI. + PHINode *NewPN = + PHINode::Create(PN.getType(), IncomingVals.size(), + PN.getName() + ".ce", NewBB->getFirstNonPHI()); + for (unsigned i : IncomingVals) + NewPN->addIncoming(PN.getIncomingValue(i), PN.getIncomingBlock(i)); + for (unsigned i : reverse(IncomingVals)) + PN.removeIncomingValue(i, false); + PN.addIncoming(NewPN, NewBB); + } + } +} + void CodeExtractor::splitReturnBlocks() { for (BasicBlock *Block : Blocks) if (ReturnInst *RI = dyn_cast(Block->getTerminator())) { @@ -942,17 +992,15 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, continue; // Find proper insertion point. - Instruction *InsertPt; + BasicBlock::iterator InsertPt; // In case OutI is an invoke, we insert the store at the beginning in the // 'normal destination' BB. Otherwise we insert the store right after OutI. if (auto *InvokeI = dyn_cast(OutI)) - InsertPt = InvokeI->getNormalDest()->getFirstNonPHI(); + InsertPt = InvokeI->getNormalDest()->getFirstInsertionPt(); + else if (auto *Phi = dyn_cast(OutI)) + InsertPt = Phi->getParent()->getFirstInsertionPt(); else - InsertPt = OutI->getNextNode(); - - // Let's assume that there is no other guy interleave non-PHI in PHIs. - if (isa(InsertPt)) - InsertPt = InsertPt->getParent()->getFirstNonPHI(); + InsertPt = std::next(OutI->getIterator()); assert(OAI != newFunction->arg_end() && "Number of output arguments should match " @@ -962,13 +1010,13 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context)); Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i); GetElementPtrInst *GEP = GetElementPtrInst::Create( - StructArgTy, &*OAI, Idx, "gep_" + outputs[i]->getName(), InsertPt); - new StoreInst(outputs[i], GEP, InsertPt); + StructArgTy, &*OAI, Idx, "gep_" + outputs[i]->getName(), &*InsertPt); + new StoreInst(outputs[i], GEP, &*InsertPt); // Since there should be only one struct argument aggregating // all the output values, we shouldn't increment OAI, which always // points to the struct argument, in this case. } else { - new StoreInst(outputs[i], &*OAI, InsertPt); + new StoreInst(outputs[i], &*OAI, &*InsertPt); ++OAI; } } @@ -1173,13 +1221,33 @@ Function *CodeExtractor::extractCodeRegion() { } } - // If we have to split PHI nodes or the entry block, do so now. - severSplitPHINodes(header); - // If we have any return instructions in the region, split those blocks so // that the return is not in the region. splitReturnBlocks(); + // Calculate the exit blocks for the extracted region and the total exit + // weights for each of those blocks. + DenseMap ExitWeights; + SmallPtrSet ExitBlocks; + for (BasicBlock *Block : Blocks) { + for (succ_iterator SI = succ_begin(Block), SE = succ_end(Block); SI != SE; + ++SI) { + if (!Blocks.count(*SI)) { + // Update the branch weight for this successor. + if (BFI) { + BlockFrequency &BF = ExitWeights[*SI]; + BF += BFI->getBlockFreq(Block) * BPI->getEdgeProbability(Block, *SI); + } + ExitBlocks.insert(*SI); + } + } + } + NumExitBlocks = ExitBlocks.size(); + + // If we have to split PHI nodes of the entry or exit blocks, do so now. + severSplitPHINodesOfEntry(header); + severSplitPHINodesOfExits(ExitBlocks); + // This takes place of the original loop BasicBlock *codeReplacer = BasicBlock::Create(header->getContext(), "codeRepl", oldFunction, @@ -1224,25 +1292,6 @@ Function *CodeExtractor::extractCodeRegion() { cast(II)->moveBefore(TI); } - // Calculate the exit blocks for the extracted region and the total exit - // weights for each of those blocks. - DenseMap ExitWeights; - SmallPtrSet ExitBlocks; - for (BasicBlock *Block : Blocks) { - for (succ_iterator SI = succ_begin(Block), SE = succ_end(Block); SI != SE; - ++SI) { - if (!Blocks.count(*SI)) { - // Update the branch weight for this successor. - if (BFI) { - BlockFrequency &BF = ExitWeights[*SI]; - BF += BFI->getBlockFreq(Block) * BPI->getEdgeProbability(Block, *SI); - } - ExitBlocks.insert(*SI); - } - } - } - NumExitBlocks = ExitBlocks.size(); - // Construct new function based on inputs/outputs & add allocas for all defs. Function *newFunction = constructFunction(inputs, outputs, header, newFuncRoot, @@ -1270,8 +1319,8 @@ Function *CodeExtractor::extractCodeRegion() { if (BFI && NumExitBlocks > 1) calculateNewCallTerminatorWeights(codeReplacer, ExitWeights, BPI); - // Loop over all of the PHI nodes in the header block, and change any - // references to the old incoming edge to be the new incoming edge. + // Loop over all of the PHI nodes in the header and exit blocks, and change + // any references to the old incoming edge to be the new incoming edge. for (BasicBlock::iterator I = header->begin(); isa(I); ++I) { PHINode *PN = cast(I); for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) @@ -1279,35 +1328,23 @@ Function *CodeExtractor::extractCodeRegion() { PN->setIncomingBlock(i, newFuncRoot); } - // Look at all successors of the codeReplacer block. If any of these blocks - // had PHI nodes in them, we need to update the "from" block to be the code - // replacer, not the original block in the extracted region. - for (BasicBlock *SuccBB : successors(codeReplacer)) { - for (PHINode &PN : SuccBB->phis()) { + for (BasicBlock *ExitBB : ExitBlocks) + for (PHINode &PN : ExitBB->phis()) { Value *IncomingCodeReplacerVal = nullptr; - SmallVector IncomingValsToRemove; - for (unsigned I = 0, E = PN.getNumIncomingValues(); I != E; ++I) { - BasicBlock *IncomingBB = PN.getIncomingBlock(I); - + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { // Ignore incoming values from outside of the extracted region. - if (!Blocks.count(IncomingBB)) + if (!Blocks.count(PN.getIncomingBlock(i))) continue; // Ensure that there is only one incoming value from codeReplacer. if (!IncomingCodeReplacerVal) { - PN.setIncomingBlock(I, codeReplacer); - IncomingCodeReplacerVal = PN.getIncomingValue(I); - } else { - assert(IncomingCodeReplacerVal == PN.getIncomingValue(I) && + PN.setIncomingBlock(i, codeReplacer); + IncomingCodeReplacerVal = PN.getIncomingValue(i); + } else + assert(IncomingCodeReplacerVal == PN.getIncomingValue(i) && "PHI has two incompatbile incoming values from codeRepl"); - IncomingValsToRemove.push_back(I); - } } - - for (unsigned I : reverse(IncomingValsToRemove)) - PN.removeIncomingValue(I, /*DeletePHIIfEmpty=*/false); } - } // Erase debug info intrinsics. Variable updates within the new function are // invisible to debuggers. This could be improved by defining a DISubprogram @@ -1330,7 +1367,21 @@ Function *CodeExtractor::extractCodeRegion() { DVI->eraseFromParent(); } - LLVM_DEBUG(if (verifyFunction(*newFunction)) - report_fatal_error("verifyFunction failed!")); + // Mark the new function `noreturn` if applicable. Terminators which resume + // exception propagation are treated as returning instructions. This is to + // avoid inserting traps after calls to outlined functions which unwind. + bool doesNotReturn = none_of(*newFunction, [](const BasicBlock &BB) { + const Instruction *Term = BB.getTerminator(); + return isa(Term) || isa(Term); + }); + if (doesNotReturn) + newFunction->setDoesNotReturn(); + + LLVM_DEBUG(if (verifyFunction(*newFunction, &errs())) { + newFunction->dump(); + report_fatal_error("verification of newFunction failed!"); + }); + LLVM_DEBUG(if (verifyFunction(*oldFunction)) + report_fatal_error("verification of oldFunction failed!")); return newFunction; } diff --git a/lib/Transforms/Utils/CtorUtils.cpp b/lib/Transforms/Utils/CtorUtils.cpp index 9a0240144d0808cf492a7c1f18080ed844b0845f..4e7da7d0449f9213c6c6d0a4fe871789c652d901 100644 --- a/lib/Transforms/Utils/CtorUtils.cpp +++ b/lib/Transforms/Utils/CtorUtils.cpp @@ -22,11 +22,10 @@ #define DEBUG_TYPE "ctor_utils" -namespace llvm { +using namespace llvm; -namespace { /// Given a specified llvm.global_ctors list, remove the listed elements. -void removeGlobalCtors(GlobalVariable *GCL, const BitVector &CtorsToRemove) { +static void removeGlobalCtors(GlobalVariable *GCL, const BitVector &CtorsToRemove) { // Filter out the initializer elements to remove. ConstantArray *OldCA = cast(GCL->getInitializer()); SmallVector CAList; @@ -64,7 +63,7 @@ void removeGlobalCtors(GlobalVariable *GCL, const BitVector &CtorsToRemove) { /// Given a llvm.global_ctors list that we can understand, /// return a list of the functions and null terminator as a vector. -std::vector parseGlobalCtors(GlobalVariable *GV) { +static std::vector parseGlobalCtors(GlobalVariable *GV) { if (GV->getInitializer()->isNullValue()) return std::vector(); ConstantArray *CA = cast(GV->getInitializer()); @@ -79,7 +78,7 @@ std::vector parseGlobalCtors(GlobalVariable *GV) { /// Find the llvm.global_ctors list, verifying that all initializers have an /// init priority of 65535. -GlobalVariable *findGlobalCtors(Module &M) { +static GlobalVariable *findGlobalCtors(Module &M) { GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors"); if (!GV) return nullptr; @@ -112,12 +111,11 @@ GlobalVariable *findGlobalCtors(Module &M) { return GV; } -} // namespace /// Call "ShouldRemove" for every entry in M's global_ctor list and remove the /// entries for which it returns true. Return true if anything changed. -bool optimizeGlobalCtorsList(Module &M, - function_ref ShouldRemove) { +bool llvm::optimizeGlobalCtorsList( + Module &M, function_ref ShouldRemove) { GlobalVariable *GlobalCtors = findGlobalCtors(M); if (!GlobalCtors) return false; @@ -160,5 +158,3 @@ bool optimizeGlobalCtorsList(Module &M, removeGlobalCtors(GlobalCtors, CtorsToRemove); return true; } - -} // End llvm namespace diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp index 479816a339d0afac328b7d077ca316821f9c67f6..a9772e31da5095ccfa3415c52d1d6a4d6eaa6424 100644 --- a/lib/Transforms/Utils/FunctionImportUtils.cpp +++ b/lib/Transforms/Utils/FunctionImportUtils.cpp @@ -124,7 +124,6 @@ FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV, return SGV->getLinkage(); switch (SGV->getLinkage()) { - case GlobalValue::LinkOnceAnyLinkage: case GlobalValue::LinkOnceODRLinkage: case GlobalValue::ExternalLinkage: // External and linkonce definitions are converted to available_externally @@ -144,11 +143,13 @@ FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV, // An imported available_externally declaration stays that way. return SGV->getLinkage(); + case GlobalValue::LinkOnceAnyLinkage: case GlobalValue::WeakAnyLinkage: - // Can't import weak_any definitions correctly, or we might change the - // program semantics, since the linker will pick the first weak_any - // definition and importing would change the order they are seen by the - // linker. The module linking caller needs to enforce this. + // Can't import linkonce_any/weak_any definitions correctly, or we might + // change the program semantics, since the linker will pick the first + // linkonce_any/weak_any definition and importing would change the order + // they are seen by the linker. The module linking caller needs to enforce + // this. assert(!doImportAsDefinition(SGV)); // If imported as a declaration, it becomes external_weak. return SGV->getLinkage(); @@ -202,10 +203,26 @@ FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV, void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) { - // Check the summaries to see if the symbol gets resolved to a known local - // definition. + ValueInfo VI; if (GV.hasName()) { - ValueInfo VI = ImportIndex.getValueInfo(GV.getGUID()); + VI = ImportIndex.getValueInfo(GV.getGUID()); + // Set synthetic function entry counts. + if (VI && ImportIndex.hasSyntheticEntryCounts()) { + if (Function *F = dyn_cast(&GV)) { + if (!F->isDeclaration()) { + for (auto &S : VI.getSummaryList()) { + FunctionSummary *FS = dyn_cast(S->getBaseObject()); + if (FS->modulePath() == M.getModuleIdentifier()) { + F->setEntryCount(Function::ProfileCount(FS->entryCount(), + Function::PCT_Synthetic)); + break; + } + } + } + } + } + // Check the summaries to see if the symbol gets resolved to a known local + // definition. if (VI && VI.isDSOLocal()) { GV.setDSOLocal(true); if (GV.hasDLLImportStorageClass()) @@ -213,6 +230,22 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) { } } + // Mark read-only variables which can be imported with specific attribute. + // We can't internalize them now because IRMover will fail to link variable + // definitions to their external declarations during ThinLTO import. We'll + // internalize read-only variables later, after import is finished. + // See internalizeImmutableGVs. + // + // If global value dead stripping is not enabled in summary then + // propagateConstants hasn't been run. We can't internalize GV + // in such case. + if (!GV.isDeclaration() && VI && ImportIndex.withGlobalValueDeadStripping()) { + const auto &SL = VI.getSummaryList(); + auto *GVS = SL.empty() ? nullptr : dyn_cast(SL[0].get()); + if (GVS && GVS->isReadOnly()) + cast(&GV)->addAttribute("thinlto-internalize"); + } + bool DoPromote = false; if (GV.hasLocalLinkage() && ((DoPromote = shouldPromoteLocalToGlobal(&GV)) || isPerformingImport())) { @@ -230,7 +263,7 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) { // Remove functions imported as available externally defs from comdats, // as this is a declaration for the linker, and will be dropped eventually. // It is illegal for comdats to contain declarations. - auto *GO = dyn_cast_or_null(&GV); + auto *GO = dyn_cast(&GV); if (GO && GO->isDeclarationForLinker() && GO->hasComdat()) { // The IRMover should not have placed any imported declarations in // a comdat, so the only declaration that should be in a comdat diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 0dcd7371210d75a475719a6fde4c19b37f5d63a1..79343a90a0e3af394cefead8ab3d0ccd821e4a3b 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -476,6 +476,17 @@ void llvm::RecursivelyDeleteTriviallyDeadInstructions( } } +bool llvm::replaceDbgUsesWithUndef(Instruction *I) { + SmallVector DbgUsers; + findDbgUsers(DbgUsers, I); + for (auto *DII : DbgUsers) { + Value *Undef = UndefValue::get(I->getType()); + DII->setOperand(0, MetadataAsValue::get(DII->getContext(), + ValueAsMetadata::get(Undef))); + } + return !DbgUsers.empty(); +} + /// areAllUsesEqual - Check whether the uses of a value are all the same. /// This is similar to Instruction::hasOneUse() except this will also return /// true when there are no uses or multiple uses that all refer to the same @@ -682,10 +693,9 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, // DTU updates: Collect all the edges that enter // PredBB. These dominator edges will be redirected to DestBB. - std::vector Updates; + SmallVector Updates; if (DTU) { - Updates.reserve(1 + (2 * pred_size(PredBB))); Updates.push_back({DominatorTree::Delete, PredBB, DestBB}); for (auto I = pred_begin(PredBB), E = pred_end(PredBB); I != E; ++I) { Updates.push_back({DominatorTree::Delete, *I, PredBB}); @@ -989,9 +999,8 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB, LLVM_DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB); - std::vector Updates; + SmallVector Updates; if (DTU) { - Updates.reserve(1 + (2 * pred_size(BB))); Updates.push_back({DominatorTree::Delete, BB, Succ}); // All predecessors of BB will be moved to Succ. for (auto I = pred_begin(BB), E = pred_end(BB); I != E; ++I) { @@ -1288,33 +1297,6 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII, return; } - // If an argument is zero extended then use argument directly. The ZExt - // may be zapped by an optimization pass in future. - Argument *ExtendedArg = nullptr; - if (ZExtInst *ZExt = dyn_cast(SI->getOperand(0))) - ExtendedArg = dyn_cast(ZExt->getOperand(0)); - if (SExtInst *SExt = dyn_cast(SI->getOperand(0))) - ExtendedArg = dyn_cast(SExt->getOperand(0)); - if (ExtendedArg) { - // If this DII was already describing only a fragment of a variable, ensure - // that fragment is appropriately narrowed here. - // But if a fragment wasn't used, describe the value as the original - // argument (rather than the zext or sext) so that it remains described even - // if the sext/zext is optimized away. This widens the variable description, - // leaving it up to the consumer to know how the smaller value may be - // represented in a larger register. - if (auto Fragment = DIExpr->getFragmentInfo()) { - unsigned FragmentOffset = Fragment->OffsetInBits; - SmallVector Ops(DIExpr->elements_begin(), - DIExpr->elements_end() - 3); - Ops.push_back(dwarf::DW_OP_LLVM_fragment); - Ops.push_back(FragmentOffset); - const DataLayout &DL = DII->getModule()->getDataLayout(); - Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType())); - DIExpr = Builder.createExpression(Ops); - } - DV = ExtendedArg; - } if (!LdStHasDebugValue(DIVar, DIExpr, SI)) Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, DII->getDebugLoc(), SI); @@ -1961,6 +1943,7 @@ static void changeToCall(InvokeInst *II, DomTreeUpdater *DTU = nullptr) { NewCall->setCallingConv(II->getCallingConv()); NewCall->setAttributes(II->getAttributes()); NewCall->setDebugLoc(II->getDebugLoc()); + NewCall->copyMetadata(*II); II->replaceAllUsesWith(NewCall); // Follow the call by a branch to the normal destination. diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp index 877e0e4dcf900a54e6a409421910ae855c9a4eee..efd8b92e814d0d7cd2651235c90d2fa3293414bc 100644 --- a/lib/Transforms/Utils/LoopUnroll.cpp +++ b/lib/Transforms/Utils/LoopUnroll.cpp @@ -329,12 +329,15 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, /// /// This utility preserves LoopInfo. It will also preserve ScalarEvolution and /// DominatorTree if they are non-null. +/// +/// If RemainderLoop is non-null, it will receive the remainder loop (if +/// required and not fully unrolled). LoopUnrollResult llvm::UnrollLoop( Loop *L, unsigned Count, unsigned TripCount, bool Force, bool AllowRuntime, bool AllowExpensiveTripCount, bool PreserveCondBr, bool PreserveOnlyFirst, unsigned TripMultiple, unsigned PeelCount, bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, bool PreserveLCSSA) { + OptimizationRemarkEmitter *ORE, bool PreserveLCSSA, Loop **RemainderLoop) { BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) { @@ -468,7 +471,7 @@ LoopUnrollResult llvm::UnrollLoop( if (RuntimeTripCount && TripMultiple % Count != 0 && !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount, EpilogProfitability, UnrollRemainder, LI, SE, - DT, AC, PreserveLCSSA)) { + DT, AC, PreserveLCSSA, RemainderLoop)) { if (Force) RuntimeTripCount = false; else { diff --git a/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/lib/Transforms/Utils/LoopUnrollAndJam.cpp index 8949c603a841e79d3c02d3f517c90911410642ce..b5d80f669fbf669eae07d12f0d3fb598f105b467 100644 --- a/lib/Transforms/Utils/LoopUnrollAndJam.cpp +++ b/lib/Transforms/Utils/LoopUnrollAndJam.cpp @@ -167,12 +167,14 @@ static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, isSafeToUnrollAndJam should be used prior to calling this to make sure the unrolling will be valid. Checking profitablility is also advisable. + + If EpilogueLoop is non-null, it receives the epilogue loop (if it was + necessary to create one and not fully unrolled). */ -LoopUnrollResult -llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount, - unsigned TripMultiple, bool UnrollRemainder, - LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, - AssumptionCache *AC, OptimizationRemarkEmitter *ORE) { +LoopUnrollResult llvm::UnrollAndJamLoop( + Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple, + bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, + AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) { // When we enter here we should have already checked that it is safe BasicBlock *Header = L->getHeader(); @@ -196,7 +198,8 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount, if (TripMultiple == 1 || TripMultiple % Count != 0) { if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false, /*UseEpilogRemainder*/ true, - UnrollRemainder, LI, SE, DT, AC, true)) { + UnrollRemainder, LI, SE, DT, AC, true, + EpilogueLoop)) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be " "generated when assuming runtime trip count\n"); return LoopUnrollResult::Unmodified; diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 3361883acd0c088b7e8d709bc35bf9c85c30b336..3606ec4b9fcaa9be8edaaf2748f8381dcf0a4cc4 100644 --- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -380,6 +380,7 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop, } if (CreateRemainderLoop) { Loop *NewLoop = NewLoops[L]; + MDNode *LoopID = NewLoop->getLoopID(); assert(NewLoop && "L should have been cloned"); // Only add loop metadata if the loop is not going to be completely @@ -387,6 +388,16 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop, if (UnrollRemainder) return NewLoop; + Optional NewLoopID = makeFollowupLoopID( + LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder}); + if (NewLoopID.hasValue()) { + NewLoop->setLoopID(NewLoopID.getValue()); + + // Do not setLoopAlreadyUnrolled if loop attributes have been defined + // explicitly. + return NewLoop; + } + // Add unroll disable metadata to disable future unrolling for this loop. NewLoop->setLoopAlreadyUnrolled(); return NewLoop; @@ -525,10 +536,10 @@ static bool canProfitablyUnrollMultiExitLoop( bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, bool AllowExpensiveTripCount, bool UseEpilogRemainder, - bool UnrollRemainder, - LoopInfo *LI, ScalarEvolution *SE, - DominatorTree *DT, AssumptionCache *AC, - bool PreserveLCSSA) { + bool UnrollRemainder, LoopInfo *LI, + ScalarEvolution *SE, DominatorTree *DT, + AssumptionCache *AC, bool PreserveLCSSA, + Loop **ResultLoop) { LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n"); LLVM_DEBUG(L->dump()); LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n" @@ -911,16 +922,20 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, formDedicatedExitBlocks(remainderLoop, DT, LI, PreserveLCSSA); } + auto UnrollResult = LoopUnrollResult::Unmodified; if (remainderLoop && UnrollRemainder) { LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n"); - UnrollLoop(remainderLoop, /*Count*/ Count - 1, /*TripCount*/ Count - 1, - /*Force*/ false, /*AllowRuntime*/ false, - /*AllowExpensiveTripCount*/ false, /*PreserveCondBr*/ true, - /*PreserveOnlyFirst*/ false, /*TripMultiple*/ 1, - /*PeelCount*/ 0, /*UnrollRemainder*/ false, LI, SE, DT, AC, - /*ORE*/ nullptr, PreserveLCSSA); + UnrollResult = + UnrollLoop(remainderLoop, /*Count*/ Count - 1, /*TripCount*/ Count - 1, + /*Force*/ false, /*AllowRuntime*/ false, + /*AllowExpensiveTripCount*/ false, /*PreserveCondBr*/ true, + /*PreserveOnlyFirst*/ false, /*TripMultiple*/ 1, + /*PeelCount*/ 0, /*UnrollRemainder*/ false, LI, SE, DT, AC, + /*ORE*/ nullptr, PreserveLCSSA); } + if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled) + *ResultLoop = remainderLoop; NumRuntimeUnrolled++; return true; } diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp index 249869e1bdeecd27f26fbae41a8c4e2177e7e919..1d4f07ff1b9a9548899fe3f88238664786b288e0 100644 --- a/lib/Transforms/Utils/LoopUtils.cpp +++ b/lib/Transforms/Utils/LoopUtils.cpp @@ -26,9 +26,11 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/DIBuilder.h" #include "llvm/IR/DomTreeUpdater.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" @@ -42,6 +44,8 @@ using namespace llvm::PatternMatch; #define DEBUG_TYPE "loop-utils" +static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced"; + bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA) { bool Changed = false; @@ -183,14 +187,8 @@ void llvm::initializeLoopPassPass(PassRegistry &Registry) { INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) } -/// Find string metadata for loop -/// -/// If it has a value (e.g. {"llvm.distribute", 1} return the value as an -/// operand or null otherwise. If the string metadata is not found return -/// Optional's not-a-value. -Optional llvm::findStringMetadataForLoop(Loop *TheLoop, - StringRef Name) { - MDNode *LoopID = TheLoop->getLoopID(); +static Optional findOptionMDForLoopID(MDNode *LoopID, + StringRef Name) { // Return none if LoopID is false. if (!LoopID) return None; @@ -209,18 +207,253 @@ Optional llvm::findStringMetadataForLoop(Loop *TheLoop, continue; // Return true if MDString holds expected MetaData. if (Name.equals(S->getString())) - switch (MD->getNumOperands()) { - case 1: - return nullptr; - case 2: - return &MD->getOperand(1); - default: - llvm_unreachable("loop metadata has 0 or 1 operand"); - } + return MD; } return None; } +static Optional findOptionMDForLoop(const Loop *TheLoop, + StringRef Name) { + return findOptionMDForLoopID(TheLoop->getLoopID(), Name); +} + +/// Find string metadata for loop +/// +/// If it has a value (e.g. {"llvm.distribute", 1} return the value as an +/// operand or null otherwise. If the string metadata is not found return +/// Optional's not-a-value. +Optional llvm::findStringMetadataForLoop(Loop *TheLoop, + StringRef Name) { + auto MD = findOptionMDForLoop(TheLoop, Name).getValueOr(nullptr); + if (!MD) + return None; + switch (MD->getNumOperands()) { + case 1: + return nullptr; + case 2: + return &MD->getOperand(1); + default: + llvm_unreachable("loop metadata has 0 or 1 operand"); + } +} + +static Optional getOptionalBoolLoopAttribute(const Loop *TheLoop, + StringRef Name) { + Optional MD = findOptionMDForLoop(TheLoop, Name); + if (!MD.hasValue()) + return None; + MDNode *OptionNode = MD.getValue(); + if (OptionNode == nullptr) + return None; + switch (OptionNode->getNumOperands()) { + case 1: + // When the value is absent it is interpreted as 'attribute set'. + return true; + case 2: + return mdconst::extract_or_null( + OptionNode->getOperand(1).get()); + } + llvm_unreachable("unexpected number of options"); +} + +static bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name) { + return getOptionalBoolLoopAttribute(TheLoop, Name).getValueOr(false); +} + +llvm::Optional llvm::getOptionalIntLoopAttribute(Loop *TheLoop, + StringRef Name) { + const MDOperand *AttrMD = + findStringMetadataForLoop(TheLoop, Name).getValueOr(nullptr); + if (!AttrMD) + return None; + + ConstantInt *IntMD = mdconst::extract_or_null(AttrMD->get()); + if (!IntMD) + return None; + + return IntMD->getSExtValue(); +} + +Optional llvm::makeFollowupLoopID( + MDNode *OrigLoopID, ArrayRef FollowupOptions, + const char *InheritOptionsExceptPrefix, bool AlwaysNew) { + if (!OrigLoopID) { + if (AlwaysNew) + return nullptr; + return None; + } + + assert(OrigLoopID->getOperand(0) == OrigLoopID); + + bool InheritAllAttrs = !InheritOptionsExceptPrefix; + bool InheritSomeAttrs = + InheritOptionsExceptPrefix && InheritOptionsExceptPrefix[0] != '\0'; + SmallVector MDs; + MDs.push_back(nullptr); + + bool Changed = false; + if (InheritAllAttrs || InheritSomeAttrs) { + for (const MDOperand &Existing : drop_begin(OrigLoopID->operands(), 1)) { + MDNode *Op = cast(Existing.get()); + + auto InheritThisAttribute = [InheritSomeAttrs, + InheritOptionsExceptPrefix](MDNode *Op) { + if (!InheritSomeAttrs) + return false; + + // Skip malformatted attribute metadata nodes. + if (Op->getNumOperands() == 0) + return true; + Metadata *NameMD = Op->getOperand(0).get(); + if (!isa(NameMD)) + return true; + StringRef AttrName = cast(NameMD)->getString(); + + // Do not inherit excluded attributes. + return !AttrName.startswith(InheritOptionsExceptPrefix); + }; + + if (InheritThisAttribute(Op)) + MDs.push_back(Op); + else + Changed = true; + } + } else { + // Modified if we dropped at least one attribute. + Changed = OrigLoopID->getNumOperands() > 1; + } + + bool HasAnyFollowup = false; + for (StringRef OptionName : FollowupOptions) { + MDNode *FollowupNode = + findOptionMDForLoopID(OrigLoopID, OptionName).getValueOr(nullptr); + if (!FollowupNode) + continue; + + HasAnyFollowup = true; + for (const MDOperand &Option : drop_begin(FollowupNode->operands(), 1)) { + MDs.push_back(Option.get()); + Changed = true; + } + } + + // Attributes of the followup loop not specified explicity, so signal to the + // transformation pass to add suitable attributes. + if (!AlwaysNew && !HasAnyFollowup) + return None; + + // If no attributes were added or remove, the previous loop Id can be reused. + if (!AlwaysNew && !Changed) + return OrigLoopID; + + // No attributes is equivalent to having no !llvm.loop metadata at all. + if (MDs.size() == 1) + return nullptr; + + // Build the new loop ID. + MDTuple *FollowupLoopID = MDNode::get(OrigLoopID->getContext(), MDs); + FollowupLoopID->replaceOperandWith(0, FollowupLoopID); + return FollowupLoopID; +} + +bool llvm::hasDisableAllTransformsHint(const Loop *L) { + return getBooleanLoopAttribute(L, LLVMLoopDisableNonforced); +} + +TransformationMode llvm::hasUnrollTransformation(Loop *L) { + if (getBooleanLoopAttribute(L, "llvm.loop.unroll.disable")) + return TM_SuppressedByUser; + + Optional Count = + getOptionalIntLoopAttribute(L, "llvm.loop.unroll.count"); + if (Count.hasValue()) + return Count.getValue() == 1 ? TM_SuppressedByUser : TM_ForcedByUser; + + if (getBooleanLoopAttribute(L, "llvm.loop.unroll.enable")) + return TM_ForcedByUser; + + if (getBooleanLoopAttribute(L, "llvm.loop.unroll.full")) + return TM_ForcedByUser; + + if (hasDisableAllTransformsHint(L)) + return TM_Disable; + + return TM_Unspecified; +} + +TransformationMode llvm::hasUnrollAndJamTransformation(Loop *L) { + if (getBooleanLoopAttribute(L, "llvm.loop.unroll_and_jam.disable")) + return TM_SuppressedByUser; + + Optional Count = + getOptionalIntLoopAttribute(L, "llvm.loop.unroll_and_jam.count"); + if (Count.hasValue()) + return Count.getValue() == 1 ? TM_SuppressedByUser : TM_ForcedByUser; + + if (getBooleanLoopAttribute(L, "llvm.loop.unroll_and_jam.enable")) + return TM_ForcedByUser; + + if (hasDisableAllTransformsHint(L)) + return TM_Disable; + + return TM_Unspecified; +} + +TransformationMode llvm::hasVectorizeTransformation(Loop *L) { + Optional Enable = + getOptionalBoolLoopAttribute(L, "llvm.loop.vectorize.enable"); + + if (Enable == false) + return TM_SuppressedByUser; + + Optional VectorizeWidth = + getOptionalIntLoopAttribute(L, "llvm.loop.vectorize.width"); + Optional InterleaveCount = + getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count"); + + if (Enable == true) { + // 'Forcing' vector width and interleave count to one effectively disables + // this tranformation. + if (VectorizeWidth == 1 && InterleaveCount == 1) + return TM_SuppressedByUser; + return TM_ForcedByUser; + } + + if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) + return TM_Disable; + + if (VectorizeWidth == 1 && InterleaveCount == 1) + return TM_Disable; + + if (VectorizeWidth > 1 || InterleaveCount > 1) + return TM_Enable; + + if (hasDisableAllTransformsHint(L)) + return TM_Disable; + + return TM_Unspecified; +} + +TransformationMode llvm::hasDistributeTransformation(Loop *L) { + if (getBooleanLoopAttribute(L, "llvm.loop.distribute.enable")) + return TM_ForcedByUser; + + if (hasDisableAllTransformsHint(L)) + return TM_Disable; + + return TM_Unspecified; +} + +TransformationMode llvm::hasLICMVersioningTransformation(Loop *L) { + if (getBooleanLoopAttribute(L, "llvm.loop.licm_versioning.disable")) + return TM_SuppressedByUser; + + if (hasDisableAllTransformsHint(L)) + return TM_Disable; + + return TM_Unspecified; +} + /// Does a BFS from a given node to all of its children inside a given loop. /// The returned vector of nodes includes the starting point. SmallVector @@ -336,6 +569,10 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr, DTU.deleteEdge(Preheader, L->getHeader()); } + // Use a map to unique and a vector to guarantee deterministic ordering. + llvm::SmallDenseSet, 4> DeadDebugSet; + llvm::SmallVector DeadDebugInst; + // Given LCSSA form is satisfied, we should not have users of instructions // within the dead loop outside of the loop. However, LCSSA doesn't take // unreachable uses into account. We handle them here. @@ -360,8 +597,27 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr, "Unexpected user in reachable block"); U.set(Undef); } + auto *DVI = dyn_cast(&I); + if (!DVI) + continue; + auto Key = DeadDebugSet.find({DVI->getVariable(), DVI->getExpression()}); + if (Key != DeadDebugSet.end()) + continue; + DeadDebugSet.insert({DVI->getVariable(), DVI->getExpression()}); + DeadDebugInst.push_back(DVI); } + // After the loop has been deleted all the values defined and modified + // inside the loop are going to be unavailable. + // Since debug values in the loop have been deleted, inserting an undef + // dbg.value truncates the range of any dbg.value before the loop where the + // loop used to be. This is particularly important for constant values. + DIBuilder DIB(*ExitBlock->getModule()); + for (auto *DVI : DeadDebugInst) + DIB.insertDbgValueIntrinsic( + UndefValue::get(Builder.getInt32Ty()), DVI->getVariable(), + DVI->getExpression(), DVI->getDebugLoc(), ExitBlock->getFirstNonPHI()); + // Remove the block from the reference counting scheme, so that we can // delete it freely later. for (auto *Block : L->blocks()) diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index 6773ad7bfc70ad0236d3d3d9f41db16b8ac23937..a53083c31e3f12afa3b8fd7ed98f2020a26b52f3 100644 --- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -751,14 +751,18 @@ void PromoteMem2Reg::run() { // Ok, now we know that all of the PHI nodes are missing entries for some // basic blocks. Start by sorting the incoming predecessors for efficient // access. - llvm::sort(Preds); + auto CompareBBNumbers = [this](BasicBlock *A, BasicBlock *B) { + return BBNumbers.lookup(A) < BBNumbers.lookup(B); + }; + llvm::sort(Preds, CompareBBNumbers); // Now we loop through all BB's which have entries in SomePHI and remove // them from the Preds list. for (unsigned i = 0, e = SomePHI->getNumIncomingValues(); i != e; ++i) { // Do a log(n) search of the Preds list for the entry we want. SmallVectorImpl::iterator EntIt = std::lower_bound( - Preds.begin(), Preds.end(), SomePHI->getIncomingBlock(i)); + Preds.begin(), Preds.end(), SomePHI->getIncomingBlock(i), + CompareBBNumbers); assert(EntIt != Preds.end() && *EntIt == SomePHI->getIncomingBlock(i) && "PHI node has entry for a block which is not a predecessor!"); diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index 849f9ee1d19dde4749c66a67148f7e5f1c308bb1..b98f2fff65eaeb96fd0daad9c6561533ceeed348 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -693,7 +693,7 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(Instruction *TI) { if (SwitchInst *SI = dyn_cast(TI)) { // Do not permit merging of large switch instructions into their // predecessors unless there is only one predecessor. - if (SI->getNumSuccessors() * pred_size(SI->getParent()) <= 128) + if (!SI->getParent()->hasNPredecessorsOrMore(128 / SI->getNumSuccessors())) CV = SI->getCondition(); } else if (BranchInst *BI = dyn_cast(TI)) if (BI->isConditional() && BI->getCondition()->hasOneUse()) @@ -1372,6 +1372,14 @@ HoistTerminator: } } + // As the parent basic block terminator is a branch instruction which is + // removed at the end of the current transformation, use its previous + // non-debug instruction, as the reference insertion point, which will + // provide the debug location for generated select instructions. For BBs + // with only debug instructions, use an empty debug location. + Instruction *InsertPt = + BIParent->getTerminator()->getPrevNonDebugInstruction(); + // Okay, it is safe to hoist the terminator. Instruction *NT = I1->clone(); BIParent->getInstList().insert(BI->getIterator(), NT); @@ -1381,7 +1389,16 @@ HoistTerminator: NT->takeName(I1); } + // Ensure terminator gets a debug location, even an unknown one, in case + // it involves inlinable calls. + NT->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc()); + IRBuilder Builder(NT); + // If an earlier instruction in this BB had a location, adopt it, otherwise + // clear debug locations. + Builder.SetCurrentDebugLocation(InsertPt ? InsertPt->getDebugLoc() + : DebugLoc()); + // Hoisting one of the terminators from our successor is a great thing. // Unfortunately, the successors of the if/else blocks may have PHI nodes in // them. If they do, all PHI entries for BB1/BB2 must agree for all PHI @@ -2874,7 +2891,7 @@ static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB, if (!AlternativeV) break; - assert(pred_size(Succ) == 2); + assert(Succ->hasNPredecessors(2)); auto PredI = pred_begin(Succ); BasicBlock *OtherPredBB = *PredI == BB ? *++PredI : *PredI; if (PHI->getIncomingValueForBlock(OtherPredBB) == AlternativeV) @@ -5758,7 +5775,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, // backedge, so we can eliminate BB. bool NeedCanonicalLoop = Options.NeedCanonicalLoop && - (LoopHeaders && pred_size(BB) > 1 && + (LoopHeaders && BB->hasNPredecessorsOrMore(2) && (LoopHeaders->count(BB) || LoopHeaders->count(Succ))); BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator(); if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() && @@ -5837,28 +5854,17 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { if (SimplifyBranchOnICmpChain(BI, Builder, DL)) return true; - // If this basic block has a single dominating predecessor block and the - // dominating block's condition implies BI's condition, we know the direction - // of the BI branch. - if (BasicBlock *Dom = BB->getSinglePredecessor()) { - auto *PBI = dyn_cast_or_null(Dom->getTerminator()); - if (PBI && PBI->isConditional() && - PBI->getSuccessor(0) != PBI->getSuccessor(1)) { - assert(PBI->getSuccessor(0) == BB || PBI->getSuccessor(1) == BB); - bool CondIsTrue = PBI->getSuccessor(0) == BB; - Optional Implication = isImpliedCondition( - PBI->getCondition(), BI->getCondition(), DL, CondIsTrue); - if (Implication) { - // Turn this into a branch on constant. - auto *OldCond = BI->getCondition(); - ConstantInt *CI = *Implication - ? ConstantInt::getTrue(BB->getContext()) - : ConstantInt::getFalse(BB->getContext()); - BI->setCondition(CI); - RecursivelyDeleteTriviallyDeadInstructions(OldCond); - return requestResimplify(); - } - } + // If this basic block has dominating predecessor blocks and the dominating + // blocks' conditions imply BI's condition, we know the direction of BI. + Optional Imp = isImpliedByDomCondition(BI->getCondition(), BI, DL); + if (Imp) { + // Turn this into a branch on constant. + auto *OldCond = BI->getCondition(); + ConstantInt *TorF = *Imp ? ConstantInt::getTrue(BB->getContext()) + : ConstantInt::getFalse(BB->getContext()); + BI->setCondition(TorF); + RecursivelyDeleteTriviallyDeadInstructions(OldCond); + return requestResimplify(); } // If this basic block is ONLY a compare and a branch, and if a predecessor diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt index 27a4d241b320080bb52964385e3f979edee3c619..06eaadf58c3fce1f6b9c3531335d23d6ff06d370 100644 --- a/lib/Transforms/Vectorize/CMakeLists.txt +++ b/lib/Transforms/Vectorize/CMakeLists.txt @@ -7,6 +7,7 @@ add_llvm_library(LLVMVectorize VPlan.cpp VPlanHCFGBuilder.cpp VPlanHCFGTransforms.cpp + VPlanSLP.cpp VPlanVerifier.cpp ADDITIONAL_HEADER_DIRS diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index f2344fec3cd3bb3d8611b2d7f89060531b8e18a0..9ff18328c219d4eba51b0f342e5868076ba5caea 100644 --- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -79,6 +79,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Vectorize.h" +#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include #include #include @@ -205,12 +206,12 @@ private: unsigned Alignment); }; -class LoadStoreVectorizer : public FunctionPass { +class LoadStoreVectorizerLegacyPass : public FunctionPass { public: static char ID; - LoadStoreVectorizer() : FunctionPass(ID) { - initializeLoadStoreVectorizerPass(*PassRegistry::getPassRegistry()); + LoadStoreVectorizerLegacyPass() : FunctionPass(ID) { + initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; @@ -230,30 +231,23 @@ public: } // end anonymous namespace -char LoadStoreVectorizer::ID = 0; +char LoadStoreVectorizerLegacyPass::ID = 0; -INITIALIZE_PASS_BEGIN(LoadStoreVectorizer, DEBUG_TYPE, +INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, "Vectorize load and Store instructions", false, false) INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_END(LoadStoreVectorizer, DEBUG_TYPE, +INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, "Vectorize load and store instructions", false, false) Pass *llvm::createLoadStoreVectorizerPass() { - return new LoadStoreVectorizer(); + return new LoadStoreVectorizerLegacyPass(); } -// The real propagateMetadata expects a SmallVector, but we deal in -// vectors of Instructions. -static void propagateMetadata(Instruction *I, ArrayRef IL) { - SmallVector VL(IL.begin(), IL.end()); - propagateMetadata(I, VL); -} - -bool LoadStoreVectorizer::runOnFunction(Function &F) { +bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) { // Don't vectorize when the attribute NoImplicitFloat is used. if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat)) return false; @@ -268,6 +262,30 @@ bool LoadStoreVectorizer::runOnFunction(Function &F) { return V.run(); } +PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { + // Don't vectorize when the attribute NoImplicitFloat is used. + if (F.hasFnAttribute(Attribute::NoImplicitFloat)) + return PreservedAnalyses::all(); + + AliasAnalysis &AA = AM.getResult(F); + DominatorTree &DT = AM.getResult(F); + ScalarEvolution &SE = AM.getResult(F); + TargetTransformInfo &TTI = AM.getResult(F); + + Vectorizer V(F, AA, DT, SE, TTI); + bool Changed = V.run(); + PreservedAnalyses PA; + PA.preserveSet(); + return Changed ? PA : PreservedAnalyses::all(); +} + +// The real propagateMetadata expects a SmallVector, but we deal in +// vectors of Instructions. +static void propagateMetadata(Instruction *I, ArrayRef IL) { + SmallVector VL(IL.begin(), IL.end()); + propagateMetadata(I, VL); +} + // Vectorizer Implementation bool Vectorizer::run() { bool Changed = false; diff --git a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 755ad32a7bf5079b871a0b09c53a9f756d508bfe..ac989dd66f72d228d974df9f7b045aab38940dec 100644 --- a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -80,10 +80,11 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) { return false; } -LoopVectorizeHints::LoopVectorizeHints(const Loop *L, bool DisableInterleaving, +LoopVectorizeHints::LoopVectorizeHints(const Loop *L, + bool InterleaveOnlyWhenForced, OptimizationRemarkEmitter &ORE) : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH), - Interleave("interleave.count", DisableInterleaving, HK_UNROLL), + Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) { // Populate values with existing loop metadata. @@ -98,19 +99,19 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L, bool DisableInterleaving, // consider the loop to have been already vectorized because there's // nothing more that we can do. IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1; - LLVM_DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() + LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n"); } -bool LoopVectorizeHints::allowVectorization(Function *F, Loop *L, - bool AlwaysVectorize) const { +bool LoopVectorizeHints::allowVectorization( + Function *F, Loop *L, bool VectorizeOnlyWhenForced) const { if (getForce() == LoopVectorizeHints::FK_Disabled) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); emitRemarkWithHints(); return false; } - if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) { + if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); emitRemarkWithHints(); return false; @@ -817,15 +818,14 @@ bool LoopVectorizationLegality::canVectorizeMemory() { if (!LAI->canVectorizeMemory()) return false; - if (LAI->hasMultipleStoresToLoopInvariantAddress()) { + if (LAI->hasDependenceInvolvingLoopInvariantAddress()) { ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress") - << "multiple writes to a loop invariant address could not " + << "write to a loop invariant address could not " "be vectorized"); LLVM_DEBUG( - dbgs() << "LV: We don't allow multiple stores to a uniform address\n"); + dbgs() << "LV: Non vectorizable stores to a uniform address\n"); return false; } - Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); PSE.addPredicate(LAI->getPSE().getUnionPredicate()); diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index c9c70b5c5364a0d9522cae79ebb7bd763a251f87..c74352cf7039b49e70066eac51a10bd250e4b21c 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -152,6 +152,16 @@ using namespace llvm; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME +/// @{ +/// Metadata attribute names +static const char *const LLVMLoopVectorizeFollowupAll = + "llvm.loop.vectorize.followup_all"; +static const char *const LLVMLoopVectorizeFollowupVectorized = + "llvm.loop.vectorize.followup_vectorized"; +static const char *const LLVMLoopVectorizeFollowupEpilogue = + "llvm.loop.vectorize.followup_epilogue"; +/// @} + STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); @@ -796,27 +806,6 @@ void InnerLoopVectorizer::addMetadata(ArrayRef To, } } -static void emitMissedWarning(Function *F, Loop *L, - const LoopVectorizeHints &LH, - OptimizationRemarkEmitter *ORE) { - LH.emitRemarkWithHints(); - - if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { - if (LH.getWidth() != 1) - ORE->emit(DiagnosticInfoOptimizationFailure( - DEBUG_TYPE, "FailedRequestedVectorization", - L->getStartLoc(), L->getHeader()) - << "loop not vectorized: " - << "failed explicitly specified loop vectorization"); - else if (LH.getInterleave() != 1) - ORE->emit(DiagnosticInfoOptimizationFailure( - DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(), - L->getHeader()) - << "loop not interleaved: " - << "failed explicitly specified loop interleaving"); - } -} - namespace llvm { /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -976,7 +965,7 @@ public: /// Save vectorization decision \p W and \p Cost taken by the cost model for /// interleaving group \p Grp and vector width \p VF. - void setWideningDecision(const InterleaveGroup *Grp, unsigned VF, + void setWideningDecision(const InterleaveGroup *Grp, unsigned VF, InstWidening W, unsigned Cost) { assert(VF >= 2 && "Expected VF >=2"); /// Broadcast this decicion to all instructions inside the group. @@ -1131,7 +1120,8 @@ public: } /// Get the interleaved access group that \p Instr belongs to. - const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) { + const InterleaveGroup * + getInterleavedAccessGroup(Instruction *Instr) { return InterleaveInfo.getInterleaveGroup(Instr); } @@ -1369,14 +1359,15 @@ static bool isExplicitVecOuterLoop(Loop *OuterLp, return false; Function *Fn = OuterLp->getHeader()->getParent(); - if (!Hints.allowVectorization(Fn, OuterLp, false /*AlwaysVectorize*/)) { + if (!Hints.allowVectorization(Fn, OuterLp, + true /*VectorizeOnlyWhenForced*/)) { LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); return false; } if (!Hints.getWidth()) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n"); - emitMissedWarning(Fn, OuterLp, Hints, ORE); + Hints.emitRemarkWithHints(); return false; } @@ -1384,7 +1375,7 @@ static bool isExplicitVecOuterLoop(Loop *OuterLp, // TODO: Interleave support is future work. LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " "outer loops.\n"); - emitMissedWarning(Fn, OuterLp, Hints, ORE); + Hints.emitRemarkWithHints(); return false; } @@ -1425,10 +1416,11 @@ struct LoopVectorize : public FunctionPass { LoopVectorizePass Impl; - explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true) + explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, + bool VectorizeOnlyWhenForced = false) : FunctionPass(ID) { - Impl.DisableUnrolling = NoUnrolling; - Impl.AlwaysVectorize = AlwaysVectorize; + Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; + Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); } @@ -1994,7 +1986,8 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, VectorParts *BlockInMask) { - const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr); + const InterleaveGroup *Group = + Cost->getInterleavedAccessGroup(Instr); assert(Group && "Fail to get an interleaved access group."); // Skip if current instruction is not the insert position. @@ -2737,6 +2730,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { BasicBlock *OldBasicBlock = OrigLoop->getHeader(); BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); BasicBlock *ExitBlock = OrigLoop->getExitBlock(); + MDNode *OrigLoopID = OrigLoop->getLoopID(); assert(VectorPH && "Invalid loop structure"); assert(ExitBlock && "Must have an exit block"); @@ -2880,6 +2874,17 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { LoopVectorBody = VecBody; LoopScalarBody = OldBasicBlock; + Optional VectorizedLoopID = + makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, + LLVMLoopVectorizeFollowupVectorized}); + if (VectorizedLoopID.hasValue()) { + Lp->setLoopID(VectorizedLoopID.getValue()); + + // Do not setAlreadyVectorized if loop attributes have been defined + // explicitly. + return LoopVectorPreHeader; + } + // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). if (MDNode *LID = OrigLoop->getLoopID()) @@ -5826,9 +5831,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, auto *Phi = cast(I); // First-order recurrences are replaced by vector shuffles inside the loop. + // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - VectorTy, VF - 1, ToVectorTy(RetTy, 1)); + VectorTy, VF - 1, VectorType::get(RetTy, 1)); // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi @@ -6018,8 +6024,9 @@ INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { -Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) { - return new LoopVectorize(NoUnrolling, AlwaysVectorize); +Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, + bool VectorizeOnlyWhenForced) { + return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); } } // end namespace llvm @@ -6376,7 +6383,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan) { - const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I); + const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I); if (!IG) return nullptr; @@ -6792,7 +6799,8 @@ LoopVectorizationPlanner::buildVPlanWithVPRecipes( // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct // member of the IG, do not construct any Recipe for it. - const InterleaveGroup *IG = CM.getInterleavedAccessGroup(Instr); + const InterleaveGroup *IG = + CM.getInterleavedAccessGroup(Instr); if (IG && Instr != IG->getInsertPos() && Range.Start >= 2 && // Query is illegal for VF == 1 CM.getWideningDecision(Instr, Range.Start) == @@ -7136,7 +7144,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"); - LoopVectorizeHints Hints(L, DisableUnrolling, *ORE); + LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); LLVM_DEBUG( dbgs() << "LV: Loop hints:" @@ -7160,7 +7168,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // less verbose reporting vectorized loops and unvectorized loops that may // benefit from vectorization, respectively. - if (!Hints.allowVectorization(F, L, AlwaysVectorize)) { + if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); return false; } @@ -7173,7 +7181,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { &Requirements, &Hints, DB, AC); if (!LVL.canVectorize(EnableVPlanNativePath)) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); - emitMissedWarning(F, L, Hints, ORE); + Hints.emitRemarkWithHints(); return false; } @@ -7246,7 +7254,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "NoImplicitFloat", L) << "loop not vectorized due to NoImplicitFloat attribute"); - emitMissedWarning(F, L, Hints, ORE); + Hints.emitRemarkWithHints(); return false; } @@ -7261,7 +7269,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { ORE->emit( createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L) << "loop not vectorized due to unsafe FP support."); - emitMissedWarning(F, L, Hints, ORE); + Hints.emitRemarkWithHints(); return false; } @@ -7303,7 +7311,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (Requirements.doesNotMeet(F, L, Hints)) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " "requirements.\n"); - emitMissedWarning(F, L, Hints, ORE); + Hints.emitRemarkWithHints(); return false; } @@ -7380,6 +7388,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { LVP.setBestPlan(VF.Width, IC); using namespace ore; + bool DisableRuntimeUnroll = false; + MDNode *OrigLoopID = L->getLoopID(); if (!VectorizeLoop) { assert(IC > 1 && "interleave count should not be 1 or 0"); @@ -7406,7 +7416,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // no runtime checks about strides and memory. A scalar loop that is // rarely used is not worth unrolling. if (!LB.areSafetyChecksAdded()) - AddRuntimeUnrollDisableMetaData(L); + DisableRuntimeUnroll = true; // Report the vectorization decision. ORE->emit([&]() { @@ -7418,8 +7428,18 @@ bool LoopVectorizePass::processLoop(Loop *L) { }); } - // Mark the loop as already vectorized to avoid vectorizing again. - Hints.setAlreadyVectorized(); + Optional RemainderLoopID = + makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, + LLVMLoopVectorizeFollowupEpilogue}); + if (RemainderLoopID.hasValue()) { + L->setLoopID(RemainderLoopID.getValue()); + } else { + if (DisableRuntimeUnroll) + AddRuntimeUnrollDisableMetaData(L); + + // Mark the loop as already vectorized to avoid vectorizing again. + Hints.setAlreadyVectorized(); + } LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); return true; diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 3592df3ede3d13f187717c122af372fa98694a69..87a861970d10a0e33c9d2ef427c6b50792db24a1 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3643,6 +3643,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { auto &Locs = ExternallyUsedValues[Scalar]; ExternallyUsedValues.insert({Ex, Locs}); ExternallyUsedValues.erase(Scalar); + // Required to update internally referenced instructions. + Scalar->replaceAllUsesWith(Ex); continue; } @@ -5453,7 +5455,7 @@ class HorizontalReduction { } }; - Instruction *ReductionRoot = nullptr; + WeakTrackingVH ReductionRoot; /// The operation data of the reduction operation. OperationData ReductionData; @@ -5738,7 +5740,7 @@ public: unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); Value *VectorizedTree = nullptr; - IRBuilder<> Builder(ReductionRoot); + IRBuilder<> Builder(cast(ReductionRoot)); FastMathFlags Unsafe; Unsafe.setFast(); Builder.setFastMathFlags(Unsafe); @@ -5747,8 +5749,13 @@ public: BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; // The same extra argument may be used several time, so log each attempt // to use it. - for (auto &Pair : ExtraArgs) + for (auto &Pair : ExtraArgs) { + assert(Pair.first && "DebugLoc must be set."); ExternallyUsedValues[Pair.second].push_back(Pair.first); + } + // The reduction root is used as the insertion point for new instructions, + // so set it as externally used to prevent it from being deleted. + ExternallyUsedValues[ReductionRoot]; SmallVector IgnoreList; for (auto &V : ReductionOps) IgnoreList.append(V.begin(), V.end()); @@ -5800,6 +5807,7 @@ public: Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues); // Emit a reduction. + Builder.SetInsertPoint(cast(ReductionRoot)); Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); if (VectorizedTree) { @@ -5826,8 +5834,6 @@ public: VectorizedTree = VectReductionData.createOp(Builder, "", ReductionOps); } for (auto &Pair : ExternallyUsedValues) { - assert(!Pair.second.empty() && - "At least one DebugLoc must be inserted"); // Add each externally used value to the final reduction. for (auto *I : Pair.second) { Builder.SetCurrentDebugLocation(I->getDebugLoc()); diff --git a/lib/Transforms/Vectorize/VPlan.cpp b/lib/Transforms/Vectorize/VPlan.cpp index a3c15a36b05d77d8ba344d07fe7880942cbc1109..05a5400beb4e69e49e26fe6ff988150229c33a4e 100644 --- a/lib/Transforms/Vectorize/VPlan.cpp +++ b/lib/Transforms/Vectorize/VPlan.cpp @@ -338,6 +338,12 @@ void VPInstruction::print(raw_ostream &O) const { case VPInstruction::ICmpULE: O << "icmp ule"; break; + case VPInstruction::SLPLoad: + O << "combined load"; + break; + case VPInstruction::SLPStore: + O << "combined store"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -462,7 +468,7 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, "One successor of a basic block does not lead to the other."); assert(InterimSucc->getSinglePredecessor() && "Interim successor has more than one predecessor."); - assert(pred_size(PostDomSucc) == 2 && + assert(PostDomSucc->hasNPredecessors(2) && "PostDom successor has more than two predecessors."); DT->addNewBlock(InterimSucc, BB); DT->addNewBlock(PostDomSucc, BB); @@ -680,3 +686,55 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, } template void DomTreeBuilder::Calculate(VPDominatorTree &DT); + +void VPValue::replaceAllUsesWith(VPValue *New) { + for (VPUser *User : users()) + for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) + if (User->getOperand(I) == this) + User->setOperand(I, New); +} + +void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region, + Old2NewTy &Old2New, + InterleavedAccessInfo &IAI) { + ReversePostOrderTraversal RPOT(Region->getEntry()); + for (VPBlockBase *Base : RPOT) { + visitBlock(Base, Old2New, IAI); + } +} + +void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, + InterleavedAccessInfo &IAI) { + if (VPBasicBlock *VPBB = dyn_cast(Block)) { + for (VPRecipeBase &VPI : *VPBB) { + assert(isa(&VPI) && "Can only handle VPInstructions"); + auto *VPInst = cast(&VPI); + auto *Inst = cast(VPInst->getUnderlyingValue()); + auto *IG = IAI.getInterleaveGroup(Inst); + if (!IG) + continue; + + auto NewIGIter = Old2New.find(IG); + if (NewIGIter == Old2New.end()) + Old2New[IG] = new InterleaveGroup( + IG->getFactor(), IG->isReverse(), IG->getAlignment()); + + if (Inst == IG->getInsertPos()) + Old2New[IG]->setInsertPos(VPInst); + + InterleaveGroupMap[VPInst] = Old2New[IG]; + InterleaveGroupMap[VPInst]->insertMember( + VPInst, IG->getIndex(Inst), + IG->isReverse() ? (-1) * int(IG->getFactor()) : IG->getFactor()); + } + } else if (VPRegionBlock *Region = dyn_cast(Block)) + visitRegion(Region, Old2New, IAI); + else + llvm_unreachable("Unsupported kind of VPBlock."); +} + +VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan, + InterleavedAccessInfo &IAI) { + Old2NewTy Old2New; + visitRegion(cast(Plan.getEntry()), Old2New, IAI); +} diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h index 9daaea1acdedd082396cc69d92742f38f5797b3e..5c1b4a83c30e10991dce1adba2ceac4decba0741 100644 --- a/lib/Transforms/Vectorize/VPlan.h +++ b/lib/Transforms/Vectorize/VPlan.h @@ -38,6 +38,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/ADT/ilist.h" #include "llvm/ADT/ilist_node.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/IRBuilder.h" #include #include @@ -52,12 +53,14 @@ class LoopVectorizationCostModel; class BasicBlock; class DominatorTree; class InnerLoopVectorizer; -class InterleaveGroup; +template class InterleaveGroup; +class LoopInfo; class raw_ostream; class Value; class VPBasicBlock; class VPRegionBlock; class VPlan; +class VPlanSlp; /// A range of powers-of-2 vectorization factors with fixed start and /// adjustable end. The range includes start and excludes end, e.g.,: @@ -607,10 +610,16 @@ public: /// the VPInstruction is also a single def-use vertex. class VPInstruction : public VPUser, public VPRecipeBase { friend class VPlanHCFGTransforms; + friend class VPlanSlp; public: /// VPlan opcodes, extending LLVM IR with idiomatics instructions. - enum { Not = Instruction::OtherOpsEnd + 1, ICmpULE }; + enum { + Not = Instruction::OtherOpsEnd + 1, + ICmpULE, + SLPLoad, + SLPStore, + }; private: typedef unsigned char OpcodeTy; @@ -620,6 +629,13 @@ private: /// modeled instruction. void generateInstruction(VPTransformState &State, unsigned Part); +protected: + Instruction *getUnderlyingInstr() { + return cast_or_null(getUnderlyingValue()); + } + + void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); } + public: VPInstruction(unsigned Opcode, ArrayRef Operands) : VPUser(VPValue::VPInstructionSC, Operands), @@ -633,6 +649,11 @@ public: return V->getVPValueID() == VPValue::VPInstructionSC; } + VPInstruction *clone() const { + SmallVector Operands(operands()); + return new VPInstruction(Opcode, Operands); + } + /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPRecipeBase *R) { return R->getVPRecipeID() == VPRecipeBase::VPInstructionSC; @@ -650,6 +671,14 @@ public: /// Print the VPInstruction. void print(raw_ostream &O) const; + + /// Return true if this instruction may modify memory. + bool mayWriteToMemory() const { + // TODO: we can use attributes of the called function to rule out memory + // modifications. + return Opcode == Instruction::Store || Opcode == Instruction::Call || + Opcode == Instruction::Invoke || Opcode == SLPStore; + } }; /// VPWidenRecipe is a recipe for producing a copy of vector type for each @@ -771,11 +800,11 @@ public: /// or stores into one wide load/store and shuffles. class VPInterleaveRecipe : public VPRecipeBase { private: - const InterleaveGroup *IG; + const InterleaveGroup *IG; std::unique_ptr User; public: - VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Mask) + VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Mask) : VPRecipeBase(VPInterleaveSC), IG(IG) { if (Mask) // Create a VPInstruction to register as a user of the mask. User.reset(new VPUser({Mask})); @@ -793,7 +822,7 @@ public: /// Print the recipe. void print(raw_ostream &O, const Twine &Indent) const override; - const InterleaveGroup *getInterleaveGroup() { return IG; } + const InterleaveGroup *getInterleaveGroup() { return IG; } }; /// VPReplicateRecipe replicates a given instruction producing multiple scalar @@ -1464,6 +1493,144 @@ public: } }; +class VPInterleavedAccessInfo { +private: + DenseMap *> + InterleaveGroupMap; + + /// Type for mapping of instruction based interleave groups to VPInstruction + /// interleave groups + using Old2NewTy = DenseMap *, + InterleaveGroup *>; + + /// Recursively \p Region and populate VPlan based interleave groups based on + /// \p IAI. + void visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New, + InterleavedAccessInfo &IAI); + /// Recursively traverse \p Block and populate VPlan based interleave groups + /// based on \p IAI. + void visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, + InterleavedAccessInfo &IAI); + +public: + VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI); + + ~VPInterleavedAccessInfo() { + SmallPtrSet *, 4> DelSet; + // Avoid releasing a pointer twice. + for (auto &I : InterleaveGroupMap) + DelSet.insert(I.second); + for (auto *Ptr : DelSet) + delete Ptr; + } + + /// Get the interleave group that \p Instr belongs to. + /// + /// \returns nullptr if doesn't have such group. + InterleaveGroup * + getInterleaveGroup(VPInstruction *Instr) const { + if (InterleaveGroupMap.count(Instr)) + return InterleaveGroupMap.find(Instr)->second; + return nullptr; + } +}; + +/// Class that maps (parts of) an existing VPlan to trees of combined +/// VPInstructions. +class VPlanSlp { +private: + enum class OpMode { Failed, Load, Opcode }; + + /// A DenseMapInfo implementation for using SmallVector as + /// DenseMap keys. + struct BundleDenseMapInfo { + static SmallVector getEmptyKey() { + return {reinterpret_cast(-1)}; + } + + static SmallVector getTombstoneKey() { + return {reinterpret_cast(-2)}; + } + + static unsigned getHashValue(const SmallVector &V) { + return static_cast(hash_combine_range(V.begin(), V.end())); + } + + static bool isEqual(const SmallVector &LHS, + const SmallVector &RHS) { + return LHS == RHS; + } + }; + + /// Mapping of values in the original VPlan to a combined VPInstruction. + DenseMap, VPInstruction *, BundleDenseMapInfo> + BundleToCombined; + + VPInterleavedAccessInfo &IAI; + + /// Basic block to operate on. For now, only instructions in a single BB are + /// considered. + const VPBasicBlock &BB; + + /// Indicates whether we managed to combine all visited instructions or not. + bool CompletelySLP = true; + + /// Width of the widest combined bundle in bits. + unsigned WidestBundleBits = 0; + + using MultiNodeOpTy = + typename std::pair>; + + // Input operand bundles for the current multi node. Each multi node operand + // bundle contains values not matching the multi node's opcode. They will + // be reordered in reorderMultiNodeOps, once we completed building a + // multi node. + SmallVector MultiNodeOps; + + /// Indicates whether we are building a multi node currently. + bool MultiNodeActive = false; + + /// Check if we can vectorize Operands together. + bool areVectorizable(ArrayRef Operands) const; + + /// Add combined instruction \p New for the bundle \p Operands. + void addCombined(ArrayRef Operands, VPInstruction *New); + + /// Indicate we hit a bundle we failed to combine. Returns nullptr for now. + VPInstruction *markFailed(); + + /// Reorder operands in the multi node to maximize sequential memory access + /// and commutative operations. + SmallVector reorderMultiNodeOps(); + + /// Choose the best candidate to use for the lane after \p Last. The set of + /// candidates to choose from are values with an opcode matching \p Last's + /// or loads consecutive to \p Last. + std::pair getBest(OpMode Mode, VPValue *Last, + SmallPtrSetImpl &Candidates, + VPInterleavedAccessInfo &IAI); + + /// Print bundle \p Values to dbgs(). + void dumpBundle(ArrayRef Values); + +public: + VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {} + + ~VPlanSlp() { + for (auto &KV : BundleToCombined) + delete KV.second; + } + + /// Tries to build an SLP tree rooted at \p Operands and returns a + /// VPInstruction combining \p Operands, if they can be combined. + VPInstruction *buildGraph(ArrayRef Operands); + + /// Return the width of the widest combined bundle in bits. + unsigned getWidestBundleBits() const { return WidestBundleBits; } + + /// Return true if all visited instruction can be combined. + bool isCompletelySLP() const { return CompletelySLP; } +}; } // end namespace llvm #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H diff --git a/lib/Transforms/Vectorize/VPlanSLP.cpp b/lib/Transforms/Vectorize/VPlanSLP.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ad3a85a6f760fca5cc6e8dae88f10fe73018a512 --- /dev/null +++ b/lib/Transforms/Vectorize/VPlanSLP.cpp @@ -0,0 +1,468 @@ +//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// This file implements SLP analysis based on VPlan. The analysis is based on +/// the ideas described in +/// +/// Look-ahead SLP: auto-vectorization in the presence of commutative +/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, +/// Luís F. W. Góes +/// +//===----------------------------------------------------------------------===// + +#include "VPlan.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GraphWriter.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "vplan-slp" + +// Number of levels to look ahead when re-ordering multi node operands. +static unsigned LookaheadMaxDepth = 5; + +VPInstruction *VPlanSlp::markFailed() { + // FIXME: Currently this is used to signal we hit instructions we cannot + // trivially SLP'ize. + CompletelySLP = false; + return nullptr; +} + +void VPlanSlp::addCombined(ArrayRef Operands, VPInstruction *New) { + if (all_of(Operands, [](VPValue *V) { + return cast(V)->getUnderlyingInstr(); + })) { + unsigned BundleSize = 0; + for (VPValue *V : Operands) { + Type *T = cast(V)->getUnderlyingInstr()->getType(); + assert(!T->isVectorTy() && "Only scalar types supported for now"); + BundleSize += T->getScalarSizeInBits(); + } + WidestBundleBits = std::max(WidestBundleBits, BundleSize); + } + + auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New); + assert(Res.second && + "Already created a combined instruction for the operand bundle"); + (void)Res; +} + +bool VPlanSlp::areVectorizable(ArrayRef Operands) const { + // Currently we only support VPInstructions. + if (!all_of(Operands, [](VPValue *Op) { + return Op && isa(Op) && + cast(Op)->getUnderlyingInstr(); + })) { + LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n"); + return false; + } + + // Check if opcodes and type width agree for all instructions in the bundle. + // FIXME: Differing widths/opcodes can be handled by inserting additional + // instructions. + // FIXME: Deal with non-primitive types. + const Instruction *OriginalInstr = + cast(Operands[0])->getUnderlyingInstr(); + unsigned Opcode = OriginalInstr->getOpcode(); + unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits(); + if (!all_of(Operands, [Opcode, Width](VPValue *Op) { + const Instruction *I = cast(Op)->getUnderlyingInstr(); + return I->getOpcode() == Opcode && + I->getType()->getPrimitiveSizeInBits() == Width; + })) { + LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n"); + return false; + } + + // For now, all operands must be defined in the same BB. + if (any_of(Operands, [this](VPValue *Op) { + return cast(Op)->getParent() != &this->BB; + })) { + LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n"); + return false; + } + + if (any_of(Operands, + [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) { + LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n"); + return false; + } + + // For loads, check that there are no instructions writing to memory in + // between them. + // TODO: we only have to forbid instructions writing to memory that could + // interfere with any of the loads in the bundle + if (Opcode == Instruction::Load) { + unsigned LoadsSeen = 0; + VPBasicBlock *Parent = cast(Operands[0])->getParent(); + for (auto &I : *Parent) { + auto *VPI = cast(&I); + if (VPI->getOpcode() == Instruction::Load && + std::find(Operands.begin(), Operands.end(), VPI) != Operands.end()) + LoadsSeen++; + + if (LoadsSeen == Operands.size()) + break; + if (LoadsSeen > 0 && VPI->mayWriteToMemory()) { + LLVM_DEBUG( + dbgs() << "VPSLP: instruction modifying memory between loads\n"); + return false; + } + } + + if (!all_of(Operands, [](VPValue *Op) { + return cast(cast(Op)->getUnderlyingInstr()) + ->isSimple(); + })) { + LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n"); + return false; + } + } + + if (Opcode == Instruction::Store) + if (!all_of(Operands, [](VPValue *Op) { + return cast(cast(Op)->getUnderlyingInstr()) + ->isSimple(); + })) { + LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n"); + return false; + } + + return true; +} + +static SmallVector getOperands(ArrayRef Values, + unsigned OperandIndex) { + SmallVector Operands; + for (VPValue *V : Values) { + auto *U = cast(V); + Operands.push_back(U->getOperand(OperandIndex)); + } + return Operands; +} + +static bool areCommutative(ArrayRef Values) { + return Instruction::isCommutative( + cast(Values[0])->getOpcode()); +} + +static SmallVector, 4> +getOperands(ArrayRef Values) { + SmallVector, 4> Result; + auto *VPI = cast(Values[0]); + + switch (VPI->getOpcode()) { + case Instruction::Load: + llvm_unreachable("Loads terminate a tree, no need to get operands"); + case Instruction::Store: + Result.push_back(getOperands(Values, 0)); + break; + default: + for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I) + Result.push_back(getOperands(Values, I)); + break; + } + + return Result; +} + +/// Returns the opcode of Values or ~0 if they do not all agree. +static Optional getOpcode(ArrayRef Values) { + unsigned Opcode = cast(Values[0])->getOpcode(); + if (any_of(Values, [Opcode](VPValue *V) { + return cast(V)->getOpcode() != Opcode; + })) + return None; + return {Opcode}; +} + +/// Returns true if A and B access sequential memory if they are loads or +/// stores or if they have identical opcodes otherwise. +static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, + VPInterleavedAccessInfo &IAI) { + if (A->getOpcode() != B->getOpcode()) + return false; + + if (A->getOpcode() != Instruction::Load && + A->getOpcode() != Instruction::Store) + return true; + auto *GA = IAI.getInterleaveGroup(A); + auto *GB = IAI.getInterleaveGroup(B); + + return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B); +} + +/// Implements getLAScore from Listing 7 in the paper. +/// Traverses and compares operands of V1 and V2 to MaxLevel. +static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, + VPInterleavedAccessInfo &IAI) { + if (!isa(V1) || !isa(V2)) + return 0; + + if (MaxLevel == 0) + return (unsigned)areConsecutiveOrMatch(cast(V1), + cast(V2), IAI); + + unsigned Score = 0; + for (unsigned I = 0, EV1 = cast(V1)->getNumOperands(); I < EV1; ++I) + for (unsigned J = 0, EV2 = cast(V2)->getNumOperands(); J < EV2; ++J) + Score += getLAScore(cast(V1)->getOperand(I), + cast(V2)->getOperand(J), MaxLevel - 1, IAI); + return Score; +} + +std::pair +VPlanSlp::getBest(OpMode Mode, VPValue *Last, + SmallPtrSetImpl &Candidates, + VPInterleavedAccessInfo &IAI) { + assert((Mode == OpMode::Load || Mode == OpMode::Opcode) && + "Currently we only handle load and commutative opcodes"); + LLVM_DEBUG(dbgs() << " getBest\n"); + + SmallVector BestCandidates; + LLVM_DEBUG(dbgs() << " Candidates for " + << *cast(Last)->getUnderlyingInstr() << " "); + for (auto *Candidate : Candidates) { + auto *LastI = cast(Last); + auto *CandidateI = cast(Candidate); + if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) { + LLVM_DEBUG(dbgs() << *cast(Candidate)->getUnderlyingInstr() + << " "); + BestCandidates.push_back(Candidate); + } + } + LLVM_DEBUG(dbgs() << "\n"); + + if (BestCandidates.empty()) + return {OpMode::Failed, nullptr}; + + if (BestCandidates.size() == 1) + return {Mode, BestCandidates[0]}; + + VPValue *Best = nullptr; + unsigned BestScore = 0; + for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) { + unsigned PrevScore = ~0u; + bool AllSame = true; + + // FIXME: Avoid visiting the same operands multiple times. + for (auto *Candidate : BestCandidates) { + unsigned Score = getLAScore(Last, Candidate, Depth, IAI); + if (PrevScore == ~0u) + PrevScore = Score; + if (PrevScore != Score) + AllSame = false; + PrevScore = Score; + + if (Score > BestScore) { + BestScore = Score; + Best = Candidate; + } + } + if (!AllSame) + break; + } + LLVM_DEBUG(dbgs() << "Found best " + << *cast(Best)->getUnderlyingInstr() + << "\n"); + Candidates.erase(Best); + + return {Mode, Best}; +} + +SmallVector VPlanSlp::reorderMultiNodeOps() { + SmallVector FinalOrder; + SmallVector Mode; + FinalOrder.reserve(MultiNodeOps.size()); + Mode.reserve(MultiNodeOps.size()); + + LLVM_DEBUG(dbgs() << "Reordering multinode\n"); + + for (auto &Operands : MultiNodeOps) { + FinalOrder.push_back({Operands.first, {Operands.second[0]}}); + if (cast(Operands.second[0])->getOpcode() == + Instruction::Load) + Mode.push_back(OpMode::Load); + else + Mode.push_back(OpMode::Opcode); + } + + for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) { + LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n"); + SmallPtrSet Candidates; + LLVM_DEBUG(dbgs() << " Candidates "); + for (auto Ops : MultiNodeOps) { + LLVM_DEBUG( + dbgs() << *cast(Ops.second[Lane])->getUnderlyingInstr() + << " "); + Candidates.insert(Ops.second[Lane]); + } + LLVM_DEBUG(dbgs() << "\n"); + + for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) { + LLVM_DEBUG(dbgs() << " Checking " << Op << "\n"); + if (Mode[Op] == OpMode::Failed) + continue; + + VPValue *Last = FinalOrder[Op].second[Lane - 1]; + std::pair Res = + getBest(Mode[Op], Last, Candidates, IAI); + if (Res.second) + FinalOrder[Op].second.push_back(Res.second); + else + // TODO: handle this case + FinalOrder[Op].second.push_back(markFailed()); + } + } + + return FinalOrder; +} + +void VPlanSlp::dumpBundle(ArrayRef Values) { + dbgs() << " Ops: "; + for (auto Op : Values) + if (auto *Instr = cast_or_null(Op)->getUnderlyingInstr()) + dbgs() << *Instr << " | "; + else + dbgs() << " nullptr | "; + dbgs() << "\n"; +} + +VPInstruction *VPlanSlp::buildGraph(ArrayRef Values) { + assert(!Values.empty() && "Need some operands!"); + + // If we already visited this instruction bundle, re-use the existing node + auto I = BundleToCombined.find(to_vector<4>(Values)); + if (I != BundleToCombined.end()) { +#ifndef NDEBUG + // Check that the resulting graph is a tree. If we re-use a node, this means + // its values have multiple users. We only allow this, if all users of each + // value are the same instruction. + for (auto *V : Values) { + auto UI = V->user_begin(); + auto *FirstUser = *UI++; + while (UI != V->user_end()) { + assert(*UI == FirstUser && "Currently we only support SLP trees."); + UI++; + } + } +#endif + return I->second; + } + + // Dump inputs + LLVM_DEBUG({ + dbgs() << "buildGraph: "; + dumpBundle(Values); + }); + + if (!areVectorizable(Values)) + return markFailed(); + + assert(getOpcode(Values) && "Opcodes for all values must match"); + unsigned ValuesOpcode = getOpcode(Values).getValue(); + + SmallVector CombinedOperands; + if (areCommutative(Values)) { + bool MultiNodeRoot = !MultiNodeActive; + MultiNodeActive = true; + for (auto &Operands : getOperands(Values)) { + LLVM_DEBUG({ + dbgs() << " Visiting Commutative"; + dumpBundle(Operands); + }); + + auto OperandsOpcode = getOpcode(Operands); + if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) { + LLVM_DEBUG(dbgs() << " Same opcode, continue building\n"); + CombinedOperands.push_back(buildGraph(Operands)); + } else { + LLVM_DEBUG(dbgs() << " Adding multinode Ops\n"); + // Create dummy VPInstruction, which will we replace later by the + // re-ordered operand. + VPInstruction *Op = new VPInstruction(0, {}); + CombinedOperands.push_back(Op); + MultiNodeOps.emplace_back(Op, Operands); + } + } + + if (MultiNodeRoot) { + LLVM_DEBUG(dbgs() << "Reorder \n"); + MultiNodeActive = false; + + auto FinalOrder = reorderMultiNodeOps(); + + MultiNodeOps.clear(); + for (auto &Ops : FinalOrder) { + VPInstruction *NewOp = buildGraph(Ops.second); + Ops.first->replaceAllUsesWith(NewOp); + for (unsigned i = 0; i < CombinedOperands.size(); i++) + if (CombinedOperands[i] == Ops.first) + CombinedOperands[i] = NewOp; + delete Ops.first; + Ops.first = NewOp; + } + LLVM_DEBUG(dbgs() << "Found final order\n"); + } + } else { + LLVM_DEBUG(dbgs() << " NonCommuntative\n"); + if (ValuesOpcode == Instruction::Load) + for (VPValue *V : Values) + CombinedOperands.push_back(cast(V)->getOperand(0)); + else + for (auto &Operands : getOperands(Values)) + CombinedOperands.push_back(buildGraph(Operands)); + } + + unsigned Opcode; + switch (ValuesOpcode) { + case Instruction::Load: + Opcode = VPInstruction::SLPLoad; + break; + case Instruction::Store: + Opcode = VPInstruction::SLPStore; + break; + default: + Opcode = ValuesOpcode; + break; + } + + if (!CompletelySLP) + return markFailed(); + + assert(CombinedOperands.size() > 0 && "Need more some operands"); + auto *VPI = new VPInstruction(Opcode, CombinedOperands); + VPI->setUnderlyingInstr(cast(Values[0])->getUnderlyingInstr()); + + LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs()); + cast(Values[0])->print(dbgs()); dbgs() << "\n"); + addCombined(Values, VPI); + return VPI; +} diff --git a/lib/Transforms/Vectorize/VPlanValue.h b/lib/Transforms/Vectorize/VPlanValue.h index a81044a5e1b8d5e66eee1b1673c3eb504d377991..b473579b699fb751b595bc3f0de79a1fd9da5aed 100644 --- a/lib/Transforms/Vectorize/VPlanValue.h +++ b/lib/Transforms/Vectorize/VPlanValue.h @@ -40,6 +40,7 @@ class VPValue { friend class VPBuilder; friend class VPlanHCFGTransforms; friend class VPBasicBlock; + friend class VPInterleavedAccessInfo; private: const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). @@ -105,6 +106,20 @@ public: const_user_range users() const { return const_user_range(user_begin(), user_end()); } + + /// Returns true if the value has more than one unique user. + bool hasMoreThanOneUniqueUser() { + if (getNumUsers() == 0) + return false; + + // Check if all users match the first user. + auto Current = std::next(user_begin()); + while (Current != user_end() && *user_begin() == *Current) + Current++; + return Current != user_end(); + } + + void replaceAllUsesWith(VPValue *New); }; typedef DenseMap Value2VPValueTy; @@ -150,6 +165,8 @@ public: return Operands[N]; } + void setOperand(unsigned I, VPValue *New) { Operands[I] = New; } + typedef SmallVectorImpl::iterator operand_iterator; typedef SmallVectorImpl::const_iterator const_operand_iterator; typedef iterator_range operand_range; diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp index f62a885583285e25f1a00525e66000033386d09a..559ab1968844e4b7af5c9b85a6f1368add3a5ef5 100644 --- a/lib/Transforms/Vectorize/Vectorize.cpp +++ b/lib/Transforms/Vectorize/Vectorize.cpp @@ -27,7 +27,7 @@ using namespace llvm; void llvm::initializeVectorization(PassRegistry &Registry) { initializeLoopVectorizePass(Registry); initializeSLPVectorizerPass(Registry); - initializeLoadStoreVectorizerPass(Registry); + initializeLoadStoreVectorizerLegacyPassPass(Registry); } void LLVMInitializeVectorization(LLVMPassRegistryRef R) { diff --git a/lib/XRay/FDRRecordProducer.cpp b/lib/XRay/FDRRecordProducer.cpp index 122578010c435acbed71546374a45224f6c7630a..25b3ee8af219353a86901ec489a737e87f228417 100644 --- a/lib/XRay/FDRRecordProducer.cpp +++ b/lib/XRay/FDRRecordProducer.cpp @@ -9,29 +9,32 @@ #include "llvm/XRay/FDRRecordProducer.h" #include "llvm/Support/DataExtractor.h" +#include + namespace llvm { namespace xray { namespace { +// Keep this in sync with the values written in the XRay FDR mode runtime in +// compiler-rt. +enum MetadataRecordKinds : uint8_t { + NewBufferKind, + EndOfBufferKind, + NewCPUIdKind, + TSCWrapKind, + WalltimeMarkerKind, + CustomEventMarkerKind, + CallArgumentKind, + BufferExtentsKind, + TypedEventMarkerKind, + PidKind, + // This is an end marker, used to identify the upper bound for this enum. + EnumEndMarker, +}; + Expected> metadataRecordType(const XRayFileHeader &Header, uint8_t T) { - // Keep this in sync with the values written in the XRay FDR mode runtime in - // compiler-rt. - enum MetadataRecordKinds : uint8_t { - NewBufferKind, - EndOfBufferKind, - NewCPUIdKind, - TSCWrapKind, - WalltimeMarkerKind, - CustomEventMarkerKind, - CallArgumentKind, - BufferExtentsKind, - TypedEventMarkerKind, - PidKind, - // This is an end marker, used to identify the upper bound for this enum. - EnumEndMarker, - }; if (T >= static_cast(MetadataRecordKinds::EnumEndMarker)) return createStringError(std::make_error_code(std::errc::invalid_argument), @@ -70,9 +73,70 @@ metadataRecordType(const XRayFileHeader &Header, uint8_t T) { llvm_unreachable("Unhandled MetadataRecordKinds enum value"); } +constexpr bool isMetadataIntroducer(uint8_t FirstByte) { + return FirstByte & 0x01u; +} + } // namespace +Expected> +FileBasedRecordProducer::findNextBufferExtent() { + // We seek one byte at a time until we find a suitable buffer extents metadata + // record introducer. + std::unique_ptr R; + while (!R) { + auto PreReadOffset = OffsetPtr; + uint8_t FirstByte = E.getU8(&OffsetPtr); + if (OffsetPtr == PreReadOffset) + return createStringError( + std::make_error_code(std::errc::executable_format_error), + "Failed reading one byte from offset %d.", OffsetPtr); + + if (isMetadataIntroducer(FirstByte)) { + auto LoadedType = FirstByte >> 1; + if (LoadedType == MetadataRecordKinds::BufferExtentsKind) { + auto MetadataRecordOrErr = metadataRecordType(Header, LoadedType); + if (!MetadataRecordOrErr) + return MetadataRecordOrErr.takeError(); + + R = std::move(MetadataRecordOrErr.get()); + RecordInitializer RI(E, OffsetPtr); + if (auto Err = R->apply(RI)) + return std::move(Err); + return std::move(R); + } + } + } + llvm_unreachable("Must always terminate with either an error or a record."); +} + Expected> FileBasedRecordProducer::produce() { + // First, we set up our result record. + std::unique_ptr R; + + // Before we do any further reading, we should check whether we're at the end + // of the current buffer we're been consuming. In FDR logs version >= 3, we + // rely on the buffer extents record to determine how many bytes we should be + // considering as valid records. + if (Header.Version >= 3 && CurrentBufferBytes == 0) { + // Find the next buffer extents record. + auto BufferExtentsOrError = findNextBufferExtent(); + if (!BufferExtentsOrError) + return joinErrors( + BufferExtentsOrError.takeError(), + createStringError( + std::make_error_code(std::errc::executable_format_error), + "Failed to find the next BufferExtents record.")); + + R = std::move(BufferExtentsOrError.get()); + assert(R != nullptr); + assert(isa(R.get())); + auto BE = dyn_cast(R.get()); + CurrentBufferBytes = BE->size(); + return std::move(R); + } + + // // At the top level, we read one byte to determine the type of the record to // create. This byte will comprise of the following bits: // @@ -90,11 +154,8 @@ Expected> FileBasedRecordProducer::produce() { std::make_error_code(std::errc::executable_format_error), "Failed reading one byte from offset %d.", OffsetPtr); - // Set up our result record. - std::unique_ptr R; - // For metadata records, handle especially here. - if (FirstByte & 0x01) { + if (isMetadataIntroducer(FirstByte)) { auto LoadedType = FirstByte >> 1; auto MetadataRecordOrErr = metadataRecordType(Header, LoadedType); if (!MetadataRecordOrErr) @@ -113,6 +174,22 @@ Expected> FileBasedRecordProducer::produce() { if (auto Err = R->apply(RI)) return std::move(Err); + // If we encountered a BufferExtents record, we should record the remaining + // bytes for the current buffer, to determine when we should start ignoring + // potentially malformed data and looking for buffer extents records. + if (auto BE = dyn_cast(R.get())) { + CurrentBufferBytes = BE->size(); + } else if (Header.Version >= 3) { + if (OffsetPtr - PreReadOffset > CurrentBufferBytes) + return createStringError( + std::make_error_code(std::errc::executable_format_error), + "Buffer over-read at offset %d (over-read by %d bytes); Record Type " + "= %s.", + OffsetPtr, (OffsetPtr - PreReadOffset) - CurrentBufferBytes, + Record::kindToString(R->getRecordType()).data()); + + CurrentBufferBytes -= OffsetPtr - PreReadOffset; + } assert(R != nullptr); return std::move(R); } diff --git a/lib/XRay/FDRRecords.cpp b/lib/XRay/FDRRecords.cpp index 2b68a73686fbf95713cb71e2e96922e633c2f3ea..2a40d5e0622937d6a42208452a167b6a4f01629b 100644 --- a/lib/XRay/FDRRecords.cpp +++ b/lib/XRay/FDRRecords.cpp @@ -29,5 +29,39 @@ Error FunctionRecord::apply(RecordVisitor &V) { return V.visit(*this); } Error CustomEventRecordV5::apply(RecordVisitor &V) { return V.visit(*this); } Error TypedEventRecord::apply(RecordVisitor &V) { return V.visit(*this); } +StringRef Record::kindToString(RecordKind K) { + switch (K) { + case RecordKind::RK_Metadata: + return "Metadata"; + case RecordKind::RK_Metadata_BufferExtents: + return "Metadata:BufferExtents"; + case RecordKind::RK_Metadata_WallClockTime: + return "Metadata:WallClockTime"; + case RecordKind::RK_Metadata_NewCPUId: + return "Metadata:NewCPUId"; + case RecordKind::RK_Metadata_TSCWrap: + return "Metadata:TSCWrap"; + case RecordKind::RK_Metadata_CustomEvent: + return "Metadata:CustomEvent"; + case RecordKind::RK_Metadata_CustomEventV5: + return "Metadata:CustomEventV5"; + case RecordKind::RK_Metadata_CallArg: + return "Metadata:CallArg"; + case RecordKind::RK_Metadata_PIDEntry: + return "Metadata:PIDEntry"; + case RecordKind::RK_Metadata_NewBuffer: + return "Metadata:NewBuffer"; + case RecordKind::RK_Metadata_EndOfBuffer: + return "Metadata:EndOfBuffer"; + case RecordKind::RK_Metadata_TypedEvent: + return "Metadata:TypedEvent"; + case RecordKind::RK_Metadata_LastMetadata: + return "Metadata:LastMetadata"; + case RecordKind::RK_Function: + return "Function"; + } + return "Unknown"; +} + } // namespace xray } // namespace llvm diff --git a/lib/XRay/FDRTraceWriter.cpp b/lib/XRay/FDRTraceWriter.cpp index d5f9697998638396c53aa04d1f2daf55635503f0..c5224f4be094d30c7fa551ea08418d884de51457 100644 --- a/lib/XRay/FDRTraceWriter.cpp +++ b/lib/XRay/FDRTraceWriter.cpp @@ -43,7 +43,9 @@ template struct IndexedWriter { template Error writeMetadata(support::endian::Writer &OS, Values &&... Ds) { - uint8_t FirstByte = (Kind << 1) | uint8_t{0x01}; + // The first bit in the first byte of metadata records is always set to 1, so + // we ensure this is the case when we write out the first byte of the record. + uint8_t FirstByte = (static_cast(Kind) << 1) | uint8_t{0x01u}; auto T = std::make_tuple(std::forward(std::move(Ds))...); // Write in field order. OS.write(FirstByte); @@ -112,7 +114,7 @@ Error FDRTraceWriter::visit(CustomEventRecordV5 &R) { } Error FDRTraceWriter::visit(TypedEventRecord &R) { - if (auto E = writeMetadata<7u>(OS, R.size(), R.delta(), R.eventType())) + if (auto E = writeMetadata<8u>(OS, R.size(), R.delta(), R.eventType())) return E; auto D = R.data(); ArrayRef Bytes(D.data(), D.size()); diff --git a/lib/XRay/InstrumentationMap.cpp b/lib/XRay/InstrumentationMap.cpp index 84317708d3f41aa422b3982fef68af52140ceebd..9f2b179486f07dd1b023947c50d7cb751a41c9d7 100644 --- a/lib/XRay/InstrumentationMap.cpp +++ b/lib/XRay/InstrumentationMap.cpp @@ -12,12 +12,14 @@ //===----------------------------------------------------------------------===// #include "llvm/XRay/InstrumentationMap.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/None.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/Object/Binary.h" +#include "llvm/Object/ELFObjectFile.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Support/DataExtractor.h" #include "llvm/Support/Error.h" @@ -46,6 +48,8 @@ Optional InstrumentationMap::getFunctionAddr(int32_t FuncId) const { return None; } +using RelocMap = DenseMap; + static Error loadObj(StringRef Filename, object::OwningBinary &ObjFile, InstrumentationMap::SledContainer &Sleds, @@ -79,6 +83,31 @@ loadObj(StringRef Filename, object::OwningBinary &ObjFile, return errorCodeToError( std::make_error_code(std::errc::executable_format_error)); + RelocMap Relocs; + if (ObjFile.getBinary()->isELF()) { + uint32_t RelativeRelocation = [](object::ObjectFile *ObjFile) { + if (const auto *ELFObj = dyn_cast(ObjFile)) + return ELFObj->getELFFile()->getRelativeRelocationType(); + else if (const auto *ELFObj = dyn_cast(ObjFile)) + return ELFObj->getELFFile()->getRelativeRelocationType(); + else if (const auto *ELFObj = dyn_cast(ObjFile)) + return ELFObj->getELFFile()->getRelativeRelocationType(); + else if (const auto *ELFObj = dyn_cast(ObjFile)) + return ELFObj->getELFFile()->getRelativeRelocationType(); + else + return static_cast(0); + }(ObjFile.getBinary()); + + for (const object::SectionRef &Section : Sections) { + for (const object::RelocationRef &Reloc : Section.relocations()) { + if (Reloc.getType() != RelativeRelocation) + continue; + if (auto AddendOrErr = object::ELFRelocationRef(Reloc).getAddend()) + Relocs.insert({Reloc.getOffset(), *AddendOrErr}); + } + } + } + // Copy the instrumentation map data into the Sleds data structure. auto C = Contents.bytes_begin(); static constexpr size_t ELF64SledEntrySize = 32; @@ -89,6 +118,16 @@ loadObj(StringRef Filename, object::OwningBinary &ObjFile, "an XRay sled entry in ELF64."), std::make_error_code(std::errc::executable_format_error)); + auto RelocateOrElse = [&](uint32_t Offset, uint64_t Address) { + if (!Address) { + uint64_t A = I->getAddress() + C - Contents.bytes_begin() + Offset; + RelocMap::const_iterator R = Relocs.find(A); + if (R != Relocs.end()) + return R->second; + } + return Address; + }; + int32_t FuncId = 1; uint64_t CurFn = 0; for (; C != Contents.bytes_end(); C += ELF64SledEntrySize) { @@ -98,8 +137,10 @@ loadObj(StringRef Filename, object::OwningBinary &ObjFile, Sleds.push_back({}); auto &Entry = Sleds.back(); uint32_t OffsetPtr = 0; - Entry.Address = Extractor.getU64(&OffsetPtr); - Entry.Function = Extractor.getU64(&OffsetPtr); + uint32_t AddrOff = OffsetPtr; + Entry.Address = RelocateOrElse(AddrOff, Extractor.getU64(&OffsetPtr)); + uint32_t FuncOff = OffsetPtr; + Entry.Function = RelocateOrElse(FuncOff, Extractor.getU64(&OffsetPtr)); auto Kind = Extractor.getU8(&OffsetPtr); static constexpr SledEntry::FunctionKinds Kinds[] = { SledEntry::FunctionKinds::ENTRY, SledEntry::FunctionKinds::EXIT, diff --git a/lib/XRay/RecordInitializer.cpp b/lib/XRay/RecordInitializer.cpp index cc9dd4609498c28e47c8a174d0bae0c249b3226e..f136a1e456b77381c53b765c3ed555aac54d38a9 100644 --- a/lib/XRay/RecordInitializer.cpp +++ b/lib/XRay/RecordInitializer.cpp @@ -111,6 +111,12 @@ Error RecordInitializer::visit(CustomEventRecord &R) { std::make_error_code(std::errc::invalid_argument), "Cannot read a custom event record size field offset %d.", OffsetPtr); + if (R.Size <= 0) + return createStringError( + std::make_error_code(std::errc::bad_address), + "Invalid size for custom event (size = %d) at offset %d.", R.Size, + OffsetPtr); + PreReadOffset = OffsetPtr; R.TSC = E.getU64(&OffsetPtr); if (PreReadOffset == OffsetPtr) @@ -142,11 +148,21 @@ Error RecordInitializer::visit(CustomEventRecord &R) { std::vector Buffer; Buffer.resize(R.Size); + PreReadOffset = OffsetPtr; if (E.getU8(&OffsetPtr, Buffer.data(), R.Size) != Buffer.data()) return createStringError( std::make_error_code(std::errc::invalid_argument), "Failed reading data into buffer of size %d at offset %d.", R.Size, OffsetPtr); + + assert(OffsetPtr >= PreReadOffset); + if (OffsetPtr - PreReadOffset != static_cast(R.Size)) + return createStringError( + std::make_error_code(std::errc::invalid_argument), + "Failed reading enough bytes for the custom event payload -- read %d " + "expecting %d bytes at offset %d.", + OffsetPtr - PreReadOffset, R.Size, PreReadOffset); + R.Data.assign(Buffer.begin(), Buffer.end()); return Error::success(); } @@ -167,6 +183,12 @@ Error RecordInitializer::visit(CustomEventRecordV5 &R) { std::make_error_code(std::errc::invalid_argument), "Cannot read a custom event record size field offset %d.", OffsetPtr); + if (R.Size <= 0) + return createStringError( + std::make_error_code(std::errc::bad_address), + "Invalid size for custom event (size = %d) at offset %d.", R.Size, + OffsetPtr); + PreReadOffset = OffsetPtr; R.Delta = E.getSigned(&OffsetPtr, sizeof(int32_t)); if (PreReadOffset == OffsetPtr) @@ -188,11 +210,21 @@ Error RecordInitializer::visit(CustomEventRecordV5 &R) { std::vector Buffer; Buffer.resize(R.Size); + PreReadOffset = OffsetPtr; if (E.getU8(&OffsetPtr, Buffer.data(), R.Size) != Buffer.data()) return createStringError( std::make_error_code(std::errc::invalid_argument), "Failed reading data into buffer of size %d at offset %d.", R.Size, OffsetPtr); + + assert(OffsetPtr >= PreReadOffset); + if (OffsetPtr - PreReadOffset != static_cast(R.Size)) + return createStringError( + std::make_error_code(std::errc::invalid_argument), + "Failed reading enough bytes for the custom event payload -- read %d " + "expecting %d bytes at offset %d.", + OffsetPtr - PreReadOffset, R.Size, PreReadOffset); + R.Data.assign(Buffer.begin(), Buffer.end()); return Error::success(); } @@ -213,6 +245,12 @@ Error RecordInitializer::visit(TypedEventRecord &R) { std::make_error_code(std::errc::invalid_argument), "Cannot read a typed event record size field offset %d.", OffsetPtr); + if (R.Size <= 0) + return createStringError( + std::make_error_code(std::errc::bad_address), + "Invalid size for typed event (size = %d) at offset %d.", R.Size, + OffsetPtr); + PreReadOffset = OffsetPtr; R.Delta = E.getSigned(&OffsetPtr, sizeof(int32_t)); if (PreReadOffset == OffsetPtr) @@ -241,11 +279,21 @@ Error RecordInitializer::visit(TypedEventRecord &R) { std::vector Buffer; Buffer.resize(R.Size); + PreReadOffset = OffsetPtr; if (E.getU8(&OffsetPtr, Buffer.data(), R.Size) != Buffer.data()) return createStringError( std::make_error_code(std::errc::invalid_argument), "Failed reading data into buffer of size %d at offset %d.", R.Size, OffsetPtr); + + assert(OffsetPtr >= PreReadOffset); + if (OffsetPtr - PreReadOffset != static_cast(R.Size)) + return createStringError( + std::make_error_code(std::errc::invalid_argument), + "Failed reading enough bytes for the typed event payload -- read %d " + "expecting %d bytes at offset %d.", + OffsetPtr - PreReadOffset, R.Size, PreReadOffset); + R.Data.assign(Buffer.begin(), Buffer.end()); return Error::success(); } @@ -337,7 +385,11 @@ Error RecordInitializer::visit(FunctionRecord &R) { return createStringError(std::make_error_code(std::errc::bad_address), "Cannot read function id field from offset %d.", OffsetPtr); - unsigned FunctionType = (Buffer >> 1) & 0x07; + + // To get the function record type, we shift the buffer one to the right + // (truncating the function record indicator) then take the three bits + // (0b0111) to get the record type as an unsigned value. + unsigned FunctionType = (Buffer >> 1) & 0x07u; switch (FunctionType) { case static_cast(RecordTypes::ENTER): case static_cast(RecordTypes::ENTER_ARG): diff --git a/projects/CMakeLists.txt b/projects/CMakeLists.txt index 32617fd4ba62979a59379bfb18cc9c90022c1266..9afc30c8928e3cbd33fda03e6102cae3ea2e2eb7 100644 --- a/projects/CMakeLists.txt +++ b/projects/CMakeLists.txt @@ -13,7 +13,8 @@ foreach(entry ${entries}) (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/parallel-libs) AND (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/openmp) AND (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/debuginfo-tests)) - add_subdirectory(${entry}) + get_filename_component(entry_name "${entry}" NAME) + add_llvm_external_project(${entry_name}) endif() endif() endforeach(entry) diff --git a/test/Analysis/ConstantFolding/func-and-folding.ll b/test/Analysis/ConstantFolding/func-and-folding.ll new file mode 100644 index 0000000000000000000000000000000000000000..2dbbe673601d574b9f01b2768cde9f3d5dbe71c0 --- /dev/null +++ b/test/Analysis/ConstantFolding/func-and-folding.ll @@ -0,0 +1,27 @@ +; RUN: opt < %s -constprop -S -o - | FileCheck %s + +; Function Attrs: minsize norecurse nounwind optsize readnone +define void @foo1() #0 { +entry: + ret void +} + +; Function Attrs: minsize norecurse nounwind optsize readnone +define void @foo2() align 4 { +entry: + ret void +} + +; Function Attrs: minsize nounwind optsize +define i32 @main() local_unnamed_addr #1 { +entry: +; CHECK: ptrtoint + %call = tail call i32 bitcast (i32 (...)* @process to i32 (i32)*)(i32 and (i32 ptrtoint (void ()* @foo1 to i32), i32 2)) #3 +; CHECK-NEXT: ptrtoint + %call2 = tail call i32 bitcast (i32 (...)* @process to i32 (i32)*)(i32 and (i32 ptrtoint (void ()* @foo2 to i32), i32 2)) #3 + ret i32 0 +} + +; Function Attrs: minsize optsize +declare i32 @process(...) local_unnamed_addr #2 + diff --git a/test/Analysis/ConstantFolding/saturating-add-sub.ll b/test/Analysis/ConstantFolding/saturating-add-sub.ll new file mode 100644 index 0000000000000000000000000000000000000000..14c6a9fabfade7ad69911da348bfa6ca81221252 --- /dev/null +++ b/test/Analysis/ConstantFolding/saturating-add-sub.ll @@ -0,0 +1,111 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -constprop -S | FileCheck %s + +declare void @dummy(i8) +declare void @dummy_vec(<2 x i8>) + +declare i8 @llvm.uadd.sat.i8(i8, i8) +declare i8 @llvm.sadd.sat.i8(i8, i8) +declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>) +declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>) + +declare i8 @llvm.usub.sat.i8(i8, i8) +declare i8 @llvm.ssub.sat.i8(i8, i8) +declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>) +declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>) + +define void @test_add_scalar() { +; CHECK-LABEL: @test_add_scalar( +; CHECK-NEXT: call void @dummy(i8 30) +; CHECK-NEXT: call void @dummy(i8 -1) +; CHECK-NEXT: call void @dummy(i8 -10) +; CHECK-NEXT: call void @dummy(i8 127) +; CHECK-NEXT: call void @dummy(i8 -128) +; CHECK-NEXT: ret void +; + %x1 = call i8 @llvm.uadd.sat.i8(i8 10, i8 20) + call void @dummy(i8 %x1) + %x2 = call i8 @llvm.uadd.sat.i8(i8 250, i8 100) + call void @dummy(i8 %x2) + + %y1 = call i8 @llvm.sadd.sat.i8(i8 10, i8 -20) + call void @dummy(i8 %y1) + %y2 = call i8 @llvm.sadd.sat.i8(i8 120, i8 10) + call void @dummy(i8 %y2) + %y3 = call i8 @llvm.sadd.sat.i8(i8 -120, i8 -10) + call void @dummy(i8 %y3) + + ret void +} + +define void @test_add_vector(<2 x i8> %a) { +; CHECK-LABEL: @test_add_vector( +; CHECK-NEXT: call void @dummy_vec(<2 x i8> ) +; CHECK-NEXT: call void @dummy_vec(<2 x i8> ) +; CHECK-NEXT: call void @dummy_vec(<2 x i8> ) +; CHECK-NEXT: call void @dummy_vec(<2 x i8> ) +; CHECK-NEXT: call void @dummy_vec(<2 x i8> ) +; CHECK-NEXT: ret void +; + %x1 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> , <2 x i8> ) + call void @dummy_vec(<2 x i8> %x1) + %x2 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> , <2 x i8> ) + call void @dummy_vec(<2 x i8> %x2) + + %y1 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> , <2 x i8> ) + call void @dummy_vec(<2 x i8> %y1) + %y2 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> , <2 x i8> ) + call void @dummy_vec(<2 x i8> %y2) + %y3 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> , <2 x i8> ) + call void @dummy_vec(<2 x i8> %y3) + + ret void +} + +define void @test_usub_ssub_scalar() { +; CHECK-LABEL: @test_usub_ssub_scalar( +; CHECK-NEXT: call void @dummy(i8 10) +; CHECK-NEXT: call void @dummy(i8 0) +; CHECK-NEXT: call void @dummy(i8 -30) +; CHECK-NEXT: call void @dummy(i8 127) +; CHECK-NEXT: call void @dummy(i8 -128) +; CHECK-NEXT: ret void +; + %x1 = call i8 @llvm.usub.sat.i8(i8 20, i8 10) + call void @dummy(i8 %x1) + %x2 = call i8 @llvm.usub.sat.i8(i8 200, i8 250) + call void @dummy(i8 %x2) + + %y1 = call i8 @llvm.ssub.sat.i8(i8 -10, i8 20) + call void @dummy(i8 %y1) + %y2 = call i8 @llvm.ssub.sat.i8(i8 120, i8 -10) + call void @dummy(i8 %y2) + %y3 = call i8 @llvm.ssub.sat.i8(i8 -120, i8 10) + call void @dummy(i8 %y3) + + ret void +} + +define void @test_sub_vector(<2 x i8> %a) { +; CHECK-LABEL: @test_sub_vector( +; CHECK-NEXT: call void @dummy_vec(<2 x i8> ) +; CHECK-NEXT: call void @dummy_vec(<2 x i8> zeroinitializer) +; CHECK-NEXT: call void @dummy_vec(<2 x i8> ) +; CHECK-NEXT: call void @dummy_vec(<2 x i8> ) +; CHECK-NEXT: call void @dummy_vec(<2 x i8> ) +; CHECK-NEXT: ret void +; + %x1 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> , <2 x i8> ) + call void @dummy_vec(<2 x i8> %x1) + %x2 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> , <2 x i8> ) + call void @dummy_vec(<2 x i8> %x2) + + %y1 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> , <2 x i8> ) + call void @dummy_vec(<2 x i8> %y1) + %y2 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> , <2 x i8> ) + call void @dummy_vec(<2 x i8> %y2) + %y3 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> , <2 x i8> ) + call void @dummy_vec(<2 x i8> %y3) + + ret void +} diff --git a/test/Analysis/CostModel/AArch64/vector-reduce.ll b/test/Analysis/CostModel/AArch64/vector-reduce.ll index c268a18e7f8c6d0c439569c976530cd788c67048..8cc6eea4ec4c13d67ecface8fb06d1fb6630f6c0 100644 --- a/test/Analysis/CostModel/AArch64/vector-reduce.ll +++ b/test/Analysis/CostModel/AArch64/vector-reduce.ll @@ -47,7 +47,7 @@ define i32 @add.i32.v4i32(<4 x i32> %v) { } ; COST-LABEL: umin.i8.v8i8 -; COST: Found an estimated cost of 280 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> %v) +; COST: Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> %v) ; CODE-LABEL: umin.i8.v8i8 ; CODE: uminv b0, v0.8b define i8 @umin.i8.v8i8(<8 x i8> %v) { @@ -56,7 +56,7 @@ define i8 @umin.i8.v8i8(<8 x i8> %v) { } ; COST-LABEL: umin.i8.v16i8 -; COST: Found an estimated cost of 744 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> %v) +; COST: Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> %v) ; CODE-LABEL: umin.i8.v16i8 ; CODE: uminv b0, v0.16b define i8 @umin.i8.v16i8(<16 x i8> %v) { @@ -65,7 +65,7 @@ define i8 @umin.i8.v16i8(<16 x i8> %v) { } ; COST-LABEL: umin.i16.v4i16 -; COST: Found an estimated cost of 92 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> %v) +; COST: Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> %v) ; CODE-LABEL: umin.i16.v4i16 ; CODE: uminv h0, v0.4h define i16 @umin.i16.v4i16(<4 x i16> %v) { @@ -74,7 +74,7 @@ define i16 @umin.i16.v4i16(<4 x i16> %v) { } ; COST-LABEL: umin.i16.v8i16 -; COST: Found an estimated cost of 280 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> %v) +; COST: Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> %v) ; CODE-LABEL: umin.i16.v8i16 ; CODE: uminv h0, v0.8h define i16 @umin.i16.v8i16(<8 x i16> %v) { @@ -83,7 +83,7 @@ define i16 @umin.i16.v8i16(<8 x i16> %v) { } ; COST-LABEL: umin.i32.v4i32 -; COST: Found an estimated cost of 62 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %v) +; COST: Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %v) ; CODE-LABEL: umin.i32.v4i32 ; CODE: uminv s0, v0.4s define i32 @umin.i32.v4i32(<4 x i32> %v) { @@ -92,7 +92,7 @@ define i32 @umin.i32.v4i32(<4 x i32> %v) { } ; COST-LABEL: umax.i8.v8i8 -; COST: Found an estimated cost of 280 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> %v) +; COST: Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> %v) ; CODE-LABEL: umax.i8.v8i8 ; CODE: umaxv b0, v0.8b define i8 @umax.i8.v8i8(<8 x i8> %v) { @@ -101,7 +101,7 @@ define i8 @umax.i8.v8i8(<8 x i8> %v) { } ; COST-LABEL: umax.i8.v16i8 -; COST: Found an estimated cost of 744 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> %v) +; COST: Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> %v) ; CODE-LABEL: umax.i8.v16i8 ; CODE: umaxv b0, v0.16b define i8 @umax.i8.v16i8(<16 x i8> %v) { @@ -110,7 +110,7 @@ define i8 @umax.i8.v16i8(<16 x i8> %v) { } ; COST-LABEL: umax.i16.v4i16 -; COST: Found an estimated cost of 92 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> %v) +; COST: Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> %v) ; CODE-LABEL: umax.i16.v4i16 ; CODE: umaxv h0, v0.4h define i16 @umax.i16.v4i16(<4 x i16> %v) { @@ -119,7 +119,7 @@ define i16 @umax.i16.v4i16(<4 x i16> %v) { } ; COST-LABEL: umax.i16.v8i16 -; COST: Found an estimated cost of 280 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> %v) +; COST: Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> %v) ; CODE-LABEL: umax.i16.v8i16 ; CODE: umaxv h0, v0.8h define i16 @umax.i16.v8i16(<8 x i16> %v) { @@ -128,7 +128,7 @@ define i16 @umax.i16.v8i16(<8 x i16> %v) { } ; COST-LABEL: umax.i32.v4i32 -; COST: Found an estimated cost of 62 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %v) +; COST: Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %v) ; CODE-LABEL: umax.i32.v4i32 ; CODE: umaxv s0, v0.4s define i32 @umax.i32.v4i32(<4 x i32> %v) { @@ -137,7 +137,7 @@ define i32 @umax.i32.v4i32(<4 x i32> %v) { } ; COST-LABEL: smin.i8.v8i8 -; COST: Found an estimated cost of 280 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> %v) +; COST: Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> %v) ; CODE-LABEL: smin.i8.v8i8 ; CODE: sminv b0, v0.8b define i8 @smin.i8.v8i8(<8 x i8> %v) { @@ -146,7 +146,7 @@ define i8 @smin.i8.v8i8(<8 x i8> %v) { } ; COST-LABEL: smin.i8.v16i8 -; COST: Found an estimated cost of 744 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> %v) +; COST: Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> %v) ; CODE-LABEL: smin.i8.v16i8 ; CODE: sminv b0, v0.16b define i8 @smin.i8.v16i8(<16 x i8> %v) { @@ -155,7 +155,7 @@ define i8 @smin.i8.v16i8(<16 x i8> %v) { } ; COST-LABEL: smin.i16.v4i16 -; COST: Found an estimated cost of 92 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> %v) +; COST: Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> %v) ; CODE-LABEL: smin.i16.v4i16 ; CODE: sminv h0, v0.4h define i16 @smin.i16.v4i16(<4 x i16> %v) { @@ -164,7 +164,7 @@ define i16 @smin.i16.v4i16(<4 x i16> %v) { } ; COST-LABEL: smin.i16.v8i16 -; COST: Found an estimated cost of 280 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> %v) +; COST: Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> %v) ; CODE-LABEL: smin.i16.v8i16 ; CODE: sminv h0, v0.8h define i16 @smin.i16.v8i16(<8 x i16> %v) { @@ -173,7 +173,7 @@ define i16 @smin.i16.v8i16(<8 x i16> %v) { } ; COST-LABEL: smin.i32.v4i32 -; COST: Found an estimated cost of 62 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %v) +; COST: Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %v) ; CODE-LABEL: smin.i32.v4i32 ; CODE: sminv s0, v0.4s define i32 @smin.i32.v4i32(<4 x i32> %v) { @@ -182,7 +182,7 @@ define i32 @smin.i32.v4i32(<4 x i32> %v) { } ; COST-LABEL: smax.i8.v8i8 -; COST: Found an estimated cost of 280 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> %v) +; COST: Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> %v) ; CODE-LABEL: smax.i8.v8i8 ; CODE: smaxv b0, v0.8b define i8 @smax.i8.v8i8(<8 x i8> %v) { @@ -191,7 +191,7 @@ define i8 @smax.i8.v8i8(<8 x i8> %v) { } ; COST-LABEL: smax.i8.v16i8 -; COST: Found an estimated cost of 744 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> %v) +; COST: Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> %v) ; CODE-LABEL: smax.i8.v16i8 ; CODE: smaxv b0, v0.16b define i8 @smax.i8.v16i8(<16 x i8> %v) { @@ -200,7 +200,7 @@ define i8 @smax.i8.v16i8(<16 x i8> %v) { } ; COST-LABEL: smax.i16.v4i16 -; COST: Found an estimated cost of 92 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> %v) +; COST: Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> %v) ; CODE-LABEL: smax.i16.v4i16 ; CODE: smaxv h0, v0.4h define i16 @smax.i16.v4i16(<4 x i16> %v) { @@ -209,7 +209,7 @@ define i16 @smax.i16.v4i16(<4 x i16> %v) { } ; COST-LABEL: smax.i16.v8i16 -; COST: Found an estimated cost of 280 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> %v) +; COST: Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> %v) ; CODE-LABEL: smax.i16.v8i16 ; CODE: smaxv h0, v0.8h define i16 @smax.i16.v8i16(<8 x i16> %v) { @@ -218,7 +218,7 @@ define i16 @smax.i16.v8i16(<8 x i16> %v) { } ; COST-LABEL: smax.i32.v4i32 -; COST: Found an estimated cost of 62 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %v) +; COST: Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %v) ; CODE-LABEL: smax.i32.v4i32 ; CODE: smaxv s0, v0.4s define i32 @smax.i32.v4i32(<4 x i32> %v) { @@ -227,7 +227,7 @@ define i32 @smax.i32.v4i32(<4 x i32> %v) { } ; COST-LABEL: fmin.f32.v4f32 -; COST: Found an estimated cost of 62 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %v) +; COST: Found an estimated cost of 34 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %v) ; CODE-LABEL: fmin.f32.v4f32 ; CODE: fminnmv s0, v0.4s define float @fmin.f32.v4f32(<4 x float> %v) { @@ -236,7 +236,7 @@ define float @fmin.f32.v4f32(<4 x float> %v) { } ; COST-LABEL: fmax.f32.v4f32 -; COST: Found an estimated cost of 62 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %v) +; COST: Found an estimated cost of 34 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %v) ; CODE-LABEL: fmax.f32.v4f32 ; CODE: fmaxnmv s0, v0.4s define float @fmax.f32.v4f32(<4 x float> %v) { diff --git a/test/Analysis/CostModel/SystemZ/cmp-mem.ll b/test/Analysis/CostModel/SystemZ/cmp-mem.ll new file mode 100644 index 0000000000000000000000000000000000000000..4f92d5b191a92b971d22f1d096a2570e6604ee9a --- /dev/null +++ b/test/Analysis/CostModel/SystemZ/cmp-mem.ll @@ -0,0 +1,27 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s +; +; Test costs for i8 and i16 comparisons against memory with a small immediate. + +define i32 @fun0(i8* %Src, i8* %Dst, i8 %Val) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'fun0': +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %Ld = load i8, i8* %Src +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %Cmp = icmp eq i8 %Ld, 123 +; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %Ret = zext i1 %Cmp to i32 +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %Ret + %Ld = load i8, i8* %Src + %Cmp = icmp eq i8 %Ld, 123 + %Ret = zext i1 %Cmp to i32 + ret i32 %Ret +} + +define i32 @fun1(i16* %Src, i16* %Dst, i16 %Val) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'fun1': +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %Ld = load i16, i16* %Src +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %Cmp = icmp eq i16 +; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %Ret = zext i1 %Cmp to i32 +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %Ret + %Ld = load i16, i16* %Src + %Cmp = icmp eq i16 %Ld, 1234 + %Ret = zext i1 %Cmp to i32 + ret i32 %Ret +} diff --git a/test/Analysis/CostModel/SystemZ/divrem-reg.ll b/test/Analysis/CostModel/SystemZ/divrem-reg.ll index 0cb1293cf3bf75d1c28274f9cd4ace36ed15268c..669d3e9064dd5b8fc882053e3661bef86306fee6 100644 --- a/test/Analysis/CostModel/SystemZ/divrem-reg.ll +++ b/test/Analysis/CostModel/SystemZ/divrem-reg.ll @@ -16,19 +16,19 @@ define i64 @fun0(i64 %a, i64 %b) { define i32 @fun1(i32 %a, i32 %b) { %r = sdiv i32 %a, %b ret i32 %r -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %r = sdiv i32 %a, %b +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %r = sdiv i32 %a, %b } define i16 @fun2(i16 %a, i16 %b) { %r = sdiv i16 %a, %b ret i16 %r -; CHECK: Cost Model: Found an estimated cost of 23 for instruction: %r = sdiv i16 %a, %b +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %r = sdiv i16 %a, %b } define i8 @fun3(i8 %a, i8 %b) { %r = sdiv i8 %a, %b ret i8 %r -; CHECK: Cost Model: Found an estimated cost of 23 for instruction: %r = sdiv i8 %a, %b +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %r = sdiv i8 %a, %b } ; Vector sdiv @@ -42,13 +42,13 @@ define <2 x i64> @fun4(<2 x i64> %a, <2 x i64> %b) { define <4 x i32> @fun5(<4 x i32> %a, <4 x i32> %b) { %r = sdiv <4 x i32> %a, %b ret <4 x i32> %r -; CHECK: Cost Model: Found an estimated cost of 98 for instruction: %r = sdiv <4 x i32> +; CHECK: Cost Model: Found an estimated cost of 94 for instruction: %r = sdiv <4 x i32> } define <2 x i32> @fun6(<2 x i32> %a, <2 x i32> %b) { %r = sdiv <2 x i32> %a, %b ret <2 x i32> %r -; CHECK: Cost Model: Found an estimated cost of 50 for instruction: %r = sdiv <2 x i32> +; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %r = sdiv <2 x i32> } define <8 x i16> @fun7(<8 x i16> %a, <8 x i16> %b) { @@ -60,7 +60,7 @@ define <8 x i16> @fun7(<8 x i16> %a, <8 x i16> %b) { define <4 x i16> @fun8(<4 x i16> %a, <4 x i16> %b) { %r = sdiv <4 x i16> %a, %b ret <4 x i16> %r -; CHECK: Cost Model: Found an estimated cost of 106 for instruction: %r = sdiv <4 x i16> +; CHECK: Cost Model: Found an estimated cost of 94 for instruction: %r = sdiv <4 x i16> } define <16 x i8> @fun9(<16 x i8> %a, <16 x i8> %b) { @@ -80,25 +80,25 @@ define <8 x i8> @fun10(<8 x i8> %a, <8 x i8> %b) { define i64 @fun11(i64 %a, i64 %b) { %r = udiv i64 %a, %b ret i64 %r -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %r = udiv i64 %a, %b +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %r = udiv i64 %a, %b } define i32 @fun12(i32 %a, i32 %b) { %r = udiv i32 %a, %b ret i32 %r -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %r = udiv i32 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %r = udiv i32 } define i16 @fun13(i16 %a, i16 %b) { %r = udiv i16 %a, %b ret i16 %r -; CHECK: Cost Model: Found an estimated cost of 23 for instruction: %r = udiv i16 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %r = udiv i16 } define i8 @fun14(i8 %a, i8 %b) { %r = udiv i8 %a, %b ret i8 %r -; CHECK: Cost Model: Found an estimated cost of 23 for instruction: %r = udiv i8 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %r = udiv i8 } ; Vector udiv @@ -106,19 +106,19 @@ define i8 @fun14(i8 %a, i8 %b) { define <2 x i64> @fun15(<2 x i64> %a, <2 x i64> %b) { %r = udiv <2 x i64> %a, %b ret <2 x i64> %r -; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %r = udiv <2 x i64> +; CHECK: Cost Model: Found an estimated cost of 47 for instruction: %r = udiv <2 x i64> } define <4 x i32> @fun16(<4 x i32> %a, <4 x i32> %b) { %r = udiv <4 x i32> %a, %b ret <4 x i32> %r -; CHECK: Cost Model: Found an estimated cost of 98 for instruction: %r = udiv <4 x i32> +; CHECK: Cost Model: Found an estimated cost of 94 for instruction: %r = udiv <4 x i32> } define <2 x i32> @fun17(<2 x i32> %a, <2 x i32> %b) { %r = udiv <2 x i32> %a, %b ret <2 x i32> %r -; CHECK: Cost Model: Found an estimated cost of 50 for instruction: %r = udiv <2 x i32> +; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %r = udiv <2 x i32> } define <8 x i16> @fun18(<8 x i16> %a, <8 x i16> %b) { @@ -130,7 +130,7 @@ define <8 x i16> @fun18(<8 x i16> %a, <8 x i16> %b) { define <4 x i16> @fun19(<4 x i16> %a, <4 x i16> %b) { %r = udiv <4 x i16> %a, %b ret <4 x i16> %r -; CHECK: Cost Model: Found an estimated cost of 106 for instruction: %r = udiv <4 x i16> +; CHECK: Cost Model: Found an estimated cost of 94 for instruction: %r = udiv <4 x i16> } define <16 x i8> @fun20(<16 x i8> %a, <16 x i8> %b) { @@ -156,19 +156,19 @@ define i64 @fun22(i64 %a, i64 %b) { define i32 @fun23(i32 %a, i32 %b) { %r = srem i32 %a, %b ret i32 %r -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %r = srem i32 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %r = srem i32 } define i16 @fun24(i16 %a, i16 %b) { %r = srem i16 %a, %b ret i16 %r -; CHECK: Cost Model: Found an estimated cost of 23 for instruction: %r = srem i16 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %r = srem i16 } define i8 @fun25(i8 %a, i8 %b) { %r = srem i8 %a, %b ret i8 %r -; CHECK: Cost Model: Found an estimated cost of 23 for instruction: %r = srem i8 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %r = srem i8 } ; Vector srem @@ -182,13 +182,13 @@ define <2 x i64> @fun26(<2 x i64> %a, <2 x i64> %b) { define <4 x i32> @fun27(<4 x i32> %a, <4 x i32> %b) { %r = srem <4 x i32> %a, %b ret <4 x i32> %r -; CHECK: Cost Model: Found an estimated cost of 98 for instruction: %r = srem <4 x i32> +; CHECK: Cost Model: Found an estimated cost of 94 for instruction: %r = srem <4 x i32> } define <2 x i32> @fun28(<2 x i32> %a, <2 x i32> %b) { %r = srem <2 x i32> %a, %b ret <2 x i32> %r -; CHECK: Cost Model: Found an estimated cost of 50 for instruction: %r = srem <2 x i32> +; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %r = srem <2 x i32> } define <8 x i16> @fun29(<8 x i16> %a, <8 x i16> %b) { @@ -200,7 +200,7 @@ define <8 x i16> @fun29(<8 x i16> %a, <8 x i16> %b) { define <4 x i16> @fun30(<4 x i16> %a, <4 x i16> %b) { %r = srem <4 x i16> %a, %b ret <4 x i16> %r -; CHECK: Cost Model: Found an estimated cost of 106 for instruction: %r = srem <4 x i16> +; CHECK: Cost Model: Found an estimated cost of 94 for instruction: %r = srem <4 x i16> } define <16 x i8> @fun31(<16 x i8> %a, <16 x i8> %b) { @@ -220,25 +220,25 @@ define <8 x i8> @fun32(<8 x i8> %a, <8 x i8> %b) { define i64 @fun33(i64 %a, i64 %b) { %r = urem i64 %a, %b ret i64 %r -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %r = urem i64 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %r = urem i64 } define i32 @fun34(i32 %a, i32 %b) { %r = urem i32 %a, %b ret i32 %r -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %r = urem i32 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %r = urem i32 } define i16 @fun35(i16 %a, i16 %b) { %r = urem i16 %a, %b ret i16 %r -; CHECK: Cost Model: Found an estimated cost of 23 for instruction: %r = urem i16 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %r = urem i16 } define i8 @fun36(i8 %a, i8 %b) { %r = urem i8 %a, %b ret i8 %r -; CHECK: Cost Model: Found an estimated cost of 23 for instruction: %r = urem i8 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %r = urem i8 } ; Vector urem @@ -246,19 +246,19 @@ define i8 @fun36(i8 %a, i8 %b) { define <2 x i64> @fun37(<2 x i64> %a, <2 x i64> %b) { %r = urem <2 x i64> %a, %b ret <2 x i64> %r -; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %r = urem <2 x i64> +; CHECK: Cost Model: Found an estimated cost of 47 for instruction: %r = urem <2 x i64> } define <4 x i32> @fun38(<4 x i32> %a, <4 x i32> %b) { %r = urem <4 x i32> %a, %b ret <4 x i32> %r -; CHECK: Cost Model: Found an estimated cost of 98 for instruction: %r = urem <4 x i32> +; CHECK: Cost Model: Found an estimated cost of 94 for instruction: %r = urem <4 x i32> } define <2 x i32> @fun39(<2 x i32> %a, <2 x i32> %b) { %r = urem <2 x i32> %a, %b ret <2 x i32> %r -; CHECK: Cost Model: Found an estimated cost of 50 for instruction: %r = urem <2 x i32> +; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %r = urem <2 x i32> } define <8 x i16> @fun40(<8 x i16> %a, <8 x i16> %b) { @@ -270,7 +270,7 @@ define <8 x i16> @fun40(<8 x i16> %a, <8 x i16> %b) { define <4 x i16> @fun41(<4 x i16> %a, <4 x i16> %b) { %r = urem <4 x i16> %a, %b ret <4 x i16> %r -; CHECK: Cost Model: Found an estimated cost of 106 for instruction: %r = urem <4 x i16> +; CHECK: Cost Model: Found an estimated cost of 94 for instruction: %r = urem <4 x i16> } define <16 x i8> @fun42(<16 x i8> %a, <16 x i8> %b) { diff --git a/test/Analysis/CostModel/SystemZ/fp-cast.ll b/test/Analysis/CostModel/SystemZ/fp-cast.ll index 20feefb8025946412f37a1648adf8d55f76dd92f..cbad825e486469d75048aa757132fca9bec9c05a 100644 --- a/test/Analysis/CostModel/SystemZ/fp-cast.ll +++ b/test/Analysis/CostModel/SystemZ/fp-cast.ll @@ -110,45 +110,45 @@ define void @fptosi() { ; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %v14 = fptosi <2 x fp128> undef to <2 x i16> ; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %v15 = fptosi <2 x fp128> undef to <2 x i8> ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %v16 = fptosi <2 x double> undef to <2 x i64> -; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %v17 = fptosi <2 x double> undef to <2 x i32> -; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %v18 = fptosi <2 x double> undef to <2 x i16> -; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %v19 = fptosi <2 x double> undef to <2 x i8> -; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v20 = fptosi <2 x float> undef to <2 x i64> -; CHECK: Cost Model: Found an estimated cost of 14 for instruction: %v21 = fptosi <2 x float> undef to <2 x i32> -; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %v22 = fptosi <2 x float> undef to <2 x i16> -; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %v23 = fptosi <2 x float> undef to <2 x i8> +; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v17 = fptosi <2 x double> undef to <2 x i32> +; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v18 = fptosi <2 x double> undef to <2 x i16> +; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v19 = fptosi <2 x double> undef to <2 x i8> +; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %v20 = fptosi <2 x float> undef to <2 x i64> +; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v21 = fptosi <2 x float> undef to <2 x i32> +; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v22 = fptosi <2 x float> undef to <2 x i16> +; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v23 = fptosi <2 x float> undef to <2 x i8> ; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v24 = fptosi <4 x fp128> undef to <4 x i64> ; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v25 = fptosi <4 x fp128> undef to <4 x i32> ; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v26 = fptosi <4 x fp128> undef to <4 x i16> ; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v27 = fptosi <4 x fp128> undef to <4 x i8> ; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %v28 = fptosi <4 x double> undef to <4 x i64> -; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v29 = fptosi <4 x double> undef to <4 x i32> -; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v30 = fptosi <4 x double> undef to <4 x i16> -; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v31 = fptosi <4 x double> undef to <4 x i8> -; CHECK: Cost Model: Found an estimated cost of 11 for instruction: %v32 = fptosi <4 x float> undef to <4 x i64> -; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v33 = fptosi <4 x float> undef to <4 x i32> -; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v34 = fptosi <4 x float> undef to <4 x i16> -; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v35 = fptosi <4 x float> undef to <4 x i8> +; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v29 = fptosi <4 x double> undef to <4 x i32> +; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v30 = fptosi <4 x double> undef to <4 x i16> +; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v31 = fptosi <4 x double> undef to <4 x i8> +; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %v32 = fptosi <4 x float> undef to <4 x i64> +; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v33 = fptosi <4 x float> undef to <4 x i32> +; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v34 = fptosi <4 x float> undef to <4 x i16> +; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v35 = fptosi <4 x float> undef to <4 x i8> ; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v36 = fptosi <8 x fp128> undef to <8 x i64> ; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %v37 = fptosi <8 x fp128> undef to <8 x i32> ; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %v38 = fptosi <8 x fp128> undef to <8 x i16> ; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %v39 = fptosi <8 x fp128> undef to <8 x i8> ; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %v40 = fptosi <8 x double> undef to <8 x i64> -; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v41 = fptosi <8 x double> undef to <8 x i32> -; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v42 = fptosi <8 x double> undef to <8 x i16> -; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v43 = fptosi <8 x double> undef to <8 x i8> -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %v44 = fptosi <8 x float> undef to <8 x i64> -; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v45 = fptosi <8 x float> undef to <8 x i32> -; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v46 = fptosi <8 x float> undef to <8 x i16> -; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v47 = fptosi <8 x float> undef to <8 x i8> +; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v41 = fptosi <8 x double> undef to <8 x i32> +; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v42 = fptosi <8 x double> undef to <8 x i16> +; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v43 = fptosi <8 x double> undef to <8 x i8> +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %v44 = fptosi <8 x float> undef to <8 x i64> +; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v45 = fptosi <8 x float> undef to <8 x i32> +; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v46 = fptosi <8 x float> undef to <8 x i16> +; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v47 = fptosi <8 x float> undef to <8 x i8> ; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v48 = fptosi <16 x double> undef to <16 x i64> -; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %v49 = fptosi <16 x double> undef to <16 x i32> -; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %v50 = fptosi <16 x double> undef to <16 x i16> -; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %v51 = fptosi <16 x double> undef to <16 x i8> -; CHECK: Cost Model: Found an estimated cost of 41 for instruction: %v52 = fptosi <16 x float> undef to <16 x i64> -; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %v53 = fptosi <16 x float> undef to <16 x i32> -; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %v54 = fptosi <16 x float> undef to <16 x i16> -; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %v55 = fptosi <16 x float> undef to <16 x i8> +; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %v49 = fptosi <16 x double> undef to <16 x i32> +; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %v50 = fptosi <16 x double> undef to <16 x i16> +; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %v51 = fptosi <16 x double> undef to <16 x i8> +; CHECK: Cost Model: Found an estimated cost of 40 for instruction: %v52 = fptosi <16 x float> undef to <16 x i64> +; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %v53 = fptosi <16 x float> undef to <16 x i32> +; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %v54 = fptosi <16 x float> undef to <16 x i16> +; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %v55 = fptosi <16 x float> undef to <16 x i8> ret void; } @@ -229,45 +229,45 @@ define void @fptoui() { ; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %v14 = fptoui <2 x fp128> undef to <2 x i16> ; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %v15 = fptoui <2 x fp128> undef to <2 x i8> ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %v16 = fptoui <2 x double> undef to <2 x i64> -; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %v17 = fptoui <2 x double> undef to <2 x i32> -; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %v18 = fptoui <2 x double> undef to <2 x i16> -; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %v19 = fptoui <2 x double> undef to <2 x i8> -; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v20 = fptoui <2 x float> undef to <2 x i64> -; CHECK: Cost Model: Found an estimated cost of 14 for instruction: %v21 = fptoui <2 x float> undef to <2 x i32> -; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %v22 = fptoui <2 x float> undef to <2 x i16> -; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %v23 = fptoui <2 x float> undef to <2 x i8> +; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v17 = fptoui <2 x double> undef to <2 x i32> +; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v18 = fptoui <2 x double> undef to <2 x i16> +; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v19 = fptoui <2 x double> undef to <2 x i8> +; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %v20 = fptoui <2 x float> undef to <2 x i64> +; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v21 = fptoui <2 x float> undef to <2 x i32> +; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v22 = fptoui <2 x float> undef to <2 x i16> +; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v23 = fptoui <2 x float> undef to <2 x i8> ; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v24 = fptoui <4 x fp128> undef to <4 x i64> ; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v25 = fptoui <4 x fp128> undef to <4 x i32> ; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v26 = fptoui <4 x fp128> undef to <4 x i16> ; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v27 = fptoui <4 x fp128> undef to <4 x i8> ; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %v28 = fptoui <4 x double> undef to <4 x i64> -; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v29 = fptoui <4 x double> undef to <4 x i32> -; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v30 = fptoui <4 x double> undef to <4 x i16> -; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v31 = fptoui <4 x double> undef to <4 x i8> -; CHECK: Cost Model: Found an estimated cost of 11 for instruction: %v32 = fptoui <4 x float> undef to <4 x i64> -; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v33 = fptoui <4 x float> undef to <4 x i32> -; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v34 = fptoui <4 x float> undef to <4 x i16> -; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v35 = fptoui <4 x float> undef to <4 x i8> +; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v29 = fptoui <4 x double> undef to <4 x i32> +; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v30 = fptoui <4 x double> undef to <4 x i16> +; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v31 = fptoui <4 x double> undef to <4 x i8> +; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %v32 = fptoui <4 x float> undef to <4 x i64> +; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v33 = fptoui <4 x float> undef to <4 x i32> +; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v34 = fptoui <4 x float> undef to <4 x i16> +; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v35 = fptoui <4 x float> undef to <4 x i8> ; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v36 = fptoui <8 x fp128> undef to <8 x i64> ; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %v37 = fptoui <8 x fp128> undef to <8 x i32> ; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %v38 = fptoui <8 x fp128> undef to <8 x i16> ; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %v39 = fptoui <8 x fp128> undef to <8 x i8> ; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %v40 = fptoui <8 x double> undef to <8 x i64> -; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v41 = fptoui <8 x double> undef to <8 x i32> -; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v42 = fptoui <8 x double> undef to <8 x i16> -; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v43 = fptoui <8 x double> undef to <8 x i8> -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %v44 = fptoui <8 x float> undef to <8 x i64> -; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v45 = fptoui <8 x float> undef to <8 x i32> -; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v46 = fptoui <8 x float> undef to <8 x i16> -; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v47 = fptoui <8 x float> undef to <8 x i8> +; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v41 = fptoui <8 x double> undef to <8 x i32> +; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v42 = fptoui <8 x double> undef to <8 x i16> +; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v43 = fptoui <8 x double> undef to <8 x i8> +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %v44 = fptoui <8 x float> undef to <8 x i64> +; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v45 = fptoui <8 x float> undef to <8 x i32> +; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v46 = fptoui <8 x float> undef to <8 x i16> +; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v47 = fptoui <8 x float> undef to <8 x i8> ; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v48 = fptoui <16 x double> undef to <16 x i64> -; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %v49 = fptoui <16 x double> undef to <16 x i32> -; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %v50 = fptoui <16 x double> undef to <16 x i16> -; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %v51 = fptoui <16 x double> undef to <16 x i8> -; CHECK: Cost Model: Found an estimated cost of 41 for instruction: %v52 = fptoui <16 x float> undef to <16 x i64> -; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %v53 = fptoui <16 x float> undef to <16 x i32> -; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %v54 = fptoui <16 x float> undef to <16 x i16> -; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %v55 = fptoui <16 x float> undef to <16 x i8> +; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %v49 = fptoui <16 x double> undef to <16 x i32> +; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %v50 = fptoui <16 x double> undef to <16 x i16> +; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %v51 = fptoui <16 x double> undef to <16 x i8> +; CHECK: Cost Model: Found an estimated cost of 40 for instruction: %v52 = fptoui <16 x float> undef to <16 x i64> +; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %v53 = fptoui <16 x float> undef to <16 x i32> +; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %v54 = fptoui <16 x float> undef to <16 x i16> +; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %v55 = fptoui <16 x float> undef to <16 x i8> ret void; } @@ -374,50 +374,50 @@ define void @sitofp() { ; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %v9 = sitofp i8 undef to fp128 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %v10 = sitofp i8 undef to double ; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %v11 = sitofp i8 undef to float -; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %v12 = sitofp <2 x i64> undef to <2 x fp128> +; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %v12 = sitofp <2 x i64> undef to <2 x fp128> ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %v13 = sitofp <2 x i64> undef to <2 x double> -; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v14 = sitofp <2 x i64> undef to <2 x float> -; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %v15 = sitofp <2 x i32> undef to <2 x fp128> -; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v16 = sitofp <2 x i32> undef to <2 x double> -; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v17 = sitofp <2 x i32> undef to <2 x float> -; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v18 = sitofp <2 x i16> undef to <2 x fp128> -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v19 = sitofp <2 x i16> undef to <2 x double> -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v20 = sitofp <2 x i16> undef to <2 x float> -; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v21 = sitofp <2 x i8> undef to <2 x fp128> -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v22 = sitofp <2 x i8> undef to <2 x double> -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v23 = sitofp <2 x i8> undef to <2 x float> -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v24 = sitofp <4 x i64> undef to <4 x fp128> +; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %v14 = sitofp <2 x i64> undef to <2 x float> +; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %v15 = sitofp <2 x i32> undef to <2 x fp128> +; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %v16 = sitofp <2 x i32> undef to <2 x double> +; CHECK: Cost Model: Found an estimated cost of 14 for instruction: %v17 = sitofp <2 x i32> undef to <2 x float> +; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %v18 = sitofp <2 x i16> undef to <2 x fp128> +; CHECK: Cost Model: Found an estimated cost of 9 for instruction: %v19 = sitofp <2 x i16> undef to <2 x double> +; CHECK: Cost Model: Found an estimated cost of 9 for instruction: %v20 = sitofp <2 x i16> undef to <2 x float> +; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %v21 = sitofp <2 x i8> undef to <2 x fp128> +; CHECK: Cost Model: Found an estimated cost of 9 for instruction: %v22 = sitofp <2 x i8> undef to <2 x double> +; CHECK: Cost Model: Found an estimated cost of 9 for instruction: %v23 = sitofp <2 x i8> undef to <2 x float> +; CHECK: Cost Model: Found an estimated cost of 9 for instruction: %v24 = sitofp <4 x i64> undef to <4 x fp128> ; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %v25 = sitofp <4 x i64> undef to <4 x double> -; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v26 = sitofp <4 x i64> undef to <4 x float> -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v27 = sitofp <4 x i32> undef to <4 x fp128> -; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v28 = sitofp <4 x i32> undef to <4 x double> -; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v29 = sitofp <4 x i32> undef to <4 x float> -; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v30 = sitofp <4 x i16> undef to <4 x fp128> -; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %v31 = sitofp <4 x i16> undef to <4 x double> -; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %v32 = sitofp <4 x i16> undef to <4 x float> -; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v33 = sitofp <4 x i8> undef to <4 x fp128> -; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %v34 = sitofp <4 x i8> undef to <4 x double> -; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %v35 = sitofp <4 x i8> undef to <4 x float> -; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %v36 = sitofp <8 x i64> undef to <8 x fp128> +; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v26 = sitofp <4 x i64> undef to <4 x float> +; CHECK: Cost Model: Found an estimated cost of 9 for instruction: %v27 = sitofp <4 x i32> undef to <4 x fp128> +; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v28 = sitofp <4 x i32> undef to <4 x double> +; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v29 = sitofp <4 x i32> undef to <4 x float> +; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v30 = sitofp <4 x i16> undef to <4 x fp128> +; CHECK: Cost Model: Found an estimated cost of 17 for instruction: %v31 = sitofp <4 x i16> undef to <4 x double> +; CHECK: Cost Model: Found an estimated cost of 17 for instruction: %v32 = sitofp <4 x i16> undef to <4 x float> +; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v33 = sitofp <4 x i8> undef to <4 x fp128> +; CHECK: Cost Model: Found an estimated cost of 17 for instruction: %v34 = sitofp <4 x i8> undef to <4 x double> +; CHECK: Cost Model: Found an estimated cost of 17 for instruction: %v35 = sitofp <4 x i8> undef to <4 x float> +; CHECK: Cost Model: Found an estimated cost of 17 for instruction: %v36 = sitofp <8 x i64> undef to <8 x fp128> ; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %v37 = sitofp <8 x i64> undef to <8 x double> -; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v38 = sitofp <8 x i64> undef to <8 x float> -; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %v39 = sitofp <8 x i32> undef to <8 x fp128> -; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v40 = sitofp <8 x i32> undef to <8 x double> -; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v41 = sitofp <8 x i32> undef to <8 x float> -; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v42 = sitofp <8 x i16> undef to <8 x fp128> -; CHECK: Cost Model: Found an estimated cost of 32 for instruction: %v43 = sitofp <8 x i16> undef to <8 x double> -; CHECK: Cost Model: Found an estimated cost of 32 for instruction: %v44 = sitofp <8 x i16> undef to <8 x float> -; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v45 = sitofp <8 x i8> undef to <8 x fp128> -; CHECK: Cost Model: Found an estimated cost of 32 for instruction: %v46 = sitofp <8 x i8> undef to <8 x double> -; CHECK: Cost Model: Found an estimated cost of 32 for instruction: %v47 = sitofp <8 x i8> undef to <8 x float> +; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v38 = sitofp <8 x i64> undef to <8 x float> +; CHECK: Cost Model: Found an estimated cost of 17 for instruction: %v39 = sitofp <8 x i32> undef to <8 x fp128> +; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v40 = sitofp <8 x i32> undef to <8 x double> +; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v41 = sitofp <8 x i32> undef to <8 x float> +; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v42 = sitofp <8 x i16> undef to <8 x fp128> +; CHECK: Cost Model: Found an estimated cost of 33 for instruction: %v43 = sitofp <8 x i16> undef to <8 x double> +; CHECK: Cost Model: Found an estimated cost of 33 for instruction: %v44 = sitofp <8 x i16> undef to <8 x float> +; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v45 = sitofp <8 x i8> undef to <8 x fp128> +; CHECK: Cost Model: Found an estimated cost of 33 for instruction: %v46 = sitofp <8 x i8> undef to <8 x double> +; CHECK: Cost Model: Found an estimated cost of 33 for instruction: %v47 = sitofp <8 x i8> undef to <8 x float> ; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v48 = sitofp <16 x i64> undef to <16 x double> -; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %v49 = sitofp <16 x i64> undef to <16 x float> -; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %v50 = sitofp <16 x i32> undef to <16 x double> -; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %v51 = sitofp <16 x i32> undef to <16 x float> -; CHECK: Cost Model: Found an estimated cost of 64 for instruction: %v52 = sitofp <16 x i16> undef to <16 x double> -; CHECK: Cost Model: Found an estimated cost of 64 for instruction: %v53 = sitofp <16 x i16> undef to <16 x float> -; CHECK: Cost Model: Found an estimated cost of 64 for instruction: %v54 = sitofp <16 x i8> undef to <16 x double> -; CHECK: Cost Model: Found an estimated cost of 64 for instruction: %v55 = sitofp <16 x i8> undef to <16 x float> +; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %v49 = sitofp <16 x i64> undef to <16 x float> +; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %v50 = sitofp <16 x i32> undef to <16 x double> +; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %v51 = sitofp <16 x i32> undef to <16 x float> +; CHECK: Cost Model: Found an estimated cost of 65 for instruction: %v52 = sitofp <16 x i16> undef to <16 x double> +; CHECK: Cost Model: Found an estimated cost of 65 for instruction: %v53 = sitofp <16 x i16> undef to <16 x float> +; CHECK: Cost Model: Found an estimated cost of 65 for instruction: %v54 = sitofp <16 x i8> undef to <16 x double> +; CHECK: Cost Model: Found an estimated cost of 65 for instruction: %v55 = sitofp <16 x i8> undef to <16 x float> ret void; } @@ -492,50 +492,50 @@ define void @uitofp() { ; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %v9 = uitofp i8 undef to fp128 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %v10 = uitofp i8 undef to double ; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %v11 = uitofp i8 undef to float -; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %v12 = uitofp <2 x i64> undef to <2 x fp128> +; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %v12 = uitofp <2 x i64> undef to <2 x fp128> ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %v13 = uitofp <2 x i64> undef to <2 x double> -; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v14 = uitofp <2 x i64> undef to <2 x float> -; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %v15 = uitofp <2 x i32> undef to <2 x fp128> -; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v16 = uitofp <2 x i32> undef to <2 x double> -; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v17 = uitofp <2 x i32> undef to <2 x float> -; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v18 = uitofp <2 x i16> undef to <2 x fp128> -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v19 = uitofp <2 x i16> undef to <2 x double> -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v20 = uitofp <2 x i16> undef to <2 x float> -; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %v21 = uitofp <2 x i8> undef to <2 x fp128> -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v22 = uitofp <2 x i8> undef to <2 x double> -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v23 = uitofp <2 x i8> undef to <2 x float> -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v24 = uitofp <4 x i64> undef to <4 x fp128> +; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %v14 = uitofp <2 x i64> undef to <2 x float> +; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %v15 = uitofp <2 x i32> undef to <2 x fp128> +; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %v16 = uitofp <2 x i32> undef to <2 x double> +; CHECK: Cost Model: Found an estimated cost of 14 for instruction: %v17 = uitofp <2 x i32> undef to <2 x float> +; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %v18 = uitofp <2 x i16> undef to <2 x fp128> +; CHECK: Cost Model: Found an estimated cost of 9 for instruction: %v19 = uitofp <2 x i16> undef to <2 x double> +; CHECK: Cost Model: Found an estimated cost of 9 for instruction: %v20 = uitofp <2 x i16> undef to <2 x float> +; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %v21 = uitofp <2 x i8> undef to <2 x fp128> +; CHECK: Cost Model: Found an estimated cost of 9 for instruction: %v22 = uitofp <2 x i8> undef to <2 x double> +; CHECK: Cost Model: Found an estimated cost of 9 for instruction: %v23 = uitofp <2 x i8> undef to <2 x float> +; CHECK: Cost Model: Found an estimated cost of 9 for instruction: %v24 = uitofp <4 x i64> undef to <4 x fp128> ; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %v25 = uitofp <4 x i64> undef to <4 x double> -; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v26 = uitofp <4 x i64> undef to <4 x float> -; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v27 = uitofp <4 x i32> undef to <4 x fp128> -; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v28 = uitofp <4 x i32> undef to <4 x double> -; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v29 = uitofp <4 x i32> undef to <4 x float> -; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v30 = uitofp <4 x i16> undef to <4 x fp128> -; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %v31 = uitofp <4 x i16> undef to <4 x double> -; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %v32 = uitofp <4 x i16> undef to <4 x float> -; CHECK: Cost Model: Found an estimated cost of 12 for instruction: %v33 = uitofp <4 x i8> undef to <4 x fp128> -; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %v34 = uitofp <4 x i8> undef to <4 x double> -; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %v35 = uitofp <4 x i8> undef to <4 x float> -; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %v36 = uitofp <8 x i64> undef to <8 x fp128> +; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v26 = uitofp <4 x i64> undef to <4 x float> +; CHECK: Cost Model: Found an estimated cost of 9 for instruction: %v27 = uitofp <4 x i32> undef to <4 x fp128> +; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v28 = uitofp <4 x i32> undef to <4 x double> +; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v29 = uitofp <4 x i32> undef to <4 x float> +; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v30 = uitofp <4 x i16> undef to <4 x fp128> +; CHECK: Cost Model: Found an estimated cost of 17 for instruction: %v31 = uitofp <4 x i16> undef to <4 x double> +; CHECK: Cost Model: Found an estimated cost of 17 for instruction: %v32 = uitofp <4 x i16> undef to <4 x float> +; CHECK: Cost Model: Found an estimated cost of 13 for instruction: %v33 = uitofp <4 x i8> undef to <4 x fp128> +; CHECK: Cost Model: Found an estimated cost of 17 for instruction: %v34 = uitofp <4 x i8> undef to <4 x double> +; CHECK: Cost Model: Found an estimated cost of 17 for instruction: %v35 = uitofp <4 x i8> undef to <4 x float> +; CHECK: Cost Model: Found an estimated cost of 17 for instruction: %v36 = uitofp <8 x i64> undef to <8 x fp128> ; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %v37 = uitofp <8 x i64> undef to <8 x double> -; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v38 = uitofp <8 x i64> undef to <8 x float> -; CHECK: Cost Model: Found an estimated cost of 16 for instruction: %v39 = uitofp <8 x i32> undef to <8 x fp128> -; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v40 = uitofp <8 x i32> undef to <8 x double> -; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v41 = uitofp <8 x i32> undef to <8 x float> -; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v42 = uitofp <8 x i16> undef to <8 x fp128> -; CHECK: Cost Model: Found an estimated cost of 32 for instruction: %v43 = uitofp <8 x i16> undef to <8 x double> -; CHECK: Cost Model: Found an estimated cost of 32 for instruction: %v44 = uitofp <8 x i16> undef to <8 x float> -; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v45 = uitofp <8 x i8> undef to <8 x fp128> -; CHECK: Cost Model: Found an estimated cost of 32 for instruction: %v46 = uitofp <8 x i8> undef to <8 x double> -; CHECK: Cost Model: Found an estimated cost of 32 for instruction: %v47 = uitofp <8 x i8> undef to <8 x float> +; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v38 = uitofp <8 x i64> undef to <8 x float> +; CHECK: Cost Model: Found an estimated cost of 17 for instruction: %v39 = uitofp <8 x i32> undef to <8 x fp128> +; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v40 = uitofp <8 x i32> undef to <8 x double> +; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v41 = uitofp <8 x i32> undef to <8 x float> +; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v42 = uitofp <8 x i16> undef to <8 x fp128> +; CHECK: Cost Model: Found an estimated cost of 33 for instruction: %v43 = uitofp <8 x i16> undef to <8 x double> +; CHECK: Cost Model: Found an estimated cost of 33 for instruction: %v44 = uitofp <8 x i16> undef to <8 x float> +; CHECK: Cost Model: Found an estimated cost of 25 for instruction: %v45 = uitofp <8 x i8> undef to <8 x fp128> +; CHECK: Cost Model: Found an estimated cost of 33 for instruction: %v46 = uitofp <8 x i8> undef to <8 x double> +; CHECK: Cost Model: Found an estimated cost of 33 for instruction: %v47 = uitofp <8 x i8> undef to <8 x float> ; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %v48 = uitofp <16 x i64> undef to <16 x double> -; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %v49 = uitofp <16 x i64> undef to <16 x float> -; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %v50 = uitofp <16 x i32> undef to <16 x double> -; CHECK: Cost Model: Found an estimated cost of 48 for instruction: %v51 = uitofp <16 x i32> undef to <16 x float> -; CHECK: Cost Model: Found an estimated cost of 64 for instruction: %v52 = uitofp <16 x i16> undef to <16 x double> -; CHECK: Cost Model: Found an estimated cost of 64 for instruction: %v53 = uitofp <16 x i16> undef to <16 x float> -; CHECK: Cost Model: Found an estimated cost of 64 for instruction: %v54 = uitofp <16 x i8> undef to <16 x double> -; CHECK: Cost Model: Found an estimated cost of 64 for instruction: %v55 = uitofp <16 x i8> undef to <16 x float> +; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %v49 = uitofp <16 x i64> undef to <16 x float> +; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %v50 = uitofp <16 x i32> undef to <16 x double> +; CHECK: Cost Model: Found an estimated cost of 49 for instruction: %v51 = uitofp <16 x i32> undef to <16 x float> +; CHECK: Cost Model: Found an estimated cost of 65 for instruction: %v52 = uitofp <16 x i16> undef to <16 x double> +; CHECK: Cost Model: Found an estimated cost of 65 for instruction: %v53 = uitofp <16 x i16> undef to <16 x float> +; CHECK: Cost Model: Found an estimated cost of 65 for instruction: %v54 = uitofp <16 x i8> undef to <16 x double> +; CHECK: Cost Model: Found an estimated cost of 65 for instruction: %v55 = uitofp <16 x i8> undef to <16 x float> ret void; } diff --git a/test/Analysis/CostModel/SystemZ/int-operands-extcost.ll b/test/Analysis/CostModel/SystemZ/int-operands-extcost.ll new file mode 100644 index 0000000000000000000000000000000000000000..138f48f42c78f79ceb8c6b92c2311885955034cf --- /dev/null +++ b/test/Analysis/CostModel/SystemZ/int-operands-extcost.ll @@ -0,0 +1,45 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 \ +; RUN: | FileCheck %s +; +; Test that i8/i16 operands get extra costs for extensions to i32 only in +; cases where this is needed. + +define void @icmp() { + %li8_0 = load i8, i8* undef + %li8_1 = load i8, i8* undef + icmp slt i8 %li8_0, %li8_1 + + %a0 = add i8 %li8_0, 1 + %a1 = add i8 %li8_1, 1 + icmp slt i8 %a0, %a1 + + icmp slt i8 %a0, 123 + + %li16_0 = load i16, i16* undef + %li16_1 = load i16, i16* undef + icmp slt i16 %li16_0, %li16_1 + + %a2 = add i16 %li16_0, 1 + %a3 = add i16 %li16_1, 1 + icmp slt i16 %a2, %a3 + + icmp slt i16 %a2, 123 + + ret void; +; CHECK: Printing analysis 'Cost Model Analysis' for function 'icmp': +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li8_0 = load i8, i8* undef +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li8_1 = load i8, i8* undef +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %1 = icmp slt i8 %li8_0, %li8_1 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %a0 = add i8 %li8_0, 1 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %a1 = add i8 %li8_1, 1 +; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %2 = icmp slt i8 %a0, %a1 +; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %3 = icmp slt i8 %a0, 123 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li16_0 = load i16, i16* undef +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li16_1 = load i16, i16* undef +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %4 = icmp slt i16 %li16_0, %li16_1 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %a2 = add i16 %li16_0, 1 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %a3 = add i16 %li16_1, 1 +; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %5 = icmp slt i16 %a2, %a3 +; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %6 = icmp slt i16 %a2, 123 +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: ret void +} diff --git a/test/Analysis/CostModel/SystemZ/intrinsics.ll b/test/Analysis/CostModel/SystemZ/intrinsics.ll new file mode 100644 index 0000000000000000000000000000000000000000..f83cf5a7c3d4d39d411b77c6fd9113fb6f893aba --- /dev/null +++ b/test/Analysis/CostModel/SystemZ/intrinsics.ll @@ -0,0 +1,124 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s + +define void @bswap_i64(i64 %arg, <2 x i64> %arg2) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'bswap_i64': +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp1 = tail call i64 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp2 = tail call <2 x i64> +; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %swp4 = tail call <4 x i64> + %swp1 = tail call i64 @llvm.bswap.i64(i64 %arg) + %swp2 = tail call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %arg2) + %swp4 = tail call <4 x i64> @llvm.bswap.v4i64(<4 x i64> undef) + ret void +} + +define void @bswap_i32(i32 %arg, <2 x i32> %arg2, <4 x i32> %arg4) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'bswap_i32': +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp1 = tail call i32 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp2 = tail call <2 x i32> +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp4 = tail call <4 x i32> +; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %swp8 = tail call <8 x i32> + %swp1 = tail call i32 @llvm.bswap.i32(i32 %arg) + %swp2 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %arg2) + %swp4 = tail call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %arg4) + %swp8 = tail call <8 x i32> @llvm.bswap.v8i32(<8 x i32> undef) + ret void +} + +define void @bswap_i16(i16 %arg, <2 x i16> %arg2, <4 x i16> %arg4, + <8 x i16> %arg8) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'bswap_i16': +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp1 = tail call i16 @llvm.bswap.i16(i16 %arg) +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp2 = tail call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %arg2) +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp4 = tail call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %arg4) +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp8 = tail call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %arg8) +; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %swp16 = tail call <16 x i16> @llvm.bswap.v16i16(<16 x i16> undef) + %swp1 = tail call i16 @llvm.bswap.i16(i16 %arg) + %swp2 = tail call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %arg2) + %swp4 = tail call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %arg4) + %swp8 = tail call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %arg8) + %swp16 = tail call <16 x i16> @llvm.bswap.v16i16(<16 x i16> undef) + ret void +} + +; Test that store/load reversed is reflected in costs. +define void @bswap_i64_mem(i64* %src, i64 %arg, i64* %dst) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'bswap_i64_mem': +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %Ld1 = load i64, i64* %src +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp1 = tail call i64 @llvm.bswap.i64(i64 %Ld1) +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp2 = tail call i64 @llvm.bswap.i64(i64 %arg) +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: store i64 %swp2, i64* %dst +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %Ld2 = load i64, i64* %src +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp3 = tail call i64 @llvm.bswap.i64(i64 %Ld2) +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: store i64 %swp3, i64* %dst + %Ld1 = load i64, i64* %src + %swp1 = tail call i64 @llvm.bswap.i64(i64 %Ld1) + + %swp2 = tail call i64 @llvm.bswap.i64(i64 %arg) + store i64 %swp2, i64* %dst + + %Ld2 = load i64, i64* %src + %swp3 = tail call i64 @llvm.bswap.i64(i64 %Ld2) + store i64 %swp3, i64* %dst + + ret void +} + +define void @bswap_i32_mem(i32* %src, i32 %arg, i32* %dst) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'bswap_i32_mem': +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %Ld1 = load i32, i32* %src +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp1 = tail call i32 @llvm.bswap.i32(i32 %Ld1) +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp2 = tail call i32 @llvm.bswap.i32(i32 %arg) +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: store i32 %swp2, i32* %dst +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %Ld2 = load i32, i32* %src +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp3 = tail call i32 @llvm.bswap.i32(i32 %Ld2) +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: store i32 %swp3, i32* %dst + %Ld1 = load i32, i32* %src + %swp1 = tail call i32 @llvm.bswap.i32(i32 %Ld1) + + %swp2 = tail call i32 @llvm.bswap.i32(i32 %arg) + store i32 %swp2, i32* %dst + + %Ld2 = load i32, i32* %src + %swp3 = tail call i32 @llvm.bswap.i32(i32 %Ld2) + store i32 %swp3, i32* %dst + + ret void +} + +define void @bswap_i16_mem(i16* %src, i16 %arg, i16* %dst) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'bswap_i16_mem': +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %Ld1 = load i16, i16* %src +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp1 = tail call i16 @llvm.bswap.i16(i16 %Ld1) +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp2 = tail call i16 @llvm.bswap.i16(i16 %arg) +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: store i16 %swp2, i16* %dst +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %Ld2 = load i16, i16* %src +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp3 = tail call i16 @llvm.bswap.i16(i16 %Ld2) +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: store i16 %swp3, i16* %dst + %Ld1 = load i16, i16* %src + %swp1 = tail call i16 @llvm.bswap.i16(i16 %Ld1) + + %swp2 = tail call i16 @llvm.bswap.i16(i16 %arg) + store i16 %swp2, i16* %dst + + %Ld2 = load i16, i16* %src + %swp3 = tail call i16 @llvm.bswap.i16(i16 %Ld2) + store i16 %swp3, i16* %dst + + ret void +} + + +declare i64 @llvm.bswap.i64(i64) +declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) +declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) + +declare i32 @llvm.bswap.i32(i32) +declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) +declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) +declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) + +declare i16 @llvm.bswap.i16(i16) +declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) +declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>) +declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>) +declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>) diff --git a/test/Analysis/CostModel/SystemZ/load-and-test.ll b/test/Analysis/CostModel/SystemZ/load-and-test.ll new file mode 100644 index 0000000000000000000000000000000000000000..f0695688889e3abfe57502c6da102f9d9506c2aa --- /dev/null +++ b/test/Analysis/CostModel/SystemZ/load-and-test.ll @@ -0,0 +1,25 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s +; +; Test that load and test results in 0 cost for the compare. + +define i64 @fun0(i64* %Src, i64 %Arg) { + %Ld1 = load i64, i64* %Src + %Cmp = icmp eq i64 %Ld1, 0 + %S = select i1 %Cmp, i64 %Arg, i64 %Ld1 + ret i64 %S +; CHECK: Printing analysis 'Cost Model Analysis' for function 'fun0': +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %Ld1 = load i64, i64* %Src +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %Cmp = icmp eq i64 %Ld1, 0 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %S = select +} + +define i32 @fun1(i32* %Src, i32 %Arg) { + %Ld1 = load i32, i32* %Src + %Cmp = icmp eq i32 %Ld1, 0 + %S = select i1 %Cmp, i32 %Arg, i32 %Ld1 + ret i32 %S +; CHECK: Printing analysis 'Cost Model Analysis' for function 'fun1': +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %Ld1 = load i32, i32* %Src +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %Cmp = icmp eq i32 %Ld1, 0 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %S = select +} diff --git a/test/Analysis/CostModel/SystemZ/logical.ll b/test/Analysis/CostModel/SystemZ/logical.ll index 41984e0a29c4206107a5981b729a2e7135ac508e..c4d1fc2b92d44e056266e74dbf6bbfa304873101 100644 --- a/test/Analysis/CostModel/SystemZ/logical.ll +++ b/test/Analysis/CostModel/SystemZ/logical.ll @@ -68,8 +68,8 @@ define void @ashr() { %res18 = ashr <16 x i32> undef, undef %res19 = ashr <16 x i64> undef, undef -; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %res0 = ashr i8 undef, undef -; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %res1 = ashr i16 undef, undef +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res0 = ashr i8 undef, undef +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res1 = ashr i16 undef, undef ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res2 = ashr i32 undef, undef ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res3 = ashr i64 undef, undef ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res4 = ashr <2 x i8> undef, undef @@ -114,8 +114,8 @@ define void @lshr() { %res18 = lshr <16 x i32> undef, undef %res19 = lshr <16 x i64> undef, undef -; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %res0 = lshr i8 undef, undef -; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %res1 = lshr i16 undef, undef +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res0 = lshr i8 undef, undef +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res1 = lshr i16 undef, undef ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res2 = lshr i32 undef, undef ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res3 = lshr i64 undef, undef ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res4 = lshr <2 x i8> undef, undef diff --git a/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll b/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll index d5c097ced6254bd86033b4012db8d58271afc720..59999e3bc9e07499579116a09da81362e4dcf342 100644 --- a/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll +++ b/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll @@ -85,6 +85,37 @@ define void @add() { ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %10 = add i32 %sext_3, undef } +define void @add_i16_mem16(i16 %Arg, i16* %Src1, i16* %Src2, i16* %Dst, i32* %Src32) { + %L1 = load i16, i16* %Src1 + %S0 = add i16 %L1, %Arg + store volatile i16 %S0, i16* %Dst + + %L2 = load i16, i16* %Src1 + %L3 = load i16, i16* %Src2 + %S1 = add i16 %L2, %L3 + store volatile i16 %S1, i16* %Dst + + ; Truncated load + %L32 = load i32, i32* %Src32 + %tr = trunc i32 %L32 to i16 + %S2 = add i16 %tr, %Arg + store volatile i16 %S2, i16* %Dst + + ret void +; CHECK: Printing analysis 'Cost Model Analysis' for function 'add_i16_mem16': +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %L1 = load i16, i16* %Src1 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %S0 = add i16 %L1, %Arg +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store volatile i16 %S0, i16* %Dst +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %L2 = load i16, i16* %Src1 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %L3 = load i16, i16* %Src2 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %S1 = add i16 %L2, %L3 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store volatile i16 %S1, i16* %Dst +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %L32 = load i32, i32* %Src32 +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %tr = trunc i32 %L32 to i16 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %S2 = add i16 %tr, %Arg +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store volatile i16 %S2, i16* %Dst +} + define void @sub_lhs_mem() { %li32 = load i32, i32* undef sub i32 %li32, undef @@ -228,6 +259,37 @@ define void @sub_rhs_mem() { ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %8 = sub i32 undef, %sext_3 } +define void @sub_i16_mem16(i16 %Arg, i16* %Src1, i16* %Src2, i16* %Dst, i32* %Src32) { + %L1 = load i16, i16* %Src1 + %D0 = sub i16 %Arg, %L1 + store volatile i16 %D0, i16* %Dst + + %L2 = load i16, i16* %Src1 + %L3 = load i16, i16* %Src2 + %D1 = sub i16 %L2, %L3 + store volatile i16 %D1, i16* %Dst + + ; Truncated load + %L32 = load i32, i32* %Src32 + %tr = trunc i32 %L32 to i16 + %D2 = sub i16 %Arg, %tr + store volatile i16 %D2, i16* %Dst + + ret void +; CHECK: Printing analysis 'Cost Model Analysis' for function 'sub_i16_mem16': +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %L1 = load i16, i16* %Src1 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %D0 = sub i16 %Arg, %L1 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store volatile i16 %D0, i16* %Dst +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %L2 = load i16, i16* %Src1 +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %L3 = load i16, i16* %Src2 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %D1 = sub i16 %L2, %L3 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store volatile i16 %D1, i16* %Dst +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %L32 = load i32, i32* %Src32 +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %tr = trunc i32 %L32 to i16 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %D2 = sub i16 %Arg, %tr +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store volatile i16 %D2, i16* %Dst +} + define void @mul() { %li32 = load i32, i32* undef mul i32 %li32, undef @@ -305,6 +367,37 @@ define void @mul() { ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %10 = mul i32 %sext_3, undef } +define void @mul_i16_mem16(i16 %Arg, i16* %Src1, i16* %Src2, i16* %Dst, i32* %Src32) { + %L1 = load i16, i16* %Src1 + %P0 = mul i16 %Arg, %L1 + store volatile i16 %P0, i16* %Dst + + %L2 = load i16, i16* %Src1 + %L3 = load i16, i16* %Src2 + %P1 = mul i16 %L2, %L3 + store volatile i16 %P1, i16* %Dst + + ; Truncated load + %L32 = load i32, i32* %Src32 + %tr = trunc i32 %L32 to i16 + %P2 = mul i16 %Arg, %tr + store volatile i16 %P2, i16* %Dst + + ret void +; CHECK: Printing analysis 'Cost Model Analysis' for function 'mul_i16_mem16': +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %L1 = load i16, i16* %Src1 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %P0 = mul i16 %Arg, %L1 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store volatile i16 %P0, i16* %Dst +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %L2 = load i16, i16* %Src1 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %L3 = load i16, i16* %Src2 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %P1 = mul i16 %L2, %L3 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store volatile i16 %P1, i16* %Dst +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %L32 = load i32, i32* %Src32 +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %tr = trunc i32 %L32 to i16 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %P2 = mul i16 %Arg, %tr +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: store volatile i16 %P2, i16* %Dst +} + define void @sdiv_lhs(i32 %arg32, i64 %arg64) { %li32 = load i32, i32* undef sdiv i32 %li32, %arg32 @@ -340,10 +433,10 @@ define void @sdiv_lhs(i32 %arg32, i64 %arg64) { ; An sdiv loaded dividend (lhs) operand is *not* foldable. ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li32 = load i32, i32* undef -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %1 = sdiv i32 %li32, %arg32 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %1 = sdiv i32 %li32, %arg32 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li32_0 = load i32, i32* undef ; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %li32_1 = load i32, i32* undef -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %2 = sdiv i32 %li32_0, %li32_1 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %2 = sdiv i32 %li32_0, %li32_1 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li64 = load i64, i64* undef ; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %3 = sdiv i64 %li64, %arg64 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li64_0 = load i64, i64* undef @@ -388,12 +481,12 @@ define void @sdiv_rhs(i32 %arg32, i64 %arg64) { ; An sdiv loaded divisor (rhs) operand is foldable. ; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %li32 = load i32, i32* undef -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %1 = sdiv i32 %arg32, %li32 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %1 = sdiv i32 %arg32, %li32 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %li64 = load i64, i64* undef ; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %2 = sdiv i64 %arg64, %li64 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %li64_2 = load i64, i64* undef ; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %tr = trunc i64 %li64_2 to i32 -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %3 = sdiv i32 undef, %tr +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %3 = sdiv i32 undef, %tr ; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %li32_2 = load i32, i32* undef ; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %sext_0 = sext i32 %li32_2 to i64 ; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %4 = sdiv i64 undef, %sext_0 @@ -432,15 +525,15 @@ define void @udiv_lhs(i32 %arg32, i64 %arg64) { ; An udiv loaded dividend (lhs) operand is *not* foldable. ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li32 = load i32, i32* undef -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %1 = udiv i32 %li32, %arg32 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %1 = udiv i32 %li32, %arg32 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li32_0 = load i32, i32* undef ; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %li32_1 = load i32, i32* undef -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %2 = udiv i32 %li32_0, %li32_1 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %2 = udiv i32 %li32_0, %li32_1 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li64 = load i64, i64* undef -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %3 = udiv i64 %li64, %arg64 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %3 = udiv i64 %li64, %arg64 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li64_0 = load i64, i64* undef ; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %li64_1 = load i64, i64* undef -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %4 = udiv i64 %li64_0, %li64_1 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %4 = udiv i64 %li64_0, %li64_1 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li64_2 = load i64, i64* undef ; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %tr_0 = trunc i64 %li64_2 to i32 ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %5 = udiv i32 %tr_0, undef @@ -470,15 +563,15 @@ define void @udiv_rhs(i32 %arg32, i64 %arg64) { ; An udiv loaded divisor (rhs) operand is foldable. ; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %li32 = load i32, i32* undef -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %1 = udiv i32 %arg32, %li32 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %1 = udiv i32 %arg32, %li32 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %li64 = load i64, i64* undef -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %2 = udiv i64 %arg64, %li64 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %2 = udiv i64 %arg64, %li64 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %li64_2 = load i64, i64* undef ; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %tr_0 = trunc i64 %li64_2 to i32 -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %3 = udiv i32 undef, %tr_0 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %3 = udiv i32 undef, %tr_0 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li64_3 = load i64, i64* undef ; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %tr_1 = trunc i64 %li64_3 to i32 -; CHECK: Cost Model: Found an estimated cost of 21 for instruction: %4 = udiv i64 undef, %li64_3 +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %4 = udiv i64 undef, %li64_3 } define void @and() { @@ -633,6 +726,16 @@ define void @icmp() { %tr_0 = trunc i64 %li64_2 to i32 icmp eq i32 %tr_0, undef + ; Sign-extended load + %li32_2 = load i32, i32* undef + %sext = sext i32 %li32_2 to i64 + icmp eq i64 %sext, undef + + ; Zero-extended load + %li32_3 = load i32, i32* undef + %zext = zext i32 %li32_3 to i64 + icmp eq i64 %zext, undef + ; Loads with multiple uses are *not* folded %li64_3 = load i64, i64* undef %tr_1 = trunc i64 %li64_3 to i32 @@ -652,7 +755,13 @@ define void @icmp() { ; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %li64_2 = load i64, i64* undef ; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %tr_0 = trunc i64 %li64_2 to i32 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %5 = icmp eq i32 %tr_0, undef +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %li32_2 = load i32, i32* undef +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %sext = sext i32 %li32_2 to i64 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %6 = icmp eq i64 %sext, undef +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %li32_3 = load i32, i32* undef +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %zext = zext i32 %li32_3 to i64 +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %7 = icmp eq i64 %zext, undef ; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %li64_3 = load i64, i64* undef ; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %tr_1 = trunc i64 %li64_3 to i32 -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %6 = icmp eq i64 %li64_3, undef +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %8 = icmp eq i64 %li64_3, undef } diff --git a/test/Analysis/CostModel/X86/cast-widen.ll b/test/Analysis/CostModel/X86/cast-widen.ll new file mode 100644 index 0000000000000000000000000000000000000000..05304f4927f4b82fb8e33949025a7d976a7f0faf --- /dev/null +++ b/test/Analysis/CostModel/X86/cast-widen.ll @@ -0,0 +1,496 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ + +define i32 @add(i32 %arg) { +; SSE-LABEL: 'add' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A = zext <4 x i1> undef to <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %B = sext <4 x i1> undef to <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %C = trunc <4 x i32> undef to <4 x i1> +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %D = zext <8 x i1> undef to <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %E = sext <8 x i1> undef to <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F = trunc <8 x i32> undef to <8 x i1> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %G = zext i1 undef to i32 +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %H = trunc i32 undef to i1 +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'add' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A = zext <4 x i1> undef to <4 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %B = sext <4 x i1> undef to <4 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %C = trunc <4 x i32> undef to <4 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D = zext <8 x i1> undef to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %E = sext <8 x i1> undef to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %F = trunc <8 x i32> undef to <8 x i1> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %G = zext i1 undef to i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %H = trunc i32 undef to i1 +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'add' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A = zext <4 x i1> undef to <4 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %B = sext <4 x i1> undef to <4 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %C = trunc <4 x i32> undef to <4 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %D = zext <8 x i1> undef to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %E = sext <8 x i1> undef to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %F = trunc <8 x i32> undef to <8 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %G = zext i1 undef to i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %H = trunc i32 undef to i1 +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'add' +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %A = zext <4 x i1> undef to <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %B = sext <4 x i1> undef to <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %C = trunc <4 x i32> undef to <4 x i1> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %D = zext <8 x i1> undef to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %E = sext <8 x i1> undef to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %F = trunc <8 x i32> undef to <8 x i1> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %G = zext i1 undef to i32 +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %H = trunc i32 undef to i1 +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + ; -- Same size registeres -- + %A = zext <4 x i1> undef to <4 x i32> + %B = sext <4 x i1> undef to <4 x i32> + %C = trunc <4 x i32> undef to <4 x i1> + + ; -- Different size registers -- + %D = zext <8 x i1> undef to <8 x i32> + %E = sext <8 x i1> undef to <8 x i32> + %F = trunc <8 x i32> undef to <8 x i1> + + ; -- scalars -- + %G = zext i1 undef to i32 + %H = trunc i32 undef to i1 + + ret i32 undef +} + +define i32 @zext_sext(<8 x i1> %in) { +; SSE2-LABEL: 'zext_sext' +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <8 x i1> %in to <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %S = sext <8 x i1> %in to <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %A1 = zext <16 x i8> undef to <16 x i16> +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %A2 = sext <16 x i8> undef to <16 x i16> +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %A = sext <8 x i16> undef to <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %B = zext <8 x i16> undef to <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %C = sext <4 x i32> undef to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %D = zext <4 x i32> undef to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %D1 = zext <8 x i32> undef to <8 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %D2 = sext <8 x i32> undef to <8 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %D3 = zext <16 x i16> undef to <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %D4 = zext <16 x i8> undef to <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %D5 = zext <16 x i1> undef to <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %E = trunc <4 x i64> undef to <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %F = trunc <8 x i32> undef to <8 x i16> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8> +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %G = trunc <8 x i64> undef to <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16> +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE41-LABEL: 'zext_sext' +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <8 x i1> %in to <8 x i32> +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %S = sext <8 x i1> %in to <8 x i32> +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %A1 = zext <16 x i8> undef to <16 x i16> +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %A2 = sext <16 x i8> undef to <16 x i16> +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %A = sext <8 x i16> undef to <8 x i32> +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %B = zext <8 x i16> undef to <8 x i32> +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C = sext <4 x i32> undef to <4 x i64> +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32> +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32> +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64> +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64> +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64> +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64> +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %D = zext <4 x i32> undef to <4 x i64> +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %D1 = zext <8 x i32> undef to <8 x i64> +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %D2 = sext <8 x i32> undef to <8 x i64> +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D3 = zext <16 x i16> undef to <16 x i32> +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D4 = zext <16 x i8> undef to <16 x i32> +; SSE41-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %D5 = zext <16 x i1> undef to <16 x i32> +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %E = trunc <4 x i64> undef to <4 x i32> +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %F = trunc <8 x i32> undef to <8 x i16> +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8> +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8> +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8> +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %G = trunc <8 x i64> undef to <8 x i32> +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16> +; SSE41-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8> +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'zext_sext' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %Z = zext <8 x i1> %in to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %S = sext <8 x i1> %in to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %A1 = zext <16 x i8> undef to <16 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %A2 = sext <16 x i8> undef to <16 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %A = sext <8 x i16> undef to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %B = zext <8 x i16> undef to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C = sext <4 x i32> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D = zext <4 x i32> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %D1 = zext <8 x i32> undef to <8 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %D2 = sext <8 x i32> undef to <8 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D3 = zext <16 x i16> undef to <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D4 = zext <16 x i8> undef to <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %D5 = zext <16 x i1> undef to <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %E = trunc <4 x i64> undef to <4 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %F = trunc <8 x i32> undef to <8 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8> +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %G = trunc <8 x i64> undef to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'zext_sext' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <8 x i1> %in to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %S = sext <8 x i1> %in to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A1 = zext <16 x i8> undef to <16 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A2 = sext <16 x i8> undef to <16 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A = sext <8 x i16> undef to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = zext <8 x i16> undef to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C = sext <4 x i32> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D = zext <4 x i32> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %D1 = zext <8 x i32> undef to <8 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %D2 = sext <8 x i32> undef to <8 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D3 = zext <16 x i16> undef to <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D4 = zext <16 x i8> undef to <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %D5 = zext <16 x i1> undef to <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %E = trunc <4 x i64> undef to <4 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F = trunc <8 x i32> undef to <8 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8> +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %G = trunc <8 x i64> undef to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'zext_sext' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <8 x i1> %in to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %S = sext <8 x i1> %in to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A1 = zext <16 x i8> undef to <16 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A2 = sext <16 x i8> undef to <16 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A = sext <8 x i16> undef to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = zext <8 x i16> undef to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C = sext <4 x i32> undef to <4 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D = zext <4 x i32> undef to <4 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D1 = zext <8 x i32> undef to <8 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D2 = sext <8 x i32> undef to <8 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D3 = zext <16 x i16> undef to <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D4 = zext <16 x i8> undef to <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %D5 = zext <16 x i1> undef to <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %E = trunc <4 x i64> undef to <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F = trunc <8 x i32> undef to <8 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %G = trunc <8 x i64> undef to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %Z = zext <8 x i1> %in to <8 x i32> + %S = sext <8 x i1> %in to <8 x i32> + + %A1 = zext <16 x i8> undef to <16 x i16> + %A2 = sext <16 x i8> undef to <16 x i16> + %A = sext <8 x i16> undef to <8 x i32> + %B = zext <8 x i16> undef to <8 x i32> + %C = sext <4 x i32> undef to <4 x i64> + + %C.v8i8.z = zext <8 x i8> undef to <8 x i32> + %C.v8i8.s = sext <8 x i8> undef to <8 x i32> + %C.v4i16.z = zext <4 x i16> undef to <4 x i64> + %C.v4i16.s = sext <4 x i16> undef to <4 x i64> + + %C.v4i8.z = zext <4 x i8> undef to <4 x i64> + %C.v4i8.s = sext <4 x i8> undef to <4 x i64> + + %D = zext <4 x i32> undef to <4 x i64> + + %D1 = zext <8 x i32> undef to <8 x i64> + + %D2 = sext <8 x i32> undef to <8 x i64> + + %D3 = zext <16 x i16> undef to <16 x i32> + %D4 = zext <16 x i8> undef to <16 x i32> + %D5 = zext <16 x i1> undef to <16 x i32> + + %E = trunc <4 x i64> undef to <4 x i32> + %F = trunc <8 x i32> undef to <8 x i16> + %F1 = trunc <16 x i16> undef to <16 x i8> + %F2 = trunc <8 x i32> undef to <8 x i8> + %F3 = trunc <4 x i64> undef to <4 x i8> + + %G = trunc <8 x i64> undef to <8 x i32> + %G1 = trunc <16 x i32> undef to <16 x i16> + %G2 = trunc <16 x i32> undef to <16 x i8> + ret i32 undef +} + +define i32 @masks8(<8 x i1> %in) { +; SSE-LABEL: 'masks8' +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <8 x i1> %in to <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %S = sext <8 x i1> %in to <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'masks8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %Z = zext <8 x i1> %in to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %S = sext <8 x i1> %in to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'masks8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <8 x i1> %in to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %S = sext <8 x i1> %in to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'masks8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <8 x i1> %in to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %S = sext <8 x i1> %in to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %Z = zext <8 x i1> %in to <8 x i32> + %S = sext <8 x i1> %in to <8 x i32> + ret i32 undef +} + +define i32 @masks4(<4 x i1> %in) { +; SSE-LABEL: 'masks4' +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <4 x i1> %in to <4 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %S = sext <4 x i1> %in to <4 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'masks4' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %Z = zext <4 x i1> %in to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %S = sext <4 x i1> %in to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'masks4' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <4 x i1> %in to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %S = sext <4 x i1> %in to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'masks4' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %Z = zext <4 x i1> %in to <4 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %S = sext <4 x i1> %in to <4 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %Z = zext <4 x i1> %in to <4 x i64> + %S = sext <4 x i1> %in to <4 x i64> + ret i32 undef +} + +define void @sitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) { +; SSE-LABEL: 'sitofp4' +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %A1 = sitofp <4 x i1> %a to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %A2 = sitofp <4 x i1> %a to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %D1 = sitofp <4 x i32> %d to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %D2 = sitofp <4 x i32> %d to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX-LABEL: 'sitofp4' +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %A1 = sitofp <4 x i1> %a to <4 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %A2 = sitofp <4 x i1> %a to <4 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D1 = sitofp <4 x i32> %d to <4 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D2 = sitofp <4 x i32> %d to <4 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'sitofp4' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %A1 = sitofp <4 x i1> %a to <4 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %A2 = sitofp <4 x i1> %a to <4 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D1 = sitofp <4 x i32> %d to <4 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D2 = sitofp <4 x i32> %d to <4 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %A1 = sitofp <4 x i1> %a to <4 x float> + %A2 = sitofp <4 x i1> %a to <4 x double> + %B1 = sitofp <4 x i8> %b to <4 x float> + %B2 = sitofp <4 x i8> %b to <4 x double> + %C1 = sitofp <4 x i16> %c to <4 x float> + %C2 = sitofp <4 x i16> %c to <4 x double> + %D1 = sitofp <4 x i32> %d to <4 x float> + %D2 = sitofp <4 x i32> %d to <4 x double> + ret void +} + +define void @sitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) { +; SSE-LABEL: 'sitofp8' +; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX-LABEL: 'sitofp8' +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'sitofp8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %A1 = sitofp <8 x i1> %a to <8 x float> + %B1 = sitofp <8 x i8> %b to <8 x float> + %C1 = sitofp <8 x i16> %c to <8 x float> + %D1 = sitofp <8 x i32> %d to <8 x float> + ret void +} + +define void @uitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) { +; SSE-LABEL: 'uitofp4' +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = uitofp <4 x i1> %a to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %A2 = uitofp <4 x i1> %a to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <4 x i8> %b to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %B2 = uitofp <4 x i8> %b to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %C1 = uitofp <4 x i16> %c to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %C2 = uitofp <4 x i16> %c to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <4 x i32> %d to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %D2 = uitofp <4 x i32> %d to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX-LABEL: 'uitofp4' +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %A1 = uitofp <4 x i1> %a to <4 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %A2 = uitofp <4 x i1> %a to <4 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %B1 = uitofp <4 x i8> %b to <4 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %B2 = uitofp <4 x i8> %b to <4 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C1 = uitofp <4 x i16> %c to <4 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C2 = uitofp <4 x i16> %c to <4 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %D1 = uitofp <4 x i32> %d to <4 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %D2 = uitofp <4 x i32> %d to <4 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'uitofp4' +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %A1 = uitofp <4 x i1> %a to <4 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %A2 = uitofp <4 x i1> %a to <4 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %B1 = uitofp <4 x i8> %b to <4 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %B2 = uitofp <4 x i8> %b to <4 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C1 = uitofp <4 x i16> %c to <4 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C2 = uitofp <4 x i16> %c to <4 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D1 = uitofp <4 x i32> %d to <4 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D2 = uitofp <4 x i32> %d to <4 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %A1 = uitofp <4 x i1> %a to <4 x float> + %A2 = uitofp <4 x i1> %a to <4 x double> + %B1 = uitofp <4 x i8> %b to <4 x float> + %B2 = uitofp <4 x i8> %b to <4 x double> + %C1 = uitofp <4 x i16> %c to <4 x float> + %C2 = uitofp <4 x i16> %c to <4 x double> + %D1 = uitofp <4 x i32> %d to <4 x float> + %D2 = uitofp <4 x i32> %d to <4 x double> + ret void +} + +define void @uitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) { +; SSE-LABEL: 'uitofp8' +; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'uitofp8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'uitofp8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'uitofp8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %A1 = uitofp <8 x i1> %a to <8 x float> + %B1 = uitofp <8 x i8> %b to <8 x float> + %C1 = uitofp <8 x i16> %c to <8 x float> + %D1 = uitofp <8 x i32> %d to <8 x float> + ret void +} + +define void @fp_conv(<8 x float> %a, <16 x float>%b, <4 x float> %c) { +; SSE-LABEL: 'fp_conv' +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %A1 = fpext <4 x float> %c to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %A2 = fpext <8 x float> %a to <8 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %A3 = fptrunc <4 x double> undef to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %A4 = fptrunc <8 x double> undef to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX-LABEL: 'fp_conv' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A1 = fpext <4 x float> %c to <4 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %A2 = fpext <8 x float> %a to <8 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A3 = fptrunc <4 x double> undef to <4 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %A4 = fptrunc <8 x double> undef to <8 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'fp_conv' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A1 = fpext <4 x float> %c to <4 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A2 = fpext <8 x float> %a to <8 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A3 = fptrunc <4 x double> undef to <4 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A4 = fptrunc <8 x double> undef to <8 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %A1 = fpext <4 x float> %c to <4 x double> + %A2 = fpext <8 x float> %a to <8 x double> + %A3 = fptrunc <4 x double> undef to <4 x float> + %A4 = fptrunc <8 x double> undef to <8 x float> + ret void +} diff --git a/test/Analysis/CostModel/X86/extend.ll b/test/Analysis/CostModel/X86/extend.ll new file mode 100644 index 0000000000000000000000000000000000000000..1de0b0b20e1bbb14253511f7e3ed9e64fcbba282 --- /dev/null +++ b/test/Analysis/CostModel/X86/extend.ll @@ -0,0 +1,860 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2 + +define i32 @zext_vXi32() { +; SSE2-LABEL: 'zext_vXi32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i32> undef to <2 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i32> undef to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'zext_vXi32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i32> undef to <2 x i64> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i32> undef to <4 x i64> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'zext_vXi32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i32> undef to <2 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = zext <4 x i32> undef to <4 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'zext_vXi32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i32> undef to <2 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = zext <4 x i32> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'zext_vXi32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i32> undef to <2 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = zext <4 x i32> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'zext_vXi32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i32> undef to <2 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = zext <4 x i32> undef to <4 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'zext_vXi32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i32> undef to <2 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = zext <4 x i32> undef to <4 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2i64 = zext <2 x i32> undef to <2 x i64> + %V4i64 = zext <4 x i32> undef to <4 x i64> + %V8i64 = zext <8 x i32> undef to <8 x i64> + + ret i32 undef +} + +define i32 @zext_vXi16() { +; SSE2-LABEL: 'zext_vXi16' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i16> undef to <2 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i16> undef to <2 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i16> undef to <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i16> undef to <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = zext <16 x i16> undef to <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'zext_vXi16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i16> undef to <2 x i64> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i16> undef to <2 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i16> undef to <4 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i16> undef to <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = zext <16 x i16> undef to <16 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'zext_vXi16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i16> undef to <2 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i16> undef to <2 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i16> undef to <4 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = zext <8 x i16> undef to <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = zext <16 x i16> undef to <16 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'zext_vXi16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i16> undef to <2 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i16> undef to <2 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i16> undef to <4 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = zext <8 x i16> undef to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = zext <16 x i16> undef to <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'zext_vXi16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i16> undef to <2 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i16> undef to <2 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i16> undef to <4 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = zext <8 x i16> undef to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = zext <16 x i16> undef to <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'zext_vXi16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i16> undef to <2 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i16> undef to <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i16> undef to <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = zext <8 x i16> undef to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = zext <16 x i16> undef to <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'zext_vXi16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i16> undef to <2 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i16> undef to <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i16> undef to <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = zext <8 x i16> undef to <8 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = zext <16 x i16> undef to <16 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2i64 = zext <2 x i16> undef to <2 x i64> + %V4i64 = zext <4 x i16> undef to <4 x i64> + %V8i64 = zext <8 x i16> undef to <8 x i64> + + %V2i32 = zext <2 x i16> undef to <2 x i32> + %V4i32 = zext <4 x i16> undef to <4 x i32> + %V8i32 = zext <8 x i16> undef to <8 x i32> + %V16i32 = zext <16 x i16> undef to <16 x i32> + + ret i32 undef +} + +define i32 @zext_vXi8() { +; SSE2-LABEL: 'zext_vXi8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i8> undef to <8 x i16> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = zext <16 x i8> undef to <16 x i16> +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'zext_vXi8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i8> undef to <8 x i16> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = zext <16 x i8> undef to <16 x i16> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'zext_vXi8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i8> undef to <8 x i16> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16i16 = zext <16 x i8> undef to <16 x i16> +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'zext_vXi8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i8> undef to <8 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = zext <16 x i8> undef to <16 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'zext_vXi8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i8> undef to <8 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = zext <16 x i8> undef to <16 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'zext_vXi8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i8> undef to <8 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = zext <16 x i8> undef to <16 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'zext_vXi8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i8> undef to <8 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = zext <16 x i8> undef to <16 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'zext_vXi8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i8> undef to <8 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = zext <16 x i8> undef to <16 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2i64 = zext <2 x i8> undef to <2 x i64> + %V4i64 = zext <4 x i8> undef to <4 x i64> + %V8i64 = zext <8 x i8> undef to <8 x i64> + + %V2i32 = zext <2 x i8> undef to <2 x i32> + %V4i32 = zext <4 x i8> undef to <4 x i32> + %V8i32 = zext <8 x i8> undef to <8 x i32> + %V16i32 = zext <16 x i8> undef to <16 x i32> + + %V2i16 = zext <2 x i8> undef to <2 x i16> + %V4i16 = zext <4 x i8> undef to <4 x i16> + %V8i16 = zext <8 x i8> undef to <8 x i16> + %V16i16 = zext <16 x i8> undef to <16 x i16> + %V32i16 = zext <32 x i8> undef to <32 x i16> + + ret i32 undef +} + +define i32 @zext_vXi1() { +; SSE-LABEL: 'zext_vXi1' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i1> undef to <2 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i1> undef to <4 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i1> undef to <2 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i1> undef to <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i1> undef to <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16i32 = zext <16 x i1> undef to <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = zext <8 x i1> undef to <8 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = zext <16 x i1> undef to <16 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = zext <32 x i1> undef to <32 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = zext <64 x i1> undef to <64 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'zext_vXi1' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i1> undef to <2 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = zext <4 x i1> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i1> undef to <2 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i1> undef to <4 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = zext <8 x i1> undef to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16i32 = zext <16 x i1> undef to <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = zext <8 x i1> undef to <8 x i8> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = zext <16 x i1> undef to <16 x i8> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = zext <32 x i1> undef to <32 x i8> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = zext <64 x i1> undef to <64 x i8> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'zext_vXi1' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i1> undef to <2 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i1> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i1> undef to <2 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i1> undef to <4 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i1> undef to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16i32 = zext <16 x i1> undef to <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = zext <8 x i1> undef to <8 x i8> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = zext <16 x i1> undef to <16 x i8> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = zext <32 x i1> undef to <32 x i8> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = zext <64 x i1> undef to <64 x i8> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'zext_vXi1' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = zext <2 x i1> undef to <2 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i1> undef to <4 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = zext <2 x i1> undef to <2 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = zext <4 x i1> undef to <4 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i1> undef to <8 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16i32 = zext <16 x i1> undef to <16 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i8 = zext <8 x i1> undef to <8 x i8> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i8 = zext <16 x i1> undef to <16 x i8> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32i8 = zext <32 x i1> undef to <32 x i8> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 195 for instruction: %V64i8 = zext <64 x i1> undef to <64 x i8> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'zext_vXi1' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = zext <2 x i1> undef to <2 x i64> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i1> undef to <4 x i64> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = zext <2 x i1> undef to <2 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = zext <4 x i1> undef to <4 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i1> undef to <8 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16i32 = zext <16 x i1> undef to <16 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i8 = zext <8 x i1> undef to <8 x i8> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16i8 = zext <16 x i1> undef to <16 x i8> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32i8 = zext <32 x i1> undef to <32 x i8> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64i8 = zext <64 x i1> undef to <64 x i8> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'zext_vXi1' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i1> undef to <2 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = zext <4 x i1> undef to <4 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i1> undef to <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i1> undef to <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = zext <8 x i1> undef to <8 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16i32 = zext <16 x i1> undef to <16 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = zext <8 x i1> undef to <8 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = zext <16 x i1> undef to <16 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = zext <32 x i1> undef to <32 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = zext <64 x i1> undef to <64 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2i64 = zext <2 x i1> undef to <2 x i64> + %V4i64 = zext <4 x i1> undef to <4 x i64> + %V8i64 = zext <8 x i1> undef to <8 x i64> + + %V2i32 = zext <2 x i1> undef to <2 x i32> + %V4i32 = zext <4 x i1> undef to <4 x i32> + %V8i32 = zext <8 x i1> undef to <8 x i32> + %V16i32 = zext <16 x i1> undef to <16 x i32> + + %V2i16 = zext <2 x i1> undef to <2 x i16> + %V4i16 = zext <4 x i1> undef to <4 x i16> + %V8i16 = zext <8 x i1> undef to <8 x i16> + %V16i16 = zext <16 x i1> undef to <16 x i16> + %V32i16 = zext <32 x i1> undef to <32 x i16> + + %V2i8 = zext <2 x i1> undef to <2 x i8> + %V4i8 = zext <4 x i1> undef to <4 x i8> + %V8i8 = zext <8 x i1> undef to <8 x i8> + %V16i8 = zext <16 x i1> undef to <16 x i8> + %V32i8 = zext <32 x i1> undef to <32 x i8> + %V64i8 = zext <64 x i1> undef to <64 x i8> + + ret i32 undef +} + +define i32 @sext_vXi32() { +; SSE2-LABEL: 'sext_vXi32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i32> undef to <2 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i64 = sext <4 x i32> undef to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'sext_vXi32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i32> undef to <2 x i64> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i64 = sext <4 x i32> undef to <4 x i64> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'sext_vXi32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i32> undef to <2 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = sext <4 x i32> undef to <4 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'sext_vXi32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i32> undef to <2 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i32> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'sext_vXi32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i32> undef to <2 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = sext <4 x i32> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'sext_vXi32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i32> undef to <2 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = sext <4 x i32> undef to <4 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'sext_vXi32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i32> undef to <2 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i32> undef to <4 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2i64 = sext <2 x i32> undef to <2 x i64> + %V4i64 = sext <4 x i32> undef to <4 x i64> + %V8i64 = sext <8 x i32> undef to <8 x i64> + + ret i32 undef +} + +define i32 @sext_vXi16() { +; SSE2-LABEL: 'sext_vXi16' +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i32 = sext <16 x i16> undef to <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'sext_vXi16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i32 = sext <16 x i16> undef to <16 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'sext_vXi16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i16> undef to <16 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'sext_vXi16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i16> undef to <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'sext_vXi16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i16> undef to <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'sext_vXi16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = sext <16 x i16> undef to <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'sext_vXi16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i16> undef to <16 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2i64 = sext <2 x i16> undef to <2 x i64> + %V4i64 = sext <4 x i16> undef to <4 x i64> + %V8i64 = sext <8 x i16> undef to <8 x i64> + + %V2i32 = sext <2 x i16> undef to <2 x i32> + %V4i32 = sext <4 x i16> undef to <4 x i32> + %V8i32 = sext <8 x i16> undef to <8 x i32> + %V16i32 = sext <16 x i16> undef to <16 x i32> + + ret i32 undef +} + +define i32 @sext_vXi8() { +; SSE2-LABEL: 'sext_vXi8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16> +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i8> undef to <8 x i16> +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = sext <16 x i8> undef to <16 x i16> +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'sext_vXi8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i8> undef to <8 x i16> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = sext <16 x i8> undef to <16 x i16> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'sext_vXi8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = sext <8 x i8> undef to <8 x i16> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16i16 = sext <16 x i8> undef to <16 x i16> +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'sext_vXi8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = sext <8 x i8> undef to <8 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = sext <16 x i8> undef to <16 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'sext_vXi8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = sext <8 x i8> undef to <8 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = sext <16 x i8> undef to <16 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'sext_vXi8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = sext <8 x i8> undef to <8 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = sext <16 x i8> undef to <16 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'sext_vXi8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = sext <8 x i8> undef to <8 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = sext <16 x i8> undef to <16 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'sext_vXi8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = sext <8 x i8> undef to <8 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = sext <16 x i8> undef to <16 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2i64 = sext <2 x i8> undef to <2 x i64> + %V4i64 = sext <4 x i8> undef to <4 x i64> + %V8i64 = sext <8 x i8> undef to <8 x i64> + + %V2i32 = sext <2 x i8> undef to <2 x i32> + %V4i32 = sext <4 x i8> undef to <4 x i32> + %V8i32 = sext <8 x i8> undef to <8 x i32> + %V16i32 = sext <16 x i8> undef to <16 x i32> + + %V2i16 = sext <2 x i8> undef to <2 x i16> + %V4i16 = sext <4 x i8> undef to <4 x i16> + %V8i16 = sext <8 x i8> undef to <8 x i16> + %V16i16 = sext <16 x i8> undef to <16 x i16> + %V32i16 = sext <32 x i8> undef to <32 x i16> + + ret i32 undef +} + +define i32 @sext_vXi1() { +; SSE-LABEL: 'sext_vXi1' +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i8 = sext <8 x i1> undef to <8 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16i8 = sext <16 x i1> undef to <16 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32i8 = sext <32 x i1> undef to <32 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64i8 = sext <64 x i1> undef to <64 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'sext_vXi1' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i8 = sext <8 x i1> undef to <8 x i8> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16i8 = sext <16 x i1> undef to <16 x i8> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32i8 = sext <32 x i1> undef to <32 x i8> +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64i8 = sext <64 x i1> undef to <64 x i8> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'sext_vXi1' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i8 = sext <8 x i1> undef to <8 x i8> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16i8 = sext <16 x i1> undef to <16 x i8> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32i8 = sext <32 x i1> undef to <32 x i8> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64i8 = sext <64 x i1> undef to <64 x i8> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'sext_vXi1' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i8 = sext <8 x i1> undef to <8 x i8> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i8 = sext <16 x i1> undef to <16 x i8> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32i8 = sext <32 x i1> undef to <32 x i8> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 195 for instruction: %V64i8 = sext <64 x i1> undef to <64 x i8> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'sext_vXi1' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i8 = sext <8 x i1> undef to <8 x i8> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = sext <16 x i1> undef to <16 x i8> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = sext <32 x i1> undef to <32 x i8> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = sext <64 x i1> undef to <64 x i8> +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'sext_vXi1' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i8 = sext <8 x i1> undef to <8 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16i8 = sext <16 x i1> undef to <16 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32i8 = sext <32 x i1> undef to <32 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64i8 = sext <64 x i1> undef to <64 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2i64 = sext <2 x i1> undef to <2 x i64> + %V4i64 = sext <4 x i1> undef to <4 x i64> + %V8i64 = sext <8 x i1> undef to <8 x i64> + + %V2i32 = sext <2 x i1> undef to <2 x i32> + %V4i32 = sext <4 x i1> undef to <4 x i32> + %V8i32 = sext <8 x i1> undef to <8 x i32> + %V16i32 = sext <16 x i1> undef to <16 x i32> + + %V2i16 = sext <2 x i1> undef to <2 x i16> + %V4i16 = sext <4 x i1> undef to <4 x i16> + %V8i16 = sext <8 x i1> undef to <8 x i16> + %V16i16 = sext <16 x i1> undef to <16 x i16> + %V32i16 = sext <32 x i1> undef to <32 x i16> + + %V2i8 = sext <2 x i1> undef to <2 x i8> + %V4i8 = sext <4 x i1> undef to <4 x i8> + %V8i8 = sext <8 x i1> undef to <8 x i8> + %V16i8 = sext <16 x i1> undef to <16 x i8> + %V32i8 = sext <32 x i1> undef to <32 x i8> + %V64i8 = sext <64 x i1> undef to <64 x i8> + + ret i32 undef +} diff --git a/test/Analysis/CostModel/X86/fptosi-widen.ll b/test/Analysis/CostModel/X86/fptosi-widen.ll new file mode 100644 index 0000000000000000000000000000000000000000..2135fef504fe1e6345e3d99a01ed0264c24e0a7a --- /dev/null +++ b/test/Analysis/CostModel/X86/fptosi-widen.ll @@ -0,0 +1,305 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ +; +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2 + +define i32 @fptosi_double_i64(i32 %arg) { +; SSE-LABEL: 'fptosi_double_i64' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64 +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'fptosi_double_i64' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64 +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'fptosi_double_i64' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'fptosi_double_i64' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64 +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'fptosi_double_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I64 = fptosi double undef to i64 + %V2I64 = fptosi <2 x double> undef to <2 x i64> + %V4I64 = fptosi <4 x double> undef to <4 x i64> + %V8I64 = fptosi <8 x double> undef to <8 x i64> + ret i32 undef +} + +define i32 @fptosi_double_i32(i32 %arg) { +; SSE-LABEL: 'fptosi_double_i32' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi double undef to i32 +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptosi <2 x double> undef to <2 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = fptosi <4 x double> undef to <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = fptosi <8 x double> undef to <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'fptosi_double_i32' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi double undef to i32 +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptosi <2 x double> undef to <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x double> undef to <4 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = fptosi <8 x double> undef to <8 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'fptosi_double_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi double undef to i32 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptosi <2 x double> undef to <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x double> undef to <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptosi <8 x double> undef to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'fptosi_double_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi double undef to i32 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptosi <2 x double> undef to <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x double> undef to <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = fptosi <8 x double> undef to <8 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I32 = fptosi double undef to i32 + %V2I32 = fptosi <2 x double> undef to <2 x i32> + %V4I32 = fptosi <4 x double> undef to <4 x i32> + %V8I32 = fptosi <8 x double> undef to <8 x i32> + ret i32 undef +} + +define i32 @fptosi_double_i16(i32 %arg) { +; SSE-LABEL: 'fptosi_double_i16' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'fptosi_double_i16' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'fptosi_double_i16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'fptosi_double_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I16 = fptosi double undef to i16 + %V2I16 = fptosi <2 x double> undef to <2 x i16> + %V4I16 = fptosi <4 x double> undef to <4 x i16> + %V8I16 = fptosi <8 x double> undef to <8 x i16> + ret i32 undef +} + +define i32 @fptosi_double_i8(i32 %arg) { +; SSE-LABEL: 'fptosi_double_i8' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8 +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'fptosi_double_i8' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8 +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8> +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8> +; AVX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'fptosi_double_i8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'fptosi_double_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I8 = fptosi double undef to i8 + %V2I8 = fptosi <2 x double> undef to <2 x i8> + %V4I8 = fptosi <4 x double> undef to <4 x i8> + %V8I8 = fptosi <8 x double> undef to <8 x i8> + ret i32 undef +} + +define i32 @fptosi_float_i64(i32 %arg) { +; SSE-LABEL: 'fptosi_float_i64' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64 +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'fptosi_float_i64' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64 +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'fptosi_float_i64' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'fptosi_float_i64' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64 +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'fptosi_float_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I64 = fptosi float undef to i64 + %V2I64 = fptosi <2 x float> undef to <2 x i64> + %V4I64 = fptosi <4 x float> undef to <4 x i64> + %V8I64 = fptosi <8 x float> undef to <8 x i64> + %V16I64 = fptosi <16 x float> undef to <16 x i64> + ret i32 undef +} + +define i32 @fptosi_float_i32(i32 %arg) { +; CHECK-LABEL: 'fptosi_float_i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi float undef to i32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x float> undef to <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptosi <8 x float> undef to <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = fptosi <16 x float> undef to <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'fptosi_float_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi float undef to i32 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x float> undef to <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptosi <8 x float> undef to <8 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = fptosi <16 x float> undef to <16 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I32 = fptosi float undef to i32 + %V4I32 = fptosi <4 x float> undef to <4 x i32> + %V8I32 = fptosi <8 x float> undef to <8 x i32> + %V16I32 = fptosi <16 x float> undef to <16 x i32> + ret i32 undef +} + +define i32 @fptosi_float_i16(i32 %arg) { +; SSE-LABEL: 'fptosi_float_i16' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'fptosi_float_i16' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'fptosi_float_i16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'fptosi_float_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I16 = fptosi float undef to i16 + %V4I16 = fptosi <4 x float> undef to <4 x i16> + %V8I16 = fptosi <8 x float> undef to <8 x i16> + %V16I16 = fptosi <16 x float> undef to <16 x i16> + ret i32 undef +} + +define i32 @fptosi_float_i8(i32 %arg) { +; SSE-LABEL: 'fptosi_float_i8' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8 +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'fptosi_float_i8' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8> +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8> +; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'fptosi_float_i8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'fptosi_float_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I8 = fptosi float undef to i8 + %V4I8 = fptosi <4 x float> undef to <4 x i8> + %V8I8 = fptosi <8 x float> undef to <8 x i8> + %V16I8 = fptosi <16 x float> undef to <16 x i8> + ret i32 undef +} diff --git a/test/Analysis/CostModel/X86/fptoui-widen.ll b/test/Analysis/CostModel/X86/fptoui-widen.ll new file mode 100644 index 0000000000000000000000000000000000000000..d5b0482c162677b4fc0b97ee1fb208bacf785c44 --- /dev/null +++ b/test/Analysis/CostModel/X86/fptoui-widen.ll @@ -0,0 +1,319 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ +; +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2 + +define i32 @fptoui_double_i64(i32 %arg) { +; SSE-LABEL: 'fptoui_double_i64' +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64 +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'fptoui_double_i64' +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64 +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'fptoui_double_i64' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui double undef to i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'fptoui_double_i64' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui double undef to i64 +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'fptoui_double_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I64 = fptoui double undef to i64 + %V2I64 = fptoui <2 x double> undef to <2 x i64> + %V4I64 = fptoui <4 x double> undef to <4 x i64> + %V8I64 = fptoui <8 x double> undef to <8 x i64> + ret i32 undef +} + +define i32 @fptoui_double_i32(i32 %arg) { +; SSE-LABEL: 'fptoui_double_i32' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'fptoui_double_i32' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'fptoui_double_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'fptoui_double_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I32 = fptoui double undef to i32 + %V2I32 = fptoui <2 x double> undef to <2 x i32> + %V4I32 = fptoui <4 x double> undef to <4 x i32> + %V8I32 = fptoui <8 x double> undef to <8 x i32> + ret i32 undef +} + +define i32 @fptoui_double_i16(i32 %arg) { +; SSE-LABEL: 'fptoui_double_i16' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16 +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'fptoui_double_i16' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'fptoui_double_i16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'fptoui_double_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I16 = fptoui double undef to i16 + %V2I16 = fptoui <2 x double> undef to <2 x i16> + %V4I16 = fptoui <4 x double> undef to <4 x i16> + %V8I16 = fptoui <8 x double> undef to <8 x i16> + ret i32 undef +} + +define i32 @fptoui_double_i8(i32 %arg) { +; SSE-LABEL: 'fptoui_double_i8' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8 +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'fptoui_double_i8' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8 +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8> +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8> +; AVX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'fptoui_double_i8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'fptoui_double_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I8 = fptoui double undef to i8 + %V2I8 = fptoui <2 x double> undef to <2 x i8> + %V4I8 = fptoui <4 x double> undef to <4 x i8> + %V8I8 = fptoui <8 x double> undef to <8 x i8> + ret i32 undef +} + +define i32 @fptoui_float_i64(i32 %arg) { +; SSE-LABEL: 'fptoui_float_i64' +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64 +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'fptoui_float_i64' +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64 +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'fptoui_float_i64' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui float undef to i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'fptoui_float_i64' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui float undef to i64 +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'fptoui_float_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I64 = fptoui float undef to i64 + %V2I64 = fptoui <2 x float> undef to <2 x i64> + %V4I64 = fptoui <4 x float> undef to <4 x i64> + %V8I64 = fptoui <8 x float> undef to <8 x i64> + %V16I64 = fptoui <16 x float> undef to <16 x i64> + ret i32 undef +} + +define i32 @fptoui_float_i32(i32 %arg) { +; SSE-LABEL: 'fptoui_float_i32' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32 +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'fptoui_float_i32' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32 +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'fptoui_float_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'fptoui_float_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I32 = fptoui float undef to i32 + %V4I32 = fptoui <4 x float> undef to <4 x i32> + %V8I32 = fptoui <8 x float> undef to <8 x i32> + %V16I32 = fptoui <16 x float> undef to <16 x i32> + ret i32 undef +} + +define i32 @fptoui_float_i16(i32 %arg) { +; SSE-LABEL: 'fptoui_float_i16' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16 +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'fptoui_float_i16' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'fptoui_float_i16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'fptoui_float_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I16 = fptoui float undef to i16 + %V4I16 = fptoui <4 x float> undef to <4 x i16> + %V8I16 = fptoui <8 x float> undef to <8 x i16> + %V16I16 = fptoui <16 x float> undef to <16 x i16> + ret i32 undef +} + +define i32 @fptoui_float_i8(i32 %arg) { +; SSE-LABEL: 'fptoui_float_i8' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8 +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'fptoui_float_i8' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8 +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8> +; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8> +; AVX-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'fptoui_float_i8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'fptoui_float_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I8 = fptoui float undef to i8 + %V4I8 = fptoui <4 x float> undef to <4 x i8> + %V8I8 = fptoui <8 x float> undef to <8 x i8> + %V16I8 = fptoui <16 x float> undef to <16 x i8> + ret i32 undef +} diff --git a/test/Analysis/CostModel/X86/fshl.ll b/test/Analysis/CostModel/X86/fshl.ll new file mode 100644 index 0000000000000000000000000000000000000000..d5da8e7ae4da41c356aad2139bae9929de1a0fea --- /dev/null +++ b/test/Analysis/CostModel/X86/fshl.ll @@ -0,0 +1,2632 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ +; +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=slm | FileCheck %s --check-prefixes=SLM +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=BDVER2 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; +; Variable Funnel Shifts +; + +define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512, i64 %c64, <2 x i64> %c128, <4 x i64> %c256, <8 x i64> %c512) { +; SSSE3-LABEL: 'var_funnel_i64' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'var_funnel_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'var_funnel_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'var_funnel_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'var_funnel_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'var_funnel_i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64) +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; SLM-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'var_funnel_i64' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64) +; GLM-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; GLM-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; GLM-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'var_funnel_i64' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'var_funnel_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64) + %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) + %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) + %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) + ret void +} + +define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512, i32 %c32, <4 x i32> %c128, <8 x i32> %c256, <16 x i32> %c512) { +; SSSE3-LABEL: 'var_funnel_i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'var_funnel_i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'var_funnel_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32) +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'var_funnel_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'var_funnel_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'var_funnel_i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32) +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; SLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'var_funnel_i32' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32) +; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; GLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; GLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'var_funnel_i32' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'var_funnel_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32) + %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) + %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) + %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) + ret void +} + +define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) { +; SSSE3-LABEL: 'var_funnel_i16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'var_funnel_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) +; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'var_funnel_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) +; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'var_funnel_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) +; AVX2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'var_funnel_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'var_funnel_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'var_funnel_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'var_funnel_i16' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) +; SLM-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; SLM-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; SLM-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'var_funnel_i16' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) +; GLM-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; GLM-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; GLM-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'var_funnel_i16' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'var_funnel_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) + %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) + %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) + %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) + ret void +} + +define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) { +; SSSE3-LABEL: 'var_funnel_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 228 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'var_funnel_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'var_funnel_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 122 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'var_funnel_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) +; AVX2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'var_funnel_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'var_funnel_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'var_funnel_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'var_funnel_i8' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) +; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; SLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; SLM-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'var_funnel_i8' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) +; GLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; GLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; GLM-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'var_funnel_i8' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'var_funnel_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 122 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) + %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) + %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) + %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) + ret void +} + +; +; Uniform Variable Funnel Shifts +; + +define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512, i64 %c64, <2 x i64> %c128, <4 x i64> %c256, <8 x i64> %c512) { +; SSSE3-LABEL: 'splatvar_funnel_i64' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'splatvar_funnel_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatvar_funnel_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatvar_funnel_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'splatvar_funnel_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatvar_funnel_i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatvar_funnel_i64' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; GLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; GLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatvar_funnel_i64' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatvar_funnel_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer + %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer + %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer + %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) + %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) + %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) + ret void +} + +define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512, i32 %c32, <4 x i32> %c128, <8 x i32> %c256, <16 x i32> %c512) { +; SSE-LABEL: 'splatvar_funnel_i32' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; SSE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatvar_funnel_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatvar_funnel_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'splatvar_funnel_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatvar_funnel_i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatvar_funnel_i32' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; GLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; GLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatvar_funnel_i32' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatvar_funnel_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer + %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer + %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer + %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) + %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) + %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) + ret void +} + +define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) { +; SSE-LABEL: 'splatvar_funnel_i16' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; SSE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatvar_funnel_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatvar_funnel_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'splatvar_funnel_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'splatvar_funnel_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'splatvar_funnel_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatvar_funnel_i16' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatvar_funnel_i16' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; GLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; GLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatvar_funnel_i16' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatvar_funnel_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer + %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer + %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer + %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) + %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) + %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) + ret void +} + +define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) { +; SSSE3-LABEL: 'splatvar_funnel_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 228 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'splatvar_funnel_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatvar_funnel_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 122 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatvar_funnel_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'splatvar_funnel_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'splatvar_funnel_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'splatvar_funnel_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatvar_funnel_i8' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; SLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; SLM-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatvar_funnel_i8' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; GLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; GLM-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatvar_funnel_i8' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatvar_funnel_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 122 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer + %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer + %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer + %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) + %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) + %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) + ret void +} + +; +; Constant Funnel Shifts +; + +define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) { +; SSSE3-LABEL: 'constant_funnel_i64' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'constant_funnel_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'constant_funnel_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'constant_funnel_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'constant_funnel_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'constant_funnel_i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'constant_funnel_i64' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'constant_funnel_i64' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'constant_funnel_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) + %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) + %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) + %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) + ret void +} + +define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) { +; SSSE3-LABEL: 'constant_funnel_i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'constant_funnel_i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7) +; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'constant_funnel_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'constant_funnel_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'constant_funnel_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'constant_funnel_i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'constant_funnel_i32' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'constant_funnel_i32' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'constant_funnel_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7) + %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) + %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) + %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) + ret void +} + +define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) { +; SSSE3-LABEL: 'constant_funnel_i16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'constant_funnel_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'constant_funnel_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'constant_funnel_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'constant_funnel_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'constant_funnel_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'constant_funnel_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'constant_funnel_i16' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'constant_funnel_i16' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'constant_funnel_i16' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'constant_funnel_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) + %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) + %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) + %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) + ret void +} + +define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) { +; SSSE3-LABEL: 'constant_funnel_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'constant_funnel_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'constant_funnel_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'constant_funnel_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'constant_funnel_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'constant_funnel_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'constant_funnel_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'constant_funnel_i8' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'constant_funnel_i8' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'constant_funnel_i8' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'constant_funnel_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) + %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) + %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) + %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) + ret void +} + +; +; Uniform Constant Funnel Shifts +; + +define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) { +; SSSE3-LABEL: 'splatconstant_funnel_i64' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'splatconstant_funnel_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatconstant_funnel_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatconstant_funnel_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'splatconstant_funnel_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatconstant_funnel_i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatconstant_funnel_i64' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatconstant_funnel_i64' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatconstant_funnel_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) + %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) + %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) + %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) + ret void +} + +define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) { +; SSE-LABEL: 'splatconstant_funnel_i32' +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatconstant_funnel_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatconstant_funnel_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'splatconstant_funnel_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatconstant_funnel_i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5) +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatconstant_funnel_i32' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5) +; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatconstant_funnel_i32' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatconstant_funnel_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5) + %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) + %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) + %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) + ret void +} + +define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) { +; SSE-LABEL: 'splatconstant_funnel_i16' +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatconstant_funnel_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatconstant_funnel_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'splatconstant_funnel_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'splatconstant_funnel_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'splatconstant_funnel_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatconstant_funnel_i16' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatconstant_funnel_i16' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) +; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatconstant_funnel_i16' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatconstant_funnel_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) + %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) + %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) + %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) + ret void +} + +define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) { +; SSE-LABEL: 'splatconstant_funnel_i8' +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatconstant_funnel_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatconstant_funnel_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'splatconstant_funnel_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'splatconstant_funnel_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'splatconstant_funnel_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatconstant_funnel_i8' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatconstant_funnel_i8' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) +; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatconstant_funnel_i8' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatconstant_funnel_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) + %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) + %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) + %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) + ret void +} + +; +; Variable Unary Funnel Shifts (Rotates) +; + +define void @var_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %c64, <2 x i64> %c128, <4 x i64> %c256, <8 x i64> %c512) { +; SSE-LABEL: 'var_rotate_i64' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64) +; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) +; SSE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) +; SSE-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'var_rotate_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64) +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'var_rotate_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'var_rotate_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'var_rotate_i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64) +; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) +; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) +; SLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'var_rotate_i64' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64) +; GLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) +; GLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) +; GLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'var_rotate_i64' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'var_rotate_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64) + %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) + %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) + %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) + ret void +} + +define void @var_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %c32, <4 x i32> %c128, <8 x i32> %c256, <16 x i32> %c512) { +; SSSE3-LABEL: 'var_rotate_i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'var_rotate_i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32) +; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'var_rotate_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32) +; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'var_rotate_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'var_rotate_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'var_rotate_i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32) +; SLM-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; SLM-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; SLM-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'var_rotate_i32' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32) +; GLM-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; GLM-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; GLM-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'var_rotate_i32' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'var_rotate_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32) + %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) + %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) + %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) + ret void +} + +define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) { +; SSSE3-LABEL: 'var_rotate_i16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 268 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'var_rotate_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) +; SSE42-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'var_rotate_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) +; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'var_rotate_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) +; AVX2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'var_rotate_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'var_rotate_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'var_rotate_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'var_rotate_i16' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) +; SLM-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; SLM-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; SLM-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'var_rotate_i16' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) +; GLM-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; GLM-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; GLM-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'var_rotate_i16' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'var_rotate_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) + %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) + %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) + %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) + ret void +} + +define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) { +; SSSE3-LABEL: 'var_rotate_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 220 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'var_rotate_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) +; SSE42-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'var_rotate_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'var_rotate_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'var_rotate_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'var_rotate_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'var_rotate_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'var_rotate_i8' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) +; SLM-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; SLM-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; SLM-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'var_rotate_i8' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) +; GLM-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; GLM-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; GLM-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'var_rotate_i8' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'var_rotate_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) + %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) + %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) + %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) + ret void +} + +; +; Uniform Variable Unary Funnel Shifts (Rotates) +; + +define void @splatvar_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %c64, <2 x i64> %c128, <4 x i64> %c256, <8 x i64> %c512) { +; SSE-LABEL: 'splatvar_rotate_i64' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) +; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatvar_rotate_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatvar_rotate_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'splatvar_rotate_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatvar_rotate_i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) +; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatvar_rotate_i64' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) +; GLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) +; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatvar_rotate_i64' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatvar_rotate_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer + %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer + %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer + %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) + %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) + %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) + ret void +} + +define void @splatvar_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %c32, <4 x i32> %c128, <8 x i32> %c256, <16 x i32> %c512) { +; SSE-LABEL: 'splatvar_rotate_i32' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) +; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatvar_rotate_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatvar_rotate_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'splatvar_rotate_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatvar_rotate_i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatvar_rotate_i32' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) +; GLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) +; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatvar_rotate_i32' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatvar_rotate_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer + %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer + %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer + %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) + %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) + %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) + ret void +} + +define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) { +; SSE-LABEL: 'splatvar_rotate_i16' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatvar_rotate_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatvar_rotate_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'splatvar_rotate_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'splatvar_rotate_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'splatvar_rotate_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatvar_rotate_i16' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatvar_rotate_i16' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; GLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatvar_rotate_i16' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatvar_rotate_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer + %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer + %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer + %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) + %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) + %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) + ret void +} + +define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) { +; SSSE3-LABEL: 'splatvar_rotate_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 220 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'splatvar_rotate_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatvar_rotate_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatvar_rotate_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'splatvar_rotate_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'splatvar_rotate_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'splatvar_rotate_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatvar_rotate_i8' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; SLM-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; SLM-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatvar_rotate_i8' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; GLM-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; GLM-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatvar_rotate_i8' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatvar_rotate_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer + %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer + %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer + %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) + %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) + %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) + ret void +} + +; +; Constant Unary Funnel Shifts (Rotates) +; + +define void @constant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) { +; SSE-LABEL: 'constant_rotate_i64' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'constant_rotate_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'constant_rotate_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'constant_rotate_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'constant_rotate_i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'constant_rotate_i64' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'constant_rotate_i64' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'constant_rotate_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) + %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) + %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) + %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) + ret void +} + +define void @constant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) { +; SSSE3-LABEL: 'constant_rotate_i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'constant_rotate_i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'constant_rotate_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'constant_rotate_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'constant_rotate_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'constant_rotate_i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'constant_rotate_i32' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'constant_rotate_i32' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'constant_rotate_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7) + %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) + %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) + %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) + ret void +} + +define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) { +; SSSE3-LABEL: 'constant_rotate_i16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'constant_rotate_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) +; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'constant_rotate_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'constant_rotate_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'constant_rotate_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'constant_rotate_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'constant_rotate_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'constant_rotate_i16' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'constant_rotate_i16' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'constant_rotate_i16' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'constant_rotate_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) + %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) + %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) + %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) + ret void +} + +define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) { +; SSSE3-LABEL: 'constant_rotate_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'constant_rotate_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'constant_rotate_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'constant_rotate_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'constant_rotate_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'constant_rotate_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'constant_rotate_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'constant_rotate_i8' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'constant_rotate_i8' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'constant_rotate_i8' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'constant_rotate_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) + %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) + %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) + %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) + ret void +} + +; +; Uniform Constant Unary Funnel Shifts (Rotates) +; + +define void @splatconstant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) { +; SSE-LABEL: 'splatconstant_rotate_i64' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatconstant_rotate_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatconstant_rotate_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'splatconstant_rotate_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatconstant_rotate_i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatconstant_rotate_i64' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatconstant_rotate_i64' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatconstant_rotate_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) + %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) + %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) + %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) + ret void +} + +define void @splatconstant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) { +; SSE-LABEL: 'splatconstant_rotate_i32' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatconstant_rotate_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatconstant_rotate_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'splatconstant_rotate_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatconstant_rotate_i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5) +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatconstant_rotate_i32' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5) +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatconstant_rotate_i32' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatconstant_rotate_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5) + %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) + %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) + %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) + ret void +} + +define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) { +; SSE-LABEL: 'splatconstant_rotate_i16' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatconstant_rotate_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatconstant_rotate_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'splatconstant_rotate_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'splatconstant_rotate_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'splatconstant_rotate_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatconstant_rotate_i16' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatconstant_rotate_i16' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatconstant_rotate_i16' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatconstant_rotate_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) + %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) + %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) + %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) + ret void +} + +define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) { +; SSE-LABEL: 'splatconstant_rotate_i8' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatconstant_rotate_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatconstant_rotate_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'splatconstant_rotate_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'splatconstant_rotate_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'splatconstant_rotate_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatconstant_rotate_i8' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatconstant_rotate_i8' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) +; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatconstant_rotate_i8' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatconstant_rotate_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) + %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) + %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) + %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) + ret void +} + +declare i64 @llvm.fshl.i64(i64, i64, i64) +declare i32 @llvm.fshl.i32(i32, i32, i32) +declare i16 @llvm.fshl.i16(i16, i16, i16) +declare i8 @llvm.fshl.i8 (i8, i8, i8) + +declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) +declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) +declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) + +declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>) +declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>) +declare <16 x i16> @llvm.fshl.v16i16(<16 x i16>, <16 x i16>, <16 x i16>) +declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>) + +declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>) +declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>) +declare <32 x i16> @llvm.fshl.v32i16(<32 x i16>, <32 x i16>, <32 x i16>) +declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>) diff --git a/test/Analysis/CostModel/X86/fshr.ll b/test/Analysis/CostModel/X86/fshr.ll new file mode 100644 index 0000000000000000000000000000000000000000..7450fc73d40703a1959b5cf18fb582dd5be21663 --- /dev/null +++ b/test/Analysis/CostModel/X86/fshr.ll @@ -0,0 +1,2633 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ +; +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=slm | FileCheck %s --check-prefixes=SLM +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=BDVER2 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; +; Variable Funnel Shifts +; + +define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512, i64 %c64, <2 x i64> %c128, <4 x i64> %c256, <8 x i64> %c512) { +; SSSE3-LABEL: 'var_funnel_i64' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'var_funnel_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'var_funnel_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'var_funnel_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'var_funnel_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'var_funnel_i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64) +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; SLM-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'var_funnel_i64' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64) +; GLM-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; GLM-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; GLM-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'var_funnel_i64' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'var_funnel_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64) + %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) + %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) + %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) + ret void +} + +define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512, i32 %c32, <4 x i32> %c128, <8 x i32> %c256, <16 x i32> %c512) { +; SSSE3-LABEL: 'var_funnel_i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'var_funnel_i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'var_funnel_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32) +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'var_funnel_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'var_funnel_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'var_funnel_i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32) +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; SLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'var_funnel_i32' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32) +; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; GLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; GLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'var_funnel_i32' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'var_funnel_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32) + %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) + %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) + %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) + ret void +} + +define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) { +; SSSE3-LABEL: 'var_funnel_i16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'var_funnel_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) +; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'var_funnel_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) +; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'var_funnel_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) +; AVX2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'var_funnel_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'var_funnel_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'var_funnel_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'var_funnel_i16' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) +; SLM-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; SLM-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; SLM-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'var_funnel_i16' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) +; GLM-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; GLM-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; GLM-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'var_funnel_i16' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'var_funnel_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) + %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) + %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) + %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) + ret void +} + +define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) { +; SSSE3-LABEL: 'var_funnel_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 228 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'var_funnel_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'var_funnel_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 122 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'var_funnel_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) +; AVX2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'var_funnel_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'var_funnel_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'var_funnel_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'var_funnel_i8' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) +; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; SLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; SLM-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'var_funnel_i8' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) +; GLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; GLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; GLM-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'var_funnel_i8' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'var_funnel_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 122 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) + %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) + %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) + %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) + ret void +} + +; +; Uniform Variable Funnel Shifts +; + +define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512, i64 %c64, <2 x i64> %c128, <4 x i64> %c256, <8 x i64> %c512) { +; SSSE3-LABEL: 'splatvar_funnel_i64' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'splatvar_funnel_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatvar_funnel_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatvar_funnel_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'splatvar_funnel_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatvar_funnel_i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatvar_funnel_i64' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; GLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; GLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatvar_funnel_i64' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatvar_funnel_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer + %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer + %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer + %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) + %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) + %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) + ret void +} + +define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512, i32 %c32, <4 x i32> %c128, <8 x i32> %c256, <16 x i32> %c512) { +; SSE-LABEL: 'splatvar_funnel_i32' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; SSE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatvar_funnel_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatvar_funnel_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'splatvar_funnel_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatvar_funnel_i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatvar_funnel_i32' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; GLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; GLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatvar_funnel_i32' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatvar_funnel_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer + %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer + %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer + %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) + %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) + %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) + ret void +} + +define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) { +; SSE-LABEL: 'splatvar_funnel_i16' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; SSE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatvar_funnel_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatvar_funnel_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'splatvar_funnel_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'splatvar_funnel_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'splatvar_funnel_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatvar_funnel_i16' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatvar_funnel_i16' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; GLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; GLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatvar_funnel_i16' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatvar_funnel_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer + %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer + %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer + %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) + %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) + %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) + ret void +} + +define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) { +; SSSE3-LABEL: 'splatvar_funnel_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 228 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'splatvar_funnel_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatvar_funnel_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 122 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatvar_funnel_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'splatvar_funnel_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'splatvar_funnel_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'splatvar_funnel_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatvar_funnel_i8' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; SLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; SLM-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatvar_funnel_i8' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; GLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; GLM-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatvar_funnel_i8' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatvar_funnel_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 122 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer + %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer + %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer + %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) + %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) + %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) + ret void +} + +; +; Constant Funnel Shifts +; + +define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) { +; SSSE3-LABEL: 'constant_funnel_i64' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'constant_funnel_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'constant_funnel_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'constant_funnel_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'constant_funnel_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'constant_funnel_i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'constant_funnel_i64' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'constant_funnel_i64' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'constant_funnel_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) + %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) + %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) + %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) + ret void +} + +define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) { +; SSSE3-LABEL: 'constant_funnel_i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'constant_funnel_i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7) +; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'constant_funnel_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'constant_funnel_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'constant_funnel_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'constant_funnel_i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'constant_funnel_i32' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'constant_funnel_i32' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'constant_funnel_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7) + %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) + %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) + %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) + ret void +} + +define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) { +; SSSE3-LABEL: 'constant_funnel_i16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'constant_funnel_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'constant_funnel_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'constant_funnel_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'constant_funnel_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'constant_funnel_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'constant_funnel_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'constant_funnel_i16' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'constant_funnel_i16' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'constant_funnel_i16' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'constant_funnel_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) + %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) + %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) + %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) + ret void +} + +define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) { +; SSSE3-LABEL: 'constant_funnel_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'constant_funnel_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'constant_funnel_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'constant_funnel_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'constant_funnel_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'constant_funnel_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'constant_funnel_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'constant_funnel_i8' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'constant_funnel_i8' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'constant_funnel_i8' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'constant_funnel_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) + %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) + %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) + %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) + ret void +} + +; +; Uniform Constant Funnel Shifts +; + +define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) { +; SSSE3-LABEL: 'splatconstant_funnel_i64' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'splatconstant_funnel_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatconstant_funnel_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatconstant_funnel_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'splatconstant_funnel_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatconstant_funnel_i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatconstant_funnel_i64' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatconstant_funnel_i64' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatconstant_funnel_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) + %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) + %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) + %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) + ret void +} + +define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) { +; SSE-LABEL: 'splatconstant_funnel_i32' +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatconstant_funnel_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatconstant_funnel_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'splatconstant_funnel_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatconstant_funnel_i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5) +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatconstant_funnel_i32' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5) +; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatconstant_funnel_i32' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatconstant_funnel_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5) + %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) + %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) + %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) + ret void +} + +define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) { +; SSE-LABEL: 'splatconstant_funnel_i16' +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatconstant_funnel_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatconstant_funnel_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'splatconstant_funnel_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'splatconstant_funnel_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'splatconstant_funnel_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatconstant_funnel_i16' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatconstant_funnel_i16' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) +; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatconstant_funnel_i16' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatconstant_funnel_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) + %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) + %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) + %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) + ret void +} + +define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) { +; SSE-LABEL: 'splatconstant_funnel_i8' +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatconstant_funnel_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatconstant_funnel_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'splatconstant_funnel_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'splatconstant_funnel_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'splatconstant_funnel_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatconstant_funnel_i8' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatconstant_funnel_i8' +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) +; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatconstant_funnel_i8' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatconstant_funnel_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) + %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) + %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) + %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) + ret void +} + +; +; Variable Unary Funnel Shifts (Rotates) +; +; TODO - Add constant and uniform-constant tests + +define void @var_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %c64, <2 x i64> %c128, <4 x i64> %c256, <8 x i64> %c512) { +; SSE-LABEL: 'var_rotate_i64' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64) +; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) +; SSE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) +; SSE-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'var_rotate_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64) +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'var_rotate_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'var_rotate_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'var_rotate_i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64) +; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) +; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) +; SLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'var_rotate_i64' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64) +; GLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) +; GLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) +; GLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'var_rotate_i64' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'var_rotate_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64) + %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) + %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) + %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) + ret void +} + +define void @var_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %c32, <4 x i32> %c128, <8 x i32> %c256, <16 x i32> %c512) { +; SSSE3-LABEL: 'var_rotate_i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'var_rotate_i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32) +; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'var_rotate_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32) +; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'var_rotate_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'var_rotate_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'var_rotate_i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32) +; SLM-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; SLM-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; SLM-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'var_rotate_i32' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32) +; GLM-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; GLM-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; GLM-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'var_rotate_i32' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'var_rotate_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32) + %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) + %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) + %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) + ret void +} + +define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) { +; SSSE3-LABEL: 'var_rotate_i16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 268 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'var_rotate_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) +; SSE42-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'var_rotate_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) +; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'var_rotate_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) +; AVX2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'var_rotate_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'var_rotate_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'var_rotate_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'var_rotate_i16' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) +; SLM-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; SLM-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; SLM-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'var_rotate_i16' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) +; GLM-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; GLM-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; GLM-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'var_rotate_i16' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'var_rotate_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) + %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) + %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) + %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) + ret void +} + +define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) { +; SSSE3-LABEL: 'var_rotate_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 220 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'var_rotate_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) +; SSE42-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'var_rotate_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'var_rotate_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'var_rotate_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'var_rotate_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'var_rotate_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'var_rotate_i8' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) +; SLM-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; SLM-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; SLM-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'var_rotate_i8' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) +; GLM-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; GLM-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; GLM-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'var_rotate_i8' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'var_rotate_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) + %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) + %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) + %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) + ret void +} + +; +; Uniform Variable Unary Funnel Shifts (Rotates) +; + +define void @splatvar_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %c64, <2 x i64> %c128, <4 x i64> %c256, <8 x i64> %c512) { +; SSE-LABEL: 'splatvar_rotate_i64' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) +; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatvar_rotate_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatvar_rotate_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'splatvar_rotate_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatvar_rotate_i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) +; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatvar_rotate_i64' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) +; GLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) +; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatvar_rotate_i64' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatvar_rotate_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer + %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer + %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer + %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) + %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) + %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) + ret void +} + +define void @splatvar_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %c32, <4 x i32> %c128, <8 x i32> %c256, <16 x i32> %c512) { +; SSE-LABEL: 'splatvar_rotate_i32' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) +; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatvar_rotate_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatvar_rotate_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'splatvar_rotate_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatvar_rotate_i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatvar_rotate_i32' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) +; GLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) +; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatvar_rotate_i32' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatvar_rotate_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer + %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer + %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer + %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) + %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) + %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) + ret void +} + +define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) { +; SSE-LABEL: 'splatvar_rotate_i16' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatvar_rotate_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatvar_rotate_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'splatvar_rotate_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'splatvar_rotate_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'splatvar_rotate_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatvar_rotate_i16' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatvar_rotate_i16' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; GLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatvar_rotate_i16' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatvar_rotate_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer + %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer + %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer + %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) + %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) + %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) + ret void +} + +define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) { +; SSSE3-LABEL: 'splatvar_rotate_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 220 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'splatvar_rotate_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; SSE42-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; SSE42-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatvar_rotate_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; AVX1-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; AVX1-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatvar_rotate_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; AVX2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'splatvar_rotate_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'splatvar_rotate_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'splatvar_rotate_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatvar_rotate_i8' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; SLM-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; SLM-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatvar_rotate_i8' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; GLM-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; GLM-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; GLM-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatvar_rotate_i8' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatvar_rotate_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; BTVER2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer + %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer + %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer + %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) + %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) + %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) + ret void +} + +; +; Constant Unary Funnel Shifts (Rotates) +; + +define void @constant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) { +; SSE-LABEL: 'constant_rotate_i64' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'constant_rotate_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'constant_rotate_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'constant_rotate_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'constant_rotate_i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'constant_rotate_i64' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'constant_rotate_i64' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'constant_rotate_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) + %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) + %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) + %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) + ret void +} + +define void @constant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) { +; SSSE3-LABEL: 'constant_rotate_i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'constant_rotate_i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'constant_rotate_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'constant_rotate_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'constant_rotate_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'constant_rotate_i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'constant_rotate_i32' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'constant_rotate_i32' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'constant_rotate_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7) + %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) + %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) + %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) + ret void +} + +define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) { +; SSSE3-LABEL: 'constant_rotate_i16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'constant_rotate_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) +; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'constant_rotate_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'constant_rotate_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'constant_rotate_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'constant_rotate_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'constant_rotate_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'constant_rotate_i16' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'constant_rotate_i16' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'constant_rotate_i16' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'constant_rotate_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) + %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) + %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) + %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) + ret void +} + +define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) { +; SSSE3-LABEL: 'constant_rotate_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'constant_rotate_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'constant_rotate_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'constant_rotate_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'constant_rotate_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'constant_rotate_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'constant_rotate_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'constant_rotate_i8' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'constant_rotate_i8' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'constant_rotate_i8' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'constant_rotate_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) + %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) + %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) + %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) + ret void +} + +; +; Uniform Constant Unary Funnel Shifts (Rotates) +; + +define void @splatconstant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) { +; SSE-LABEL: 'splatconstant_rotate_i64' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatconstant_rotate_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatconstant_rotate_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'splatconstant_rotate_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatconstant_rotate_i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) +; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatconstant_rotate_i64' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatconstant_rotate_i64' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatconstant_rotate_i64' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) + %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) + %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) + %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) + ret void +} + +define void @splatconstant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) { +; SSE-LABEL: 'splatconstant_rotate_i32' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatconstant_rotate_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatconstant_rotate_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'splatconstant_rotate_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatconstant_rotate_i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5) +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatconstant_rotate_i32' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5) +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatconstant_rotate_i32' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatconstant_rotate_i32' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5) + %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) + %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) + %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) + ret void +} + +define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) { +; SSE-LABEL: 'splatconstant_rotate_i16' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatconstant_rotate_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatconstant_rotate_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'splatconstant_rotate_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'splatconstant_rotate_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'splatconstant_rotate_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatconstant_rotate_i16' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatconstant_rotate_i16' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatconstant_rotate_i16' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatconstant_rotate_i16' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) + %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) + %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) + %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) + ret void +} + +define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) { +; SSE-LABEL: 'splatconstant_rotate_i8' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'splatconstant_rotate_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'splatconstant_rotate_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'splatconstant_rotate_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'splatconstant_rotate_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'splatconstant_rotate_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SLM-LABEL: 'splatconstant_rotate_i8' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; GLM-LABEL: 'splatconstant_rotate_i8' +; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) +; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BDVER2-LABEL: 'splatconstant_rotate_i8' +; BDVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; BDVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; BTVER2-LABEL: 'splatconstant_rotate_i8' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) + %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) + %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) + %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) + ret void +} + +declare i64 @llvm.fshr.i64(i64, i64, i64) +declare i32 @llvm.fshr.i32(i32, i32, i32) +declare i16 @llvm.fshr.i16(i16, i16, i16) +declare i8 @llvm.fshr.i8 (i8, i8, i8) + +declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) +declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) +declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) +declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) + +declare <4 x i64> @llvm.fshr.v4i64(<4 x i64>, <4 x i64>, <4 x i64>) +declare <8 x i32> @llvm.fshr.v8i32(<8 x i32>, <8 x i32>, <8 x i32>) +declare <16 x i16> @llvm.fshr.v16i16(<16 x i16>, <16 x i16>, <16 x i16>) +declare <32 x i8> @llvm.fshr.v32i8(<32 x i8>, <32 x i8>, <32 x i8>) + +declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>) +declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>) +declare <32 x i16> @llvm.fshr.v32i16(<32 x i16>, <32 x i16>, <32 x i16>) +declare <64 x i8> @llvm.fshr.v64i8(<64 x i8>, <64 x i8>, <64 x i8>) diff --git a/test/Analysis/CostModel/X86/interleave-load-i32.ll b/test/Analysis/CostModel/X86/interleave-load-i32.ll index 3c94d8c446f95605d9c9b4e637716d83fd2ee378..697b5000870aa88c2a6eb5f4b32bd545c4c70e01 100755 --- a/test/Analysis/CostModel/X86/interleave-load-i32.ll +++ b/test/Analysis/CostModel/X86/interleave-load-i32.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -S -mattr=avx512f --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/test/Analysis/CostModel/X86/interleave-store-i32.ll b/test/Analysis/CostModel/X86/interleave-store-i32.ll index e3076bfa294b8178f84bc22bbcd1fd6481f62510..d2901d0e37898cbae8e79437122a9996daf4abf7 100755 --- a/test/Analysis/CostModel/X86/interleave-store-i32.ll +++ b/test/Analysis/CostModel/X86/interleave-store-i32.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s +; RUN: opt -loop-vectorize -S -mattr=avx512f --debug-only=loop-vectorize < %s 2>&1| FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/test/Analysis/CostModel/X86/masked-intrinsic-cost-widen.ll b/test/Analysis/CostModel/X86/masked-intrinsic-cost-widen.ll new file mode 100644 index 0000000000000000000000000000000000000000..7ca1c4ec6df0e701b3e3b8c1f49db25ba8e01f06 --- /dev/null +++ b/test/Analysis/CostModel/X86/masked-intrinsic-cost-widen.ll @@ -0,0 +1,606 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -S -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze | FileCheck %s --check-prefix=AVX2 +; RUN: opt < %s -S -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=skylake -cost-model -analyze | FileCheck %s --check-prefix=SKL +; RUN: opt < %s -S -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=knl -cost-model -analyze | FileCheck %s --check-prefix=KNL +; RUN: opt < %s -S -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=skx -cost-model -analyze | FileCheck %s --check-prefix=SKX + + +define <2 x double> @test1(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) { +; AVX2-LABEL: 'test1' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res +; +; SKL-LABEL: 'test1' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer +; SKL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res +; +; KNL-LABEL: 'test1' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res +; +; SKX-LABEL: 'test1' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res +; + %mask = icmp eq <2 x i64> %trigger, zeroinitializer + %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) + ret <2 x double> %res +} + +define <4 x i32> @test2(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) { +; AVX2-LABEL: 'test2' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res +; +; SKL-LABEL: 'test2' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SKL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res +; +; KNL-LABEL: 'test2' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res +; +; SKX-LABEL: 'test2' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res +; + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst) + ret <4 x i32> %res +} + +define void @test3(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { +; AVX2-LABEL: 'test3' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKL-LABEL: 'test3' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SKL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; KNL-LABEL: 'test3' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKX-LABEL: 'test3' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask) + ret void +} + +define <8 x float> @test4(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) { +; AVX2-LABEL: 'test4' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res +; +; SKL-LABEL: 'test4' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; SKL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res +; +; KNL-LABEL: 'test4' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res +; +; SKX-LABEL: 'test4' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res +; + %mask = icmp eq <8 x i32> %trigger, zeroinitializer + %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1>%mask, <8 x float>%dst) + ret <8 x float> %res +} + +define void @test5(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { +; AVX2-LABEL: 'test5' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKL-LABEL: 'test5' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SKL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; KNL-LABEL: 'test5' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKX-LABEL: 'test5' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v2f32.p0v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask) + ret void +} + +define void @test6(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { +; AVX2-LABEL: 'test6' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKL-LABEL: 'test6' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SKL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; KNL-LABEL: 'test6' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKX-LABEL: 'test6' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) + ret void +} + +define <2 x float> @test7(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) { +; AVX2-LABEL: 'test7' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res +; +; SKL-LABEL: 'test7' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SKL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res +; +; KNL-LABEL: 'test7' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res +; +; SKX-LABEL: 'test7' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res +; + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst) + ret <2 x float> %res +} + +define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { +; AVX2-LABEL: 'test8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res +; +; SKL-LABEL: 'test8' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SKL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res +; +; KNL-LABEL: 'test8' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res +; +; SKX-LABEL: 'test8' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res +; + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) + ret <2 x i32> %res +} + +define <2 x double> @test_gather_2f64(<2 x double*> %ptrs, <2 x i1> %mask, <2 x double> %src0) { +; AVX2-LABEL: 'test_gather_2f64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res +; +; SKL-LABEL: 'test_gather_2f64' +; SKL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res +; +; KNL-LABEL: 'test_gather_2f64' +; KNL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res +; +; SKX-LABEL: 'test_gather_2f64' +; SKX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res +; + + + + +%res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) + ret <2 x double> %res +} +declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %src0) + +define <4 x i32> @test_gather_4i32(<4 x i32*> %ptrs, <4 x i1> %mask, <4 x i32> %src0) { +; AVX2-LABEL: 'test_gather_4i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res +; +; SKL-LABEL: 'test_gather_4i32' +; SKL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res +; +; KNL-LABEL: 'test_gather_4i32' +; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res +; +; SKX-LABEL: 'test_gather_4i32' +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res +; + + + + +%res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) + ret <4 x i32> %res +} + +define <4 x i32> @test_gather_4i32_const_mask(<4 x i32*> %ptrs, <4 x i32> %src0) { +; AVX2-LABEL: 'test_gather_4i32_const_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res +; +; SKL-LABEL: 'test_gather_4i32_const_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res +; +; KNL-LABEL: 'test_gather_4i32_const_mask' +; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res +; +; SKX-LABEL: 'test_gather_4i32_const_mask' +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res +; + + + + +%res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) + ret <4 x i32> %res +} +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32, <4 x i1> %mask, <4 x i32> %src0) + +define <16 x float> @test_gather_16f32_const_mask(float* %base, <16 x i32> %ind) { +; AVX2-LABEL: 'test_gather_16f32_const_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res +; +; SKL-LABEL: 'test_gather_16f32_const_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res +; +; KNL-LABEL: 'test_gather_16f32_const_mask' +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind +; KNL-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res +; +; SKX-LABEL: 'test_gather_16f32_const_mask' +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind +; SKX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> , <16 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res +; + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> , <16 x float> undef) + ret <16 x float>%res +} + +define <16 x float> @test_gather_16f32_var_mask(float* %base, <16 x i32> %ind, <16 x i1>%mask) { +; AVX2-LABEL: 'test_gather_16f32_var_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res +; +; SKL-LABEL: 'test_gather_16f32_var_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res +; +; KNL-LABEL: 'test_gather_16f32_var_mask' +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind +; KNL-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res +; +; SKX-LABEL: 'test_gather_16f32_var_mask' +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind +; SKX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res +; + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) + ret <16 x float>%res +} + +define <16 x float> @test_gather_16f32_ra_var_mask(<16 x float*> %ptrs, <16 x i32> %ind, <16 x i1>%mask) { +; AVX2-LABEL: 'test_gather_16f32_ra_var_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res +; +; SKL-LABEL: 'test_gather_16f32_ra_var_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res +; +; KNL-LABEL: 'test_gather_16f32_ra_var_mask' +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind +; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res +; +; SKX-LABEL: 'test_gather_16f32_ra_var_mask' +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind +; SKX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res +; + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) + ret <16 x float>%res +} + +define <16 x float> @test_gather_16f32_const_mask2(float* %base, <16 x i32> %ind) { +; AVX2-LABEL: 'test_gather_16f32_const_mask2' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res +; +; SKL-LABEL: 'test_gather_16f32_const_mask2' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer +; SKL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res +; +; KNL-LABEL: 'test_gather_16f32_const_mask2' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer +; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind +; KNL-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res +; +; SKX-LABEL: 'test_gather_16f32_const_mask2' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer +; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind +; SKX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res +; + %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 + %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer + + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) + ret <16 x float>%res +} + +define void @test_scatter_16i32(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) { +; AVX2-LABEL: 'test_scatter_16i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; AVX2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKL-LABEL: 'test_scatter_16i32' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; SKL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; KNL-LABEL: 'test_scatter_16i32' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; KNL-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKX-LABEL: 'test_scatter_16i32' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %imask = bitcast i16 %mask to <16 x i1> +; SKX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 + %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer + + %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind + %imask = bitcast i16 %mask to <16 x i1> + call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) + ret void +} + +define void @test_scatter_8i32(<8 x i32>%a1, <8 x i32*> %ptr, <8 x i1>%mask) { +; AVX2-LABEL: 'test_scatter_8i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKL-LABEL: 'test_scatter_8i32' +; SKL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; KNL-LABEL: 'test_scatter_8i32' +; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKX-LABEL: 'test_scatter_8i32' +; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + + call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask) + ret void +} + +declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32, <8 x i1> %mask) + +define void @test_scatter_4i32(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { +; AVX2-LABEL: 'test_scatter_4i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKL-LABEL: 'test_scatter_4i32' +; SKL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; KNL-LABEL: 'test_scatter_4i32' +; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKX-LABEL: 'test_scatter_4i32' +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) + ret void +} + +define <4 x float> @test_gather_4f32(float* %ptr, <4 x i32> %ind, <4 x i1>%mask) { +; AVX2-LABEL: 'test_gather_4f32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res +; +; SKL-LABEL: 'test_gather_4f32' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res +; +; KNL-LABEL: 'test_gather_4f32' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind +; KNL-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res +; +; SKX-LABEL: 'test_gather_4f32' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res +; + %sext_ind = sext <4 x i32> %ind to <4 x i64> + %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind + + %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) + ret <4 x float>%res +} + +define <4 x float> @test_gather_4f32_const_mask(float* %ptr, <4 x i32> %ind) { +; AVX2-LABEL: 'test_gather_4f32_const_mask' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res +; +; SKL-LABEL: 'test_gather_4f32_const_mask' +; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind +; SKL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res +; +; KNL-LABEL: 'test_gather_4f32_const_mask' +; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind +; KNL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res +; +; SKX-LABEL: 'test_gather_4f32_const_mask' +; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind +; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> , <4 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res +; + %sext_ind = sext <4 x i32> %ind to <4 x i64> + %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind + + %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> , <4 x float> undef) + ret <4 x float>%res +} + +declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32, <4 x i1> %mask, <4 x float> ) +declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32, <4 x i1> %mask) +declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32, <16 x i1> %imask) +declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32, <16 x i1> %mask, <16 x float>) + +declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) +declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>) +declare void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>) +declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) +declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>) +declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>) +declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) +declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) +declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>) +declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>) +declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) +declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>) +declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) +declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) +declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>) +declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) diff --git a/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll index 18dc250de814394b4053826c75257ce8b58f5529..5f2a51c7f938bd8e387cb0746c599cb992fb13a2 100644 --- a/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll +++ b/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll @@ -112,22 +112,22 @@ define <8 x float> @test4(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %d define void @test5(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { ; AVX2-LABEL: 'test5' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SKL-LABEL: 'test5' ; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SKL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) +; SKL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) ; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; KNL-LABEL: 'test5' ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SKX-LABEL: 'test5' ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) ; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -164,22 +164,22 @@ define void @test6(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { define <2 x float> @test7(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) { ; AVX2-LABEL: 'test7' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res ; ; SKL-LABEL: 'test7' ; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SKL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; SKL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) ; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res ; ; KNL-LABEL: 'test7' ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res ; ; SKX-LABEL: 'test7' ; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer -; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) +; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) ; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res ; %mask = icmp eq <2 x i32> %trigger, zeroinitializer diff --git a/test/Analysis/CostModel/X86/min-legal-vector-width.ll b/test/Analysis/CostModel/X86/min-legal-vector-width.ll new file mode 100644 index 0000000000000000000000000000000000000000..028ed87a9896084c83e536bbb94822b09cad38e2 --- /dev/null +++ b/test/Analysis/CostModel/X86/min-legal-vector-width.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,VEC256,AVX +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512vl,+avx512bw,+avx512dq,+prefer-256-bit | FileCheck %s --check-prefixes=CHECK,VEC256,SKX256 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512vl,+avx512bw,+avx512dq,-prefer-256-bit | FileCheck %s --check-prefixes=CHECK,VEC512 + +define void @zext256() "min-legal-vector-width"="256" { +; VEC256-LABEL: 'zext256' +; VEC256-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %A = zext <8 x i16> undef to <8 x i64> +; VEC256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %B = zext <8 x i32> undef to <8 x i64> +; VEC256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C = zext <16 x i8> undef to <16 x i32> +; VEC256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D = zext <16 x i16> undef to <16 x i32> +; VEC256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %E = zext <32 x i8> undef to <32 x i16> +; VEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VEC512-LABEL: 'zext256' +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A = zext <8 x i16> undef to <8 x i64> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = zext <8 x i32> undef to <8 x i64> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C = zext <16 x i8> undef to <16 x i32> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D = zext <16 x i16> undef to <16 x i32> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %E = zext <32 x i8> undef to <32 x i16> +; VEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %A = zext <8 x i16> undef to <8 x i64> + %B = zext <8 x i32> undef to <8 x i64> + %C = zext <16 x i8> undef to <16 x i32> + %D = zext <16 x i16> undef to <16 x i32> + %E = zext <32 x i8> undef to <32 x i16> + ret void +} + +define void @zext512() "min-legal-vector-width"="512" { +; AVX-LABEL: 'zext512' +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %A = zext <8 x i16> undef to <8 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %B = zext <8 x i32> undef to <8 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C = zext <16 x i8> undef to <16 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D = zext <16 x i16> undef to <16 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %E = zext <32 x i8> undef to <32 x i16> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKX256-LABEL: 'zext512' +; SKX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A = zext <8 x i16> undef to <8 x i64> +; SKX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = zext <8 x i32> undef to <8 x i64> +; SKX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C = zext <16 x i8> undef to <16 x i32> +; SKX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D = zext <16 x i16> undef to <16 x i32> +; SKX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %E = zext <32 x i8> undef to <32 x i16> +; SKX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VEC512-LABEL: 'zext512' +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A = zext <8 x i16> undef to <8 x i64> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = zext <8 x i32> undef to <8 x i64> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C = zext <16 x i8> undef to <16 x i32> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D = zext <16 x i16> undef to <16 x i32> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %E = zext <32 x i8> undef to <32 x i16> +; VEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %A = zext <8 x i16> undef to <8 x i64> + %B = zext <8 x i32> undef to <8 x i64> + %C = zext <16 x i8> undef to <16 x i32> + %D = zext <16 x i16> undef to <16 x i32> + %E = zext <32 x i8> undef to <32 x i16> + ret void +} + +define void @sext256() "min-legal-vector-width"="256" { +; VEC256-LABEL: 'sext256' +; VEC256-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %A = sext <8 x i8> undef to <8 x i64> +; VEC256-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %B = sext <8 x i16> undef to <8 x i64> +; VEC256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C = sext <8 x i32> undef to <8 x i64> +; VEC256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D = sext <16 x i8> undef to <16 x i32> +; VEC256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %E = sext <16 x i16> undef to <16 x i32> +; VEC256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %F = sext <32 x i8> undef to <32 x i16> +; VEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VEC512-LABEL: 'sext256' +; VEC512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %A = sext <8 x i8> undef to <8 x i64> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = sext <8 x i16> undef to <8 x i64> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C = sext <8 x i32> undef to <8 x i64> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D = sext <16 x i8> undef to <16 x i32> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %E = sext <16 x i16> undef to <16 x i32> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F = sext <32 x i8> undef to <32 x i16> +; VEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %A = sext <8 x i8> undef to <8 x i64> + %B = sext <8 x i16> undef to <8 x i64> + %C = sext <8 x i32> undef to <8 x i64> + %D = sext <16 x i8> undef to <16 x i32> + %E = sext <16 x i16> undef to <16 x i32> + %F = sext <32 x i8> undef to <32 x i16> + ret void +} + +define void @sext512() "min-legal-vector-width"="512" { +; AVX-LABEL: 'sext512' +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %A = sext <8 x i8> undef to <8 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %B = sext <8 x i16> undef to <8 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C = sext <8 x i32> undef to <8 x i64> +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D = sext <16 x i8> undef to <16 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %E = sext <16 x i16> undef to <16 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %F = sext <32 x i8> undef to <32 x i16> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SKX256-LABEL: 'sext512' +; SKX256-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %A = sext <8 x i8> undef to <8 x i64> +; SKX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = sext <8 x i16> undef to <8 x i64> +; SKX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C = sext <8 x i32> undef to <8 x i64> +; SKX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D = sext <16 x i8> undef to <16 x i32> +; SKX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %E = sext <16 x i16> undef to <16 x i32> +; SKX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F = sext <32 x i8> undef to <32 x i16> +; SKX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VEC512-LABEL: 'sext512' +; VEC512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %A = sext <8 x i8> undef to <8 x i64> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = sext <8 x i16> undef to <8 x i64> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C = sext <8 x i32> undef to <8 x i64> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D = sext <16 x i8> undef to <16 x i32> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %E = sext <16 x i16> undef to <16 x i32> +; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F = sext <32 x i8> undef to <32 x i16> +; VEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %A = sext <8 x i8> undef to <8 x i64> + %B = sext <8 x i16> undef to <8 x i64> + %C = sext <8 x i32> undef to <8 x i64> + %D = sext <16 x i8> undef to <16 x i32> + %E = sext <16 x i16> undef to <16 x i32> + %F = sext <32 x i8> undef to <32 x i16> + ret void +} diff --git a/test/Analysis/CostModel/X86/reduce-add-widen.ll b/test/Analysis/CostModel/X86/reduce-add-widen.ll new file mode 100644 index 0000000000000000000000000000000000000000..5230511b11eadc48430460ab2bb00c9ce924ef9e --- /dev/null +++ b/test/Analysis/CostModel/X86/reduce-add-widen.ll @@ -0,0 +1,307 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ + +define i32 @reduce_i64(i32 %arg) { +; SSE2-LABEL: 'reduce_i64' +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i64' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'reduce_i64' +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'reduce_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef) + ret i32 undef +} + +define i32 @reduce_i32(i32 %arg) { +; SSE2-LABEL: 'reduce_i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'reduce_i32' +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'reduce_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef) + ret i32 undef +} + +define i32 @reduce_i16(i32 %arg) { +; SSE2-LABEL: 'reduce_i16' +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) + ret i32 undef +} + +define i32 @reduce_i8(i32 %arg) { +; SSE2-LABEL: 'reduce_i8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) + ret i32 undef +} + +declare i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64>) +declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64>) +declare i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64>) +declare i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64>) + +declare i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32>) +declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>) +declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>) +declare i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32>) +declare i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32>) + +declare i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16>) +declare i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16>) +declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>) +declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>) +declare i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16>) +declare i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16>) + +declare i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8>) +declare i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8>) +declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>) +declare i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8>) +declare i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8>) +declare i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8>) diff --git a/test/Analysis/CostModel/X86/reduce-add.ll b/test/Analysis/CostModel/X86/reduce-add.ll index 97f7a75ffa2c9e106c98213e0e99a974c5464275..bb762233385906d8a810ce008fecc99dd0dc1669 100644 --- a/test/Analysis/CostModel/X86/reduce-add.ll +++ b/test/Analysis/CostModel/X86/reduce-add.ll @@ -1,28 +1,28 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ define i32 @reduce_i64(i32 %arg) { ; SSE2-LABEL: 'reduce_i64' ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i64' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i64' @@ -45,8 +45,8 @@ define i32 @reduce_i64(i32 %arg) { ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef) @@ -59,19 +59,19 @@ define i32 @reduce_i64(i32 %arg) { define i32 @reduce_i32(i32 %arg) { ; SSE2-LABEL: 'reduce_i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i32' @@ -94,8 +94,8 @@ define i32 @reduce_i32(i32 %arg) { ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef) @@ -108,22 +108,25 @@ define i32 @reduce_i32(i32 %arg) { define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) @@ -132,45 +135,51 @@ define i32 @reduce_i16(i32 %arg) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %V2 = call i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef) %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef) %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef) @@ -181,69 +190,87 @@ define i32 @reduce_i16(i32 %arg) { define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 101 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 121 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %V2 = call i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8> undef) %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef) %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef) %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef) @@ -264,12 +291,15 @@ declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>) declare i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32>) declare i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32>) +declare i16 @llvm.experimental.vector.reduce.add.i16.v2i16(<2 x i16>) declare i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16>) declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>) declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>) declare i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16>) declare i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16>) +declare i8 @llvm.experimental.vector.reduce.add.i8.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.add.i8.v4i8(<4 x i8>) declare i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8>) declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>) declare i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8>) diff --git a/test/Analysis/CostModel/X86/reduce-and-widen.ll b/test/Analysis/CostModel/X86/reduce-and-widen.ll new file mode 100644 index 0000000000000000000000000000000000000000..6f9db08b7076850b95573153e96b1023ad63befb --- /dev/null +++ b/test/Analysis/CostModel/X86/reduce-and-widen.ll @@ -0,0 +1,400 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ + +define i32 @reduce_i64(i32 %arg) { +; SSE-LABEL: 'reduce_i64' +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'reduce_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef) + ret i32 undef +} + +define i32 @reduce_i32(i32 %arg) { +; SSE-LABEL: 'reduce_i32' +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'reduce_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef) + ret i32 undef +} + +define i32 @reduce_i16(i32 %arg) { +; SSE2-LABEL: 'reduce_i16' +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) + ret i32 undef +} + +define i32 @reduce_i8(i32 %arg) { +; SSE2-LABEL: 'reduce_i8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) + ret i32 undef +} + +define i32 @reduce_i1(i32 %arg) { +; SSE2-LABEL: 'reduce_i1' +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i1' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i1' +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i1' +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i1' +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i1' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i1' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i1' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef) + %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) + %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) + %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) + %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) + %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) + %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) + %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) + ret i32 undef +} + +declare i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64>) +declare i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64>) +declare i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64>) +declare i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64>) + +declare i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32>) +declare i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32>) +declare i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32>) +declare i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32>) +declare i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32>) + +declare i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16>) +declare i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16>) +declare i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16>) +declare i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16>) +declare i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16>) +declare i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16>) + +declare i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8>) +declare i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8>) +declare i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8>) +declare i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8>) +declare i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8>) +declare i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8>) + +declare i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1>) +declare i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1>) +declare i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1>) +declare i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1>) +declare i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1>) +declare i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1>) +declare i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1>) +declare i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1>) diff --git a/test/Analysis/CostModel/X86/reduce-and.ll b/test/Analysis/CostModel/X86/reduce-and.ll index 1dfa0953c286e2d5ceec9f3be962d39102f30da8..35caf489c0045ed8b79d31e7b85b0cf7bd0005bd 100644 --- a/test/Analysis/CostModel/X86/reduce-and.ll +++ b/test/Analysis/CostModel/X86/reduce-and.ll @@ -1,44 +1,44 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ define i32 @reduce_i64(i32 %arg) { ; SSE-LABEL: 'reduce_i64' ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef) @@ -51,35 +51,35 @@ define i32 @reduce_i64(i32 %arg) { define i32 @reduce_i32(i32 %arg) { ; SSE-LABEL: 'reduce_i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef) @@ -92,69 +92,78 @@ define i32 @reduce_i32(i32 %arg) { define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i16' -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %V2 = call i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef) %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef) %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef) @@ -165,69 +174,87 @@ define i32 @reduce_i16(i32 %arg) { define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i8' -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 121 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %V2 = call i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8> undef) %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef) %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef) %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef) @@ -239,90 +266,90 @@ define i32 @reduce_i8(i32 %arg) { define i32 @reduce_i1(i32 %arg) { ; SSE2-LABEL: 'reduce_i1' ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i1' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i1' ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i1' ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i1' ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i1' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i1' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 357 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 838 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 841 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i1' ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef) @@ -348,12 +375,15 @@ declare i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32>) declare i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32>) declare i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32>) +declare i16 @llvm.experimental.vector.reduce.and.i16.v2i16(<2 x i16>) declare i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16>) declare i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16>) declare i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16>) declare i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16>) declare i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16>) +declare i8 @llvm.experimental.vector.reduce.and.i8.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.and.i8.v4i8(<4 x i8>) declare i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8>) declare i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8>) declare i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8>) diff --git a/test/Analysis/CostModel/X86/reduce-mul-widen.ll b/test/Analysis/CostModel/X86/reduce-mul-widen.ll new file mode 100644 index 0000000000000000000000000000000000000000..1331f78f0b4b65e2680eb19adfff1c25e6c1e374 --- /dev/null +++ b/test/Analysis/CostModel/X86/reduce-mul-widen.ll @@ -0,0 +1,323 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ + +define i32 @reduce_i64(i32 %arg) { +; SSE-LABEL: 'reduce_i64' +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i64' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i64' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i64' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef) + ret i32 undef +} + +define i32 @reduce_i32(i32 %arg) { +; SSE2-LABEL: 'reduce_i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'reduce_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) + ret i32 undef +} + +define i32 @reduce_i16(i32 %arg) { +; SSE2-LABEL: 'reduce_i16' +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) + ret i32 undef +} + +define i32 @reduce_i8(i32 %arg) { +; SSE2-LABEL: 'reduce_i8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 101 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 125 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 173 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 171 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 197 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 249 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 123 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 157 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 125 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 115 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 125 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) + ret i32 undef +} + +declare i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64>) +declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64>) +declare i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64>) +declare i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64>) + +declare i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32>) +declare i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32>) +declare i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32>) +declare i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32>) +declare i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32>) + +declare i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16>) +declare i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16>) +declare i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16>) +declare i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16>) +declare i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16>) +declare i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16>) + +declare i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8>) +declare i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8>) +declare i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8>) +declare i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8>) +declare i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8>) +declare i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8>) diff --git a/test/Analysis/CostModel/X86/reduce-mul.ll b/test/Analysis/CostModel/X86/reduce-mul.ll index 97e67a92f8f351cd8198deb74d735c1bcd313cc7..68f94cdfec9901692d4ab6e2494dd8f699b076ba 100644 --- a/test/Analysis/CostModel/X86/reduce-mul.ll +++ b/test/Analysis/CostModel/X86/reduce-mul.ll @@ -1,60 +1,60 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ define i32 @reduce_i64(i32 %arg) { ; SSE-LABEL: 'reduce_i64' ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 154 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i64' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i64' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i64' ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef) @@ -67,67 +67,67 @@ define i32 @reduce_i64(i32 %arg) { define i32 @reduce_i32(i32 %arg) { ; SSE2-LABEL: 'reduce_i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 105 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 105 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i32' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i32' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i32' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef) @@ -140,69 +140,78 @@ define i32 @reduce_i32(i32 %arg) { define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i16' -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %V2 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef) %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef) %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef) @@ -213,69 +222,87 @@ define i32 @reduce_i16(i32 %arg) { define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 129 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 275 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 101 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 125 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 173 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 93 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 239 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i8' -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 93 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 239 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 255 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 360 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 171 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 197 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 249 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 137 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 241 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 123 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 157 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 197 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 125 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 201 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 115 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 197 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 125 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %V2 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> undef) %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef) %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef) %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef) @@ -296,12 +323,15 @@ declare i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32>) declare i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32>) declare i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32>) +declare i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16>) declare i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16>) declare i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16>) declare i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16>) declare i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16>) declare i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16>) +declare i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8>) declare i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8>) declare i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8>) declare i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8>) diff --git a/test/Analysis/CostModel/X86/reduce-or-widen.ll b/test/Analysis/CostModel/X86/reduce-or-widen.ll new file mode 100644 index 0000000000000000000000000000000000000000..8cdef17e438a4be3a367fd67c517b419495cb8aa --- /dev/null +++ b/test/Analysis/CostModel/X86/reduce-or-widen.ll @@ -0,0 +1,400 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ + +define i32 @reduce_i64(i32 %arg) { +; SSE-LABEL: 'reduce_i64' +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'reduce_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef) + ret i32 undef +} + +define i32 @reduce_i32(i32 %arg) { +; SSE-LABEL: 'reduce_i32' +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'reduce_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef) + ret i32 undef +} + +define i32 @reduce_i16(i32 %arg) { +; SSE2-LABEL: 'reduce_i16' +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) + ret i32 undef +} + +define i32 @reduce_i8(i32 %arg) { +; SSE2-LABEL: 'reduce_i8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) + ret i32 undef +} + +define i32 @reduce_i1(i32 %arg) { +; SSE2-LABEL: 'reduce_i1' +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i1' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i1' +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i1' +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i1' +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i1' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i1' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i1' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef) + %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) + %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) + %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) + %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) + %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) + %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) + %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) + ret i32 undef +} + +declare i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64>) +declare i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64>) +declare i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64>) +declare i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64>) + +declare i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32>) +declare i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32>) +declare i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32>) +declare i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32>) +declare i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32>) + +declare i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16>) +declare i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16>) +declare i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16>) +declare i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16>) +declare i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16>) +declare i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16>) + +declare i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8>) +declare i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8>) +declare i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8>) +declare i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8>) +declare i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8>) +declare i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8>) + +declare i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1>) +declare i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1>) +declare i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1>) +declare i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1>) +declare i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1>) +declare i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1>) +declare i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1>) +declare i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1>) diff --git a/test/Analysis/CostModel/X86/reduce-or.ll b/test/Analysis/CostModel/X86/reduce-or.ll index 13814ac2b76efb094876d36ce723ba3a7d09dc71..ed0ef620482c9c4ac25c3b454142b687a0e0db08 100644 --- a/test/Analysis/CostModel/X86/reduce-or.ll +++ b/test/Analysis/CostModel/X86/reduce-or.ll @@ -1,44 +1,44 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ define i32 @reduce_i64(i32 %arg) { ; SSE-LABEL: 'reduce_i64' ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef) @@ -51,35 +51,35 @@ define i32 @reduce_i64(i32 %arg) { define i32 @reduce_i32(i32 %arg) { ; SSE-LABEL: 'reduce_i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef) @@ -92,69 +92,78 @@ define i32 @reduce_i32(i32 %arg) { define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i16' -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %V2 = call i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef) %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef) %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef) @@ -165,69 +174,87 @@ define i32 @reduce_i16(i32 %arg) { define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i8' -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 121 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %V2 = call i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8> undef) %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef) %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef) %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef) @@ -239,90 +266,90 @@ define i32 @reduce_i8(i32 %arg) { define i32 @reduce_i1(i32 %arg) { ; SSE2-LABEL: 'reduce_i1' ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i1' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i1' ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i1' ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i1' ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i1' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i1' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 357 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 838 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 841 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i1' ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef) @@ -348,12 +375,15 @@ declare i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32>) declare i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32>) declare i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32>) +declare i16 @llvm.experimental.vector.reduce.or.i16.v2i16(<2 x i16>) declare i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16>) declare i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16>) declare i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16>) declare i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16>) declare i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16>) +declare i8 @llvm.experimental.vector.reduce.or.i8.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.or.i8.v4i8(<4 x i8>) declare i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8>) declare i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8>) declare i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8>) diff --git a/test/Analysis/CostModel/X86/reduce-smax-widen.ll b/test/Analysis/CostModel/X86/reduce-smax-widen.ll new file mode 100644 index 0000000000000000000000000000000000000000..44b01d3707c8ce74607a9e28103558d4594628b2 --- /dev/null +++ b/test/Analysis/CostModel/X86/reduce-smax-widen.ll @@ -0,0 +1,323 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ + +define i32 @reduce_i64(i32 %arg) { +; SSE2-LABEL: 'reduce_i64' +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i64' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'reduce_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef) + ret i32 undef +} + +define i32 @reduce_i32(i32 %arg) { +; SSE2-LABEL: 'reduce_i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'reduce_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef) + ret i32 undef +} + +define i32 @reduce_i16(i32 %arg) { +; SSE2-LABEL: 'reduce_i16' +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef) + ret i32 undef +} + +define i32 @reduce_i8(i32 %arg) { +; SSE2-LABEL: 'reduce_i8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) + ret i32 undef +} + +declare i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64>) +declare i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64>) +declare i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64>) +declare i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64>) + +declare i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32>) +declare i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32>) +declare i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32>) +declare i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32>) +declare i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32>) + +declare i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16>) +declare i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16>) +declare i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16>) +declare i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16>) +declare i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16>) +declare i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16>) + +declare i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8>) +declare i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8>) +declare i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8>) +declare i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8>) +declare i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8>) +declare i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8>) diff --git a/test/Analysis/CostModel/X86/reduce-smax.ll b/test/Analysis/CostModel/X86/reduce-smax.ll index 5426c7f9c80b072d9df9ee54f5318c0fedd3891b..b4b9a3ba6379802b4c29809f953d94c51013e1fe 100644 --- a/test/Analysis/CostModel/X86/reduce-smax.ll +++ b/test/Analysis/CostModel/X86/reduce-smax.ll @@ -1,32 +1,32 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ define i32 @reduce_i64(i32 %arg) { ; SSE2-LABEL: 'reduce_i64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef) @@ -34,15 +34,15 @@ define i32 @reduce_i64(i32 %arg) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef) @@ -50,7 +50,7 @@ define i32 @reduce_i64(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef) @@ -67,19 +67,19 @@ define i32 @reduce_i64(i32 %arg) { define i32 @reduce_i32(i32 %arg) { ; SSE2-LABEL: 'reduce_i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i32' @@ -124,22 +124,25 @@ define i32 @reduce_i32(i32 %arg) { define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) @@ -148,14 +151,16 @@ define i32 @reduce_i16(i32 %arg) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 101 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) @@ -164,6 +169,7 @@ define i32 @reduce_i16(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) @@ -172,14 +178,16 @@ define i32 @reduce_i16(i32 %arg) { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) @@ -187,6 +195,7 @@ define i32 @reduce_i16(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %V2 = call i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef) %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef) %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef) @@ -197,69 +206,87 @@ define i32 @reduce_i16(i32 %arg) { define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 111 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 162 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 173 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 194 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 253 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 258 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %V2 = call i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8> undef) %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef) %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef) %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef) @@ -280,12 +307,15 @@ declare i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32>) declare i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32>) declare i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32>) +declare i16 @llvm.experimental.vector.reduce.smax.i16.v2i16(<2 x i16>) declare i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16>) declare i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16>) declare i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16>) declare i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16>) declare i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16>) +declare i8 @llvm.experimental.vector.reduce.smax.i8.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.smax.i8.v4i8(<4 x i8>) declare i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8>) declare i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8>) declare i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8>) diff --git a/test/Analysis/CostModel/X86/reduce-smin-widen.ll b/test/Analysis/CostModel/X86/reduce-smin-widen.ll new file mode 100644 index 0000000000000000000000000000000000000000..18dd54cf026ed04f10eb0309e803c0608ef7bdee --- /dev/null +++ b/test/Analysis/CostModel/X86/reduce-smin-widen.ll @@ -0,0 +1,314 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ + +define i32 @reduce_i64(i32 %arg) { +; SSE2-LABEL: 'reduce_i64' +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i64' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'reduce_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef) + ret i32 undef +} + +define i32 @reduce_i32(i32 %arg) { +; SSE2-LABEL: 'reduce_i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'reduce_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef) + ret i32 undef +} + +define i32 @reduce_i16(i32 %arg) { +; SSE2-LABEL: 'reduce_i16' +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef) + ret i32 undef +} + +define i32 @reduce_i8(i32 %arg) { +; SSE2-LABEL: 'reduce_i8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) + ret i32 undef +} + +declare i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64>) +declare i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64>) +declare i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64>) +declare i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64>) + +declare i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32>) +declare i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32>) +declare i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32>) +declare i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32>) +declare i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32>) + +declare i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16>) +declare i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16>) +declare i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16>) +declare i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16>) +declare i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16>) +declare i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16>) + +declare i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8>) +declare i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8>) +declare i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8>) +declare i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8>) +declare i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8>) +declare i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8>) diff --git a/test/Analysis/CostModel/X86/reduce-smin.ll b/test/Analysis/CostModel/X86/reduce-smin.ll index b8076a98513266b217ada124e11a1986815292c1..f420ad736ed35820afd271489c81be1b96f45c29 100644 --- a/test/Analysis/CostModel/X86/reduce-smin.ll +++ b/test/Analysis/CostModel/X86/reduce-smin.ll @@ -1,32 +1,32 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ define i32 @reduce_i64(i32 %arg) { ; SSE2-LABEL: 'reduce_i64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef) @@ -34,15 +34,15 @@ define i32 @reduce_i64(i32 %arg) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef) @@ -50,7 +50,7 @@ define i32 @reduce_i64(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef) @@ -67,19 +67,19 @@ define i32 @reduce_i64(i32 %arg) { define i32 @reduce_i32(i32 %arg) { ; SSE2-LABEL: 'reduce_i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i32' @@ -124,22 +124,25 @@ define i32 @reduce_i32(i32 %arg) { define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) @@ -148,14 +151,16 @@ define i32 @reduce_i16(i32 %arg) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 101 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) @@ -164,6 +169,7 @@ define i32 @reduce_i16(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) @@ -172,14 +178,16 @@ define i32 @reduce_i16(i32 %arg) { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) @@ -187,6 +195,7 @@ define i32 @reduce_i16(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %V2 = call i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef) %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef) %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef) @@ -197,69 +206,87 @@ define i32 @reduce_i16(i32 %arg) { define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 111 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 162 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 173 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 194 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 253 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 258 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %V2 = call i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8> undef) %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef) %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef) %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef) @@ -280,12 +307,15 @@ declare i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32>) declare i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32>) declare i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32>) +declare i16 @llvm.experimental.vector.reduce.smin.i16.v2i16(<2 x i16>) declare i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16>) declare i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16>) declare i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16>) declare i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16>) declare i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16>) +declare i8 @llvm.experimental.vector.reduce.smin.i8.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.smin.i8.v4i8(<4 x i8>) declare i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8>) declare i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8>) declare i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8>) diff --git a/test/Analysis/CostModel/X86/reduce-umax-widen.ll b/test/Analysis/CostModel/X86/reduce-umax-widen.ll new file mode 100644 index 0000000000000000000000000000000000000000..c48cfdf9778bec8855b978b5988e4f6c84249abe --- /dev/null +++ b/test/Analysis/CostModel/X86/reduce-umax-widen.ll @@ -0,0 +1,323 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ + +define i32 @reduce_i64(i32 %arg) { +; SSE2-LABEL: 'reduce_i64' +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i64' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'reduce_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef) + ret i32 undef +} + +define i32 @reduce_i32(i32 %arg) { +; SSE2-LABEL: 'reduce_i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'reduce_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef) + ret i32 undef +} + +define i32 @reduce_i16(i32 %arg) { +; SSE2-LABEL: 'reduce_i16' +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef) + ret i32 undef +} + +define i32 @reduce_i8(i32 %arg) { +; SSE2-LABEL: 'reduce_i8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) + ret i32 undef +} + +declare i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64>) +declare i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64>) +declare i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64>) +declare i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64>) + +declare i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32>) +declare i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32>) +declare i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32>) +declare i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32>) +declare i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32>) + +declare i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16>) +declare i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16>) +declare i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16>) +declare i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16>) +declare i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16>) +declare i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16>) + +declare i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8>) +declare i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8>) +declare i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8>) +declare i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8>) +declare i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8>) +declare i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8>) diff --git a/test/Analysis/CostModel/X86/reduce-umax.ll b/test/Analysis/CostModel/X86/reduce-umax.ll index 6b947ebc225b6e077a809c22e0fce796359d776d..490b0e469b1a3264cfca75f7c25b2e48116eb610 100644 --- a/test/Analysis/CostModel/X86/reduce-umax.ll +++ b/test/Analysis/CostModel/X86/reduce-umax.ll @@ -1,32 +1,32 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ define i32 @reduce_i64(i32 %arg) { ; SSE2-LABEL: 'reduce_i64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef) @@ -34,15 +34,15 @@ define i32 @reduce_i64(i32 %arg) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef) @@ -50,7 +50,7 @@ define i32 @reduce_i64(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef) @@ -67,19 +67,19 @@ define i32 @reduce_i64(i32 %arg) { define i32 @reduce_i32(i32 %arg) { ; SSE2-LABEL: 'reduce_i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i32' @@ -124,22 +124,25 @@ define i32 @reduce_i32(i32 %arg) { define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) @@ -148,14 +151,16 @@ define i32 @reduce_i16(i32 %arg) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 101 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) @@ -164,6 +169,7 @@ define i32 @reduce_i16(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) @@ -172,14 +178,16 @@ define i32 @reduce_i16(i32 %arg) { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) @@ -187,6 +195,7 @@ define i32 @reduce_i16(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %V2 = call i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef) %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef) %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef) @@ -197,69 +206,87 @@ define i32 @reduce_i16(i32 %arg) { define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 111 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 162 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 173 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 194 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 253 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 258 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %V2 = call i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8> undef) %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef) %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef) %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef) @@ -280,12 +307,15 @@ declare i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32>) declare i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32>) declare i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32>) +declare i16 @llvm.experimental.vector.reduce.umax.i16.v2i16(<2 x i16>) declare i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16>) declare i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16>) declare i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16>) declare i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16>) declare i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16>) +declare i8 @llvm.experimental.vector.reduce.umax.i8.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.umax.i8.v4i8(<4 x i8>) declare i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8>) declare i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8>) declare i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8>) diff --git a/test/Analysis/CostModel/X86/reduce-umin-widen.ll b/test/Analysis/CostModel/X86/reduce-umin-widen.ll new file mode 100644 index 0000000000000000000000000000000000000000..b59e2f3712ed5d1f8ade8d789b9dbd84305b45a0 --- /dev/null +++ b/test/Analysis/CostModel/X86/reduce-umin-widen.ll @@ -0,0 +1,323 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ + +define i32 @reduce_i64(i32 %arg) { +; SSE2-LABEL: 'reduce_i64' +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i64' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'reduce_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef) + ret i32 undef +} + +define i32 @reduce_i32(i32 %arg) { +; SSE2-LABEL: 'reduce_i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'reduce_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef) + ret i32 undef +} + +define i32 @reduce_i16(i32 %arg) { +; SSE2-LABEL: 'reduce_i16' +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef) + ret i32 undef +} + +define i32 @reduce_i8(i32 %arg) { +; SSE2-LABEL: 'reduce_i8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) + ret i32 undef +} + +declare i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64>) +declare i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64>) +declare i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64>) +declare i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64>) + +declare i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32>) +declare i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32>) +declare i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32>) +declare i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32>) +declare i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32>) + +declare i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16>) +declare i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16>) +declare i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16>) +declare i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16>) +declare i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16>) +declare i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16>) + +declare i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8>) +declare i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8>) +declare i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8>) +declare i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8>) +declare i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8>) +declare i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8>) diff --git a/test/Analysis/CostModel/X86/reduce-umin.ll b/test/Analysis/CostModel/X86/reduce-umin.ll index 0fe9029bc826c41755be3c97abfadbaef19abc8a..f2f519c6023df5b4384c7c020d99bb1af28a4643 100644 --- a/test/Analysis/CostModel/X86/reduce-umin.ll +++ b/test/Analysis/CostModel/X86/reduce-umin.ll @@ -1,32 +1,32 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ define i32 @reduce_i64(i32 %arg) { ; SSE2-LABEL: 'reduce_i64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef) @@ -34,15 +34,15 @@ define i32 @reduce_i64(i32 %arg) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef) @@ -50,7 +50,7 @@ define i32 @reduce_i64(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef) @@ -67,19 +67,19 @@ define i32 @reduce_i64(i32 %arg) { define i32 @reduce_i32(i32 %arg) { ; SSE2-LABEL: 'reduce_i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i32' @@ -124,22 +124,25 @@ define i32 @reduce_i32(i32 %arg) { define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) @@ -148,14 +151,16 @@ define i32 @reduce_i16(i32 %arg) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 101 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) @@ -164,6 +169,7 @@ define i32 @reduce_i16(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) @@ -172,14 +178,16 @@ define i32 @reduce_i16(i32 %arg) { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) @@ -187,6 +195,7 @@ define i32 @reduce_i16(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %V2 = call i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef) %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef) %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef) @@ -197,69 +206,87 @@ define i32 @reduce_i16(i32 %arg) { define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 111 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 162 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 173 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 194 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 253 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 258 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %V2 = call i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8> undef) %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef) %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef) %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef) @@ -280,12 +307,15 @@ declare i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32>) declare i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32>) declare i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32>) +declare i16 @llvm.experimental.vector.reduce.umin.i16.v2i16(<2 x i16>) declare i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16>) declare i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16>) declare i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16>) declare i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16>) declare i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16>) +declare i8 @llvm.experimental.vector.reduce.umin.i8.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.umin.i8.v4i8(<4 x i8>) declare i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8>) declare i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8>) declare i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8>) diff --git a/test/Analysis/CostModel/X86/reduce-xor-widen.ll b/test/Analysis/CostModel/X86/reduce-xor-widen.ll new file mode 100644 index 0000000000000000000000000000000000000000..ff0d276c2213e3bf83a6189b25eee214f6cbf016 --- /dev/null +++ b/test/Analysis/CostModel/X86/reduce-xor-widen.ll @@ -0,0 +1,400 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ + +define i32 @reduce_i64(i32 %arg) { +; SSE-LABEL: 'reduce_i64' +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'reduce_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef) + ret i32 undef +} + +define i32 @reduce_i32(i32 %arg) { +; SSE-LABEL: 'reduce_i32' +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'reduce_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef) + ret i32 undef +} + +define i32 @reduce_i16(i32 %arg) { +; SSE2-LABEL: 'reduce_i16' +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) + ret i32 undef +} + +define i32 @reduce_i8(i32 %arg) { +; SSE2-LABEL: 'reduce_i8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) + ret i32 undef +} + +define i32 @reduce_i1(i32 %arg) { +; SSE2-LABEL: 'reduce_i1' +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'reduce_i1' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'reduce_i1' +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'reduce_i1' +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i1' +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'reduce_i1' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'reduce_i1' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'reduce_i1' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef) + %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) + %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) + %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) + %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) + %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) + %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) + %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) + ret i32 undef +} + +declare i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64>) +declare i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64>) +declare i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64>) +declare i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64>) + +declare i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32>) +declare i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32>) +declare i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32>) +declare i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32>) +declare i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32>) + +declare i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16>) +declare i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16>) +declare i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16>) +declare i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16>) +declare i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16>) +declare i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16>) + +declare i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8>) +declare i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8>) +declare i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8>) +declare i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8>) +declare i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8>) +declare i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8>) + +declare i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1>) +declare i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1>) +declare i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1>) +declare i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1>) +declare i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1>) +declare i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1>) +declare i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1>) +declare i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1>) diff --git a/test/Analysis/CostModel/X86/reduce-xor.ll b/test/Analysis/CostModel/X86/reduce-xor.ll index f8e82d05aa7ccaecaba8cb75ee734d7c86386a05..8f7dfdd02fb64d0f16cabd09c74050b849afefe5 100644 --- a/test/Analysis/CostModel/X86/reduce-xor.ll +++ b/test/Analysis/CostModel/X86/reduce-xor.ll @@ -1,44 +1,44 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW -; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ define i32 @reduce_i64(i32 %arg) { ; SSE-LABEL: 'reduce_i64' ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef) @@ -51,35 +51,35 @@ define i32 @reduce_i64(i32 %arg) { define i32 @reduce_i32(i32 %arg) { ; SSE-LABEL: 'reduce_i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef) @@ -92,69 +92,78 @@ define i32 @reduce_i32(i32 %arg) { define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i16' -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %V2 = call i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef) %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef) %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef) @@ -165,69 +174,87 @@ define i32 @reduce_i16(i32 %arg) { define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i8' -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 121 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; + %V2 = call i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8> undef) %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef) %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef) %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef) @@ -239,90 +266,90 @@ define i32 @reduce_i8(i32 %arg) { define i32 @reduce_i1(i32 %arg) { ; SSE2-LABEL: 'reduce_i1' ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i1' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i1' ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i1' ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i1' ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i1' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i1' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 357 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 838 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 841 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i1' ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef) @@ -348,12 +375,15 @@ declare i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32>) declare i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32>) declare i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32>) +declare i16 @llvm.experimental.vector.reduce.xor.i16.v2i16(<2 x i16>) declare i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16>) declare i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16>) declare i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16>) declare i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16>) declare i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16>) +declare i8 @llvm.experimental.vector.reduce.xor.i8.v2i8(<2 x i8>) +declare i8 @llvm.experimental.vector.reduce.xor.i8.v4i8(<4 x i8>) declare i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8>) declare i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8>) declare i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8>) diff --git a/test/Analysis/CostModel/X86/reduction.ll b/test/Analysis/CostModel/X86/reduction.ll index 04e40d72246944b136af64e5df051fbe1914fc61..0acb16c0cd9e3065d789c55cb4906cdaf4330082 100644 --- a/test/Analysis/CostModel/X86/reduction.ll +++ b/test/Analysis/CostModel/X86/reduction.ll @@ -15,7 +15,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSSE3-LABEL: 'reduction_cost_float' @@ -23,7 +23,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSE42-LABEL: 'reduction_cost_float' @@ -52,35 +52,15 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) { } define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) { -; SSE2-LABEL: 'reduction_cost_int' -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 -; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r -; -; SSSE3-LABEL: 'reduction_cost_int' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r -; -; SSE42-LABEL: 'reduction_cost_int' -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; SSE-LABEL: 'reduction_cost_int' +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; AVX1-LABEL: 'reduction_cost_int' ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> @@ -127,7 +107,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -138,7 +118,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -188,7 +168,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -199,7 +179,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -248,7 +228,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -258,7 +238,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -300,13 +280,13 @@ define fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1 ; SSE2-LABEL: 'no_pairwise_reduction2double' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSSE3-LABEL: 'no_pairwise_reduction2double' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSE42-LABEL: 'no_pairwise_reduction2double' @@ -334,7 +314,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSSE3-LABEL: 'no_pairwise_reduction4float' @@ -342,7 +322,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSE42-LABEL: 'no_pairwise_reduction4float' @@ -376,7 +356,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSSE3-LABEL: 'no_pairwise_reduction4double' @@ -384,7 +364,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSE42-LABEL: 'no_pairwise_reduction4double' @@ -428,7 +408,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSSE3-LABEL: 'no_pairwise_reduction8float' @@ -438,7 +418,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSE42-LABEL: 'no_pairwise_reduction8float' @@ -486,13 +466,13 @@ define fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { ; SSE2-LABEL: 'no_pairwise_reduction2i64' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; SSSE3-LABEL: 'no_pairwise_reduction2i64' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; SSE42-LABEL: 'no_pairwise_reduction2i64' @@ -520,7 +500,7 @@ define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; SSSE3-LABEL: 'no_pairwise_reduction4i32' @@ -528,7 +508,7 @@ define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; SSE42-LABEL: 'no_pairwise_reduction4i32' @@ -557,29 +537,13 @@ define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { } define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { -; SSE2-LABEL: 'no_pairwise_reduction4i64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r -; -; SSSE3-LABEL: 'no_pairwise_reduction4i64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r -; -; SSE42-LABEL: 'no_pairwise_reduction4i64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; SSE-LABEL: 'no_pairwise_reduction4i64' +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; AVX1-LABEL: 'no_pairwise_reduction4i64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> @@ -614,7 +578,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; ; SSSE3-LABEL: 'no_pairwise_reduction8i16' @@ -624,7 +588,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; ; SSE42-LABEL: 'no_pairwise_reduction8i16' @@ -659,35 +623,15 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { } define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { -; SSE2-LABEL: 'no_pairwise_reduction8i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r -; -; SSSE3-LABEL: 'no_pairwise_reduction8i32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r -; -; SSE42-LABEL: 'no_pairwise_reduction8i32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; SSE-LABEL: 'no_pairwise_reduction8i32' +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; AVX1-LABEL: 'no_pairwise_reduction8i32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> @@ -725,14 +669,14 @@ define fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSSE3-LABEL: 'pairwise_reduction2double' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSE42-LABEL: 'pairwise_reduction2double' @@ -765,7 +709,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSSE3-LABEL: 'pairwise_reduction4float' @@ -775,7 +719,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSE42-LABEL: 'pairwise_reduction4float' @@ -817,7 +761,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSSE3-LABEL: 'pairwise_reduction4double' @@ -827,7 +771,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSE42-LABEL: 'pairwise_reduction4double' @@ -882,7 +826,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSSE3-LABEL: 'pairwise_reduction8float' @@ -895,7 +839,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSE42-LABEL: 'pairwise_reduction8float' @@ -956,14 +900,14 @@ define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; SSSE3-LABEL: 'pairwise_reduction2i64' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; SSE42-LABEL: 'pairwise_reduction2i64' @@ -996,7 +940,7 @@ define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; SSSE3-LABEL: 'pairwise_reduction4i32' @@ -1006,7 +950,7 @@ define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; SSE42-LABEL: 'pairwise_reduction4i32' @@ -1041,35 +985,15 @@ define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { } define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { -; SSE2-LABEL: 'pairwise_reduction4i64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r -; -; SSSE3-LABEL: 'pairwise_reduction4i64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r -; -; SSE42-LABEL: 'pairwise_reduction4i64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; SSE-LABEL: 'pairwise_reduction4i64' +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; AVX1-LABEL: 'pairwise_reduction4i64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> @@ -1113,7 +1037,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; ; SSSE3-LABEL: 'pairwise_reduction8i16' @@ -1126,7 +1050,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; ; SSE42-LABEL: 'pairwise_reduction8i16' @@ -1180,7 +1104,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; SSSE3-LABEL: 'pairwise_reduction8i32' @@ -1193,7 +1117,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; SSE42-LABEL: 'pairwise_reduction8i32' diff --git a/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll b/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll index 5bb2e1a756d619c4d8fe75eb7cd9c805c54a596d..f74b30cf1d37f52a4c99e43225d4a99c8da054e7 100644 --- a/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll +++ b/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll @@ -17,28 +17,52 @@ ; define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) { -; CHECK-LABEL: 'test_vXf64' -; CHECK-NEXT: Cost Model: Unknown cost for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Unknown cost for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Unknown cost for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Unknown cost for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Unknown cost for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Unknown cost for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Unknown cost for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Unknown cost for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Unknown cost for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE-LABEL: 'test_vXf64' +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX-LABEL: 'test_vXf64' +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'test_vXf64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; BTVER2-LABEL: 'test_vXf64' -; BTVER2-NEXT: Cost Model: Unknown cost for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> -; BTVER2-NEXT: Cost Model: Unknown cost for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> -; BTVER2-NEXT: Cost Model: Unknown cost for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> -; BTVER2-NEXT: Cost Model: Unknown cost for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> -; BTVER2-NEXT: Cost Model: Unknown cost for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> -; BTVER2-NEXT: Cost Model: Unknown cost for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> -; BTVER2-NEXT: Cost Model: Unknown cost for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> -; BTVER2-NEXT: Cost Model: Unknown cost for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> -; BTVER2-NEXT: Cost Model: Unknown cost for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> @@ -54,28 +78,52 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) { } define void @test_vXfi64(<4 x i64> %src256, <8 x i64> %src512) { -; CHECK-LABEL: 'test_vXfi64' -; CHECK-NEXT: Cost Model: Unknown cost for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Unknown cost for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Unknown cost for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Unknown cost for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Unknown cost for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Unknown cost for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Unknown cost for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Unknown cost for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Unknown cost for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SSE-LABEL: 'test_vXfi64' +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX-LABEL: 'test_vXfi64' +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'test_vXfi64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; BTVER2-LABEL: 'test_vXfi64' -; BTVER2-NEXT: Cost Model: Unknown cost for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> -; BTVER2-NEXT: Cost Model: Unknown cost for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> -; BTVER2-NEXT: Cost Model: Unknown cost for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> -; BTVER2-NEXT: Cost Model: Unknown cost for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> -; BTVER2-NEXT: Cost Model: Unknown cost for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> -; BTVER2-NEXT: Cost Model: Unknown cost for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> -; BTVER2-NEXT: Cost Model: Unknown cost for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> -; BTVER2-NEXT: Cost Model: Unknown cost for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> -; BTVER2-NEXT: Cost Model: Unknown cost for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> diff --git a/test/Analysis/CostModel/X86/sitofp-widen.ll b/test/Analysis/CostModel/X86/sitofp-widen.ll new file mode 100644 index 0000000000000000000000000000000000000000..3672f393c49e4c319b0ef2e1a8ecb0b6cd1bad64 --- /dev/null +++ b/test/Analysis/CostModel/X86/sitofp-widen.ll @@ -0,0 +1,319 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ +; +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2 + +define i32 @sitofp_i8_double() { +; SSE-LABEL: 'sitofp_i8_double' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = sitofp i8 undef to double +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'sitofp_i8_double' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = sitofp i8 undef to double +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'sitofp_i8_double' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = sitofp i8 undef to double +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'sitofp_i8_double' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = sitofp i8 undef to double +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %cvt_i8_f64 = sitofp i8 undef to double + %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double> + %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> + %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> + ret i32 undef +} + +define i32 @sitofp_i16_double() { +; SSE-LABEL: 'sitofp_i16_double' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = sitofp i16 undef to double +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'sitofp_i16_double' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = sitofp i16 undef to double +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'sitofp_i16_double' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = sitofp i16 undef to double +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'sitofp_i16_double' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = sitofp i16 undef to double +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %cvt_i16_f64 = sitofp i16 undef to double + %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double> + %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double> + %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double> + ret i32 undef +} + +define i32 @sitofp_i32_double() { +; SSE-LABEL: 'sitofp_i32_double' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'sitofp_i32_double' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'sitofp_i32_double' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'sitofp_i32_double' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %cvt_i32_f64 = sitofp i32 undef to double + %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double> + %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double> + %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double> + ret i32 undef +} + +define i32 @sitofp_i64_double() { +; SSE-LABEL: 'sitofp_i64_double' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double +; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'sitofp_i64_double' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double +; AVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'sitofp_i64_double' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double +; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'sitofp_i64_double' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'sitofp_i64_double' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double +; BTVER2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %cvt_i64_f64 = sitofp i64 undef to double + %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double> + %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double> + %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double> + ret i32 undef +} + +define i32 @sitofp_i8_float() { +; SSE-LABEL: 'sitofp_i8_float' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = sitofp i8 undef to float +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'sitofp_i8_float' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = sitofp i8 undef to float +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'sitofp_i8_float' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = sitofp i8 undef to float +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'sitofp_i8_float' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = sitofp i8 undef to float +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %cvt_i8_f32 = sitofp i8 undef to float + %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> + %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> + %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> + ret i32 undef +} + +define i32 @sitofp_i16_float() { +; SSE-LABEL: 'sitofp_i16_float' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = sitofp i16 undef to float +; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'sitofp_i16_float' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = sitofp i16 undef to float +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'sitofp_i16_float' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = sitofp i16 undef to float +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'sitofp_i16_float' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = sitofp i16 undef to float +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %cvt_i16_f32 = sitofp i16 undef to float + %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> + %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float> + %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float> + ret i32 undef +} + +define i32 @sitofp_i32_float() { +; SSE-LABEL: 'sitofp_i32_float' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = sitofp i32 undef to float +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'sitofp_i32_float' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = sitofp i32 undef to float +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'sitofp_i32_float' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = sitofp i32 undef to float +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'sitofp_i32_float' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = sitofp i32 undef to float +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %cvt_i32_f32 = sitofp i32 undef to float + %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float> + %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float> + %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float> + ret i32 undef +} + +define i32 @sitofp_i64_float() { +; SSE-LABEL: 'sitofp_i64_float' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float +; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'sitofp_i64_float' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'sitofp_i64_float' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'sitofp_i64_float' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'sitofp_i64_float' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %cvt_i64_f32 = sitofp i64 undef to float + %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float> + %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float> + %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float> + %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float> + ret i32 undef +} diff --git a/test/Analysis/CostModel/X86/strided-load-i16.ll b/test/Analysis/CostModel/X86/strided-load-i16.ll index ef786e7509503d1c3ee57b2bc64c8e1df837846e..8061d8abcd791e17ac139f42567d2167dec928be 100755 --- a/test/Analysis/CostModel/X86/strided-load-i16.ll +++ b/test/Analysis/CostModel/X86/strided-load-i16.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s +; RUN: opt -loop-vectorize -S -mattr=avx512bw --debug-only=loop-vectorize < %s 2>&1| FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/test/Analysis/CostModel/X86/strided-load-i32.ll b/test/Analysis/CostModel/X86/strided-load-i32.ll index fad9def2bf7849fa02009cd7cfaa419658725209..9f9447a63c87b2c998da13074415cff2b5d19ad0 100755 --- a/test/Analysis/CostModel/X86/strided-load-i32.ll +++ b/test/Analysis/CostModel/X86/strided-load-i32.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s +; RUN: opt -loop-vectorize -S -mattr=avx512f --debug-only=loop-vectorize < %s 2>&1| FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/test/Analysis/CostModel/X86/strided-load-i64.ll b/test/Analysis/CostModel/X86/strided-load-i64.ll index a7f593017d6035c5d226e8eae884a73e04d60304..8b5091b3b6dd6d746eeccfe787bb19e1d8240c9f 100755 --- a/test/Analysis/CostModel/X86/strided-load-i64.ll +++ b/test/Analysis/CostModel/X86/strided-load-i64.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s +; RUN: opt -loop-vectorize -S -mattr=avx512f --debug-only=loop-vectorize < %s 2>&1| FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/test/Analysis/CostModel/X86/strided-load-i8.ll b/test/Analysis/CostModel/X86/strided-load-i8.ll index 72c9398fe2d449052d2ed7b804ee63f03adceb4b..3c88b38edec7de7510902464041c3bb541a972d2 100755 --- a/test/Analysis/CostModel/X86/strided-load-i8.ll +++ b/test/Analysis/CostModel/X86/strided-load-i8.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s +; RUN: opt -loop-vectorize -S -mattr=avx512bw --debug-only=loop-vectorize < %s 2>&1| FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/test/Analysis/CostModel/X86/testshiftashr-widen.ll b/test/Analysis/CostModel/X86/testshiftashr-widen.ll new file mode 100644 index 0000000000000000000000000000000000000000..dcd70ecf1599b7e63f6ea7d0f176690cf38ee9e6 --- /dev/null +++ b/test/Analysis/CostModel/X86/testshiftashr-widen.ll @@ -0,0 +1,531 @@ +; RUN: llc -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=core2 < %s | FileCheck --check-prefix=SSE2-CODEGEN %s +; RUN: opt -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=core2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s + +%shifttype = type <2 x i16> +define %shifttype @shift2i16(%shifttype %a, %shifttype %b) { +entry: + ; SSE2-LABEL: shift2i16 + ; SSE2: cost of 32 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift2i16 + ; SSE2-CODEGEN: psraw + + %0 = ashr %shifttype %a , %b + ret %shifttype %0 +} + +%shifttype4i16 = type <4 x i16> +define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) { +entry: + ; SSE2-LABEL: shift4i16 + ; SSE2: cost of 32 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift4i16 + ; SSE2-CODEGEN: psraw + + %0 = ashr %shifttype4i16 %a , %b + ret %shifttype4i16 %0 +} + +%shifttype8i16 = type <8 x i16> +define %shifttype8i16 @shift8i16(%shifttype8i16 %a, %shifttype8i16 %b) { +entry: + ; SSE2-LABEL: shift8i16 + ; SSE2: cost of 32 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift8i16 + ; SSE2-CODEGEN: psraw + + %0 = ashr %shifttype8i16 %a , %b + ret %shifttype8i16 %0 +} + +%shifttype16i16 = type <16 x i16> +define %shifttype16i16 @shift16i16(%shifttype16i16 %a, %shifttype16i16 %b) { +entry: + ; SSE2-LABEL: shift16i16 + ; SSE2: cost of 64 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift16i16 + ; SSE2-CODEGEN: psraw + + %0 = ashr %shifttype16i16 %a , %b + ret %shifttype16i16 %0 +} + +%shifttype32i16 = type <32 x i16> +define %shifttype32i16 @shift32i16(%shifttype32i16 %a, %shifttype32i16 %b) { +entry: + ; SSE2-LABEL: shift32i16 + ; SSE2: cost of 128 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift32i16 + ; SSE2-CODEGEN: psraw + + %0 = ashr %shifttype32i16 %a , %b + ret %shifttype32i16 %0 +} + +%shifttype2i32 = type <2 x i32> +define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) { +entry: + ; SSE2-LABEL: shift2i32 + ; SSE2: cost of 16 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift2i32 + ; SSE2-CODEGEN: psrad + + %0 = ashr %shifttype2i32 %a , %b + ret %shifttype2i32 %0 +} + +%shifttype4i32 = type <4 x i32> +define %shifttype4i32 @shift4i32(%shifttype4i32 %a, %shifttype4i32 %b) { +entry: + ; SSE2-LABEL: shift4i32 + ; SSE2: cost of 16 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift4i32 + ; SSE2-CODEGEN: psrad + + %0 = ashr %shifttype4i32 %a , %b + ret %shifttype4i32 %0 +} + +%shifttype8i32 = type <8 x i32> +define %shifttype8i32 @shift8i32(%shifttype8i32 %a, %shifttype8i32 %b) { +entry: + ; SSE2-LABEL: shift8i32 + ; SSE2: cost of 32 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift8i32 + ; SSE2-CODEGEN: psrad + + %0 = ashr %shifttype8i32 %a , %b + ret %shifttype8i32 %0 +} + +%shifttype16i32 = type <16 x i32> +define %shifttype16i32 @shift16i32(%shifttype16i32 %a, %shifttype16i32 %b) { +entry: + ; SSE2-LABEL: shift16i32 + ; SSE2: cost of 64 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift16i32 + ; SSE2-CODEGEN: psrad + + %0 = ashr %shifttype16i32 %a , %b + ret %shifttype16i32 %0 +} + +%shifttype32i32 = type <32 x i32> +define %shifttype32i32 @shift32i32(%shifttype32i32 %a, %shifttype32i32 %b) { +entry: + ; SSE2-LABEL: shift32i32 + ; SSE2: cost of 128 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift32i32 + ; SSE2-CODEGEN: psrad + + %0 = ashr %shifttype32i32 %a , %b + ret %shifttype32i32 %0 +} + +%shifttype2i64 = type <2 x i64> +define %shifttype2i64 @shift2i64(%shifttype2i64 %a, %shifttype2i64 %b) { +entry: + ; SSE2-LABEL: shift2i64 + ; SSE2: cost of 12 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift2i64 + ; SSE2-CODEGEN: psrlq + + %0 = ashr %shifttype2i64 %a , %b + ret %shifttype2i64 %0 +} + +%shifttype4i64 = type <4 x i64> +define %shifttype4i64 @shift4i64(%shifttype4i64 %a, %shifttype4i64 %b) { +entry: + ; SSE2-LABEL: shift4i64 + ; SSE2: cost of 24 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift4i64 + ; SSE2-CODEGEN: psrlq + + %0 = ashr %shifttype4i64 %a , %b + ret %shifttype4i64 %0 +} + +%shifttype8i64 = type <8 x i64> +define %shifttype8i64 @shift8i64(%shifttype8i64 %a, %shifttype8i64 %b) { +entry: + ; SSE2-LABEL: shift8i64 + ; SSE2: cost of 48 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift8i64 + ; SSE2-CODEGEN: psrlq + + %0 = ashr %shifttype8i64 %a , %b + ret %shifttype8i64 %0 +} + +%shifttype16i64 = type <16 x i64> +define %shifttype16i64 @shift16i64(%shifttype16i64 %a, %shifttype16i64 %b) { +entry: + ; SSE2-LABEL: shift16i64 + ; SSE2: cost of 96 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift16i64 + ; SSE2-CODEGEN: psrlq + + %0 = ashr %shifttype16i64 %a , %b + ret %shifttype16i64 %0 +} + +%shifttype32i64 = type <32 x i64> +define %shifttype32i64 @shift32i64(%shifttype32i64 %a, %shifttype32i64 %b) { +entry: + ; SSE2-LABEL: shift32i64 + ; SSE2: cost of 192 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift32i64 + ; SSE2-CODEGEN: psrlq + + %0 = ashr %shifttype32i64 %a , %b + ret %shifttype32i64 %0 +} + +%shifttype2i8 = type <2 x i8> +define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) { +entry: + ; SSE2-LABEL: shift2i8 + ; SSE2: cost of 54 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift2i8 + ; SSE2-CODEGEN: psrlw + + %0 = ashr %shifttype2i8 %a , %b + ret %shifttype2i8 %0 +} + +%shifttype4i8 = type <4 x i8> +define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) { +entry: + ; SSE2-LABEL: shift4i8 + ; SSE2: cost of 54 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift4i8 + ; SSE2-CODEGEN: psrlw + + %0 = ashr %shifttype4i8 %a , %b + ret %shifttype4i8 %0 +} + +%shifttype8i8 = type <8 x i8> +define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) { +entry: + ; SSE2-LABEL: shift8i8 + ; SSE2: cost of 54 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift8i8 + ; SSE2-CODEGEN: psraw + + %0 = ashr %shifttype8i8 %a , %b + ret %shifttype8i8 %0 +} + +%shifttype16i8 = type <16 x i8> +define %shifttype16i8 @shift16i8(%shifttype16i8 %a, %shifttype16i8 %b) { +entry: + ; SSE2-LABEL: shift16i8 + ; SSE2: cost of 54 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift16i8 + ; SSE2-CODEGEN: psraw + + %0 = ashr %shifttype16i8 %a , %b + ret %shifttype16i8 %0 +} + +%shifttype32i8 = type <32 x i8> +define %shifttype32i8 @shift32i8(%shifttype32i8 %a, %shifttype32i8 %b) { +entry: + ; SSE2-LABEL: shift32i8 + ; SSE2: cost of 108 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift32i8 + ; SSE2-CODEGEN: psraw + + %0 = ashr %shifttype32i8 %a , %b + ret %shifttype32i8 %0 +} + +; Test shift by a constant a value. + +%shifttypec = type <2 x i16> +define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) { +entry: + ; SSE2-LABEL: shift2i16const + ; SSE2: cost of 1 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift2i16const + ; SSE2-CODEGEN: psraw $3 + + %0 = ashr %shifttypec %a , + ret %shifttypec %0 +} + +%shifttypec4i16 = type <4 x i16> +define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) { +entry: + ; SSE2-LABEL: shift4i16const + ; SSE2: cost of 1 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift4i16const + ; SSE2-CODEGEN: psraw $3 + + %0 = ashr %shifttypec4i16 %a , + ret %shifttypec4i16 %0 +} + +%shifttypec8i16 = type <8 x i16> +define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) { +entry: + ; SSE2-LABEL: shift8i16const + ; SSE2: cost of 1 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift8i16const + ; SSE2-CODEGEN: psraw $3 + + %0 = ashr %shifttypec8i16 %a , + ret %shifttypec8i16 %0 +} + +%shifttypec16i16 = type <16 x i16> +define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a, + %shifttypec16i16 %b) { +entry: + ; SSE2-LABEL: shift16i16const + ; SSE2: cost of 2 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift16i16const + ; SSE2-CODEGEN: psraw $3 + + %0 = ashr %shifttypec16i16 %a , + ret %shifttypec16i16 %0 +} + +%shifttypec32i16 = type <32 x i16> +define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a, + %shifttypec32i16 %b) { +entry: + ; SSE2-LABEL: shift32i16const + ; SSE2: cost of 4 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift32i16const + ; SSE2-CODEGEN: psraw $3 + + %0 = ashr %shifttypec32i16 %a , + ret %shifttypec32i16 %0 +} + +%shifttypec2i32 = type <2 x i32> +define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) { +entry: + ; SSE2-LABEL: shift2i32c + ; SSE2: cost of 1 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift2i32c + ; SSE2-CODEGEN: psrad $3 + + %0 = ashr %shifttypec2i32 %a , + ret %shifttypec2i32 %0 +} + +%shifttypec4i32 = type <4 x i32> +define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) { +entry: + ; SSE2-LABEL: shift4i32c + ; SSE2: cost of 1 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift4i32c + ; SSE2-CODEGEN: psrad $3 + + %0 = ashr %shifttypec4i32 %a , + ret %shifttypec4i32 %0 +} + +%shifttypec8i32 = type <8 x i32> +define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) { +entry: + ; SSE2-LABEL: shift8i32c + ; SSE2: cost of 2 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift8i32c + ; SSE2-CODEGEN: psrad $3 + + %0 = ashr %shifttypec8i32 %a , + ret %shifttypec8i32 %0 +} + +%shifttypec16i32 = type <16 x i32> +define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) { +entry: + ; SSE2-LABEL: shift16i32c + ; SSE2: cost of 4 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift16i32c + ; SSE2-CODEGEN: psrad $3 + + %0 = ashr %shifttypec16i32 %a , + ret %shifttypec16i32 %0 +} + +%shifttypec32i32 = type <32 x i32> +define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) { +entry: + ; SSE2-LABEL: shift32i32c + ; getTypeConversion fails here and promotes this to a i64. + ; SSE2: cost of 8 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift32i32c + ; SSE2-CODEGEN: psrad $3 + %0 = ashr %shifttypec32i32 %a , + ret %shifttypec32i32 %0 +} + +%shifttypec2i64 = type <2 x i64> +define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) { +entry: + ; SSE2-LABEL: shift2i64c + ; SSE2: cost of 4 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift2i64c + ; SSE2-CODEGEN: psrad $3 + + %0 = ashr %shifttypec2i64 %a , + ret %shifttypec2i64 %0 +} + +%shifttypec4i64 = type <4 x i64> +define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) { +entry: + ; SSE2-LABEL: shift4i64c + ; SSE2: cost of 8 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift4i64c + ; SSE2-CODEGEN: psrad $3 + + %0 = ashr %shifttypec4i64 %a , + ret %shifttypec4i64 %0 +} + +%shifttypec8i64 = type <8 x i64> +define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) { +entry: + ; SSE2-LABEL: shift8i64c + ; SSE2: cost of 16 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift8i64c + ; SSE2-CODEGEN: psrad $3 + + %0 = ashr %shifttypec8i64 %a , + ret %shifttypec8i64 %0 +} + +%shifttypec16i64 = type <16 x i64> +define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) { +entry: + ; SSE2-LABEL: shift16i64c + ; SSE2: cost of 32 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift16i64c + ; SSE2-CODEGEN: psrad $3 + + %0 = ashr %shifttypec16i64 %a , + ret %shifttypec16i64 %0 +} + +%shifttypec32i64 = type <32 x i64> +define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) { +entry: + ; SSE2-LABEL: shift32i64c + ; SSE2: cost of 64 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift32i64c + ; SSE2-CODEGEN: psrad $3 + + %0 = ashr %shifttypec32i64 %a , + ret %shifttypec32i64 %0 +} + +%shifttypec2i8 = type <2 x i8> +define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) { +entry: + ; SSE2-LABEL: shift2i8c + ; SSE2: cost of 4 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift2i8c + ; SSE2-CODEGEN: psrlw $3 + + %0 = ashr %shifttypec2i8 %a , + ret %shifttypec2i8 %0 +} + +%shifttypec4i8 = type <4 x i8> +define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) { +entry: + ; SSE2-LABEL: shift4i8c + ; SSE2: cost of 4 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift4i8c + ; SSE2-CODEGEN: psrlw $3 + + %0 = ashr %shifttypec4i8 %a , + ret %shifttypec4i8 %0 +} + +%shifttypec8i8 = type <8 x i8> +define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) { +entry: + ; SSE2-LABEL: shift8i8c + ; SSE2: cost of 4 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift8i8c + ; SSE2-CODEGEN: psrlw $3 + + %0 = ashr %shifttypec8i8 %a , + ret %shifttypec8i8 %0 +} + +%shifttypec16i8 = type <16 x i8> +define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) { +entry: + ; SSE2-LABEL: shift16i8c + ; SSE2: cost of 4 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift16i8c + ; SSE2-CODEGEN: psrlw $3 + + %0 = ashr %shifttypec16i8 %a , + ret %shifttypec16i8 %0 +} + +%shifttypec32i8 = type <32 x i8> +define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) { +entry: + ; SSE2-LABEL: shift32i8c + ; SSE2: cost of 8 {{.*}} ashr + ; SSE2-CODEGEN-LABEL: shift32i8c + ; SSE2-CODEGEN: psrlw $3 + + %0 = ashr %shifttypec32i8 %a , + ret %shifttypec32i8 %0 +} + diff --git a/test/Analysis/CostModel/X86/testshiftashr.ll b/test/Analysis/CostModel/X86/testshiftashr.ll index 864ea2e5559e5ebca13f0e6152b035d64d5af245..7f588a7488909e8ab9ff9365340ea8d259e3b776 100644 --- a/test/Analysis/CostModel/X86/testshiftashr.ll +++ b/test/Analysis/CostModel/X86/testshiftashr.ll @@ -261,7 +261,7 @@ entry: ; SSE2-LABEL: shift4i16const ; SSE2: cost of 1 {{.*}} ashr ; SSE2-CODEGEN-LABEL: shift4i16const - ; SSE2-CODEGEN: psrad $3 + ; SSE2-CODEGEN: psrad $19 %0 = ashr %shifttypec4i16 %a , ret %shifttypec4i16 %0 @@ -476,7 +476,7 @@ entry: ; SSE2-LABEL: shift4i8c ; SSE2: cost of 1 {{.*}} ashr ; SSE2-CODEGEN-LABEL: shift4i8c - ; SSE2-CODEGEN: psrad $3 + ; SSE2-CODEGEN: psrad $27 %0 = ashr %shifttypec4i8 %a , ret %shifttypec4i8 %0 @@ -488,7 +488,7 @@ entry: ; SSE2-LABEL: shift8i8c ; SSE2: cost of 1 {{.*}} ashr ; SSE2-CODEGEN-LABEL: shift8i8c - ; SSE2-CODEGEN: psraw $3 + ; SSE2-CODEGEN: psraw $11 %0 = ashr %shifttypec8i8 %a , diff --git a/test/Analysis/CostModel/X86/testshiftlshr-widen.ll b/test/Analysis/CostModel/X86/testshiftlshr-widen.ll new file mode 100644 index 0000000000000000000000000000000000000000..96a0ef75c066069f4ce8aef03374023436224f8c --- /dev/null +++ b/test/Analysis/CostModel/X86/testshiftlshr-widen.ll @@ -0,0 +1,529 @@ +; RUN: llc -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=core2 < %s | FileCheck --check-prefix=SSE2-CODEGEN %s +; RUN: opt -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=core2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s + +%shifttype = type <2 x i16> +define %shifttype @shift2i16(%shifttype %a, %shifttype %b) { +entry: + ; SSE2-LABEL: shift2i16 + ; SSE2: cost of 32 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift2i16 + ; SSE2-CODEGEN: psrlw + + %0 = lshr %shifttype %a , %b + ret %shifttype %0 +} + +%shifttype4i16 = type <4 x i16> +define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) { +entry: + ; SSE2-LABEL: shift4i16 + ; SSE2: cost of 32 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift4i16 + ; SSE2-CODEGEN: psrlw + + %0 = lshr %shifttype4i16 %a , %b + ret %shifttype4i16 %0 +} + +%shifttype8i16 = type <8 x i16> +define %shifttype8i16 @shift8i16(%shifttype8i16 %a, %shifttype8i16 %b) { +entry: + ; SSE2-LABEL: shift8i16 + ; SSE2: cost of 32 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift8i16 + ; SSE2-CODEGEN: psrlw + + %0 = lshr %shifttype8i16 %a , %b + ret %shifttype8i16 %0 +} + +%shifttype16i16 = type <16 x i16> +define %shifttype16i16 @shift16i16(%shifttype16i16 %a, %shifttype16i16 %b) { +entry: + ; SSE2-LABEL: shift16i16 + ; SSE2: cost of 64 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift16i16 + ; SSE2-CODEGEN: psrlw + + %0 = lshr %shifttype16i16 %a , %b + ret %shifttype16i16 %0 +} + +%shifttype32i16 = type <32 x i16> +define %shifttype32i16 @shift32i16(%shifttype32i16 %a, %shifttype32i16 %b) { +entry: + ; SSE2-LABEL: shift32i16 + ; SSE2: cost of 128 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift32i16 + ; SSE2-CODEGEN: psrlw + + %0 = lshr %shifttype32i16 %a , %b + ret %shifttype32i16 %0 +} + +%shifttype2i32 = type <2 x i32> +define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) { +entry: + ; SSE2-LABEL: shift2i32 + ; SSE2: cost of 16 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift2i32 + ; SSE2-CODEGEN: psrld + + %0 = lshr %shifttype2i32 %a , %b + ret %shifttype2i32 %0 +} + +%shifttype4i32 = type <4 x i32> +define %shifttype4i32 @shift4i32(%shifttype4i32 %a, %shifttype4i32 %b) { +entry: + ; SSE2-LABEL: shift4i32 + ; SSE2: cost of 16 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift4i32 + ; SSE2-CODEGEN: psrld + + %0 = lshr %shifttype4i32 %a , %b + ret %shifttype4i32 %0 +} + +%shifttype8i32 = type <8 x i32> +define %shifttype8i32 @shift8i32(%shifttype8i32 %a, %shifttype8i32 %b) { +entry: + ; SSE2-LABEL: shift8i32 + ; SSE2: cost of 32 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift8i32 + ; SSE2-CODEGEN: psrld + + %0 = lshr %shifttype8i32 %a , %b + ret %shifttype8i32 %0 +} + +%shifttype16i32 = type <16 x i32> +define %shifttype16i32 @shift16i32(%shifttype16i32 %a, %shifttype16i32 %b) { +entry: + ; SSE2-LABEL: shift16i32 + ; SSE2: cost of 64 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift16i32 + ; SSE2-CODEGEN: psrld + + %0 = lshr %shifttype16i32 %a , %b + ret %shifttype16i32 %0 +} + +%shifttype32i32 = type <32 x i32> +define %shifttype32i32 @shift32i32(%shifttype32i32 %a, %shifttype32i32 %b) { +entry: + ; SSE2-LABEL: shift32i32 + ; SSE2: cost of 128 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift32i32 + ; SSE2-CODEGEN: psrld + + %0 = lshr %shifttype32i32 %a , %b + ret %shifttype32i32 %0 +} + +%shifttype2i64 = type <2 x i64> +define %shifttype2i64 @shift2i64(%shifttype2i64 %a, %shifttype2i64 %b) { +entry: + ; SSE2-LABEL: shift2i64 + ; SSE2: cost of 4 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift2i64 + ; SSE2-CODEGEN: psrlq + + %0 = lshr %shifttype2i64 %a , %b + ret %shifttype2i64 %0 +} + +%shifttype4i64 = type <4 x i64> +define %shifttype4i64 @shift4i64(%shifttype4i64 %a, %shifttype4i64 %b) { +entry: + ; SSE2-LABEL: shift4i64 + ; SSE2: cost of 8 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift4i64 + ; SSE2-CODEGEN: psrlq + + %0 = lshr %shifttype4i64 %a , %b + ret %shifttype4i64 %0 +} + +%shifttype8i64 = type <8 x i64> +define %shifttype8i64 @shift8i64(%shifttype8i64 %a, %shifttype8i64 %b) { +entry: + ; SSE2-LABEL: shift8i64 + ; SSE2: cost of 16 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift8i64 + ; SSE2-CODEGEN: psrlq + + %0 = lshr %shifttype8i64 %a , %b + ret %shifttype8i64 %0 +} + +%shifttype16i64 = type <16 x i64> +define %shifttype16i64 @shift16i64(%shifttype16i64 %a, %shifttype16i64 %b) { +entry: + ; SSE2-LABEL: shift16i64 + ; SSE2: cost of 32 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift16i64 + ; SSE2-CODEGEN: psrlq + + %0 = lshr %shifttype16i64 %a , %b + ret %shifttype16i64 %0 +} + +%shifttype32i64 = type <32 x i64> +define %shifttype32i64 @shift32i64(%shifttype32i64 %a, %shifttype32i64 %b) { +entry: + ; SSE2-LABEL: shift32i64 + ; SSE2: cost of 64 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift32i64 + ; SSE2-CODEGEN: psrlq + + %0 = lshr %shifttype32i64 %a , %b + ret %shifttype32i64 %0 +} + +%shifttype2i8 = type <2 x i8> +define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) { +entry: + ; SSE2-LABEL: shift2i8 + ; SSE2: cost of 26 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift2i8 + ; SSE2-CODEGEN: psrlw + + %0 = lshr %shifttype2i8 %a , %b + ret %shifttype2i8 %0 +} + +%shifttype4i8 = type <4 x i8> +define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) { +entry: + ; SSE2-LABEL: shift4i8 + ; SSE2: cost of 26 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift4i8 + ; SSE2-CODEGEN: psrlw + + %0 = lshr %shifttype4i8 %a , %b + ret %shifttype4i8 %0 +} + +%shifttype8i8 = type <8 x i8> +define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) { +entry: + ; SSE2-LABEL: shift8i8 + ; SSE2: cost of 26 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift8i8 + ; SSE2-CODEGEN: psrlw + + %0 = lshr %shifttype8i8 %a , %b + ret %shifttype8i8 %0 +} + +%shifttype16i8 = type <16 x i8> +define %shifttype16i8 @shift16i8(%shifttype16i8 %a, %shifttype16i8 %b) { +entry: + ; SSE2-LABEL: shift16i8 + ; SSE2: cost of 26 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift16i8 + ; SSE2-CODEGEN: psrlw + + %0 = lshr %shifttype16i8 %a , %b + ret %shifttype16i8 %0 +} + +%shifttype32i8 = type <32 x i8> +define %shifttype32i8 @shift32i8(%shifttype32i8 %a, %shifttype32i8 %b) { +entry: + ; SSE2-LABEL: shift32i8 + ; SSE2: cost of 52 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift32i8 + ; SSE2-CODEGEN: psrlw + + %0 = lshr %shifttype32i8 %a , %b + ret %shifttype32i8 %0 +} + +; Test shift by a constant vector. + +%shifttypec = type <2 x i16> +define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) { +entry: + ; SSE2-LABEL: shift2i16const + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift2i16const + ; SSE2-CODEGEN: psrlw $3 + + %0 = lshr %shifttypec %a , + ret %shifttypec %0 +} + +%shifttypec4i16 = type <4 x i16> +define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) { +entry: + ; SSE2-LABEL: shift4i16const + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift4i16const + ; SSE2-CODEGEN: psrlw $3 + + %0 = lshr %shifttypec4i16 %a , + ret %shifttypec4i16 %0 +} + +%shifttypec8i16 = type <8 x i16> +define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) { +entry: + ; SSE2-LABEL: shift8i16const + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift8i16const + ; SSE2-CODEGEN: psrlw $3 + + %0 = lshr %shifttypec8i16 %a , + ret %shifttypec8i16 %0 +} + +%shifttypec16i16 = type <16 x i16> +define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a, + %shifttypec16i16 %b) { +entry: + ; SSE2-LABEL: shift16i16const + ; SSE2: cost of 2 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift16i16const + ; SSE2-CODEGEN: psrlw $3 + + %0 = lshr %shifttypec16i16 %a , + ret %shifttypec16i16 %0 +} + +%shifttypec32i16 = type <32 x i16> +define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a, + %shifttypec32i16 %b) { +entry: + ; SSE2-LABEL: shift32i16const + ; SSE2: cost of 4 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift32i16const + ; SSE2-CODEGEN: psrlw $3 + + %0 = lshr %shifttypec32i16 %a , + ret %shifttypec32i16 %0 +} + +%shifttypec2i32 = type <2 x i32> +define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) { +entry: + ; SSE2-LABEL: shift2i32c + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift2i32c + ; SSE2-CODEGEN: psrld $3 + + %0 = lshr %shifttypec2i32 %a , + ret %shifttypec2i32 %0 +} + +%shifttypec4i32 = type <4 x i32> +define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) { +entry: + ; SSE2-LABEL: shift4i32c + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift4i32c + ; SSE2-CODEGEN: psrld $3 + + %0 = lshr %shifttypec4i32 %a , + ret %shifttypec4i32 %0 +} + +%shifttypec8i32 = type <8 x i32> +define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) { +entry: + ; SSE2-LABEL: shift8i32c + ; SSE2: cost of 2 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift8i32c + ; SSE2-CODEGEN: psrld $3 + + %0 = lshr %shifttypec8i32 %a , + ret %shifttypec8i32 %0 +} + +%shifttypec16i32 = type <16 x i32> +define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) { +entry: + ; SSE2-LABEL: shift16i32c + ; SSE2: cost of 4 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift16i32c + ; SSE2-CODEGEN: psrld $3 + + %0 = lshr %shifttypec16i32 %a , + ret %shifttypec16i32 %0 +} + +%shifttypec32i32 = type <32 x i32> +define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) { +entry: + ; SSE2-LABEL: shift32i32c + ; SSE2: cost of 8 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift32i32c + ; SSE2-CODEGEN: psrld $3 + %0 = lshr %shifttypec32i32 %a , + ret %shifttypec32i32 %0 +} + +%shifttypec2i64 = type <2 x i64> +define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) { +entry: + ; SSE2-LABEL: shift2i64c + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift2i64c + ; SSE2-CODEGEN: psrlq $3 + + %0 = lshr %shifttypec2i64 %a , + ret %shifttypec2i64 %0 +} + +%shifttypec4i64 = type <4 x i64> +define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) { +entry: + ; SSE2-LABEL: shift4i64c + ; SSE2: cost of 2 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift4i64c + ; SSE2-CODEGEN: psrlq $3 + + %0 = lshr %shifttypec4i64 %a , + ret %shifttypec4i64 %0 +} + +%shifttypec8i64 = type <8 x i64> +define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) { +entry: + ; SSE2-LABEL: shift8i64c + ; SSE2: cost of 4 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift8i64c + ; SSE2-CODEGEN: psrlq $3 + + %0 = lshr %shifttypec8i64 %a , + ret %shifttypec8i64 %0 +} + +%shifttypec16i64 = type <16 x i64> +define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) { +entry: + ; SSE2-LABEL: shift16i64c + ; SSE2: cost of 8 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift16i64c + ; SSE2-CODEGEN: psrlq $3 + + %0 = lshr %shifttypec16i64 %a , + ret %shifttypec16i64 %0 +} + +%shifttypec32i64 = type <32 x i64> +define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) { +entry: + ; SSE2-LABEL: shift32i64c + ; SSE2: cost of 16 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift32i64c + ; SSE2-CODEGEN: psrlq $3 + + %0 = lshr %shifttypec32i64 %a , + ret %shifttypec32i64 %0 +} + +%shifttypec2i8 = type <2 x i8> +define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) { +entry: + ; SSE2-LABEL: shift2i8c + ; SSE2: cost of 2 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift2i8c + ; SSE2-CODEGEN: psrlw $3 + + %0 = lshr %shifttypec2i8 %a , + ret %shifttypec2i8 %0 +} + +%shifttypec4i8 = type <4 x i8> +define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) { +entry: + ; SSE2-LABEL: shift4i8c + ; SSE2: cost of 2 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift4i8c + ; SSE2-CODEGEN: psrlw $3 + + %0 = lshr %shifttypec4i8 %a , + ret %shifttypec4i8 %0 +} + +%shifttypec8i8 = type <8 x i8> +define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) { +entry: + ; SSE2-LABEL: shift8i8c + ; SSE2: cost of 2 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift8i8c + ; SSE2-CODEGEN: psrlw $3 + + %0 = lshr %shifttypec8i8 %a , + ret %shifttypec8i8 %0 +} + +%shifttypec16i8 = type <16 x i8> +define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) { +entry: + ; SSE2-LABEL: shift16i8c + ; SSE2: cost of 2 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift16i8c + ; SSE2-CODEGEN: psrlw $3 + + %0 = lshr %shifttypec16i8 %a , + ret %shifttypec16i8 %0 +} + +%shifttypec32i8 = type <32 x i8> +define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) { +entry: + ; SSE2-LABEL: shift32i8c + ; SSE2: cost of 4 {{.*}} lshr + ; SSE2-CODEGEN-LABEL: shift32i8c + ; SSE2-CODEGEN: psrlw $3 + + %0 = lshr %shifttypec32i8 %a , + ret %shifttypec32i8 %0 +} diff --git a/test/Analysis/CostModel/X86/testshiftshl-widen.ll b/test/Analysis/CostModel/X86/testshiftshl-widen.ll new file mode 100644 index 0000000000000000000000000000000000000000..28276e5835fd44b01da83b708c499dccb03e4a67 --- /dev/null +++ b/test/Analysis/CostModel/X86/testshiftshl-widen.ll @@ -0,0 +1,529 @@ +; RUN: llc -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=core2 < %s | FileCheck --check-prefix=SSE2-CODEGEN %s +; RUN: opt -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mcpu=core2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s + +%shifttype = type <2 x i16> +define %shifttype @shift2i16(%shifttype %a, %shifttype %b) { +entry: + ; SSE2-LABEL: shift2i16 + ; SSE2: cost of 32 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift2i16 + ; SSE2-CODEGEN: pmullw + + %0 = shl %shifttype %a , %b + ret %shifttype %0 +} + +%shifttype4i16 = type <4 x i16> +define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) { +entry: + ; SSE2-LABEL: shift4i16 + ; SSE2: cost of 32 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift4i16 + ; SSE2-CODEGEN: pmullw + + %0 = shl %shifttype4i16 %a , %b + ret %shifttype4i16 %0 +} + +%shifttype8i16 = type <8 x i16> +define %shifttype8i16 @shift8i16(%shifttype8i16 %a, %shifttype8i16 %b) { +entry: + ; SSE2-LABEL: shift8i16 + ; SSE2: cost of 32 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift8i16 + ; SSE2-CODEGEN: pmullw + + %0 = shl %shifttype8i16 %a , %b + ret %shifttype8i16 %0 +} + +%shifttype16i16 = type <16 x i16> +define %shifttype16i16 @shift16i16(%shifttype16i16 %a, %shifttype16i16 %b) { +entry: + ; SSE2-LABEL: shift16i16 + ; SSE2: cost of 64 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift16i16 + ; SSE2-CODEGEN: pmullw + + %0 = shl %shifttype16i16 %a , %b + ret %shifttype16i16 %0 +} + +%shifttype32i16 = type <32 x i16> +define %shifttype32i16 @shift32i16(%shifttype32i16 %a, %shifttype32i16 %b) { +entry: + ; SSE2-LABEL: shift32i16 + ; SSE2: cost of 128 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift32i16 + ; SSE2-CODEGEN: pmullw + + %0 = shl %shifttype32i16 %a , %b + ret %shifttype32i16 %0 +} + +%shifttype2i32 = type <2 x i32> +define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) { +entry: + ; SSE2-LABEL: shift2i32 + ; SSE2: cost of 10 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift2i32 + ; SSE2-CODEGEN: pmuludq + + %0 = shl %shifttype2i32 %a , %b + ret %shifttype2i32 %0 +} + +%shifttype4i32 = type <4 x i32> +define %shifttype4i32 @shift4i32(%shifttype4i32 %a, %shifttype4i32 %b) { +entry: + ; SSE2-LABEL: shift4i32 + ; SSE2: cost of 10 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift4i32 + ; SSE2-CODEGEN: pmuludq + + %0 = shl %shifttype4i32 %a , %b + ret %shifttype4i32 %0 +} + +%shifttype8i32 = type <8 x i32> +define %shifttype8i32 @shift8i32(%shifttype8i32 %a, %shifttype8i32 %b) { +entry: + ; SSE2-LABEL: shift8i32 + ; SSE2: cost of 20 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift8i32 + ; SSE2-CODEGEN: pmuludq + + %0 = shl %shifttype8i32 %a , %b + ret %shifttype8i32 %0 +} + +%shifttype16i32 = type <16 x i32> +define %shifttype16i32 @shift16i32(%shifttype16i32 %a, %shifttype16i32 %b) { +entry: + ; SSE2-LABEL: shift16i32 + ; SSE2: cost of 40 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift16i32 + ; SSE2-CODEGEN: pmuludq + + %0 = shl %shifttype16i32 %a , %b + ret %shifttype16i32 %0 +} + +%shifttype32i32 = type <32 x i32> +define %shifttype32i32 @shift32i32(%shifttype32i32 %a, %shifttype32i32 %b) { +entry: + ; SSE2-LABEL: shift32i32 + ; SSE2: cost of 80 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift32i32 + ; SSE2-CODEGEN: pmuludq + + %0 = shl %shifttype32i32 %a , %b + ret %shifttype32i32 %0 +} + +%shifttype2i64 = type <2 x i64> +define %shifttype2i64 @shift2i64(%shifttype2i64 %a, %shifttype2i64 %b) { +entry: + ; SSE2-LABEL: shift2i64 + ; SSE2: cost of 4 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift2i64 + ; SSE2-CODEGEN: psllq + + %0 = shl %shifttype2i64 %a , %b + ret %shifttype2i64 %0 +} + +%shifttype4i64 = type <4 x i64> +define %shifttype4i64 @shift4i64(%shifttype4i64 %a, %shifttype4i64 %b) { +entry: + ; SSE2-LABEL: shift4i64 + ; SSE2: cost of 8 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift4i64 + ; SSE2-CODEGEN: psllq + + %0 = shl %shifttype4i64 %a , %b + ret %shifttype4i64 %0 +} + +%shifttype8i64 = type <8 x i64> +define %shifttype8i64 @shift8i64(%shifttype8i64 %a, %shifttype8i64 %b) { +entry: + ; SSE2-LABEL: shift8i64 + ; SSE2: cost of 16 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift8i64 + ; SSE2-CODEGEN: psllq + + %0 = shl %shifttype8i64 %a , %b + ret %shifttype8i64 %0 +} + +%shifttype16i64 = type <16 x i64> +define %shifttype16i64 @shift16i64(%shifttype16i64 %a, %shifttype16i64 %b) { +entry: + ; SSE2-LABEL: shift16i64 + ; SSE2: cost of 32 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift16i64 + ; SSE2-CODEGEN: psllq + + %0 = shl %shifttype16i64 %a , %b + ret %shifttype16i64 %0 +} + +%shifttype32i64 = type <32 x i64> +define %shifttype32i64 @shift32i64(%shifttype32i64 %a, %shifttype32i64 %b) { +entry: + ; SSE2-LABEL: shift32i64 + ; SSE2: cost of 64 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift32i64 + ; SSE2-CODEGEN: psllq + + %0 = shl %shifttype32i64 %a , %b + ret %shifttype32i64 %0 +} + +%shifttype2i8 = type <2 x i8> +define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) { +entry: + ; SSE2-LABEL: shift2i8 + ; SSE2: cost of 26 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift2i8 + ; SSE2-CODEGEN: psllw + + %0 = shl %shifttype2i8 %a , %b + ret %shifttype2i8 %0 +} + +%shifttype4i8 = type <4 x i8> +define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) { +entry: + ; SSE2-LABEL: shift4i8 + ; SSE2: cost of 26 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift4i8 + ; SSE2-CODEGEN: psllw + + %0 = shl %shifttype4i8 %a , %b + ret %shifttype4i8 %0 +} + +%shifttype8i8 = type <8 x i8> +define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) { +entry: + ; SSE2-LABEL: shift8i8 + ; SSE2: cost of 26 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift8i8 + ; SSE2-CODEGEN: psllw + + %0 = shl %shifttype8i8 %a , %b + ret %shifttype8i8 %0 +} + +%shifttype16i8 = type <16 x i8> +define %shifttype16i8 @shift16i8(%shifttype16i8 %a, %shifttype16i8 %b) { +entry: + ; SSE2-LABEL: shift16i8 + ; SSE2: cost of 26 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift16i8 + ; SSE2-CODEGEN: psllw + + %0 = shl %shifttype16i8 %a , %b + ret %shifttype16i8 %0 +} + +%shifttype32i8 = type <32 x i8> +define %shifttype32i8 @shift32i8(%shifttype32i8 %a, %shifttype32i8 %b) { +entry: + ; SSE2-LABEL: shift32i8 + ; SSE2: cost of 52 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift32i8 + ; SSE2-CODEGEN: psllw + + %0 = shl %shifttype32i8 %a , %b + ret %shifttype32i8 %0 +} + +; Test shift by a constant vector. + +%shifttypec = type <2 x i16> +define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) { +entry: + ; SSE2-LABEL: shift2i16const + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift2i16const + ; SSE2-CODEGEN: psllw $3 + + %0 = shl %shifttypec %a , + ret %shifttypec %0 +} + +%shifttypec4i16 = type <4 x i16> +define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) { +entry: + ; SSE2-LABEL: shift4i16const + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift4i16const + ; SSE2-CODEGEN: psllw $3 + + %0 = shl %shifttypec4i16 %a , + ret %shifttypec4i16 %0 +} + +%shifttypec8i16 = type <8 x i16> +define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) { +entry: + ; SSE2-LABEL: shift8i16const + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift8i16const + ; SSE2-CODEGEN: psllw $3 + + %0 = shl %shifttypec8i16 %a , + ret %shifttypec8i16 %0 +} + +%shifttypec16i16 = type <16 x i16> +define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a, + %shifttypec16i16 %b) { +entry: + ; SSE2-LABEL: shift16i16const + ; SSE2: cost of 2 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift16i16const + ; SSE2-CODEGEN: psllw $3 + + %0 = shl %shifttypec16i16 %a , + ret %shifttypec16i16 %0 +} + +%shifttypec32i16 = type <32 x i16> +define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a, + %shifttypec32i16 %b) { +entry: + ; SSE2-LABEL: shift32i16const + ; SSE2: cost of 4 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift32i16const + ; SSE2-CODEGEN: psllw $3 + + %0 = shl %shifttypec32i16 %a , + ret %shifttypec32i16 %0 +} + +%shifttypec2i32 = type <2 x i32> +define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) { +entry: + ; SSE2-LABEL: shift2i32c + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift2i32c + ; SSE2-CODEGEN: pslld $3 + + %0 = shl %shifttypec2i32 %a , + ret %shifttypec2i32 %0 +} + +%shifttypec4i32 = type <4 x i32> +define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) { +entry: + ; SSE2-LABEL: shift4i32c + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift4i32c + ; SSE2-CODEGEN: pslld $3 + + %0 = shl %shifttypec4i32 %a , + ret %shifttypec4i32 %0 +} + +%shifttypec8i32 = type <8 x i32> +define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) { +entry: + ; SSE2-LABEL: shift8i32c + ; SSE2: cost of 2 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift8i32c + ; SSE2-CODEGEN: pslld $3 + + %0 = shl %shifttypec8i32 %a , + ret %shifttypec8i32 %0 +} + +%shifttypec16i32 = type <16 x i32> +define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) { +entry: + ; SSE2-LABEL: shift16i32c + ; SSE2: cost of 4 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift16i32c + ; SSE2-CODEGEN: pslld $3 + + %0 = shl %shifttypec16i32 %a , + ret %shifttypec16i32 %0 +} + +%shifttypec32i32 = type <32 x i32> +define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) { +entry: + ; SSE2-LABEL: shift32i32c + ; SSE2: cost of 8 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift32i32c + ; SSE2-CODEGEN: pslld $3 + %0 = shl %shifttypec32i32 %a , + ret %shifttypec32i32 %0 +} + +%shifttypec2i64 = type <2 x i64> +define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) { +entry: + ; SSE2-LABEL: shift2i64c + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift2i64c + ; SSE2-CODEGEN: psllq $3 + + %0 = shl %shifttypec2i64 %a , + ret %shifttypec2i64 %0 +} + +%shifttypec4i64 = type <4 x i64> +define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) { +entry: + ; SSE2-LABEL: shift4i64c + ; SSE2: cost of 2 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift4i64c + ; SSE2-CODEGEN: psllq $3 + + %0 = shl %shifttypec4i64 %a , + ret %shifttypec4i64 %0 +} + +%shifttypec8i64 = type <8 x i64> +define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) { +entry: + ; SSE2-LABEL: shift8i64c + ; SSE2: cost of 4 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift8i64c + ; SSE2-CODEGEN: psllq $3 + + %0 = shl %shifttypec8i64 %a , + ret %shifttypec8i64 %0 +} + +%shifttypec16i64 = type <16 x i64> +define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) { +entry: + ; SSE2-LABEL: shift16i64c + ; SSE2: cost of 8 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift16i64c + ; SSE2-CODEGEN: psllq $3 + + %0 = shl %shifttypec16i64 %a , + ret %shifttypec16i64 %0 +} + +%shifttypec32i64 = type <32 x i64> +define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) { +entry: + ; SSE2-LABEL: shift32i64c + ; SSE2: cost of 16 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift32i64c + ; SSE2-CODEGEN: psllq $3 + + %0 = shl %shifttypec32i64 %a , + ret %shifttypec32i64 %0 +} + +%shifttypec2i8 = type <2 x i8> +define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) { +entry: + ; SSE2-LABEL: shift2i8c + ; SSE2: cost of 2 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift2i8c + ; SSE2-CODEGEN: psllw $3 + + %0 = shl %shifttypec2i8 %a , + ret %shifttypec2i8 %0 +} + +%shifttypec4i8 = type <4 x i8> +define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) { +entry: + ; SSE2-LABEL: shift4i8c + ; SSE2: cost of 2 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift4i8c + ; SSE2-CODEGEN: psllw $3 + + %0 = shl %shifttypec4i8 %a , + ret %shifttypec4i8 %0 +} + +%shifttypec8i8 = type <8 x i8> +define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) { +entry: + ; SSE2-LABEL: shift8i8c + ; SSE2: cost of 2 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift8i8c + ; SSE2-CODEGEN: psllw $3 + + %0 = shl %shifttypec8i8 %a , + ret %shifttypec8i8 %0 +} + +%shifttypec16i8 = type <16 x i8> +define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) { +entry: + ; SSE2-LABEL: shift16i8c + ; SSE2: cost of 2 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift16i8c + ; SSE2-CODEGEN: psllw $3 + + %0 = shl %shifttypec16i8 %a , + ret %shifttypec16i8 %0 +} + +%shifttypec32i8 = type <32 x i8> +define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) { +entry: + ; SSE2-LABEL: shift32i8c + ; SSE2: cost of 4 {{.*}} shl + ; SSE2-CODEGEN-LABEL: shift32i8c + ; SSE2-CODEGEN: psllw $3 + + %0 = shl %shifttypec32i8 %a , + ret %shifttypec32i8 %0 +} diff --git a/test/Analysis/CostModel/X86/trunc.ll b/test/Analysis/CostModel/X86/trunc.ll index 2e0dc8e5dafdc2c8b65db693bb3e4d5b9b072898..e2b2cfa732d4193879272b9bf8bad830d343dcc7 100644 --- a/test/Analysis/CostModel/X86/trunc.ll +++ b/test/Analysis/CostModel/X86/trunc.ll @@ -53,6 +53,7 @@ define i32 @trunc_vXi16() { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16> ; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16> +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16> ; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16> ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16> ; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16> @@ -62,6 +63,7 @@ define i32 @trunc_vXi16() { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16> @@ -71,6 +73,7 @@ define i32 @trunc_vXi16() { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16> ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16> +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16> ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16> ; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16> @@ -80,6 +83,7 @@ define i32 @trunc_vXi16() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16> ; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16> ; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16> @@ -89,6 +93,7 @@ define i32 @trunc_vXi16() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16> ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16> ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16> ; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16> @@ -98,6 +103,7 @@ define i32 @trunc_vXi16() { ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16> ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16> ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16> @@ -107,6 +113,7 @@ define i32 @trunc_vXi16() { ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16> ; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16> @@ -115,6 +122,7 @@ define i32 @trunc_vXi16() { %V2i64 = trunc <2 x i64> undef to <2 x i16> %V4i64 = trunc <4 x i64> undef to <4 x i16> %V8i64 = trunc <8 x i64> undef to <8 x i16> + %V2i32 = trunc <2 x i32> undef to <2 x i16> %V4i32 = trunc <4 x i32> undef to <4 x i16> %V8i32 = trunc <8 x i32> undef to <8 x i16> %V16i32 = trunc <16 x i32> undef to <16 x i16> @@ -259,3 +267,93 @@ define i32 @trunc_vXi8() { ret i32 undef } + +define i32 @trunc_vXi1() { +; SSE-LABEL: 'trunc_vXi1' +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1> +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1> +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1> +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'trunc_vXi1' +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'trunc_vXi1' +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'trunc_vXi1' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %V2i64 = trunc <2 x i64> undef to <2 x i1> + %V4i64 = trunc <4 x i64> undef to <4 x i1> + %V8i64 = trunc <8 x i64> undef to <8 x i1> + + %V2i32 = trunc <2 x i32> undef to <2 x i1> + %V4i32 = trunc <4 x i32> undef to <4 x i1> + %V8i32 = trunc <8 x i32> undef to <8 x i1> + %V16i32 = trunc <16 x i32> undef to <16 x i1> + + %V2i16 = trunc <2 x i16> undef to <2 x i1> + %V4i16 = trunc <4 x i16> undef to <4 x i1> + %V8i16 = trunc <8 x i16> undef to <8 x i1> + %V16i16 = trunc <16 x i16> undef to <16 x i1> + %V32i16 = trunc <32 x i16> undef to <32 x i1> + + %V32i8 = trunc <32 x i8> undef to <32 x i1> + %V64i8 = trunc <64 x i8> undef to <64 x i1> + + ret i32 undef +} diff --git a/test/Analysis/CostModel/X86/uitofp-widen.ll b/test/Analysis/CostModel/X86/uitofp-widen.ll new file mode 100644 index 0000000000000000000000000000000000000000..7513d1cdcb5c3e45d66dc3d4b64e000641cf4c7c --- /dev/null +++ b/test/Analysis/CostModel/X86/uitofp-widen.ll @@ -0,0 +1,326 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ +; +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2 + +define i32 @uitofp_i8_double() { +; SSE-LABEL: 'uitofp_i8_double' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = uitofp i8 undef to double +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'uitofp_i8_double' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = uitofp i8 undef to double +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'uitofp_i8_double' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = uitofp i8 undef to double +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'uitofp_i8_double' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = uitofp i8 undef to double +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %cvt_i8_f64 = uitofp i8 undef to double + %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double> + %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> + %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> + ret i32 undef +} + +define i32 @uitofp_i16_double() { +; SSE-LABEL: 'uitofp_i16_double' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = uitofp i16 undef to double +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'uitofp_i16_double' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = uitofp i16 undef to double +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'uitofp_i16_double' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = uitofp i16 undef to double +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'uitofp_i16_double' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = uitofp i16 undef to double +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %cvt_i16_f64 = uitofp i16 undef to double + %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double> + %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double> + %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double> + ret i32 undef +} + +define i32 @uitofp_i32_double() { +; SSE-LABEL: 'uitofp_i32_double' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = uitofp i32 undef to double +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'uitofp_i32_double' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = uitofp i32 undef to double +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'uitofp_i32_double' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = uitofp i32 undef to double +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'uitofp_i32_double' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = uitofp i32 undef to double +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %cvt_i32_f64 = uitofp i32 undef to double + %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double> + %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double> + %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double> + ret i32 undef +} + +define i32 @uitofp_i64_double() { +; SSE-LABEL: 'uitofp_i64_double' +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_i64_f64 = uitofp i64 undef to double +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'uitofp_i64_double' +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f64 = uitofp i64 undef to double +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'uitofp_i64_double' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = uitofp i64 undef to double +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'uitofp_i64_double' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = uitofp i64 undef to double +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'uitofp_i64_double' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f64 = uitofp i64 undef to double +; BTVER2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %cvt_i64_f64 = uitofp i64 undef to double + %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double> + %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double> + %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double> + ret i32 undef +} + +define i32 @uitofp_i8_float() { +; SSE-LABEL: 'uitofp_i8_float' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = uitofp i8 undef to float +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'uitofp_i8_float' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = uitofp i8 undef to float +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'uitofp_i8_float' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = uitofp i8 undef to float +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'uitofp_i8_float' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = uitofp i8 undef to float +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %cvt_i8_f32 = uitofp i8 undef to float + %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> + %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> + %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> + ret i32 undef +} + +define i32 @uitofp_i16_float() { +; SSE-LABEL: 'uitofp_i16_float' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = uitofp i16 undef to float +; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'uitofp_i16_float' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = uitofp i16 undef to float +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'uitofp_i16_float' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = uitofp i16 undef to float +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'uitofp_i16_float' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = uitofp i16 undef to float +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %cvt_i16_f32 = uitofp i16 undef to float + %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> + %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float> + %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float> + ret i32 undef +} + +define i32 @uitofp_i32_float() { +; SSE-LABEL: 'uitofp_i32_float' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = uitofp i32 undef to float +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'uitofp_i32_float' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = uitofp i32 undef to float +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'uitofp_i32_float' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = uitofp i32 undef to float +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'uitofp_i32_float' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = uitofp i32 undef to float +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'uitofp_i32_float' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = uitofp i32 undef to float +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %cvt_i32_f32 = uitofp i32 undef to float + %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float> + %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float> + %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float> + ret i32 undef +} + +define i32 @uitofp_i64_float() { +; SSE-LABEL: 'uitofp_i64_float' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = uitofp i64 undef to float +; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float> +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'uitofp_i64_float' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = uitofp i64 undef to float +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float> +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'uitofp_i64_float' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = uitofp i64 undef to float +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'uitofp_i64_float' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = uitofp i64 undef to float +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float> +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; BTVER2-LABEL: 'uitofp_i64_float' +; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = uitofp i64 undef to float +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float> +; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %cvt_i64_f32 = uitofp i64 undef to float + %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float> + %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float> + %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float> + %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float> + ret i32 undef +} diff --git a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll index 1d1aafabe924fb67bcc29944c3bd3136b96cc174..75515730846ab8b8dc7d30a39117255fd56ea21a 100644 --- a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll +++ b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll @@ -1074,7 +1074,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift ; ; XOP-LABEL: 'constant_shift_v2i64' -; XOP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <2 x i64> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <2 x i64> %a, ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift ; ; AVX512-LABEL: 'constant_shift_v2i64' @@ -1103,7 +1103,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift ; ; XOP-LABEL: 'constant_shift_v4i64' -; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <4 x i64> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <4 x i64> %a, ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift ; ; AVX512-LABEL: 'constant_shift_v4i64' @@ -1132,7 +1132,7 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift ; ; XOP-LABEL: 'constant_shift_v8i64' -; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <8 x i64> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <8 x i64> %a, ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift ; ; AVX512-LABEL: 'constant_shift_v8i64' @@ -1164,13 +1164,9 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift ; -; XOPAVX1-LABEL: 'constant_shift_v4i32' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <4 x i32> %a, -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift -; -; XOPAVX2-LABEL: 'constant_shift_v4i32' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift +; XOP-LABEL: 'constant_shift_v4i32' +; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift ; ; AVX512-LABEL: 'constant_shift_v4i32' ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, @@ -1202,7 +1198,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift ; ; XOPAVX1-LABEL: 'constant_shift_v8i32' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <8 x i32> %a, +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <8 x i32> %a, ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift ; ; XOPAVX2-LABEL: 'constant_shift_v8i32' @@ -1239,7 +1235,7 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift ; ; XOPAVX1-LABEL: 'constant_shift_v16i32' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <16 x i32> %a, +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i32> %a, ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift ; ; XOPAVX2-LABEL: 'constant_shift_v16i32' @@ -1272,7 +1268,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift ; ; XOP-LABEL: 'constant_shift_v8i16' -; XOP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <8 x i16> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift ; ; AVX512F-LABEL: 'constant_shift_v8i16' @@ -1317,7 +1313,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; ; XOP-LABEL: 'constant_shift_v16i16' -; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <16 x i16> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i16> %a, ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; ; AVX512F-LABEL: 'constant_shift_v16i16' @@ -1362,7 +1358,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; ; XOP-LABEL: 'constant_shift_v32i16' -; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <32 x i16> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <32 x i16> %a, ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; ; AVX512F-LABEL: 'constant_shift_v32i16' @@ -1403,7 +1399,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; ; XOP-LABEL: 'constant_shift_v16i8' -; XOP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <16 x i8> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i8> %a, ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; ; AVX512-LABEL: 'constant_shift_v16i8' @@ -1436,7 +1432,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; ; XOP-LABEL: 'constant_shift_v32i8' -; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <32 x i8> %a, ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; ; AVX512-LABEL: 'constant_shift_v32i8' @@ -1469,7 +1465,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; ; XOP-LABEL: 'constant_shift_v64i8' -; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <64 x i8> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <64 x i8> %a, ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; ; AVX512F-LABEL: 'constant_shift_v64i8' @@ -1510,7 +1506,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) { ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift ; ; XOP-LABEL: 'splatconstant_shift_v2i64' -; XOP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <2 x i64> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <2 x i64> %a, ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift ; ; AVX512-LABEL: 'splatconstant_shift_v2i64' @@ -1538,13 +1534,9 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <4 x i64> %a, ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift ; -; XOPAVX1-LABEL: 'splatconstant_shift_v4i64' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <4 x i64> %a, -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift -; -; XOPAVX2-LABEL: 'splatconstant_shift_v4i64' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <4 x i64> %a, -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift +; XOP-LABEL: 'splatconstant_shift_v4i64' +; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <4 x i64> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift ; ; AVX512-LABEL: 'splatconstant_shift_v4i64' ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i64> %a, @@ -1571,13 +1563,9 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <8 x i64> %a, ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift ; -; XOPAVX1-LABEL: 'splatconstant_shift_v8i64' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <8 x i64> %a, -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift -; -; XOPAVX2-LABEL: 'splatconstant_shift_v8i64' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <8 x i64> %a, -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift +; XOP-LABEL: 'splatconstant_shift_v8i64' +; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <8 x i64> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift ; ; AVX512-LABEL: 'splatconstant_shift_v8i64' ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i64> %a, @@ -1618,7 +1606,7 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift ; ; XOPAVX1-LABEL: 'splatconstant_shift_v8i32' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <8 x i32> %a, +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <8 x i32> %a, ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift ; ; XOPAVX2-LABEL: 'splatconstant_shift_v8i32' @@ -1651,7 +1639,7 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift ; ; XOPAVX1-LABEL: 'splatconstant_shift_v16i32' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <16 x i32> %a, +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i32> %a, ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift ; ; XOPAVX2-LABEL: 'splatconstant_shift_v16i32' @@ -1697,7 +1685,7 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; ; XOPAVX1-LABEL: 'splatconstant_shift_v16i16' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <16 x i16> %a, +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i16> %a, ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; ; XOPAVX2-LABEL: 'splatconstant_shift_v16i16' @@ -1730,7 +1718,7 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; ; XOPAVX1-LABEL: 'splatconstant_shift_v32i16' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <32 x i16> %a, +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <32 x i16> %a, ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; ; XOPAVX2-LABEL: 'splatconstant_shift_v32i16' @@ -1771,7 +1759,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; ; XOP-LABEL: 'splatconstant_shift_v16i8' -; XOP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <16 x i8> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i8> %a, ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; ; AVX512-LABEL: 'splatconstant_shift_v16i8' @@ -1799,13 +1787,9 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <32 x i8> %a, ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; -; XOPAVX1-LABEL: 'splatconstant_shift_v32i8' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift -; -; XOPAVX2-LABEL: 'splatconstant_shift_v32i8' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <32 x i8> %a, -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; XOP-LABEL: 'splatconstant_shift_v32i8' +; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <32 x i8> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; ; AVX512-LABEL: 'splatconstant_shift_v32i8' ; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <32 x i8> %a, @@ -1832,13 +1816,9 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <64 x i8> %a, ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; -; XOPAVX1-LABEL: 'splatconstant_shift_v64i8' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <64 x i8> %a, -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift -; -; XOPAVX2-LABEL: 'splatconstant_shift_v64i8' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <64 x i8> %a, -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; XOP-LABEL: 'splatconstant_shift_v64i8' +; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <64 x i8> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; ; AVX512F-LABEL: 'splatconstant_shift_v64i8' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <64 x i8> %a, diff --git a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll index 8ebe6f821653fb5fa1d4a7d13280a993720f06a8..096699be31dae357d88a7e5a9d052106f07007af 100644 --- a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll +++ b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll @@ -1067,13 +1067,9 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift ; -; XOPAVX1-LABEL: 'constant_shift_v2i64' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <2 x i64> %a, -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift -; -; XOPAVX2-LABEL: 'constant_shift_v2i64' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift +; XOP-LABEL: 'constant_shift_v2i64' +; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift ; ; AVX512-LABEL: 'constant_shift_v2i64' ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, @@ -1101,7 +1097,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift ; ; XOPAVX1-LABEL: 'constant_shift_v4i64' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = lshr <4 x i64> %a, +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <4 x i64> %a, ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift ; ; XOPAVX2-LABEL: 'constant_shift_v4i64' @@ -1134,7 +1130,7 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift ; ; XOPAVX1-LABEL: 'constant_shift_v8i64' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = lshr <8 x i64> %a, +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <8 x i64> %a, ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift ; ; XOPAVX2-LABEL: 'constant_shift_v8i64' @@ -1170,13 +1166,9 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift ; -; XOPAVX1-LABEL: 'constant_shift_v4i32' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <4 x i32> %a, -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift -; -; XOPAVX2-LABEL: 'constant_shift_v4i32' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift +; XOP-LABEL: 'constant_shift_v4i32' +; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift ; ; AVX512-LABEL: 'constant_shift_v4i32' ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, @@ -1208,7 +1200,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift ; ; XOPAVX1-LABEL: 'constant_shift_v8i32' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = lshr <8 x i32> %a, +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <8 x i32> %a, ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift ; ; XOPAVX2-LABEL: 'constant_shift_v8i32' @@ -1245,7 +1237,7 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift ; ; XOPAVX1-LABEL: 'constant_shift_v16i32' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = lshr <16 x i32> %a, +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <16 x i32> %a, ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift ; ; XOPAVX2-LABEL: 'constant_shift_v16i32' @@ -1278,7 +1270,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift ; ; XOP-LABEL: 'constant_shift_v8i16' -; XOP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <8 x i16> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift ; ; AVX512F-LABEL: 'constant_shift_v8i16' @@ -1323,7 +1315,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; ; XOP-LABEL: 'constant_shift_v16i16' -; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = lshr <16 x i16> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i16> %a, ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; ; AVX512F-LABEL: 'constant_shift_v16i16' @@ -1368,7 +1360,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; ; XOP-LABEL: 'constant_shift_v32i16' -; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = lshr <32 x i16> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <32 x i16> %a, ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; ; AVX512F-LABEL: 'constant_shift_v32i16' @@ -1409,7 +1401,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; ; XOP-LABEL: 'constant_shift_v16i8' -; XOP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <16 x i8> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i8> %a, ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; ; AVX512-LABEL: 'constant_shift_v16i8' @@ -1442,7 +1434,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; ; XOP-LABEL: 'constant_shift_v32i8' -; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = lshr <32 x i8> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; ; AVX512-LABEL: 'constant_shift_v32i8' @@ -1475,7 +1467,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; ; XOP-LABEL: 'constant_shift_v64i8' -; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = lshr <64 x i8> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <64 x i8> %a, ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; ; AVX512F-LABEL: 'constant_shift_v64i8' @@ -1533,7 +1525,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift ; ; XOPAVX1-LABEL: 'splatconstant_shift_v4i64' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = lshr <4 x i64> %a, +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <4 x i64> %a, ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift ; ; XOPAVX2-LABEL: 'splatconstant_shift_v4i64' @@ -1566,7 +1558,7 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift ; ; XOPAVX1-LABEL: 'splatconstant_shift_v8i64' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = lshr <8 x i64> %a, +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <8 x i64> %a, ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift ; ; XOPAVX2-LABEL: 'splatconstant_shift_v8i64' @@ -1612,7 +1604,7 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift ; ; XOPAVX1-LABEL: 'splatconstant_shift_v8i32' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = lshr <8 x i32> %a, +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <8 x i32> %a, ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift ; ; XOPAVX2-LABEL: 'splatconstant_shift_v8i32' @@ -1645,7 +1637,7 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift ; ; XOPAVX1-LABEL: 'splatconstant_shift_v16i32' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = lshr <16 x i32> %a, +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <16 x i32> %a, ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift ; ; XOPAVX2-LABEL: 'splatconstant_shift_v16i32' @@ -1691,7 +1683,7 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; ; XOPAVX1-LABEL: 'splatconstant_shift_v16i16' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = lshr <16 x i16> %a, +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i16> %a, ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; ; XOPAVX2-LABEL: 'splatconstant_shift_v16i16' @@ -1724,7 +1716,7 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; ; XOPAVX1-LABEL: 'splatconstant_shift_v32i16' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = lshr <32 x i16> %a, +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <32 x i16> %a, ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; ; XOPAVX2-LABEL: 'splatconstant_shift_v32i16' @@ -1756,9 +1748,21 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { } define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { -; CHECK-LABEL: 'splatconstant_shift_v16i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <16 x i8> %a, -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift +; SSE-LABEL: 'splatconstant_shift_v16i8' +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <16 x i8> %a, +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift +; +; AVX-LABEL: 'splatconstant_shift_v16i8' +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <16 x i8> %a, +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift +; +; XOP-LABEL: 'splatconstant_shift_v16i8' +; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i8> %a, +; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift +; +; AVX512-LABEL: 'splatconstant_shift_v16i8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <16 x i8> %a, +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; ; BTVER2-LABEL: 'splatconstant_shift_v16i8' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <16 x i8> %a, @@ -1782,7 +1786,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; ; XOPAVX1-LABEL: 'splatconstant_shift_v32i8' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = lshr <32 x i8> %a, +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; ; XOPAVX2-LABEL: 'splatconstant_shift_v32i8' @@ -1815,7 +1819,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; ; XOPAVX1-LABEL: 'splatconstant_shift_v64i8' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = lshr <64 x i8> %a, +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <64 x i8> %a, ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; ; XOPAVX2-LABEL: 'splatconstant_shift_v64i8' diff --git a/test/Analysis/DemandedBits/intrinsics.ll b/test/Analysis/DemandedBits/intrinsics.ll index 48f6d4624422dfb97d2ba40bfadcd21c981f561e..6987f14f8b1ba814e1079a17bb9d2f265d00c252 100644 --- a/test/Analysis/DemandedBits/intrinsics.ll +++ b/test/Analysis/DemandedBits/intrinsics.ll @@ -23,3 +23,82 @@ define i8 @test_bitreverse(i32 %x) { } declare i32 @llvm.bitreverse.i32(i32) +; Funnel shifts +declare i32 @llvm.fshl.i32(i32, i32, i32) +declare i33 @llvm.fshr.i33(i33, i33, i33) + +; CHECK-DAG: DemandedBits: 0xff for %x2 = or i32 %x, 1 +; CHECK-DAG: DemandedBits: 0xff000000 for %y2 = or i32 %y, 1 +; CHECK-DAG: DemandedBits: 0xffff for %z = call i32 @llvm.fshl.i32(i32 %x2, i32 %y2, i32 8) +; CHECK-DAG: DemandedBits: 0xffffffff for %r = and i32 %z, 65535 +define i32 @test_fshl(i32 %x, i32 %y) { + %x2 = or i32 %x, 1 + %y2 = or i32 %y, 1 + %z = call i32 @llvm.fshl.i32(i32 %x2, i32 %y2, i32 8) + %r = and i32 %z, 65535 + ret i32 %r +} + +; CHECK-DAG: DemandedBits: 0xff for %x2 = or i33 %x, 1 +; CHECK-DAG: DemandedBits: 0x1fe000000 for %y2 = or i33 %y, 1 +; CHECK-DAG: DemandedBits: 0xffff for %z = call i33 @llvm.fshr.i33(i33 %x2, i33 %y2, i33 25) +; CHECK-DAG: DemandedBits: 0x1ffffffff for %r = and i33 %z, 65535 +define i33 @test_fshr(i33 %x, i33 %y) { + %x2 = or i33 %x, 1 + %y2 = or i33 %y, 1 + %z = call i33 @llvm.fshr.i33(i33 %x2, i33 %y2, i33 25) + %r = and i33 %z, 65535 + ret i33 %r +} + +; CHECK-DAG: DemandedBits: 0xffff for %x2 = or i32 %x, 1 +; CHECK-DAG: DemandedBits: 0x0 for %y2 = or i32 %y, 1 +; CHECK-DAG: DemandedBits: 0xffff for %z = call i32 @llvm.fshl.i32(i32 %x2, i32 %y2, i32 0) +; CHECK-DAG: DemandedBits: 0xffffffff for %r = and i32 %z, 65535 +define i32 @test_fshl_zero_shift(i32 %x, i32 %y) { + %x2 = or i32 %x, 1 + %y2 = or i32 %y, 1 + %z = call i32 @llvm.fshl.i32(i32 %x2, i32 %y2, i32 0) + %r = and i32 %z, 65535 + ret i32 %r +} + +; CHECK-DAG: DemandedBits: 0x0 for %x2 = or i33 %x, 1 +; CHECK-DAG: DemandedBits: 0xffff for %y2 = or i33 %y, 1 +; CHECK-DAG: DemandedBits: 0xffff for %z = call i33 @llvm.fshr.i33(i33 %x2, i33 %y2, i33 33) +; CHECK-DAG: DemandedBits: 0x1ffffffff for %r = and i33 %z, 65535 +define i33 @test_fshr_full_shift(i33 %x, i33 %y) { + %x2 = or i33 %x, 1 + %y2 = or i33 %y, 1 + %z = call i33 @llvm.fshr.i33(i33 %x2, i33 %y2, i33 33) + %r = and i33 %z, 65535 + ret i33 %r +} + +; CHECK-DAG: DemandedBits: 0xffffffff for %x2 = or i32 %x, 1 +; CHECK-DAG: DemandedBits: 0xffffffff for %y2 = or i32 %y, 1 +; CHECK-DAG: DemandedBits: 0x1f for %z2 = or i32 %z, 1 +; CHECK-DAG: DemandedBits: 0xffff for %f = call i32 @llvm.fshl.i32(i32 %x2, i32 %y2, i32 %z2) +; CHECK-DAG: DemandedBits: 0xffffffff for %r = and i32 %f, 65535 +define i32 @test_fshl_pow2_bitwidth(i32 %x, i32 %y, i32 %z) { + %x2 = or i32 %x, 1 + %y2 = or i32 %y, 1 + %z2 = or i32 %z, 1 + %f = call i32 @llvm.fshl.i32(i32 %x2, i32 %y2, i32 %z2) + %r = and i32 %f, 65535 + ret i32 %r +} + +; CHECK-DAG: DemandedBits: 0x1ffffffff for %x2 = or i33 %x, 1 +; CHECK-DAG: DemandedBits: 0x1ffffffff for %y2 = or i33 %y, 1 +; CHECK-DAG: DemandedBits: 0x1ffffffff for %z2 = or i33 %z, 1 +; CHECK-DAG: DemandedBits: 0xffff for %f = call i33 @llvm.fshr.i33(i33 %x2, i33 %y2, i33 %z2) +; CHECK-DAG: DemandedBits: 0x1ffffffff for %r = and i33 %f, 65535 +define i33 @test_fshr_non_pow2_bitwidth(i33 %x, i33 %y, i33 %z) { + %x2 = or i33 %x, 1 + %y2 = or i33 %y, 1 + %z2 = or i33 %z, 1 + %f = call i33 @llvm.fshr.i33(i33 %x2, i33 %y2, i33 %z2) + %r = and i33 %f, 65535 + ret i33 %r +} diff --git a/test/Analysis/DemandedBits/vectors.ll b/test/Analysis/DemandedBits/vectors.ll new file mode 100644 index 0000000000000000000000000000000000000000..36cde05fb7c6221c94834919ad8fde9062db2d80 --- /dev/null +++ b/test/Analysis/DemandedBits/vectors.ll @@ -0,0 +1,136 @@ +; RUN: opt -S -demanded-bits -analyze < %s | FileCheck %s +; RUN: opt -S -disable-output -passes="print" < %s 2>&1 | FileCheck %s + +; CHECK-DAG: DemandedBits: 0xff00 for %x = or <2 x i32> %a, zeroinitializer +; CHECK-DAG: DemandedBits: 0xff00 for %y = or <2 x i32> %b, zeroinitializer +; CHECK-DAG: DemandedBits: 0xff00 for %z = or <2 x i32> %x, %y +; CHECK-DAG: DemandedBits: 0xff for %u = lshr <2 x i32> %z, +; CHECK-DAG: DemandedBits: 0xff for %r = trunc <2 x i32> %u to <2 x i8> +define <2 x i8> @test_basic(<2 x i32> %a, <2 x i32> %b) { + %x = or <2 x i32> %a, zeroinitializer + %y = or <2 x i32> %b, zeroinitializer + %z = or <2 x i32> %x, %y + %u = lshr <2 x i32> %z, + %r = trunc <2 x i32> %u to <2 x i8> + ret <2 x i8> %r +} + +; Vector-specific instructions + +; CHECK-DAG: DemandedBits: 0xff for %x = or <2 x i32> %a, zeroinitializer +; CHECK-DAG: DemandedBits: 0xf0 for %z = extractelement <2 x i32> %x, i32 1 +; CHECK-DAG: DemandedBits: 0xf for %y = extractelement <2 x i32> %x, i32 0 +; CHECK-DAG: DemandedBits: 0xffffffff for %u = and i32 %y, 15 +; CHECK-DAG: DemandedBits: 0xffffffff for %v = and i32 %z, 240 +; CHECK-DAG: DemandedBits: 0xffffffff for %r = or i32 %u, %v +define i32 @test_extractelement(<2 x i32> %a) { + %x = or <2 x i32> %a, zeroinitializer + %y = extractelement <2 x i32> %x, i32 0 + %z = extractelement <2 x i32> %x, i32 1 + %u = and i32 %y, 15 + %v = and i32 %z, 240 + %r = or i32 %u, %v + ret i32 %r +} + +; CHECK-DAG: DemandedBits: 0xff for %x = or i32 %a, 0 +; CHECK-DAG: DemandedBits: 0xff for %y = or i32 %b, 0 +; CHECK-DAG: DemandedBits: 0xff for %z = insertelement <2 x i32> undef, i32 %x, i32 0 +; CHECK-DAG: DemandedBits: 0xff for %u = insertelement <2 x i32> %z, i32 %y, i32 1 +; CHECK-DAG: DemandedBits: 0xffffffff for %r = and <2 x i32> %u, +define <2 x i32> @test_insertelement(i32 %a, i32 %b) { + %x = or i32 %a, 0 + %y = or i32 %b, 0 + %z = insertelement <2 x i32> undef, i32 %x, i32 0 + %u = insertelement <2 x i32> %z, i32 %y, i32 1 + %r = and <2 x i32> %u, + ret <2 x i32> %r +} + +; CHECK-DAG: DemandedBits: 0xff for %x = or <2 x i32> %a, zeroinitializer +; CHECK-DAG: DemandedBits: 0xff for %y = or <2 x i32> %b, zeroinitializer +; CHECK-DAG: DemandedBits: 0xff for %z = shufflevector <2 x i32> %x, <2 x i32> %y, <3 x i32> +; CHECK-DAG: DemandedBits: 0xffffffff for %r = and <3 x i32> %z, +define <3 x i32> @test_shufflevector(<2 x i32> %a, <2 x i32> %b) { + %x = or <2 x i32> %a, zeroinitializer + %y = or <2 x i32> %b, zeroinitializer + %z = shufflevector <2 x i32> %x, <2 x i32> %y, <3 x i32> + %r = and <3 x i32> %z, + ret <3 x i32> %r +} + +; Shifts with splat shift amounts + +; CHECK-DAG: DemandedBits: 0xf for %x = or <2 x i32> %a, zeroinitializer +; CHECK-DAG: DemandedBits: 0xf0 for %y = shl <2 x i32> %x, +; CHECK-DAG: DemandedBits: 0xffffffff for %r = and <2 x i32> %y, +define <2 x i32> @test_shl(<2 x i32> %a) { + %x = or <2 x i32> %a, zeroinitializer + %y = shl <2 x i32> %x, + %r = and <2 x i32> %y, + ret <2 x i32> %r +} + +; CHECK-DAG: DemandedBits: 0xf00 for %x = or <2 x i32> %a, zeroinitializer +; CHECK-DAG: DemandedBits: 0xf0 for %y = ashr <2 x i32> %x, +; CHECK-DAG: DemandedBits: 0xffffffff for %r = and <2 x i32> %y, +define <2 x i32> @test_ashr(<2 x i32> %a) { + %x = or <2 x i32> %a, zeroinitializer + %y = ashr <2 x i32> %x, + %r = and <2 x i32> %y, + ret <2 x i32> %r +} + +; CHECK-DAG: DemandedBits: 0xf00 for %x = or <2 x i32> %a, zeroinitializer +; CHECK-DAG: DemandedBits: 0xf0 for %y = lshr <2 x i32> %x, +; CHECK-DAG: DemandedBits: 0xffffffff for %r = and <2 x i32> %y, +define <2 x i32> @test_lshr(<2 x i32> %a) { + %x = or <2 x i32> %a, zeroinitializer + %y = lshr <2 x i32> %x, + %r = and <2 x i32> %y, + ret <2 x i32> %r +} + +declare <2 x i32> @llvm.fshl.i32(<2 x i32>, <2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.fshr.i32(<2 x i32>, <2 x i32>, <2 x i32>) + +; CHECK-DAG: DemandedBits: 0xf for %x = or <2 x i32> %a, zeroinitializer +; CHECK-DAG: DemandedBits: 0xf0000000 for %y = or <2 x i32> %b, zeroinitializer +; CHECK-DAG: DemandedBits: 0xff for %z = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) +; CHECK-DAG: DemandedBits: 0xffffffff for %r = and <2 x i32> %z, +define <2 x i32> @test_fshl(<2 x i32> %a, <2 x i32> %b) { + %x = or <2 x i32> %a, zeroinitializer + %y = or <2 x i32> %b, zeroinitializer + %z = call <2 x i32> @llvm.fshl.i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) + %r = and <2 x i32> %z, + ret <2 x i32> %r +} + +; CHECK-DAG: DemandedBits: 0xf for %x = or <2 x i32> %a, zeroinitializer +; CHECK-DAG: DemandedBits: 0xf0000000 for %y = or <2 x i32> %b, zeroinitializer +; CHECK-DAG: DemandedBits: 0xff for %z = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) +; CHECK-DAG: DemandedBits: 0xffffffff for %r = and <2 x i32> %z, +define <2 x i32> @test_fshr(<2 x i32> %a, <2 x i32> %b) { + %x = or <2 x i32> %a, zeroinitializer + %y = or <2 x i32> %b, zeroinitializer + %z = call <2 x i32> @llvm.fshr.i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) + %r = and <2 x i32> %z, + ret <2 x i32> %r +} + +; FP / Int conversion. These have different input / output types. + +; CHECK-DAG: DemandedBits: 0xffffffff for %x = or <2 x i32> %a, zeroinitializer +define <2 x float> @test_uitofp(<2 x i32> %a) { + %x = or <2 x i32> %a, zeroinitializer + %r = uitofp <2 x i32> %x to <2 x float> + ret <2 x float> %r +} + +; CHECK-DAG: DemandedBits: 0xffffffff for %y = fptoui <2 x float> %x to <2 x i32> +define <2 x i32> @test_fptoui(<2 x float> %a) { + %x = fadd <2 x float> %a, + %y = fptoui <2 x float> %x to <2 x i32> + %r = and <2 x i32> %y, + ret <2 x i32> %y +} diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll new file mode 100644 index 0000000000000000000000000000000000000000..eaac9ce526c4be351ad813c4ce6392cd2e445f32 --- /dev/null +++ b/test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll @@ -0,0 +1,14 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +define amdgpu_kernel void @workitem_id_x() #1 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x) +; CHECK-NOT: DIVERGENT: %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.readfirstlane(i32) #0 + +attributes #0 = { nounwind readnone } diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll new file mode 100644 index 0000000000000000000000000000000000000000..521d528d7952111600d3970c1d81eb3a72950713 --- /dev/null +++ b/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll @@ -0,0 +1,45 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; CHECK: DIVERGENT: %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst +define i32 @test1(i32* %ptr, i32 %val) #0 { + %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst + ret i32 %orig +} + +; CHECK: DIVERGENT: %orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst +define {i32, i1} @test2(i32* %ptr, i32 %cmp, i32 %new) { + %orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst + ret {i32, i1} %orig +} + +; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) +define i32 @test_atomic_inc_i32(i32 addrspace(1)* %ptr, i32 %val) #0 { + %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) + ret i32 %ret +} + +; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) +define i64 @test_atomic_inc_i64(i64 addrspace(1)* %ptr, i64 %val) #0 { + %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) + ret i64 %ret +} + +; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) +define i32 @test_atomic_dec_i32(i32 addrspace(1)* %ptr, i32 %val) #0 { + %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false) + ret i32 %ret +} + +; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) +define i64 @test_atomic_dec_i64(i64 addrspace(1)* %ptr, i64 %val) #0 { + %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false) + ret i64 %ret +} + +declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1 +declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1 +declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind argmemonly } diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll new file mode 100644 index 0000000000000000000000000000000000000000..660c4f573f1df0f8c3778032b0b066ea26035570 --- /dev/null +++ b/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll @@ -0,0 +1,26 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +define amdgpu_kernel void @hidden_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_diverge' +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %cond.var = icmp slt i32 %tid, 0 + br i1 %cond.var, label %B, label %C ; divergent +; CHECK: DIVERGENT: br i1 %cond.var, +B: + %cond.uni = icmp slt i32 %n, 0 + br i1 %cond.uni, label %C, label %merge ; uniform +; CHECK-NOT: DIVERGENT: br i1 %cond.uni, +C: + %phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B ] +; CHECK: DIVERGENT: %phi.var.hidden = phi i32 + br label %merge +merge: + %phi.ipd = phi i32 [ %a, %B ], [ %b, %C ] +; CHECK: DIVERGENT: %phi.ipd = phi i32 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll new file mode 100644 index 0000000000000000000000000000000000000000..12e2b0ffd443802c9aa25231feff811cd3d1a332 --- /dev/null +++ b/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll @@ -0,0 +1,223 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; divergent loop (H

, B) +; the divergent join point in %exit is obscured by uniform control joining in %X +define amdgpu_kernel void @hidden_loop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_loop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br i1 %uni.cond, label %X, label %H ; uniform + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ] + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %B ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +B: + %uni.inc = add i32 %uni.merge.h, 1 + %div.exity = icmp sgt i32 %tid, 0 + br i1 %div.exity, label %Y, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exity = +; CHECK: DIVERGENT: br i1 %div.exity, + +X: + %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %H ] ; temporal divergent phi + br i1 %uni.cond, label %Y, label %exit +; CHECK: DIVERGENT: %div.merge.x = + +Y: + %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ] + br label %exit +; CHECK: DIVERGENT: %div.merge.y = + +exit: + %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ] + ret void +; CHECK: DIVERGENT: %div.merge.exit = +} + +; divergent loop (H
, B) +; the phi nodes in X and Y don't actually receive divergent values +define amdgpu_kernel void @unobserved_loop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unobserved_loop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br i1 %uni.cond, label %X, label %H ; uniform + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ] + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %B ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +B: + %uni.inc = add i32 %uni.merge.h, 1 + %div.exity = icmp sgt i32 %tid, 0 + br i1 %div.exity, label %Y, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exity = +; CHECK: DIVERGENT: br i1 %div.exity, + +X: + %uni.merge.x = phi i32 [ %a, %entry ], [ %b, %H ] + br label %exit + +Y: + %uni.merge.y = phi i32 [ %b, %B ] + br label %exit + +exit: + %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ] + ret void +; CHECK: DIVERGENT: %div.merge.exit = +} + +; divergent loop (G
, L) inside divergent loop (H
, B, C, D, G, L) +; the inner loop has no exit to top level. +; the outer loop becomes divergent as its exiting branch in C is control-dependent on the inner loop's divergent loop exit in D. +define amdgpu_kernel void @hidden_nestedloop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_nestedloop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %uni.cond, label %X, label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %D ] + br i1 %uni.cond, label %G, label %B +; CHECK: DIVERGENT: %div.exitx = + +B: + br i1 %uni.cond, label %X, label %C + +C: + br i1 %uni.cond, label %Y, label %D + +D: + %uni.inc = add i32 %uni.merge.h, 1 + br label %H + +G: + br i1 %div.exitx, label %C, label %L +; CHECK: DIVERGENT: br i1 %div.exitx, + +L: + br i1 %uni.cond, label %D, label %G + +X: + %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %B ] ; temporal divergent phi + br i1 %uni.cond, label %Y, label %exit +; CHECK: DIVERGENT: %div.merge.x = + +Y: + %div.merge.y = phi i32 [ 42, %X ], [ %b, %C ] + br label %exit +; CHECK: DIVERGENT: %div.merge.y = + +exit: + %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ] + ret void +; CHECK: DIVERGENT: %div.merge.exit = +} + +; divergent loop (G
, L) in divergent loop (H
, B, C, G, L) +; the outer loop has no immediately divergent exiting edge. +; the inner exiting edge is exiting to top-level through the outer loop causing both to become divergent. +define amdgpu_kernel void @hidden_doublebreak_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_doublebreak_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %uni.cond, label %X, label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %C ] + br i1 %uni.cond, label %G, label %B +; CHECK: DIVERGENT: %div.exitx = + +B: + br i1 %uni.cond, label %Y, label %C + +C: + %uni.inc = add i32 %uni.merge.h, 1 + br label %H + +G: + br i1 %div.exitx, label %X, label %L ; two-level break +; CHECK: DIVERGENT: br i1 %div.exitx, + +L: + br i1 %uni.cond, label %C, label %G + +X: + %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %G ] ; temporal divergence + br label %Y +; CHECK: DIVERGENT: %div.merge.x = + +Y: + %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ] + ret void +; CHECK: DIVERGENT: %div.merge.y = +} + +; divergent loop (G
, L) contained inside a uniform loop (H
, B, G, L , D) +define amdgpu_kernel void @hidden_containedloop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_containedloop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %uni.cond, label %X, label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc.d, %D ] + br i1 %uni.cond, label %G, label %B +; CHECK: DIVERGENT: %div.exitx = + +B: + %div.merge.b = phi i32 [ 42, %H ], [ %uni.merge.g, %G ] + br label %D +; CHECK: DIVERGENT: %div.merge.b = + +G: + %uni.merge.g = phi i32 [ 123, %H ], [ %uni.inc.l, %L ] + br i1 %div.exitx, label %B, label %L +; CHECK: DIVERGENT: br i1 %div.exitx, + +L: + %uni.inc.l = add i32 %uni.merge.g, 1 + br i1 %uni.cond, label %G, label %D + +D: + %uni.inc.d = add i32 %uni.merge.h, 1 + br i1 %uni.cond, label %X, label %H + +X: + %uni.merge.x = phi i32 [ %a, %entry ], [ %uni.inc.d, %D ] + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll new file mode 100644 index 0000000000000000000000000000000000000000..1b84fd5262df5d9a1132832047cfcfafdf3c0610 --- /dev/null +++ b/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll @@ -0,0 +1,13 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 +define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 { + %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 + store i32 %swizzle, i32 addrspace(1)* %out, align 4 + ret void +} + +declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1 + +attributes #0 = { nounwind convergent } +attributes #1 = { nounwind readnone convergent } diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll new file mode 100644 index 0000000000000000000000000000000000000000..9a94328be67658de4feebce8681425c29af9ac96 --- /dev/null +++ b/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll @@ -0,0 +1,48 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; This test contains an unstructured loop. +; +-------------- entry ----------------+ +; | | +; V V +; i1 = phi(0, i3) i2 = phi(0, i3) +; j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2 +; ^ | ^ +; | V | +; +-------- switch (tid / i3) ----------+ +; | +; V +; if (i3 == 5) // divergent +; because sync dependent on (tid / i3). +define i32 @unstructured_loop(i1 %entry_cond) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unstructured_loop' +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2 +loop_entry_1: + %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j1 = add i32 %i1, 1 + br label %loop_body +loop_entry_2: + %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j2 = add i32 %i2, 2 + br label %loop_body +loop_body: + %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ] + br label %loop_latch +loop_latch: + %div = sdiv i32 %tid, %i3 + switch i32 %div, label %branch [ i32 1, label %loop_entry_1 + i32 2, label %loop_entry_2 ] +branch: + %cmp = icmp eq i32 %i3, 5 + br i1 %cmp, label %then, label %else +; CHECK: DIVERGENT: br i1 %cmp, +then: + ret i32 0 +else: + ret i32 1 +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll new file mode 100644 index 0000000000000000000000000000000000000000..d7e050de1f3a2b6789e9c242b8408e2a64faee49 --- /dev/null +++ b/test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll @@ -0,0 +1,41 @@ +; RUN: opt %s -mtriple amdgcn-- -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_amdgpu_ps': +; CHECK: DIVERGENT: +; CHECK-NOT: %arg0 +; CHECK-NOT: %arg1 +; CHECK-NOT: %arg2 +; CHECK: <2 x i32> %arg3 +; CHECK: DIVERGENT: <3 x i32> %arg4 +; CHECK: DIVERGENT: float %arg5 +; CHECK: DIVERGENT: i32 %arg6 + +define amdgpu_ps void @test_amdgpu_ps([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { + ret void +} + +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_amdgpu_kernel': +; CHECK-NOT: %arg0 +; CHECK-NOT: %arg1 +; CHECK-NOT: %arg2 +; CHECK-NOT: %arg3 +; CHECK-NOT: %arg4 +; CHECK-NOT: %arg5 +; CHECK-NOT: %arg6 +define amdgpu_kernel void @test_amdgpu_kernel([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { + ret void +} + +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_c': +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +; CHECK: DIVERGENT: +define void @test_c([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { + ret void +} + +attributes #0 = { nounwind } diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/lit.local.cfg b/test/Analysis/DivergenceAnalysis/AMDGPU/lit.local.cfg new file mode 100644 index 0000000000000000000000000000000000000000..2a665f06be72e5515ca6e27018facb35daa201be --- /dev/null +++ b/test/Analysis/DivergenceAnalysis/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll new file mode 100644 index 0000000000000000000000000000000000000000..508a25406527f4190fe68d9bf7ac57e764b8bde9 --- /dev/null +++ b/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -0,0 +1,103 @@ +;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.swap( +define float @buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.add( +define float @buffer_atomic_add(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.sub( +define float @buffer_atomic_sub(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smin( +define float @buffer_atomic_smin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.smin(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umin( +define float @buffer_atomic_umin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.umin(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smax( +define float @buffer_atomic_smax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.smax(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umax( +define float @buffer_atomic_umax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.umax(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.and( +define float @buffer_atomic_and(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.and(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.or( +define float @buffer_atomic_or(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.or(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.xor( +define float @buffer_atomic_xor(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.xor(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap( +define float @buffer_atomic_cmpswap(<4 x i32> inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +declare i32 @llvm.amdgcn.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.and(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.or(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0 + +attributes #0 = { nounwind } diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll new file mode 100644 index 0000000000000000000000000000000000000000..97ef984dc816b63485aecaadf25bacba1be20e47 --- /dev/null +++ b/test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll @@ -0,0 +1,131 @@ +;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32( +define float @image_atomic_swap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32( +define float @image_atomic_add(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32( +define float @image_atomic_sub(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32( +define float @image_atomic_smin(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32( +define float @image_atomic_umin(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32( +define float @image_atomic_smax(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32( +define float @image_atomic_umax(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32( +define float @image_atomic_and(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32( +define float @image_atomic_or(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32( +define float @image_atomic_xor(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32( +define float @image_atomic_inc(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32( +define float @image_atomic_dec(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32( +define float @image_atomic_cmpswap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data, i32 inreg %cmp) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %data, i32 %cmp, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32( +define float @image_atomic_add_2d(<8 x i32> inreg %rsrc, i32 inreg %s, i32 inreg %t, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0 +declare i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0 + +declare i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0 + +attributes #0 = { nounwind } diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll new file mode 100644 index 0000000000000000000000000000000000000000..fb7c041e2d1841c7c918be1783d8be50fb377240 --- /dev/null +++ b/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll @@ -0,0 +1,30 @@ +; RUN: opt %s -mtriple amdgcn-- -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +; CHECK: DIVERGENT: %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp2 +; CHECK: DIVERGENT: %tmp10 = load volatile float, float addrspace(1)* %tmp5, align 4 +; CHECK: DIVERGENT: %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4 + +; The post dominator tree does not have a root node in this case +define amdgpu_kernel void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 { +bb0: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tmp2 = sext i32 %tmp to i64 + %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp2 + %tmp6 = load volatile float, float addrspace(1)* %tmp5, align 4 + %tmp8 = fcmp olt float %tmp6, 0.000000e+00 + br i1 %tmp8, label %bb1, label %bb2 + +bb1: + %tmp10 = load volatile float, float addrspace(1)* %tmp5, align 4 + br label %bb2 + +bb2: + %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4 + br label %bb1 +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll new file mode 100644 index 0000000000000000000000000000000000000000..978bc4232b1d0c05caffd9f2d069b01aa3db8fbe --- /dev/null +++ b/test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll @@ -0,0 +1,31 @@ +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; CHECK-LABEL: 'test1': +; CHECK-NEXT: DIVERGENT: i32 %bound +; CHECK: {{^ *}}%counter = +; CHECK-NEXT: DIVERGENT: %break = icmp sge i32 %counter, %bound +; CHECK-NEXT: DIVERGENT: br i1 %break, label %footer, label %body +; CHECK: {{^ *}}%counter.next = +; CHECK: {{^ *}}%counter.footer = +; CHECK: DIVERGENT: br i1 %break, label %end, label %header +; Note: %counter is not divergent! +define amdgpu_ps void @test1(i32 %bound) { +entry: + br label %header + +header: + %counter = phi i32 [ 0, %entry ], [ %counter.footer, %footer ] + %break = icmp sge i32 %counter, %bound + br i1 %break, label %footer, label %body + +body: + %counter.next = add i32 %counter, 1 + br label %footer + +footer: + %counter.footer = phi i32 [ %counter.next, %body ], [ undef, %header ] + br i1 %break, label %end, label %header + +end: + ret void +} diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll new file mode 100644 index 0000000000000000000000000000000000000000..4211ca28ad66ee8412e3f4d119b170c29466f91f --- /dev/null +++ b/test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll @@ -0,0 +1,154 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +; temporal-divergent use of value carried by divergent loop +define amdgpu_kernel void @temporal_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + %div.user = add i32 %uni.inc, 5 + ret void +} + +; temporal-divergent use of value carried by divergent loop inside a top-level loop +define amdgpu_kernel void @temporal_diverge_inloop(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_inloop': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %G + +G: + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %G ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + %div.user = add i32 %uni.inc, 5 + br i1 %uni.cond, label %G, label %Y + +Y: + %div.alsouser = add i32 %uni.inc, 5 + ret void +} + + +; temporal-uniform use of a valud, definition and users are carried by a surrounding divergent loop +define amdgpu_kernel void @temporal_uniform_indivloop(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_uniform_indivloop': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %G + +G: + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %G ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + br i1 %uni.cond, label %X, label %H ; divergent branch + +X: + %uni.user = add i32 %uni.inc, 5 + %div.exity = icmp slt i32 %tid, 0 +; CHECK: DIVERGENT: %div.exity = + br i1 %div.exity, label %G, label %Y +; CHECK: DIVERGENT: br i1 %div.exity, + +Y: + %div.alsouser = add i32 %uni.inc, 5 + ret void +} + + +; temporal-divergent use of value carried by divergent loop, user is inside sibling loop +define amdgpu_kernel void @temporal_diverge_loopuser(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_loopuser': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + br label %G + +G: + %div.user = add i32 %uni.inc, 5 + br i1 %uni.cond, label %G, label %Y + +Y: + ret void +} + +; temporal-divergent use of value carried by divergent loop, user is inside sibling loop, defs and use are carried by a uniform loop +define amdgpu_kernel void @temporal_diverge_loopuser_nested(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_loopuser_nested': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +X: + br label %G + +G: + %div.user = add i32 %uni.inc, 5 + br i1 %uni.cond, label %G, label %Y + +Y: + ret void +} + + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll new file mode 100644 index 0000000000000000000000000000000000000000..b22c5f11abe601480ff952f64a2109d49523daff --- /dev/null +++ b/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll @@ -0,0 +1,45 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.workitem.id.y() #0 +declare i32 @llvm.amdgcn.workitem.id.z() #0 +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 +declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0 + +; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @workitem_id_x() #1 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %id.x, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %id.y = call i32 @llvm.amdgcn.workitem.id.y() +define amdgpu_kernel void @workitem_id_y() #1 { + %id.y = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %id.y, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %id.z = call i32 @llvm.amdgcn.workitem.id.z() +define amdgpu_kernel void @workitem_id_z() #1 { + %id.z = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %id.z, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0) +define amdgpu_kernel void @mbcnt_lo() #1 { + %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0) + store volatile i32 %mbcnt.lo, i32 addrspace(1)* undef + ret void +} + +; CHECK: DIVERGENT: %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0) +define amdgpu_kernel void @mbcnt_hi() #1 { + %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0) + store volatile i32 %mbcnt.hi, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll b/test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll new file mode 100644 index 0000000000000000000000000000000000000000..89954b6f7c06d11c80fae4257c57dd68bac16b8d --- /dev/null +++ b/test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll @@ -0,0 +1,47 @@ +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define i32 @daorder(i32 %n) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'daorder' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cond = icmp slt i32 %tid, 0 + br i1 %cond, label %A, label %B ; divergent +; CHECK: DIVERGENT: br i1 %cond, +A: + %defAtA = add i32 %n, 1 ; uniform +; CHECK-NOT: DIVERGENT: %defAtA = + br label %C +B: + %defAtB = add i32 %n, 2 ; uniform +; CHECK-NOT: DIVERGENT: %defAtB = + br label %C +C: + %defAtC = phi i32 [ %defAtA, %A ], [ %defAtB, %B ] ; divergent +; CHECK: DIVERGENT: %defAtC = + br label %D + +D: + %i = phi i32 [0, %C], [ %i.inc, %E ] ; uniform +; CHECK-NOT: DIVERGENT: %i = phi + br label %E + +E: + %i.inc = add i32 %i, 1 + %loopCnt = icmp slt i32 %i.inc, %n +; CHECK-NOT: DIVERGENT: %loopCnt = + br i1 %loopCnt, label %D, label %exit + +exit: + ret i32 %n +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.laneid() + +!nvvm.annotations = !{!0} +!0 = !{i32 (i32)* @daorder, !"kernel", i32 1} diff --git a/test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll b/test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll new file mode 100644 index 0000000000000000000000000000000000000000..e2e547282205fecb15a06abfe95fc3200ea4b7b6 --- /dev/null +++ b/test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll @@ -0,0 +1,175 @@ +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; return (n < 0 ? a + threadIdx.x : b + threadIdx.x) +define i32 @no_diverge(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'no_diverge' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cond = icmp slt i32 %n, 0 + br i1 %cond, label %then, label %else ; uniform +; CHECK-NOT: DIVERGENT: br i1 %cond, +then: + %a1 = add i32 %a, %tid + br label %merge +else: + %b2 = add i32 %b, %tid + br label %merge +merge: + %c = phi i32 [ %a1, %then ], [ %b2, %else ] + ret i32 %c +} + +; c = a; +; if (threadIdx.x < 5) // divergent: data dependent +; c = b; +; return c; // c is divergent: sync dependent +define i32 @sync(i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'sync' +bb1: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() + %cond = icmp slt i32 %tid, 5 + br i1 %cond, label %bb2, label %bb3 +; CHECK: DIVERGENT: br i1 %cond, +bb2: + br label %bb3 +bb3: + %c = phi i32 [ %a, %bb1 ], [ %b, %bb2 ] ; sync dependent on tid +; CHECK: DIVERGENT: %c = + ret i32 %c +} + +; c = 0; +; if (threadIdx.x >= 5) { // divergent +; c = (n < 0 ? a : b); // c here is uniform because n is uniform +; } +; // c here is divergent because it is sync dependent on threadIdx.x >= 5 +; return c; +define i32 @mixed(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'mixed' +bb1: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() + %cond = icmp slt i32 %tid, 5 + br i1 %cond, label %bb6, label %bb2 +; CHECK: DIVERGENT: br i1 %cond, +bb2: + %cond2 = icmp slt i32 %n, 0 + br i1 %cond2, label %bb4, label %bb3 +bb3: + br label %bb5 +bb4: + br label %bb5 +bb5: + %c = phi i32 [ %a, %bb3 ], [ %b, %bb4 ] +; CHECK-NOT: DIVERGENT: %c = + br label %bb6 +bb6: + %c2 = phi i32 [ 0, %bb1], [ %c, %bb5 ] +; CHECK: DIVERGENT: %c2 = + ret i32 %c2 +} + +; We conservatively treats all parameters of a __device__ function as divergent. +define i32 @device(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'device' +; CHECK: DIVERGENT: i32 %n +; CHECK: DIVERGENT: i32 %a +; CHECK: DIVERGENT: i32 %b +entry: + %cond = icmp slt i32 %n, 0 + br i1 %cond, label %then, label %else +; CHECK: DIVERGENT: br i1 %cond, +then: + br label %merge +else: + br label %merge +merge: + %c = phi i32 [ %a, %then ], [ %b, %else ] + ret i32 %c +} + +; int i = 0; +; do { +; i++; // i here is uniform +; } while (i < laneid); +; return i == 10 ? 0 : 1; // i here is divergent +; +; The i defined in the loop is used outside. +define i32 @loop() { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'loop' +entry: + %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid() + br label %loop +loop: + %i = phi i32 [ 0, %entry ], [ %i1, %loop ] +; CHECK-NOT: DIVERGENT: %i = + %i1 = add i32 %i, 1 + %exit_cond = icmp sge i32 %i1, %laneid + br i1 %exit_cond, label %loop_exit, label %loop +loop_exit: + %cond = icmp eq i32 %i, 10 + br i1 %cond, label %then, label %else +; CHECK: DIVERGENT: br i1 %cond, +then: + ret i32 0 +else: + ret i32 1 +} + +; Same as @loop, but the loop is in the LCSSA form. +define i32 @lcssa() { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'lcssa' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + br label %loop +loop: + %i = phi i32 [ 0, %entry ], [ %i1, %loop ] +; CHECK-NOT: DIVERGENT: %i = + %i1 = add i32 %i, 1 + %exit_cond = icmp sge i32 %i1, %tid + br i1 %exit_cond, label %loop_exit, label %loop +loop_exit: + %i.lcssa = phi i32 [ %i, %loop ] +; CHECK: DIVERGENT: %i.lcssa = + %cond = icmp eq i32 %i.lcssa, 10 + br i1 %cond, label %then, label %else +; CHECK: DIVERGENT: br i1 %cond, +then: + ret i32 0 +else: + ret i32 1 +} + +; Verifies sync-dependence is computed correctly in the absense of loops. +define i32 @sync_no_loop(i32 %arg) { +entry: + %0 = add i32 %arg, 1 + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %1 = icmp sge i32 %tid, 10 + br i1 %1, label %bb1, label %bb2 + +bb1: + br label %bb3 + +bb2: + br label %bb3 + +bb3: + %2 = add i32 %0, 2 + ; CHECK-NOT: DIVERGENT: %2 + ret i32 %2 +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.laneid() + +!nvvm.annotations = !{!0, !1, !2, !3, !4} +!0 = !{i32 (i32, i32, i32)* @no_diverge, !"kernel", i32 1} +!1 = !{i32 (i32, i32)* @sync, !"kernel", i32 1} +!2 = !{i32 (i32, i32, i32)* @mixed, !"kernel", i32 1} +!3 = !{i32 ()* @loop, !"kernel", i32 1} +!4 = !{i32 (i32)* @sync_no_loop, !"kernel", i32 1} diff --git a/test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll b/test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll new file mode 100644 index 0000000000000000000000000000000000000000..3d61986657e0bb690bc521185d2762eba27b44c0 --- /dev/null +++ b/test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll @@ -0,0 +1,30 @@ +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define i32 @hidden_diverge(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_diverge' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cond.var = icmp slt i32 %tid, 0 + br i1 %cond.var, label %B, label %C ; divergent +; CHECK: DIVERGENT: br i1 %cond.var, +B: + %cond.uni = icmp slt i32 %n, 0 + br i1 %cond.uni, label %C, label %merge ; uniform +; CHECK-NOT: DIVERGENT: br i1 %cond.uni, +C: + %phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B ] +; CHECK: DIVERGENT: %phi.var.hidden = phi i32 + br label %merge +merge: + %phi.ipd = phi i32 [ %a, %B ], [ %b, %C ] +; CHECK: DIVERGENT: %phi.ipd = phi i32 + ret i32 %phi.ipd +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() + +!nvvm.annotations = !{!0} +!0 = !{i32 (i32, i32, i32)* @hidden_diverge, !"kernel", i32 1} diff --git a/test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll b/test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll new file mode 100644 index 0000000000000000000000000000000000000000..2e1686a446d4a9dc59699dc1e0d98889b27f5178 --- /dev/null +++ b/test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll @@ -0,0 +1,55 @@ +; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; This test contains an unstructured loop. +; +-------------- entry ----------------+ +; | | +; V V +; i1 = phi(0, i3) i2 = phi(0, i3) +; j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2 +; ^ | ^ +; | V | +; +-------- switch (tid / i3) ----------+ +; | +; V +; if (i3 == 5) // divergent +; because sync dependent on (tid / i3). +define i32 @unstructured_loop(i1 %entry_cond) { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unstructured_loop' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2 +loop_entry_1: + %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j1 = add i32 %i1, 1 + br label %loop_body +loop_entry_2: + %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] + %j2 = add i32 %i2, 2 + br label %loop_body +loop_body: + %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ] + br label %loop_latch +loop_latch: + %div = sdiv i32 %tid, %i3 + switch i32 %div, label %branch [ i32 1, label %loop_entry_1 + i32 2, label %loop_entry_2 ] +branch: + %cmp = icmp eq i32 %i3, 5 + br i1 %cmp, label %then, label %else +; CHECK: DIVERGENT: br i1 %cmp, +then: + ret i32 0 +else: + ret i32 1 +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.laneid() + +!nvvm.annotations = !{!0} +!0 = !{i32 (i1)* @unstructured_loop, !"kernel", i32 1} diff --git a/test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg b/test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg new file mode 100644 index 0000000000000000000000000000000000000000..2cb98eb371b21bc47c99d369adaffefd84d4a625 --- /dev/null +++ b/test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'NVPTX' in config.root.targets: + config.unsupported = True diff --git a/test/Analysis/LazyValueAnalysis/lvi-after-jumpthreading.ll b/test/Analysis/LazyValueAnalysis/lvi-after-jumpthreading.ll index 27cd2263beaa258abb543bf67320c0411cfeeb8b..fbc2eb135d75da5bebc1d16556e4b7b5345cedd7 100644 --- a/test/Analysis/LazyValueAnalysis/lvi-after-jumpthreading.ll +++ b/test/Analysis/LazyValueAnalysis/lvi-after-jumpthreading.ll @@ -72,7 +72,7 @@ loop: %cnd1 = icmp sge i32 %iv, 0 %cnd2 = icmp sgt i32 %iv2, 0 ; CHECK: %cnd2 = icmp sgt i32 %iv2, 0 -; CHECK: ; LatticeVal for: ' %cnd = and i1 %cnd1, %cnd2' in BB: '%loop' is: overdefined +; CHECK: ; LatticeVal for: ' %cnd = and i1 %cnd1, %cnd2' in BB: '%loop' is: constantrange<-1, -1> ; CHECK-DAG: ; LatticeVal for: ' %cnd = and i1 %cnd1, %cnd2' in BB: '%backedge' is: constantrange<-1, 0> ; CHECK-DAG: ; LatticeVal for: ' %cnd = and i1 %cnd1, %cnd2' in BB: '%exit' is: overdefined ; CHECK-NEXT: %cnd = and i1 %cnd1, %cnd2 @@ -92,7 +92,7 @@ backedge: ; CHECK-NEXT: ; LatticeVal for: ' %cont2 = icmp sgt i32 %iv2.next, 0' in BB: '%backedge' is: overdefined ; CHECK-NEXT: %cont2 = icmp sgt i32 %iv2.next, 0 %cont2 = icmp sgt i32 %iv2.next, 0 -; CHECK-NEXT: ; LatticeVal for: ' %cont = and i1 %cont1, %cont2' in BB: '%backedge' is: overdefined +; CHECK-NEXT: ; LatticeVal for: ' %cont = and i1 %cont1, %cont2' in BB: '%backedge' is: constantrange<-1, -1> ; CHECK-NEXT: %cont = and i1 %cont1, %cont2 %cont = and i1 %cont1, %cont2 br i1 %cont, label %loop, label %exit diff --git a/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll b/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll index 0d0fe65694c800503c88f990aa35a2ae7db997a9..cb1b7edb3d10127c624f1108fe5df2bca13f71c7 100644 --- a/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll +++ b/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll @@ -39,7 +39,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" ; CHECK-NEXT: Group ; CHECK-NEXT: (Low: %b High: ((4 * (1 umax %x)) + %b)) ; CHECK-NEXT: Member: {%b,+,4}<%for.body> -; CHECK: Multiple stores to invariant address were not found in loop. +; CHECK: Non vectorizable stores to invariant address were not found in loop. ; CHECK-NEXT: SCEV assumptions: ; CHECK-NEXT: {1,+,1}<%for.body> Added Flags: ; CHECK-NEXT: {0,+,1}<%for.body> Added Flags: diff --git a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll index f24211d1e0dfe1aae30949aa33c9fff51c8e9887..611e957168ffdcc5a34e7132c324f991221e019e 100644 --- a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll +++ b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll @@ -14,14 +14,14 @@ ; The LAA with the new PM is a loop pass so we go from inner to outer loops. ; OLDPM: for.cond1.preheader: -; OLDPM: Multiple stores to invariant address were not found in loop. +; OLDPM: Non vectorizable stores to invariant address were not found in loop. ; OLDPM: for.body3: -; OLDPM: Multiple stores to invariant address were found in loop. +; OLDPM: Non vectorizable stores to invariant address were found in loop. ; NEWPM: for.body3: -; NEWPM: Multiple stores to invariant address were found in loop. +; NEWPM: Non vectorizable stores to invariant address were found in loop. ; NEWPM: for.cond1.preheader: -; NEWPM: Multiple stores to invariant address were not found in loop. +; NEWPM: Non vectorizable stores to invariant address were not found in loop. define i32 @foo(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 { entry: diff --git a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll index 07bcdcc5c669cf224334fdeec852e031ba36e7b2..d21cc6926c3b131775dc2c3f056a17f83bc8e702 100644 --- a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll +++ b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll @@ -10,8 +10,8 @@ ; } ; } -; CHECK: Multiple stores to invariant address were not found in loop. -; CHECK-NOT: Multiple stores to invariant address were found in loop. +; CHECK: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NOT: Non vectorizable stores to invariant address were found in loop. define i32 @foo(i32* nocapture readonly %var1, i32* nocapture %var2, i32 %itr) #0 { diff --git a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll index 8d7452471f5c9d93e1a4d0117da8d2bba8d7c9db..b25d79b3d0394bb66be3515afd0f962b0704ccec 100644 --- a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll +++ b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll @@ -10,7 +10,7 @@ ; } ; } -; CHECK: Multiple stores to invariant address were not found in loop. +; CHECK: Non vectorizable stores to invariant address were not found in loop. define void @foo(i32* nocapture %var1, i32* nocapture %var2, i32 %itr) #0 { entry: diff --git a/test/Analysis/ScalarEvolution/pr28705.ll b/test/Analysis/ScalarEvolution/pr28705.ll index 8fbc08e3ca63ec85eadf29e2d4fb98b36d031996..9a8487a6c662172a3743de91dd51a4ccf67a9dd7 100644 --- a/test/Analysis/ScalarEvolution/pr28705.ll +++ b/test/Analysis/ScalarEvolution/pr28705.ll @@ -1,11 +1,11 @@ ; PR28705 ; RUN: opt < %s -indvars -S | FileCheck %s -; Check IndVarSimplify replaces the exitval use of the induction var "%inc.i.i" -; with "%.sroa.speculated + 1". +; Check IndVarSimplify doesn't replace external use of the induction var +; "%inc.i.i" with "%.sroa.speculated + 1" because it is not profitable. ; ; CHECK-LABEL: @foo( -; CHECK: %[[EXIT:.+]] = sub i32 %.sroa.speculated, -1 +; CHECK: %[[EXIT:.+]] = phi i32 [ %inc.i.i, %for.body650 ] ; CHECK: %DB.sroa.9.0.lcssa = phi i32 [ 1, %entry ], [ %[[EXIT]], %loopexit ] ; define void @foo(i32 %sub.ptr.div.i, i8* %ref.i1174) local_unnamed_addr { diff --git a/test/Analysis/ScalarEvolution/solve-quadratic.ll b/test/Analysis/ScalarEvolution/solve-quadratic.ll index cc4f807108dc8187f82b5824b7b8f18e2bb39022..5d5e02efd0e4a39e511b7acf51125a6dbc4ae8e8 100644 --- a/test/Analysis/ScalarEvolution/solve-quadratic.ll +++ b/test/Analysis/ScalarEvolution/solve-quadratic.ll @@ -41,13 +41,13 @@ ; {14,+,14,+,14} -> X=0, Y=14, Z=14 ; ; CHECK-LABEL: Printing analysis 'Scalar Evolution Analysis' for function 'test01' -; CHECK: GetQuadraticEquation: analyzing quadratic addrec: {-2,+,-2,+,-2}<%loop> -; CHECK: GetQuadraticEquation: addrec coeff bw: 4 -; CHECK: GetQuadraticEquation: equation -2x^2 + -2x + -4, coeff bw: 5, multiplied by 2 -; CHECK: SolveQuadraticAddRecExact: solving for unsigned overflow -; CHECK: SolveQuadraticEquationWrap: solving -2x^2 + -2x + -4, rw:5 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 2x^2 + 2x + -28, rw:5 -; CHECK: SolveQuadraticEquationWrap: solution (wrap): 4 +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {-2,+,-2,+,-2}<%loop> +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 4 +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation -2x^2 + -2x + -4, coeff bw: 5, multiplied by 2 +; CHECK: {{.*}}SolveQuadraticAddRecExact{{.*}}: solving for unsigned overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving -2x^2 + -2x + -4, rw:5 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 2x^2 + 2x + -28, rw:5 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 4 ; CHECK: Loop %loop: Unpredictable backedge-taken count define signext i32 @test01() { entry: @@ -73,25 +73,25 @@ exit: ; {-72,+,-36,+,1} -> X=-37, Y=-35, Z=1 ; ; CHECK-LABEL: Printing analysis 'Scalar Evolution Analysis' for function 'test02': -; CHECK: GetQuadraticEquation: analyzing quadratic addrec: {0,+,-36,+,1}<%loop> -; CHECK: GetQuadraticEquation: addrec coeff bw: 32 -; CHECK: GetQuadraticEquation: equation 1x^2 + -73x + 0, coeff bw: 33, multiplied by 2 -; CHECK: SolveQuadraticAddRecRange: solving for signed overflow -; CHECK: SolveQuadraticEquationWrap: solving 1x^2 + -73x + 4294967154, rw:32 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + -73x + -142, rw:32 -; CHECK: SolveQuadraticEquationWrap: solution (wrap): 75 -; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow -; CHECK: SolveQuadraticEquationWrap: solving 1x^2 + -73x + 4294967154, rw:33 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + -73x + -4294967438, rw:33 -; CHECK: SolveQuadraticEquationWrap: solution (wrap): 65573 -; CHECK: SolveQuadraticAddRecRange: solving for signed overflow -; CHECK: SolveQuadraticEquationWrap: solving 1x^2 + -73x + -146, rw:32 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + -73x + -146, rw:32 -; CHECK: SolveQuadraticEquationWrap: solution (wrap): 75 -; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow -; CHECK: SolveQuadraticEquationWrap: solving 1x^2 + -73x + -146, rw:33 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + -73x + -146, rw:33 -; CHECK: SolveQuadraticEquationWrap: solution (wrap): 75 +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {0,+,-36,+,1}<%loop> +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 32 +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation 1x^2 + -73x + 0, coeff bw: 33, multiplied by 2 +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 1x^2 + -73x + 4294967154, rw:32 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + -73x + -142, rw:32 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 75 +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 1x^2 + -73x + 4294967154, rw:33 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + -73x + -4294967438, rw:33 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 65573 +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 1x^2 + -73x + -146, rw:32 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + -73x + -146, rw:32 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 75 +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 1x^2 + -73x + -146, rw:33 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + -73x + -146, rw:33 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 75 ; CHECK: Loop %loop: backedge-taken count is 75 define signext i32 @test02() { entry: @@ -117,13 +117,13 @@ exit: ; {17,+,-1,+,2} -> X=-3, Y=20, Z=2 ; ; CHECK-LABEL: Printing analysis 'Scalar Evolution Analysis' for function 'test03': -; CHECK: GetQuadraticEquation: analyzing quadratic addrec: {1,+,-1,+,2}<%loop> -; CHECK: GetQuadraticEquation: addrec coeff bw: 4 -; CHECK: GetQuadraticEquation: equation 2x^2 + -4x + 2, coeff bw: 5, multiplied by 2 -; CHECK: SolveQuadraticAddRecExact: solving for unsigned overflow -; CHECK: SolveQuadraticEquationWrap: solving 2x^2 + -4x + 2, rw:5 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 2x^2 + -4x + 2, rw:5 -; CHECK: SolveQuadraticEquationWrap: solution (root): 1 +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {1,+,-1,+,2}<%loop> +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 4 +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation 2x^2 + -4x + 2, coeff bw: 5, multiplied by 2 +; CHECK: {{.*}}SolveQuadraticAddRecExact{{.*}}: solving for unsigned overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 2x^2 + -4x + 2, rw:5 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 2x^2 + -4x + 2, rw:5 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (root): 1 ; CHECK: Loop %loop: backedge-taken count is 1 define signext i32 @test03() { entry: @@ -174,32 +174,32 @@ exit: ; ; CHECK-LABEL: Printing analysis 'Scalar Evolution Analysis' for function 'test04': -; CHECK: GetQuadraticEquation: analyzing quadratic addrec: {0,+,3,+,4}<%loop> -; CHECK: GetQuadraticEquation: addrec coeff bw: 16 -; CHECK: GetQuadraticEquation: equation 4x^2 + 2x + 0, coeff bw: 17, multiplied by 2 -; CHECK: SolveQuadraticAddRecRange: solving for signed overflow -; CHECK: SolveQuadraticEquationWrap: solving 4x^2 + 2x + 2, rw:16 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 4x^2 + 2x + -65534, rw:16 -; CHECK: SolveQuadraticEquationWrap: solution (wrap): 128 -; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow -; CHECK: SolveQuadraticEquationWrap: solving 4x^2 + 2x + 2, rw:17 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 4x^2 + 2x + -131070, rw:17 -; CHECK: SolveQuadraticEquationWrap: solution (wrap): 181 -; CHECK: SolveQuadraticAddRecRange: solving for signed overflow -; CHECK: SolveQuadraticEquationWrap: solving 4x^2 + 2x + 2, rw:16 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 4x^2 + 2x + -65534, rw:16 -; CHECK: SolveQuadraticEquationWrap: solution (wrap): 128 -; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow -; CHECK: SolveQuadraticEquationWrap: solving 4x^2 + 2x + 2, rw:17 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 4x^2 + 2x + -131070, rw:17 -; CHECK: SolveQuadraticEquationWrap: solution (wrap): 181 -; CHECK: GetQuadraticEquation: analyzing quadratic addrec: {1,+,3,+,4}<%loop> -; CHECK: GetQuadraticEquation: addrec coeff bw: 16 -; CHECK: GetQuadraticEquation: equation 4x^2 + 2x + 2, coeff bw: 17, multiplied by 2 -; CHECK: SolveQuadraticAddRecExact: solving for unsigned overflow -; CHECK: SolveQuadraticEquationWrap: solving 4x^2 + 2x + 2, rw:17 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 4x^2 + 2x + -131070, rw:17 -; CHECK: SolveQuadraticEquationWrap: solution (wrap): 181 +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {0,+,3,+,4}<%loop> +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 16 +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation 4x^2 + 2x + 0, coeff bw: 17, multiplied by 2 +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 4x^2 + 2x + 2, rw:16 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 4x^2 + 2x + -65534, rw:16 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 128 +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 4x^2 + 2x + 2, rw:17 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 4x^2 + 2x + -131070, rw:17 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 181 +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 4x^2 + 2x + 2, rw:16 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 4x^2 + 2x + -65534, rw:16 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 128 +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 4x^2 + 2x + 2, rw:17 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 4x^2 + 2x + -131070, rw:17 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 181 +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {1,+,3,+,4}<%loop> +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 16 +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation 4x^2 + 2x + 2, coeff bw: 17, multiplied by 2 +; CHECK: {{.*}}SolveQuadraticAddRecExact{{.*}}: solving for unsigned overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 4x^2 + 2x + 2, rw:17 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 4x^2 + 2x + -131070, rw:17 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 181 ; CHECK: Loop %loop: Unpredictable backedge-taken count. define signext i32 @test04() { entry: @@ -224,25 +224,25 @@ exit: ; A case with signed arithmetic, but unsigned comparison. ; CHECK-LABEL: Printing analysis 'Scalar Evolution Analysis' for function 'test05': -; CHECK: GetQuadraticEquation: analyzing quadratic addrec: {0,+,-1,+,-1}<%loop> -; CHECK: GetQuadraticEquation: addrec coeff bw: 32 -; CHECK: GetQuadraticEquation: equation -1x^2 + -1x + 0, coeff bw: 33, multiplied by 2 -; CHECK: SolveQuadraticAddRecRange: solving for signed overflow -; CHECK: SolveQuadraticEquationWrap: solving -1x^2 + -1x + 4, rw:32 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + 1x + -4, rw:32 -; CHECK: SolveQuadraticEquationWrap: solution (wrap): 2 -; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow -; CHECK: SolveQuadraticEquationWrap: solving -1x^2 + -1x + 4, rw:33 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + 1x + -4, rw:33 -; CHECK: SolveQuadraticEquationWrap: solution (wrap): 2 -; CHECK: SolveQuadraticAddRecRange: solving for signed overflow -; CHECK: SolveQuadraticEquationWrap: solving -1x^2 + -1x + -2, rw:32 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + 1x + -4294967294, rw:32 -; CHECK: SolveQuadraticEquationWrap: solution (wrap): 65536 -; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow -; CHECK: SolveQuadraticEquationWrap: solving -1x^2 + -1x + -2, rw:33 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + 1x + -8589934590, rw:33 -; CHECK: SolveQuadraticEquationWrap: solution (wrap): 92682 +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {0,+,-1,+,-1}<%loop> +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 32 +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation -1x^2 + -1x + 0, coeff bw: 33, multiplied by 2 +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving -1x^2 + -1x + 4, rw:32 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + 1x + -4, rw:32 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 2 +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving -1x^2 + -1x + 4, rw:33 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + 1x + -4, rw:33 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 2 +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving -1x^2 + -1x + -2, rw:32 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + 1x + -4294967294, rw:32 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 65536 +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving -1x^2 + -1x + -2, rw:33 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + 1x + -8589934590, rw:33 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 92682 ; CHECK: Loop %loop: backedge-taken count is 2 define signext i32 @test05() { @@ -268,25 +268,25 @@ exit: ; A test that used to crash with one of the earlier versions of the code. ; CHECK-LABEL: Printing analysis 'Scalar Evolution Analysis' for function 'test06': -; CHECK: GetQuadraticEquation: analyzing quadratic addrec: {0,+,-99999,+,1}<%loop> -; CHECK: GetQuadraticEquation: addrec coeff bw: 32 -; CHECK: GetQuadraticEquation: equation 1x^2 + -199999x + 0, coeff bw: 33, multiplied by 2 -; CHECK: SolveQuadraticAddRecRange: solving for signed overflow -; CHECK: SolveQuadraticEquationWrap: solving 1x^2 + -199999x + -4294967294, rw:32 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + -199999x + 2, rw:32 -; CHECK: SolveQuadraticEquationWrap: solution (wrap): 1 -; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow -; CHECK: SolveQuadraticEquationWrap: solving 1x^2 + -199999x + -4294967294, rw:33 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + -199999x + 4294967298, rw:33 -; CHECK: SolveQuadraticEquationWrap: solution (wrap): 24469 -; CHECK: SolveQuadraticAddRecRange: solving for signed overflow -; CHECK: SolveQuadraticEquationWrap: solving 1x^2 + -199999x + -12, rw:32 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + -199999x + 4294967284, rw:32 -; CHECK: SolveQuadraticEquationWrap: solution (wrap): 24469 -; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow -; CHECK: SolveQuadraticEquationWrap: solving 1x^2 + -199999x + -12, rw:33 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 1x^2 + -199999x + 8589934580, rw:33 -; CHECK: SolveQuadraticEquationWrap: solution (wrap): 62450 +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {0,+,-99999,+,1}<%loop> +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 32 +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation 1x^2 + -199999x + 0, coeff bw: 33, multiplied by 2 +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 1x^2 + -199999x + -4294967294, rw:32 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + -199999x + 2, rw:32 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 1 +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 1x^2 + -199999x + -4294967294, rw:33 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + -199999x + 4294967298, rw:33 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 24469 +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 1x^2 + -199999x + -12, rw:32 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + -199999x + 4294967284, rw:32 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 24469 +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 1x^2 + -199999x + -12, rw:33 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 1x^2 + -199999x + 8589934580, rw:33 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solution (wrap): 62450 ; CHECK: Loop %loop: backedge-taken count is 24469 define signext i32 @test06() { entry: @@ -315,32 +315,32 @@ exit: ; solves the equation exactly, or changes the sign of it between n and n+1. ; CHECK-LABEL: Printing analysis 'Scalar Evolution Analysis' for function 'test07': -; CHECK: GetQuadraticEquation: analyzing quadratic addrec: {0,+,40811489,+,532052752}<%loop> -; CHECK: GetQuadraticEquation: addrec coeff bw: 32 -; CHECK: GetQuadraticEquation: equation 532052752x^2 + -450429774x + 0, coeff bw: 33, multiplied by 2 -; CHECK: SolveQuadraticAddRecRange: solving for signed overflow -; CHECK: SolveQuadraticEquationWrap: solving 532052752x^2 + -450429774x + 71188414, rw:32 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:32 -; CHECK: SolveQuadraticEquationWrap: no valid solution -; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow -; CHECK: SolveQuadraticEquationWrap: solving 532052752x^2 + -450429774x + 71188414, rw:33 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:33 -; CHECK: SolveQuadraticEquationWrap: no valid solution -; CHECK: SolveQuadraticAddRecRange: solving for signed overflow -; CHECK: SolveQuadraticEquationWrap: solving 532052752x^2 + -450429774x + 71188414, rw:32 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:32 -; CHECK: SolveQuadraticEquationWrap: no valid solution -; CHECK: SolveQuadraticAddRecRange: solving for unsigned overflow -; CHECK: SolveQuadraticEquationWrap: solving 532052752x^2 + -450429774x + 71188414, rw:33 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:33 -; CHECK: SolveQuadraticEquationWrap: no valid solution -; CHECK: GetQuadraticEquation: analyzing quadratic addrec: {35594207,+,40811489,+,532052752}<%loop> -; CHECK: GetQuadraticEquation: addrec coeff bw: 32 -; CHECK: GetQuadraticEquation: equation 532052752x^2 + -450429774x + 71188414, coeff bw: 33, multiplied by 2 -; CHECK: SolveQuadraticAddRecExact: solving for unsigned overflow -; CHECK: SolveQuadraticEquationWrap: solving 532052752x^2 + -450429774x + 71188414, rw:33 -; CHECK: SolveQuadraticEquationWrap: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:33 -; CHECK: SolveQuadraticEquationWrap: no valid solution +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {0,+,40811489,+,532052752}<%loop> +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 32 +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation 532052752x^2 + -450429774x + 0, coeff bw: 33, multiplied by 2 +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 532052752x^2 + -450429774x + 71188414, rw:32 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:32 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: no valid solution +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 532052752x^2 + -450429774x + 71188414, rw:33 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:33 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: no valid solution +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for signed overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 532052752x^2 + -450429774x + 71188414, rw:32 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:32 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: no valid solution +; CHECK: {{.*}}SolveQuadraticAddRecRange{{.*}}: solving for unsigned overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 532052752x^2 + -450429774x + 71188414, rw:33 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:33 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: no valid solution +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {35594207,+,40811489,+,532052752}<%loop> +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 32 +; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation 532052752x^2 + -450429774x + 71188414, coeff bw: 33, multiplied by 2 +; CHECK: {{.*}}SolveQuadraticAddRecExact{{.*}}: solving for unsigned overflow +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: solving 532052752x^2 + -450429774x + 71188414, rw:33 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: updated coefficients 532052752x^2 + -450429774x + 71188414, rw:33 +; CHECK: {{.*}}SolveQuadraticEquationWrap{{.*}}: no valid solution ; CHECK: Loop %loop: Unpredictable backedge-taken count. define signext i32 @test07() { entry: diff --git a/test/Analysis/StackSafetyAnalysis/Inputs/ipa-alias.ll b/test/Analysis/StackSafetyAnalysis/Inputs/ipa-alias.ll new file mode 100644 index 0000000000000000000000000000000000000000..bd290a8321aa382f9753648ce9cf17a95759a88c --- /dev/null +++ b/test/Analysis/StackSafetyAnalysis/Inputs/ipa-alias.ll @@ -0,0 +1,18 @@ +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@InterposableAliasWrite1 = linkonce dso_local alias void(i8*), void(i8*)* @Write1 + +@PreemptableAliasWrite1 = dso_preemptable alias void(i8*), void(i8*)* @Write1 +@AliasToPreemptableAliasWrite1 = dso_local alias void(i8*), void(i8*)* @PreemptableAliasWrite1 + +@AliasWrite1 = dso_local alias void(i8*), void(i8*)* @Write1 + +@BitcastAliasWrite1 = dso_local alias void(i32*), bitcast (void(i8*)* @Write1 to void(i32*)*) +@AliasToBitcastAliasWrite1 = dso_local alias void(i8*), bitcast (void(i32*)* @BitcastAliasWrite1 to void(i8*)*) + +define dso_local void @Write1(i8* %p) { +entry: + store i8 0, i8* %p, align 1 + ret void +} diff --git a/test/Analysis/StackSafetyAnalysis/Inputs/ipa.ll b/test/Analysis/StackSafetyAnalysis/Inputs/ipa.ll new file mode 100644 index 0000000000000000000000000000000000000000..fb789345692d93535db92b96a8aa17e570992e0a --- /dev/null +++ b/test/Analysis/StackSafetyAnalysis/Inputs/ipa.ll @@ -0,0 +1,118 @@ +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define dso_local void @Write1(i8* %p) { +entry: + store i8 0, i8* %p, align 1 + ret void +} + +define dso_local void @Write4(i8* %p) { +entry: + %0 = bitcast i8* %p to i32* + store i32 0, i32* %0, align 1 + ret void +} + +define dso_local void @Write4_2(i8* %p, i8* %q) { +entry: + %0 = bitcast i8* %p to i32* + store i32 0, i32* %0, align 1 + %1 = bitcast i8* %q to i32* + store i32 0, i32* %1, align 1 + ret void +} + +define dso_local void @Write8(i8* %p) { +entry: + %0 = bitcast i8* %p to i64* + store i64 0, i64* %0, align 1 + ret void +} + +define dso_local i8* @WriteAndReturn8(i8* %p) { +entry: + store i8 0, i8* %p, align 1 + ret i8* %p +} + +declare dso_local void @ExternalCall(i8* %p) + +define dso_preemptable void @PreemptableWrite1(i8* %p) { +entry: + store i8 0, i8* %p, align 1 + ret void +} + +define linkonce dso_local void @InterposableWrite1(i8* %p) { +entry: + store i8 0, i8* %p, align 1 + ret void +} + +define dso_local i8* @ReturnDependent(i8* %p) { +entry: + %p2 = getelementptr i8, i8* %p, i64 2 + ret i8* %p2 +} + +; access range [2, 6) +define dso_local void @Rec0(i8* %p) { +entry: + %p1 = getelementptr i8, i8* %p, i64 2 + call void @Write4(i8* %p1) + ret void +} + +; access range [3, 7) +define dso_local void @Rec1(i8* %p) { +entry: + %p1 = getelementptr i8, i8* %p, i64 1 + call void @Rec0(i8* %p1) + ret void +} + +; access range [-2, 2) +define dso_local void @Rec2(i8* %p) { +entry: + %p1 = getelementptr i8, i8* %p, i64 -5 + call void @Rec1(i8* %p1) + ret void +} + +; Recursive function that passes %acc unchanged => access range [0, 4). +define dso_local void @RecursiveNoOffset(i32* %p, i32 %size, i32* %acc) { +entry: + %cmp = icmp eq i32 %size, 0 + br i1 %cmp, label %return, label %if.end + +if.end: + %0 = load i32, i32* %p, align 4 + %1 = load i32, i32* %acc, align 4 + %add = add nsw i32 %1, %0 + store i32 %add, i32* %acc, align 4 + %add.ptr = getelementptr inbounds i32, i32* %p, i64 1 + %sub = add nsw i32 %size, -1 + tail call void @RecursiveNoOffset(i32* %add.ptr, i32 %sub, i32* %acc) + ret void + +return: + ret void +} + +; Recursive function that advances %acc on each iteration => access range unlimited. +define dso_local void @RecursiveWithOffset(i32 %size, i32* %acc) { +entry: + %cmp = icmp eq i32 %size, 0 + br i1 %cmp, label %return, label %if.end + +if.end: + store i32 0, i32* %acc, align 4 + %acc2 = getelementptr inbounds i32, i32* %acc, i64 1 + %sub = add nsw i32 %size, -1 + tail call void @RecursiveWithOffset(i32 %sub, i32* %acc2) + ret void + +return: + ret void +} diff --git a/test/Analysis/StackSafetyAnalysis/ipa-alias.ll b/test/Analysis/StackSafetyAnalysis/ipa-alias.ll new file mode 100644 index 0000000000000000000000000000000000000000..d77e7a68925a28e41cb71dc00b7a5ec57064031c --- /dev/null +++ b/test/Analysis/StackSafetyAnalysis/ipa-alias.ll @@ -0,0 +1,133 @@ +; Test IPA over a single combined file +; RUN: llvm-as %s -o %t0.bc +; RUN: llvm-as %S/Inputs/ipa-alias.ll -o %t1.bc +; RUN: llvm-link %t0.bc %t1.bc -o %t.combined.bc +; RUN: opt -S -analyze -stack-safety-local %t.combined.bc | FileCheck %s --check-prefixes=CHECK,LOCAL +; RUN: opt -S -passes="print" -disable-output %t.combined.bc 2>&1 | FileCheck %s --check-prefixes=CHECK,LOCAL +; RUN: opt -S -analyze -stack-safety %t.combined.bc | FileCheck %s --check-prefixes=CHECK,GLOBAL +; RUN: opt -S -passes="print-stack-safety" -disable-output %t.combined.bc 2>&1 | FileCheck %s --check-prefixes=CHECK,GLOBAL + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare void @PreemptableAliasWrite1(i8* %p) +declare void @AliasToPreemptableAliasWrite1(i8* %p) + +declare void @InterposableAliasWrite1(i8* %p) +; Aliases to interposable aliases are not allowed + +declare void @AliasWrite1(i8* %p) + +declare void @BitcastAliasWrite1(i32* %p) +declare void @AliasToBitcastAliasWrite1(i8* %p) + +; Call to dso_preemptable alias to a dso_local aliasee +define void @PreemptableAliasCall() { +; CHECK-LABEL: @PreemptableAliasCall dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x1[1]: empty-set, @PreemptableAliasWrite1(arg0, [0,1)){{$}} +; GLOBAL-NEXT: x1[1]: full-set, @PreemptableAliasWrite1(arg0, [0,1)){{$}} +; LOCAL-NEXT: x2[1]: empty-set, @AliasToPreemptableAliasWrite1(arg0, [0,1)){{$}} +; GLOBAL-NEXT: x2[1]: [0,1), @AliasToPreemptableAliasWrite1(arg0, [0,1)){{$}} +; CHECK-NOT: ]: +entry: + %x1 = alloca i8 + call void @PreemptableAliasWrite1(i8* %x1) + + %x2 = alloca i8 +; Alias to a preemptable alias is not preemptable + call void @AliasToPreemptableAliasWrite1(i8* %x2) + ret void +} + +; Call to an interposable alias to a non-interposable aliasee +define void @InterposableAliasCall() { +; CHECK-LABEL: @InterposableAliasCall dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[1]: empty-set, @InterposableAliasWrite1(arg0, [0,1)){{$}} +; GLOBAL-NEXT: x[1]: full-set, @InterposableAliasWrite1(arg0, [0,1)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i8 +; ThinLTO can resolve the prevailing implementation for interposable definitions. + call void @InterposableAliasWrite1(i8* %x) + ret void +} + +; Call to a dso_local/non-interposable alias/aliasee +define void @AliasCall() { +; CHECK-LABEL: @AliasCall dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[1]: empty-set, @AliasWrite1(arg0, [0,1)){{$}} +; GLOBAL-NEXT: x[1]: [0,1), @AliasWrite1(arg0, [0,1)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i8 + call void @AliasWrite1(i8* %x) + ret void +} + +; Call to a bitcasted dso_local/non-interposable alias/aliasee +define void @BitcastAliasCall() { +; CHECK-LABEL: @BitcastAliasCall dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x1[4]: empty-set, @BitcastAliasWrite1(arg0, [0,1)){{$}} +; GLOBAL-NEXT: x1[4]: [0,1), @BitcastAliasWrite1(arg0, [0,1)){{$}} +; LOCAL-NEXT: x2[1]: empty-set, @AliasToBitcastAliasWrite1(arg0, [0,1)){{$}} +; GLOBAL-NEXT: x2[1]: [0,1), @AliasToBitcastAliasWrite1(arg0, [0,1)){{$}} +; CHECK-NOT: ]: +entry: + %x1 = alloca i32 + call void @BitcastAliasWrite1(i32* %x1) + %x2 = alloca i8 + call void @AliasToBitcastAliasWrite1(i8* %x2) + ret void +} + +; The rest is from Inputs/ipa-alias.ll + +; CHECK-LABEL: @Write1{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: p[]: [0,1){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NOT: ]: + +; GLOBAL-LABEL: @InterposableAliasWrite1 interposable{{$}} +; GLOBAL-NEXT: args uses: +; GLOBAL-NEXT: []: [0,1), @Write1(arg0, [0,1)){{$}} +; GLOBAL-NEXT: allocas uses: +; GLOBAL-NOT: ]: + +; GLOBAL-LABEL: @PreemptableAliasWrite1 dso_preemptable{{$}} +; GLOBAL-NEXT: args uses: +; GLOBAL-NEXT: []: [0,1), @Write1(arg0, [0,1)){{$}} +; GLOBAL-NEXT: allocas uses: +; GLOBAL-NOT: ]: + +; GLOBAL-LABEL: @AliasToPreemptableAliasWrite1{{$}} +; GLOBAL-NEXT: args uses: +; GLOBAL-NEXT: []: [0,1), @Write1(arg0, [0,1)){{$}} +; GLOBAL-NEXT: allocas uses: +; GLOBAL-NOT: ]: + +; GLOBAL-LABEL: @AliasWrite1{{$}} +; GLOBAL-NEXT: args uses: +; GLOBAL-NEXT: []: [0,1), @Write1(arg0, [0,1)){{$}} +; GLOBAL-NEXT: allocas uses: +; GLOBAL-NOT: ]: + +; GLOBAL-LABEL: @BitcastAliasWrite1{{$}} +; GLOBAL-NEXT: args uses: +; GLOBAL-NEXT: []: [0,1), @Write1(arg0, [0,1)){{$}} +; GLOBAL-NEXT: allocas uses: +; GLOBAL-NOT: ]: + +; GLOBAL-LABEL: @AliasToBitcastAliasWrite1{{$}} +; GLOBAL-NEXT: args uses: +; GLOBAL-NEXT: []: [0,1), @Write1(arg0, [0,1)){{$}} +; GLOBAL-NEXT: allocas uses: +; GLOBAL-NOT: ]: diff --git a/test/Analysis/StackSafetyAnalysis/ipa.ll b/test/Analysis/StackSafetyAnalysis/ipa.ll new file mode 100644 index 0000000000000000000000000000000000000000..6791dd0866b85bb1bf7d49b48d539e091406575f --- /dev/null +++ b/test/Analysis/StackSafetyAnalysis/ipa.ll @@ -0,0 +1,448 @@ +; RUN: llvm-as %s -o %t0.bc +; RUN: llvm-as %S/Inputs/ipa.ll -o %t1.bc +; RUN: llvm-link -disable-lazy-loading %t0.bc %t1.bc -o %t.combined.bc +; RUN: opt -S -analyze -stack-safety-local %t.combined.bc | FileCheck %s --check-prefixes=CHECK,LOCAL +; RUN: opt -S -passes="print" -disable-output %t.combined.bc 2>&1 | FileCheck %s --check-prefixes=CHECK,LOCAL +; RUN: opt -S -analyze -stack-safety %t.combined.bc | FileCheck %s --check-prefixes=CHECK,GLOBAL +; RUN: opt -S -passes="print-stack-safety" -disable-output %t.combined.bc 2>&1 | FileCheck %s --check-prefixes=CHECK,GLOBAL + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare void @Write1(i8* %p) +declare void @Write4(i8* %p) +declare void @Write4_2(i8* %p, i8* %q) +declare void @Write8(i8* %p) +declare dso_local i8* @WriteAndReturn8(i8* %p) +declare dso_local void @ExternalCall(i8* %p) +declare void @PreemptableWrite1(i8* %p) +declare void @InterposableWrite1(i8* %p) +declare i8* @ReturnDependent(i8* %p) +declare void @Rec2(i8* %p) +declare void @RecursiveNoOffset(i32* %p, i32 %size, i32* %acc) +declare void @RecursiveWithOffset(i32 %size, i32* %acc) + +; Basic out-of-bounds. +define void @f1() { +; CHECK-LABEL: @f1 dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[4]: empty-set, @Write8(arg0, [0,1)){{$}} +; GLOBAL-NEXT: x[4]: [0,8), @Write8(arg0, [0,1)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + call void @Write8(i8* %x1) + ret void +} + +; Basic in-bounds. +define void @f2() { +; CHECK-LABEL: @f2 dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[4]: empty-set, @Write1(arg0, [0,1)){{$}} +; GLOBAL-NEXT: x[4]: [0,1), @Write1(arg0, [0,1)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + call void @Write1(i8* %x1) + ret void +} + +; Another basic in-bounds. +define void @f3() { +; CHECK-LABEL: @f3 dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[4]: empty-set, @Write4(arg0, [0,1)){{$}} +; GLOBAL-NEXT: x[4]: [0,4), @Write4(arg0, [0,1)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + call void @Write4(i8* %x1) + ret void +} + +; In-bounds with offset. +define void @f4() { +; CHECK-LABEL: @f4 dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[4]: empty-set, @Write1(arg0, [1,2)){{$}} +; GLOBAL-NEXT: x[4]: [1,2), @Write1(arg0, [1,2)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + %x2 = getelementptr i8, i8* %x1, i64 1 + call void @Write1(i8* %x2) + ret void +} + +; Out-of-bounds with offset. +define void @f5() { +; CHECK-LABEL: @f5 dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: empty-set, @Write4(arg0, [1,2)){{$}} +; GLOBAL-NEXT: [1,5), @Write4(arg0, [1,2)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + %x2 = getelementptr i8, i8* %x1, i64 1 + call void @Write4(i8* %x2) + ret void +} + +; External call. +define void @f6() { +; CHECK-LABEL: @f6 dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[4]: empty-set, @ExternalCall(arg0, [0,1)){{$}} +; GLOBAL-NEXT: x[4]: full-set, @ExternalCall(arg0, [0,1)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + call void @ExternalCall(i8* %x1) + ret void +} + +; Call to dso_preemptable function +define void @PreemptableCall() { +; CHECK-LABEL: @PreemptableCall dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[4]: empty-set, @PreemptableWrite1(arg0, [0,1)){{$}} +; GLOBAL-NEXT: x[4]: full-set, @PreemptableWrite1(arg0, [0,1)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + call void @PreemptableWrite1(i8* %x1) + ret void +} + +; Call to function with interposable linkage +define void @InterposableCall() { +; CHECK-LABEL: @InterposableCall dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[4]: empty-set, @InterposableWrite1(arg0, [0,1)){{$}} +; GLOBAL-NEXT: x[4]: full-set, @InterposableWrite1(arg0, [0,1)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + call void @InterposableWrite1(i8* %x1) + ret void +} + +; Call to function with private linkage +define void @PrivateCall() { +; CHECK-LABEL: @PrivateCall dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[4]: empty-set, @PrivateWrite1(arg0, [0,1)){{$}} +; GLOBAL-NEXT: x[4]: [0,1), @PrivateWrite1(arg0, [0,1)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + call void @PrivateWrite1(i8* %x1) + ret void +} + +define private void @PrivateWrite1(i8* %p) { +; CHECK-LABEL: @PrivateWrite1{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: p[]: [0,1){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NOT: ]: +entry: + store i8 0, i8* %p, align 1 + ret void +} + +; Caller returns a dependent value. +; FIXME: alloca considered unsafe even if the return value is unused. +define void @f7() { +; CHECK-LABEL: @f7 dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[4]: empty-set, @ReturnDependent(arg0, [0,1)){{$}} +; GLOBAL-NEXT: x[4]: full-set, @ReturnDependent(arg0, [0,1)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + %x2 = call i8* @ReturnDependent(i8* %x1) + ret void +} + +define void @f8left() { +; CHECK-LABEL: @f8left dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[8]: empty-set, @Rec2(arg0, [2,3)){{$}} +; GLOBAL-NEXT: x[8]: [0,4), @Rec2(arg0, [2,3)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i64, align 4 + %x1 = bitcast i64* %x to i8* + %x2 = getelementptr i8, i8* %x1, i64 2 +; 2 + [-2, 2) = [0, 4) => OK + call void @Rec2(i8* %x2) + ret void +} + +define void @f8right() { +; CHECK-LABEL: @f8right dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[8]: empty-set, @Rec2(arg0, [6,7)){{$}} +; GLOBAL-NEXT: x[8]: [4,8), @Rec2(arg0, [6,7)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i64, align 4 + %x1 = bitcast i64* %x to i8* + %x2 = getelementptr i8, i8* %x1, i64 6 +; 6 + [-2, 2) = [4, 8) => OK + call void @Rec2(i8* %x2) + ret void +} + +define void @f8oobleft() { +; CHECK-LABEL: @f8oobleft dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[8]: empty-set, @Rec2(arg0, [1,2)){{$}} +; GLOBAL-NEXT: x[8]: [-1,3), @Rec2(arg0, [1,2)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i64, align 4 + %x1 = bitcast i64* %x to i8* + %x2 = getelementptr i8, i8* %x1, i64 1 +; 1 + [-2, 2) = [-1, 3) => NOT OK + call void @Rec2(i8* %x2) + ret void +} + +define void @f8oobright() { +; CHECK-LABEL: @f8oobright dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[8]: empty-set, @Rec2(arg0, [7,8)){{$}} +; GLOBAL-NEXT: x[8]: [5,9), @Rec2(arg0, [7,8)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i64, align 4 + %x1 = bitcast i64* %x to i8* + %x2 = getelementptr i8, i8* %x1, i64 7 +; 7 + [-2, 2) = [5, 9) => NOT OK + call void @Rec2(i8* %x2) + ret void +} + +define void @TwoArguments() { +; CHECK-LABEL: @TwoArguments dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[8]: empty-set, @Write4_2(arg1, [0,1)), @Write4_2(arg0, [4,5)){{$}} +; GLOBAL-NEXT: x[8]: [0,8), @Write4_2(arg1, [0,1)), @Write4_2(arg0, [4,5)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i64, align 4 + %x1 = bitcast i64* %x to i8* + %x2 = getelementptr i8, i8* %x1, i64 4 + call void @Write4_2(i8* %x2, i8* %x1) + ret void +} + +define void @TwoArgumentsOOBOne() { +; CHECK-LABEL: @TwoArgumentsOOBOne dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[8]: empty-set, @Write4_2(arg1, [0,1)), @Write4_2(arg0, [5,6)){{$}} +; GLOBAL-NEXT: x[8]: [0,9), @Write4_2(arg1, [0,1)), @Write4_2(arg0, [5,6)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i64, align 4 + %x1 = bitcast i64* %x to i8* + %x2 = getelementptr i8, i8* %x1, i64 5 + call void @Write4_2(i8* %x2, i8* %x1) + ret void +} + +define void @TwoArgumentsOOBOther() { +; CHECK-LABEL: @TwoArgumentsOOBOther dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[8]: empty-set, @Write4_2(arg1, [-1,0)), @Write4_2(arg0, [4,5)){{$}} +; GLOBAL-NEXT: x[8]: [-1,8), @Write4_2(arg1, [-1,0)), @Write4_2(arg0, [4,5)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i64, align 4 + %x0 = bitcast i64* %x to i8* + %x1 = getelementptr i8, i8* %x0, i64 -1 + %x2 = getelementptr i8, i8* %x0, i64 4 + call void @Write4_2(i8* %x2, i8* %x1) + ret void +} + +define void @TwoArgumentsOOBBoth() { +; CHECK-LABEL: @TwoArgumentsOOBBoth dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[8]: empty-set, @Write4_2(arg1, [-1,0)), @Write4_2(arg0, [5,6)){{$}} +; GLOBAL-NEXT: x[8]: [-1,9), @Write4_2(arg1, [-1,0)), @Write4_2(arg0, [5,6)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i64, align 4 + %x0 = bitcast i64* %x to i8* + %x1 = getelementptr i8, i8* %x0, i64 -1 + %x2 = getelementptr i8, i8* %x0, i64 5 + call void @Write4_2(i8* %x2, i8* %x1) + ret void +} + +define i32 @TestRecursiveNoOffset(i32* %p, i32 %size) { +; CHECK-LABEL: @TestRecursiveNoOffset dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; LOCAL-NEXT: p[]: empty-set, @RecursiveNoOffset(arg0, [0,1)){{$}} +; GLOBAL-NEXT: p[]: full-set, @RecursiveNoOffset(arg0, [0,1)){{$}} +; CHECK-NEXT: size[]: empty-set, @RecursiveNoOffset(arg1, [0,1)){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: sum[4]: [0,4), @RecursiveNoOffset(arg2, [0,1)){{$}} +; CHECK-NOT: ]: +entry: + %sum = alloca i32, align 4 + %0 = bitcast i32* %sum to i8* + store i32 0, i32* %sum, align 4 + call void @RecursiveNoOffset(i32* %p, i32 %size, i32* %sum) + %1 = load i32, i32* %sum, align 4 + ret i32 %1 +} + +define void @TestRecursiveWithOffset(i32 %size) { +; CHECK-LABEL: @TestRecursiveWithOffset dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: size[]: empty-set, @RecursiveWithOffset(arg0, [0,1)){{$}} +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: sum[64]: empty-set, @RecursiveWithOffset(arg1, [0,1)){{$}} +; GLOBAL-NEXT: sum[64]: full-set, @RecursiveWithOffset(arg1, [0,1)){{$}} +; CHECK-NOT: ]: +entry: + %sum = alloca i32, i64 16, align 4 + call void @RecursiveWithOffset(i32 %size, i32* %sum) + ret void +} + +; FIXME: IPA should detect that access is safe +define void @TestUpdateArg() { +; CHECK-LABEL: @TestUpdateArg dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[16]: empty-set, @WriteAndReturn8(arg0, [0,1)){{$}} +; GLOBAL-NEXT: x[16]: full-set, @WriteAndReturn8(arg0, [0,1)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i8, i64 16, align 4 + %0 = call i8* @WriteAndReturn8(i8* %x) + ret void +} + +; The rest is from Inputs/ipa.ll + +; CHECK-LABEL: @Write1{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: p[]: [0,1){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NOT: ]: + +; CHECK-LABEL: @Write4{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: p[]: [0,4){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NOT: ]: + +; CHECK-LABEL: @Write4_2{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: p[]: [0,4){{$}} +; CHECK-NEXT: q[]: [0,4){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NOT: ]: + +; CHECK-LABEL: @Write8{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: p[]: [0,8){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NOT: ]: + +; CHECK-LABEL: @WriteAndReturn8{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: p[]: full-set{{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NOT: ]: + +; CHECK-LABEL: @PreemptableWrite1 dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: p[]: [0,1){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NOT: ]: + +; CHECK-LABEL: @InterposableWrite1 interposable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: p[]: [0,1){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NOT: ]: + +; CHECK-LABEL: @ReturnDependent{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: p[]: full-set{{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NOT: ]: + +; CHECK-LABEL: @Rec0{{$}} +; CHECK-NEXT: args uses: +; LOCAL-NEXT: p[]: empty-set, @Write4(arg0, [2,3)){{$}} +; GLOBAL-NEXT: p[]: [2,6), @Write4(arg0, [2,3)){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NOT: ]: + +; CHECK-LABEL: @Rec1{{$}} +; CHECK-NEXT: args uses: +; LOCAL-NEXT: p[]: empty-set, @Rec0(arg0, [1,2)){{$}} +; GLOBAL-NEXT: p[]: [3,7), @Rec0(arg0, [1,2)){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NOT: ]: + +; CHECK-LABEL: @Rec2{{$}} +; CHECK-NEXT: args uses: +; LOCAL-NEXT: p[]: empty-set, @Rec1(arg0, [-5,-4)){{$}} +; GLOBAL-NEXT: p[]: [-2,2), @Rec1(arg0, [-5,-4)){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NOT: ]: + +; CHECK-LABEL: @RecursiveNoOffset{{$}} +; CHECK-NEXT: args uses: +; LOCAL-NEXT: p[]: [0,4), @RecursiveNoOffset(arg0, [4,5)){{$}} +; GLOBAL-NEXT: p[]: full-set, @RecursiveNoOffset(arg0, [4,5)){{$}} +; CHECK-NEXT: size[]: empty-set, @RecursiveNoOffset(arg1, [4294967295,4294967296)){{$}} +; CHECK-NEXT: acc[]: [0,4), @RecursiveNoOffset(arg2, [0,1)){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NOT: ]: + +; CHECK-LABEL: @RecursiveWithOffset{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: size[]: empty-set, @RecursiveWithOffset(arg0, [4294967295,4294967296)){{$}} +; LOCAL-NEXT: acc[]: [0,4), @RecursiveWithOffset(arg1, [4,5)){{$}} +; GLOBAL-NEXT: acc[]: full-set, @RecursiveWithOffset(arg1, [4,5)){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NOT: ]: diff --git a/test/Analysis/StackSafetyAnalysis/local.ll b/test/Analysis/StackSafetyAnalysis/local.ll new file mode 100644 index 0000000000000000000000000000000000000000..814b97d29638493eeba1b728da03a55b3c0bae06 --- /dev/null +++ b/test/Analysis/StackSafetyAnalysis/local.ll @@ -0,0 +1,351 @@ +; RUN: opt -S -analyze -stack-safety-local < %s | FileCheck %s --check-prefixes=CHECK,LOCAL +; RUN: opt -S -passes="print" -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,LOCAL +; RUN: opt -S -analyze -stack-safety < %s | FileCheck %s --check-prefixes=CHECK,GLOBAL +; RUN: opt -S -passes="print-stack-safety" -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,GLOBAL + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@sink = global i8* null, align 8 + +declare void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 %len, i1 %isvolatile) +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %len, i1 %isvolatile) +declare void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %len, i1 %isvolatile) + +; Address leaked. +define void @LeakAddress() { +; CHECK-LABEL: @LeakAddress dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: full-set{{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + store i8* %x1, i8** @sink, align 8 + ret void +} + +define void @StoreInBounds() { +; CHECK-LABEL: @StoreInBounds dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: [0,1){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + store i8 0, i8* %x1, align 1 + ret void +} + +define void @StoreInBounds2() { +; CHECK-LABEL: @StoreInBounds2 dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: [0,4){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + store i32 0, i32* %x, align 4 + ret void +} + +define void @StoreInBounds3() { +; CHECK-LABEL: @StoreInBounds3 dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: [2,3){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + %x2 = getelementptr i8, i8* %x1, i64 2 + store i8 0, i8* %x2, align 1 + ret void +} + +; FIXME: ScalarEvolution does not look through ptrtoint/inttoptr. +define void @StoreInBounds4() { +; CHECK-LABEL: @StoreInBounds4 dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: [2,-1){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = ptrtoint i32* %x to i64 + %x2 = add i64 %x1, 2 + %x3 = inttoptr i64 %x2 to i8* + store i8 0, i8* %x3, align 1 + ret void +} + +define void @StoreOutOfBounds() { +; CHECK-LABEL: @StoreOutOfBounds dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: [2,6){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + %x2 = getelementptr i8, i8* %x1, i64 2 + %x3 = bitcast i8* %x2 to i32* + store i32 0, i32* %x3, align 1 + ret void +} + +; There is no difference in load vs store handling. +define void @LoadInBounds() { +; CHECK-LABEL: @LoadInBounds dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: [0,1){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + %v = load i8, i8* %x1, align 1 + ret void +} + +define void @LoadOutOfBounds() { +; CHECK-LABEL: @LoadOutOfBounds dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: [2,6){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + %x2 = getelementptr i8, i8* %x1, i64 2 + %x3 = bitcast i8* %x2 to i32* + %v = load i32, i32* %x3, align 1 + ret void +} + +; Leak through ret. +define i8* @Ret() { +; CHECK-LABEL: @Ret dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: full-set{{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + %x2 = getelementptr i8, i8* %x1, i64 2 + ret i8* %x2 +} + +declare void @Foo(i16* %p) + +define void @DirectCall() { +; CHECK-LABEL: @DirectCall dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[8]: empty-set, @Foo(arg0, [2,3)){{$}} +; GLOBAL-NEXT: x[8]: full-set, @Foo(arg0, [2,3)){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i64, align 4 + %x1 = bitcast i64* %x to i16* + %x2 = getelementptr i16, i16* %x1, i64 1 + call void @Foo(i16* %x2); + ret void +} + +; Indirect calls can not be analyzed (yet). +; FIXME: %p[]: full-set looks invalid +define void @IndirectCall(void (i8*)* %p) { +; CHECK-LABEL: @IndirectCall dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: p[]: full-set{{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: full-set{{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + call void %p(i8* %x1); + ret void +} + +define void @NonConstantOffset(i1 zeroext %z) { +; CHECK-LABEL: @NonConstantOffset dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: z[]: full-set{{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: [0,4){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + %idx = select i1 %z, i64 1, i64 2 + %x2 = getelementptr i8, i8* %x1, i64 %idx + store i8 0, i8* %x2, align 1 + ret void +} + +define void @NonConstantOffsetOOB(i1 zeroext %z) { +; CHECK-LABEL: @NonConstantOffsetOOB dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: z[]: full-set{{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: [0,6){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + %idx = select i1 %z, i64 1, i64 4 + %x2 = getelementptr i8, i8* %x1, i64 %idx + store i8 0, i8* %x2, align 1 + ret void +} + +define void @ArrayAlloca() { +; CHECK-LABEL: @ArrayAlloca dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[40]: [36,40){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, i32 10, align 4 + %x1 = bitcast i32* %x to i8* + %x2 = getelementptr i8, i8* %x1, i64 36 + %x3 = bitcast i8* %x2 to i32* + store i32 0, i32* %x3, align 1 + ret void +} + +define void @ArrayAllocaOOB() { +; CHECK-LABEL: @ArrayAllocaOOB dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[40]: [37,41){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, i32 10, align 4 + %x1 = bitcast i32* %x to i8* + %x2 = getelementptr i8, i8* %x1, i64 37 + %x3 = bitcast i8* %x2 to i32* + store i32 0, i32* %x3, align 1 + ret void +} + +define void @DynamicAllocaUnused(i64 %size) { +; CHECK-LABEL: @DynamicAllocaUnused dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: size[]: empty-set{{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[0]: empty-set{{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, i64 %size, align 16 + ret void +} + +; Dynamic alloca with unknown size. +define void @DynamicAlloca(i64 %size) { +; CHECK-LABEL: @DynamicAlloca dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: size[]: [0,-12){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[0]: [0,4){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, i64 %size, align 16 + store i32 0, i32* %x, align 1 + ret void +} + +; Dynamic alloca with limited size. +; FIXME: could be proved safe. Implement. +define void @DynamicAllocaFiniteSizeRange(i1 zeroext %z) { +; CHECK-LABEL: @DynamicAllocaFiniteSizeRange dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: z[]: [0,-12){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[0]: [0,4){{$}} +; CHECK-NOT: ]: +entry: + %size = select i1 %z, i64 3, i64 5 + %x = alloca i32, i64 %size, align 16 + store i32 0, i32* %x, align 1 + ret void +} + +define signext i8 @SimpleLoop() { +; CHECK-LABEL: @SimpleLoop dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[10]: [0,10){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca [10 x i8], align 1 + %0 = getelementptr inbounds [10 x i8], [10 x i8]* %x, i64 0, i64 0 + %lftr.limit = getelementptr inbounds [10 x i8], [10 x i8]* %x, i64 0, i64 10 + br label %for.body + +for.body: + %sum.010 = phi i8 [ 0, %entry ], [ %add, %for.body ] + %p.09 = phi i8* [ %0, %entry ], [ %incdec.ptr, %for.body ] + %incdec.ptr = getelementptr inbounds i8, i8* %p.09, i64 1 + %1 = load volatile i8, i8* %p.09, align 1 + %add = add i8 %1, %sum.010 + %exitcond = icmp eq i8* %incdec.ptr, %lftr.limit + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret i8 %add +} + +; OOB in a loop. +define signext i8 @SimpleLoopOOB() { +; CHECK-LABEL: @SimpleLoopOOB dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[10]: [0,11){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca [10 x i8], align 1 + %0 = getelementptr inbounds [10 x i8], [10 x i8]* %x, i64 0, i64 0 + ; 11 iterations + %lftr.limit = getelementptr inbounds [10 x i8], [10 x i8]* %x, i64 0, i64 11 + br label %for.body + +for.body: + %sum.010 = phi i8 [ 0, %entry ], [ %add, %for.body ] + %p.09 = phi i8* [ %0, %entry ], [ %incdec.ptr, %for.body ] + %incdec.ptr = getelementptr inbounds i8, i8* %p.09, i64 1 + %1 = load volatile i8, i8* %p.09, align 1 + %add = add i8 %1, %sum.010 + %exitcond = icmp eq i8* %incdec.ptr, %lftr.limit + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret i8 %add +} + +; FIXME: we don't understand that %sz in the memset call is limited to 128 by the preceding check. +define dso_local void @SizeCheck(i32 %sz) { +; CHECK-LABEL: @SizeCheck{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: sz[]: [0,1){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x1[128]: full-set{{$}} +; CHECK-NOT: ]: +entry: + %x1 = alloca [128 x i8], align 16 + %x1.sub = getelementptr inbounds [128 x i8], [128 x i8]* %x1, i64 0, i64 0 + %cmp = icmp slt i32 %sz, 129 + br i1 %cmp, label %if.then, label %if.end + +if.then: + call void @llvm.memset.p0i8.i32(i8* nonnull align 16 %x1.sub, i8 0, i32 %sz, i1 false) + br label %if.end + +if.end: + ret void +} diff --git a/test/Analysis/StackSafetyAnalysis/memintrin.ll b/test/Analysis/StackSafetyAnalysis/memintrin.ll new file mode 100644 index 0000000000000000000000000000000000000000..2eea9ea74bdf9fce190d63b5bbf4d111bd9367ee --- /dev/null +++ b/test/Analysis/StackSafetyAnalysis/memintrin.ll @@ -0,0 +1,202 @@ +; RUN: opt -S -analyze -stack-safety-local < %s | FileCheck %s +; RUN: opt -S -passes="print" -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt -S -analyze -stack-safety < %s | FileCheck %s +; RUN: opt -S -passes="print-stack-safety" -disable-output < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 %len, i1 %isvolatile) +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %len, i1 %isvolatile) +declare void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %len, i1 %isvolatile) + +define void @MemsetInBounds() { +; CHECK-LABEL: MemsetInBounds dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: [0,4){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + call void @llvm.memset.p0i8.i32(i8* %x1, i8 42, i32 4, i1 false) + ret void +} + +; Volatile does not matter for access bounds. +define void @VolatileMemsetInBounds() { +; CHECK-LABEL: VolatileMemsetInBounds dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: [0,4){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + call void @llvm.memset.p0i8.i32(i8* %x1, i8 42, i32 4, i1 true) + ret void +} + +define void @MemsetOutOfBounds() { +; CHECK-LABEL: MemsetOutOfBounds dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: [0,5){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + call void @llvm.memset.p0i8.i32(i8* %x1, i8 42, i32 5, i1 false) + ret void +} + +define void @MemsetNonConst(i32 %size) { +; CHECK-LABEL: MemsetNonConst dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: size[]: [0,1){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: full-set{{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + call void @llvm.memset.p0i8.i32(i8* %x1, i8 42, i32 %size, i1 false) + ret void +} + +; FIXME: memintrinsics should look at size range when possible +; Right now we refuse any non-constant size. +define void @MemsetNonConstInBounds(i1 zeroext %z) { +; CHECK-LABEL: MemsetNonConstInBounds dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: z[]: [0,1){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: full-set{{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + %size = select i1 %z, i32 3, i32 4 + call void @llvm.memset.p0i8.i32(i8* %x1, i8 42, i32 %size, i1 false) + ret void +} + +define void @MemcpyInBounds() { +; CHECK-LABEL: MemcpyInBounds dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: [0,4){{$}} +; CHECK-NEXT: y[4]: [0,4){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %y = alloca i32, align 4 + %x1 = bitcast i32* %x to i8* + %y1 = bitcast i32* %y to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %x1, i8* %y1, i32 4, i1 false) + ret void +} + +define void @MemcpySrcOutOfBounds() { +; CHECK-LABEL: MemcpySrcOutOfBounds dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[8]: [0,5){{$}} +; CHECK-NEXT: y[4]: [0,5){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i64, align 4 + %y = alloca i32, align 4 + %x1 = bitcast i64* %x to i8* + %y1 = bitcast i32* %y to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %x1, i8* %y1, i32 5, i1 false) + ret void +} + +define void @MemcpyDstOutOfBounds() { +; CHECK-LABEL: MemcpyDstOutOfBounds dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: [0,5){{$}} +; CHECK-NEXT: y[8]: [0,5){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %y = alloca i64, align 4 + %x1 = bitcast i32* %x to i8* + %y1 = bitcast i64* %y to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %x1, i8* %y1, i32 5, i1 false) + ret void +} + +define void @MemcpyBothOutOfBounds() { +; CHECK-LABEL: MemcpyBothOutOfBounds dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[4]: [0,9){{$}} +; CHECK-NEXT: y[8]: [0,9){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i32, align 4 + %y = alloca i64, align 4 + %x1 = bitcast i32* %x to i8* + %y1 = bitcast i64* %y to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %x1, i8* %y1, i32 9, i1 false) + ret void +} + +define void @MemcpySelfInBounds() { +; CHECK-LABEL: MemcpySelfInBounds dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[8]: [0,8){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i64, align 4 + %x1 = bitcast i64* %x to i8* + %x2 = getelementptr i8, i8* %x1, i64 5 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %x1, i8* %x2, i32 3, i1 false) + ret void +} + +define void @MemcpySelfSrcOutOfBounds() { +; CHECK-LABEL: MemcpySelfSrcOutOfBounds dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[8]: [0,9){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i64, align 4 + %x1 = bitcast i64* %x to i8* + %x2 = getelementptr i8, i8* %x1, i64 5 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %x1, i8* %x2, i32 4, i1 false) + ret void +} + +define void @MemcpySelfDstOutOfBounds() { +; CHECK-LABEL: MemcpySelfDstOutOfBounds dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[8]: [0,9){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i64, align 4 + %x1 = bitcast i64* %x to i8* + %x2 = getelementptr i8, i8* %x1, i64 5 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %x2, i8* %x1, i32 4, i1 false) + ret void +} + +define void @MemmoveSelfBothOutOfBounds() { +; CHECK-LABEL: MemmoveSelfBothOutOfBounds dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; CHECK-NEXT: x[8]: [0,14){{$}} +; CHECK-NOT: ]: +entry: + %x = alloca i64, align 4 + %x1 = bitcast i64* %x to i8* + %x2 = getelementptr i8, i8* %x1, i64 5 + call void @llvm.memmove.p0i8.p0i8.i32(i8* %x1, i8* %x2, i32 9, i1 false) + ret void +} diff --git a/test/Analysis/StackSafetyAnalysis/scev-udiv.ll b/test/Analysis/StackSafetyAnalysis/scev-udiv.ll new file mode 100644 index 0000000000000000000000000000000000000000..95a663a11fcc232ae28232b10e170b4220feff67 --- /dev/null +++ b/test/Analysis/StackSafetyAnalysis/scev-udiv.ll @@ -0,0 +1,65 @@ +; RUN: opt -S -analyze -stack-safety-local < %s | FileCheck %s --check-prefixes=CHECK,LOCAL +; RUN: opt -S -passes="print" -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,LOCAL +; RUN: opt -S -analyze -stack-safety < %s | FileCheck %s --check-prefixes=CHECK,GLOBAL +; RUN: opt -S -passes="print-stack-safety" -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,GLOBAL + +; Regression test that exercises a case when a AllocaOffsetRewritten SCEV +; could return an empty-set range. This could occur with udiv SCEVs where the +; RHS was re-written to 0. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare void @ExternalFn(i64) + +define void @Test1() { +; CHECK-LABEL: @Test1 dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[1]: empty-set, @Divide1(arg0, full-set){{$}} +; GLOBAL-NEXT: x[1]: full-set, @Divide1(arg0, full-set){{$}} +; CHECK-NOT: ]: + %x = alloca i8 + %int = ptrtoint i8* %x to i64 + call void @Divide1(i64 %int) + ret void +} + +define dso_local void @Divide1(i64 %arg) { +; CHECK-LABEL: @Divide1{{$}} +; CHECK-NEXT: args uses: +; LOCAL-NEXT: arg[]: empty-set, @ExternalFn(arg0, full-set){{$}} +; GLOBAL-NEXT: arg[]: full-set, @ExternalFn(arg0, full-set){{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NOT: ]: + %quotient = udiv i64 undef, %arg + call void @ExternalFn(i64 %quotient) + unreachable +} + +define void @Test2(i64 %arg) { +; CHECK-LABEL: @Test2 dso_preemptable{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: arg[]: empty-set{{$}} +; CHECK-NEXT: allocas uses: +; LOCAL-NEXT: x[1]: empty-set, @Divide2(arg0, full-set){{$}} +; GLOBAL-NEXT: x[1]: full-set, @Divide2(arg0, full-set){{$}} +; CHECK-NOT: ]: + %x = alloca i8 + %int = ptrtoint i8* %x to i64 + call void @Divide2(i64 %int) + ret void +} + +define dso_local void @Divide2(i64 %arg) { +; CHECK-LABEL: @Divide2{{$}} +; CHECK-NEXT: args uses: +; CHECK-NEXT: arg[]: full-set{{$}} +; CHECK-NEXT: allocas uses: +; CHECK-NOT: ]: + %x = inttoptr i64 %arg to i8* + %quotient = udiv i64 undef, %arg + %arrayidx = getelementptr i8, i8* %x, i64 %quotient + load i8, i8* %arrayidx + unreachable +} diff --git a/test/Assembler/2004-03-07-FunctionAddressAlignment.ll b/test/Assembler/2004-03-07-FunctionAddressAlignment.ll deleted file mode 100644 index 7fa0802db405c9852be6867c60785bf849fcc49a..0000000000000000000000000000000000000000 --- a/test/Assembler/2004-03-07-FunctionAddressAlignment.ll +++ /dev/null @@ -1,16 +0,0 @@ -; RUN: llvm-as < %s | llvm-dis | not grep ptrtoint -; RUN: verify-uselistorder %s -; All of these should be eliminable - - -define i32 @foo() { - ret i32 and (i32 ptrtoint (i32()* @foo to i32), i32 1) -} - -define i32 @foo2() { - ret i32 and (i32 1, i32 ptrtoint (i32()* @foo2 to i32)) -} - -define i1 @foo3() { - ret i1 icmp ne (i1()* @foo3, i1()* null) -} diff --git a/test/Assembler/debug-info-source-invalid.ll b/test/Assembler/debug-info-source-invalid.ll new file mode 100644 index 0000000000000000000000000000000000000000..d746e9e25fc3d12556d231d521cf82a03c04461a --- /dev/null +++ b/test/Assembler/debug-info-source-invalid.ll @@ -0,0 +1,27 @@ +; RUN: llvm-as < %s 2>&1 >/dev/null | FileCheck %s + +; Ensure we reject debug info where the DIFiles of a DICompileUnit mix source +; and no-source. + +define dso_local void @foo() !dbg !5 { + ret void +} + +define dso_local void @bar() !dbg !6 { + ret void +} + +!llvm.dbg.cu = !{!4} +!llvm.module.flags = !{!0, !1} + +!0 = !{i32 2, !"Dwarf Version", i32 5} +!1 = !{i32 2, !"Debug Info Version", i32 3} + +!2 = !DIFile(filename: "foo.c", directory: "dir", source: "void foo() { }\0A") +; CHECK: inconsistent use of embedded source +; CHECK: warning: ignoring invalid debug info +!3 = !DIFile(filename: "bar.h", directory: "dir") + +!4 = distinct !DICompileUnit(language: DW_LANG_C99, file: !2) +!5 = distinct !DISubprogram(name: "foo", file: !2, unit: !4) +!6 = distinct !DISubprogram(name: "bar", file: !3, unit: !4) diff --git a/test/Assembler/debug-info-source.ll b/test/Assembler/debug-info-source.ll new file mode 100644 index 0000000000000000000000000000000000000000..381603ef35c38399164625dde6e0c2ebac2d0706 --- /dev/null +++ b/test/Assembler/debug-info-source.ll @@ -0,0 +1,41 @@ +; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s +; RUN: verify-uselistorder %s + +; Ensure we accept debug info where DIFiles within a DICompileUnit either all +; have source, or none have source. + +define dso_local void @foo() !dbg !6 { + ret void +} + +define dso_local void @bar() !dbg !7 { + ret void +} + +define dso_local void @baz() !dbg !9 { + ret void +} + +define dso_local void @qux() !dbg !11 { + ret void +} + +!llvm.dbg.cu = !{!0, !2} +!llvm.module.flags = !{!4, !5} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1) +; CHECK: !1 = !DIFile(filename: "foo.c", directory: "dir", source: "void foo() { }\0A") +!1 = !DIFile(filename: "foo.c", directory: "dir", source: "void foo() { }\0A") +!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3) +; CHECK: !3 = !DIFile(filename: "qux.h", directory: "dir") +!3 = !DIFile(filename: "qux.h", directory: "dir") +!4 = !{i32 2, !"Dwarf Version", i32 5} +!5 = !{i32 2, !"Debug Info Version", i32 3} +!6 = distinct !DISubprogram(name: "foo", file: !1, unit: !0) +!7 = distinct !DISubprogram(name: "bar", file: !8, unit: !0) +; CHECK: !8 = !DIFile(filename: "bar.h", directory: "dir", source: "void bar() { }\0A") +!8 = !DIFile(filename: "bar.h", directory: "dir", source: "void bar() { }\0A") +!9 = distinct !DISubprogram(name: "baz", file: !10, unit: !2) +; CHECK: !10 = !DIFile(filename: "baz.c", directory: "dir") +!10 = !DIFile(filename: "baz.c", directory: "dir") +!11 = distinct !DISubprogram(name: "qux", file: !3, unit: !2) diff --git a/test/Assembler/disubprogram.ll b/test/Assembler/disubprogram.ll index a407f3da8bd8d5747fb35e9a355e280e23ffe8c4..df97efb417c7a85051ba8402ef4f48919a5fa065 100644 --- a/test/Assembler/disubprogram.ll +++ b/test/Assembler/disubprogram.ll @@ -6,8 +6,8 @@ define void @_Z3foov() !dbg !9 { ret void } -; CHECK: !named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14} -!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14} +; CHECK: !named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15} +!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15} !0 = !{null} !1 = distinct !DICompositeType(tag: DW_TAG_structure_type) @@ -17,7 +17,7 @@ define void @_Z3foov() !dbg !9 { !5 = distinct !{} !6 = distinct !{} -; CHECK: !7 = distinct !DISubprogram(scope: null, isLocal: false, isDefinition: true, isOptimized: false, unit: !8) +; CHECK: !7 = distinct !DISubprogram(scope: null, spFlags: DISPFlagDefinition, unit: !8) !7 = distinct !DISubprogram(unit: !8) !8 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang", @@ -25,10 +25,10 @@ define void @_Z3foov() !dbg !9 { isOptimized: true, flags: "-O2", splitDebugFilename: "abc.debug", emissionKind: 2) -; CHECK: !9 = !DISubprogram(scope: null, isLocal: false, isDefinition: false, isOptimized: false) +; CHECK: !9 = !DISubprogram(scope: null, spFlags: 0) !9 = !DISubprogram(isDefinition: false) -; CHECK: !10 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1, file: !2, line: 7, type: !3, isLocal: true, isDefinition: true, scopeLine: 8, containingType: !4, virtuality: DW_VIRTUALITY_pure_virtual, virtualIndex: 10, thisAdjustment: 3, flags: DIFlagPrototyped | DIFlagNoReturn, isOptimized: true, unit: !8, templateParams: !5, declaration: !9, retainedNodes: !6) +; CHECK: !10 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1, file: !2, line: 7, type: !3, scopeLine: 8, containingType: !4, virtualIndex: 10, thisAdjustment: 3, flags: DIFlagPrototyped | DIFlagNoReturn, spFlags: DISPFlagPureVirtual | DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !8, templateParams: !5, declaration: !9, retainedNodes: !6) !10 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1, file: !2, line: 7, type: !3, isLocal: true, isDefinition: true, scopeLine: 8, @@ -63,12 +63,21 @@ define void @_Z3foov() !dbg !9 { !13 = !{!4} ; CHECK: !13 = !{!4} -; CHECK: !14 = distinct !DISubprogram(name: "foo", scope: !1, file: !2, line: 1, type: !3, isLocal: true, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !8, thrownTypes: !13) +; CHECK: !14 = distinct !DISubprogram(name: "foo", scope: !1, file: !2, line: 1, type: !3, scopeLine: 2, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !8, thrownTypes: !13) !14 = distinct !DISubprogram(name: "foo", scope: !1, file: !2, line: 1, type: !3, isLocal: true, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !8, thrownTypes: !13) -!15 = !{i32 1, !"Debug Info Version", i32 3} -!llvm.module.flags = !{!15} +; CHECK: !15 = distinct !DISubprogram({{.*}}, flags: DIFlagPrototyped, spFlags: DISPFlagPureVirtual | DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, +!15 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1, + file: !2, line: 7, type: !3, scopeLine: 8, + containingType: !4, virtualIndex: 0, + flags: DIFlagPrototyped, + spFlags: DISPFlagPureVirtual | DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, + unit: !8, templateParams: !5, declaration: !9, + retainedNodes: !6) + +!16 = !{i32 1, !"Debug Info Version", i32 3} +!llvm.module.flags = !{!16} !llvm.dbg.cu = !{!8} diff --git a/test/Assembler/fast-math-flags.ll b/test/Assembler/fast-math-flags.ll index edff26e6d685f98a9af7671898428db56c6be760..1af2320f0a108658d90ee21c94e57970a6c9c8f9 100644 --- a/test/Assembler/fast-math-flags.ll +++ b/test/Assembler/fast-math-flags.ll @@ -38,8 +38,12 @@ entry: %e = frem float %x, %y ; CHECK: %e_vec = frem <3 x float> %vec, %vec %e_vec = frem <3 x float> %vec, %vec -; CHECK: ret float %e - ret float %e +; CHECK: %f = fneg float %x + %f = fneg float %x +; CHECK: %f_vec = fneg <3 x float> %vec + %f_vec = fneg <3 x float> %vec +; CHECK: ret float %f + ret float %f } ; CHECK: no_nan @@ -72,8 +76,12 @@ entry: %e = frem nnan float %x, %y ; CHECK: %e_vec = frem nnan <3 x float> %vec, %vec %e_vec = frem nnan <3 x float> %vec, %vec -; CHECK: ret float %e - ret float %e +; CHECK: %f = fneg nnan float %x + %f = fneg nnan float %x +; CHECK: %f_vec = fneg nnan <3 x float> %vec + %f_vec = fneg nnan <3 x float> %vec +; CHECK: ret float %f + ret float %f } ; CHECK: @contract( @@ -174,6 +182,10 @@ entry: %e = frem nnan nsz float %x, %y ; CHECK: %e_vec = frem nnan <3 x float> %vec, %vec %e_vec = frem nnan <3 x float> %vec, %vec -; CHECK: ret float %e - ret float %e +; CHECK: %f = fneg nnan nsz float %x + %f = fneg nnan nsz float %x +; CHECK: %f_vec = fneg fast <3 x float> %vec + %f_vec = fneg fast <3 x float> %vec +; CHECK: ret float %f + ret float %f } diff --git a/test/Assembler/invalid-disubprogram-uniqued-definition.ll b/test/Assembler/invalid-disubprogram-uniqued-definition.ll index c146883d6649f7567e0d94ff433a8d25550fbe90..6641e6dc21d72a07dfe9827c0f465285b08e68dd 100644 --- a/test/Assembler/invalid-disubprogram-uniqued-definition.ll +++ b/test/Assembler/invalid-disubprogram-uniqued-definition.ll @@ -1,4 +1,4 @@ ; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s -; CHECK: :[[@LINE+1]]:6: error: missing 'distinct', required for !DISubprogram when 'isDefinition' +; CHECK: :[[@LINE+1]]:6: error: missing 'distinct', required for !DISubprogram that is a Definition !0 = !DISubprogram(isDefinition: true) diff --git a/test/Assembler/thinlto-summary.ll b/test/Assembler/thinlto-summary.ll index 64af835ae2b101bde9c5ea62673670588060d3cc..4d1c0dd74d2bb0289ceb01f5f58a532e76ad5c3f 100644 --- a/test/Assembler/thinlto-summary.ll +++ b/test/Assembler/thinlto-summary.ll @@ -9,7 +9,7 @@ ; Check a function that makes several calls with various profile hotness, and a ; reference (also tests forward references to function and variables in calls ; and refs). -^2 = gv: (guid: 1, summaries: (function: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 10, calls: ((callee: ^15, hotness: hot), (callee: ^17, hotness: cold), (callee: ^16, hotness: none)), refs: (^13)))) +^2 = gv: (guid: 1, summaries: (function: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 10, calls: ((callee: ^15, hotness: hot), (callee: ^17, hotness: cold), (callee: ^16, hotness: none)), refs: (readonly ^13, ^14)))) ; Function with a call that has relative block frequency instead of profile ; hotness. @@ -24,16 +24,16 @@ ^8 = gv: (guid: 7, summaries: (function: (module: ^0, flags: (linkage: linkonce_odr, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1))) ^9 = gv: (guid: 8, summaries: (function: (module: ^0, flags: (linkage: weak_odr, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1))) ^10 = gv: (guid: 9, summaries: (function: (module: ^0, flags: (linkage: weak, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1))) -^11 = gv: (guid: 10, summaries: (variable: (module: ^0, flags: (linkage: common, notEligibleToImport: 0, live: 0, dsoLocal: 0)))) +^11 = gv: (guid: 10, summaries: (variable: (module: ^0, flags: (linkage: common, notEligibleToImport: 0, live: 0, dsoLocal: 0), varFlags: (readonly: 0)))) ; Test appending globel variable with reference (tests backward reference on ; refs). -^12 = gv: (guid: 11, summaries: (variable: (module: ^0, flags: (linkage: appending, notEligibleToImport: 0, live: 0, dsoLocal: 0), refs: (^4)))) +^12 = gv: (guid: 11, summaries: (variable: (module: ^0, flags: (linkage: appending, notEligibleToImport: 0, live: 0, dsoLocal: 0), varFlags: (readonly: 0), refs: (^4)))) ; Test a referenced global variable. -^13 = gv: (guid: 12, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0)))) +^13 = gv: (guid: 12, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), varFlags: (readonly: 1)))) ; Test a dsoLocal variable. -^14 = gv: (guid: 13, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 1)))) +^14 = gv: (guid: 13, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 1), varFlags: (readonly: 0)))) ; Functions with various flag combinations (notEligibleToImport, Live, ; combinations of optional function flags). @@ -67,7 +67,7 @@ ; Make sure we get back from llvm-dis essentially what we put in via llvm-as. ; CHECK: ^0 = module: (path: "thinlto-summary1.o", hash: (1369602428, 2747878711, 259090915, 2507395659, 1141468049)) ; CHECK: ^1 = module: (path: "thinlto-summary2.o", hash: (2998369023, 4283347029, 1195487472, 2757298015, 1852134156)) -; CHECK: ^2 = gv: (guid: 1, summaries: (function: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 10, calls: ((callee: ^15, hotness: hot), (callee: ^17, hotness: cold), (callee: ^16, hotness: none)), refs: (^13)))) +; CHECK: ^2 = gv: (guid: 1, summaries: (function: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 10, calls: ((callee: ^15, hotness: hot), (callee: ^17, hotness: cold), (callee: ^16, hotness: none)), refs: (^14, readonly ^13)))) ; CHECK: ^3 = gv: (guid: 2, summaries: (function: (module: ^1, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 10, calls: ((callee: ^15))))) ; CHECK: ^4 = gv: (guid: 3, summaries: (function: (module: ^0, flags: (linkage: internal, notEligibleToImport: 0, live: 0, dsoLocal: 1), insts: 1))) ; CHECK: ^5 = gv: (guid: 4, summaries: (alias: (module: ^0, flags: (linkage: private, notEligibleToImport: 0, live: 0, dsoLocal: 1), aliasee: ^14))) @@ -76,10 +76,10 @@ ; CHECK: ^8 = gv: (guid: 7, summaries: (function: (module: ^0, flags: (linkage: linkonce_odr, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1))) ; CHECK: ^9 = gv: (guid: 8, summaries: (function: (module: ^0, flags: (linkage: weak_odr, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1))) ; CHECK: ^10 = gv: (guid: 9, summaries: (function: (module: ^0, flags: (linkage: weak, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1))) -; CHECK: ^11 = gv: (guid: 10, summaries: (variable: (module: ^0, flags: (linkage: common, notEligibleToImport: 0, live: 0, dsoLocal: 0)))) -; CHECK: ^12 = gv: (guid: 11, summaries: (variable: (module: ^0, flags: (linkage: appending, notEligibleToImport: 0, live: 0, dsoLocal: 0), refs: (^4)))) -; CHECK: ^13 = gv: (guid: 12, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0)))) -; CHECK: ^14 = gv: (guid: 13, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 1)))) +; CHECK: ^11 = gv: (guid: 10, summaries: (variable: (module: ^0, flags: (linkage: common, notEligibleToImport: 0, live: 0, dsoLocal: 0), varFlags: (readonly: 0)))) +; CHECK: ^12 = gv: (guid: 11, summaries: (variable: (module: ^0, flags: (linkage: appending, notEligibleToImport: 0, live: 0, dsoLocal: 0), varFlags: (readonly: 0), refs: (^4)))) +; CHECK: ^13 = gv: (guid: 12, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), varFlags: (readonly: 1)))) +; CHECK: ^14 = gv: (guid: 13, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 1), varFlags: (readonly: 0)))) ; CHECK: ^15 = gv: (guid: 14, summaries: (function: (module: ^1, flags: (linkage: external, notEligibleToImport: 1, live: 1, dsoLocal: 0), insts: 1))) ; CHECK: ^16 = gv: (guid: 15, summaries: (function: (module: ^1, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1, funcFlags: (readNone: 1, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 0)))) ; CHECK: ^17 = gv: (guid: 16, summaries: (function: (module: ^1, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1, funcFlags: (readNone: 0, readOnly: 1, noRecurse: 0, returnDoesNotAlias: 1, noInline: 0), calls: ((callee: ^15))))) diff --git a/test/Bindings/llvm-c/debug_info.ll b/test/Bindings/llvm-c/debug_info.ll index 46fb0dd16ced19b4f9dbe89de087b6f0305a892e..84635f041a31976bc5b5bd5359efacfe522a7d4f 100644 --- a/test/Bindings/llvm-c/debug_info.ll +++ b/test/Bindings/llvm-c/debug_info.ll @@ -43,7 +43,7 @@ ; CHECK-NEXT: !17 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyStruct", scope: !18, file: !1, size: 192, elements: !19, runtimeLang: DW_LANG_C89, identifier: "MyStruct") ; CHECK-NEXT: !18 = !DINamespace(name: "NameSpace", scope: !6) ; CHECK-NEXT: !19 = !{!11, !11, !11} -; CHECK-NEXT: !20 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !1, file: !1, line: 42, type: !21, isLocal: true, isDefinition: true, scopeLine: 42, isOptimized: false, unit: !0, retainedNodes: !26) +; CHECK-NEXT: !20 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !1, file: !1, line: 42, type: !21, scopeLine: 42, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !26) ; CHECK-NEXT: !21 = !DISubroutineType(types: !22) ; CHECK-NEXT: !22 = !{!11, !11, !23} ; CHECK-NEXT: !23 = !DICompositeType(tag: DW_TAG_array_type, baseType: !11, size: 640, flags: DIFlagVector, elements: !24) diff --git a/test/Bitcode/DISubprogram-distinct-definitions.ll b/test/Bitcode/DISubprogram-distinct-definitions.ll index bcb9a4ec4da6670f9d72b4e2647fa316afd763b0..9d141d52de1c8d7306806633e504242309379ed9 100644 --- a/test/Bitcode/DISubprogram-distinct-definitions.ll +++ b/test/Bitcode/DISubprogram-distinct-definitions.ll @@ -9,6 +9,6 @@ define void @f() !dbg !3 { ret void } !0 = distinct !DICompileUnit(language: 12, file: !1, subprograms: !{!3}) !1 = !DIFile(filename: "path/to/file", directory: "/path/to/dir") -; CHECK: = distinct !DISubprogram({{.*}}, isDefinition: true +; CHECK: = distinct !DISubprogram({{.*}} DISPFlagDefinition !3 = !DISubprogram(name: "foo", isDefinition: true) !4 = !{i32 2, !"Debug Info Version", i32 3} diff --git a/test/Bitcode/DISubprogram-v4.ll b/test/Bitcode/DISubprogram-v4.ll new file mode 100644 index 0000000000000000000000000000000000000000..ce5521002cafb989d606a7f152748a8d517f67a1 --- /dev/null +++ b/test/Bitcode/DISubprogram-v4.ll @@ -0,0 +1,86 @@ +; The .bc file was generated from this source using llvm-as from r347766. +; A 7.0 release version should work to recreate it if necessary. +; RUN: llvm-dis < %s.bc | FileCheck %s + +; CHECK: define void @_Z3foov() !dbg !9 +define void @_Z3foov() !dbg !9 { + ret void +} + +; CHECK: !named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18} +!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18} + +!0 = !{null} +!1 = distinct !DICompositeType(tag: DW_TAG_structure_type) +!2 = !DIFile(filename: "path/to/file", directory: "/path/to/dir") +!3 = !DISubroutineType(types: !0) +!4 = distinct !DICompositeType(tag: DW_TAG_structure_type) +!5 = distinct !{} +!6 = distinct !{} + +; CHECK: !7 = distinct !DISubprogram(scope: null, spFlags: DISPFlagDefinition, unit: !8) +!7 = distinct !DISubprogram(unit: !8) + +!8 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang", + file: !2, isOptimized: true, flags: "-O2") + +; CHECK: !9 = !DISubprogram(scope: null, spFlags: 0) +!9 = !DISubprogram(isDefinition: false) + +; CHECK: !10 = distinct !DISubprogram({{.*}}, spFlags: DISPFlagPureVirtual | DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, +!10 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1, + file: !2, line: 7, type: !3, containingType: !4, + isLocal: true, isDefinition: true, isOptimized: true, + virtuality: DW_VIRTUALITY_pure_virtual, + unit: !8) + +; CHECK: !11 = distinct !DISubprogram({{.*}}, spFlags: DISPFlagVirtual | DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, +!11 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1, + file: !2, line: 7, type: !3, containingType: !4, + isLocal: true, isDefinition: true, isOptimized: true, + virtuality: DW_VIRTUALITY_virtual, + unit: !8) + +; CHECK: !12 = distinct !DISubprogram({{.*}}, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, +!12 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1, + file: !2, line: 7, type: !3, containingType: !4, + isLocal: true, isDefinition: true, isOptimized: true, + virtuality: DW_VIRTUALITY_none, + unit: !8) + +; CHECK: !13 = distinct !DISubprogram({{.*}}, spFlags: DISPFlagDefinition | DISPFlagOptimized, +!13 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1, + file: !2, line: 7, type: !3, containingType: !4, + isLocal: false, isDefinition: true, isOptimized: true, + unit: !8) + +; CHECK: !14 = distinct !DISubprogram({{.*}}, spFlags: DISPFlagLocalToUnit | DISPFlagOptimized) +!14 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1, + file: !2, line: 7, type: !3, containingType: !4, + isLocal: true, isDefinition: false, isOptimized: true) + +; CHECK: !15 = distinct !DISubprogram({{.*}}, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, +!15 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1, + file: !2, line: 7, type: !3, containingType: !4, + isLocal: true, isDefinition: true, isOptimized: false, + unit: !8) + +; CHECK: !16 = distinct !DISubprogram({{.*}}, spFlags: DISPFlagLocalToUnit) +!16 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1, + file: !2, line: 7, type: !3, containingType: !4, + isLocal: true, isDefinition: false, isOptimized: false) + +; CHECK: !17 = distinct !DISubprogram({{.*}}, spFlags: DISPFlagDefinition, +!17 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1, + file: !2, line: 7, type: !3, containingType: !4, + isLocal: false, isDefinition: true, isOptimized: false, + unit: !8) + +; CHECK: !18 = distinct !DISubprogram({{.*}}, spFlags: DISPFlagOptimized) +!18 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1, + file: !2, line: 7, type: !3, containingType: !4, + isLocal: false, isDefinition: false, isOptimized: true) + +!19 = !{i32 1, !"Debug Info Version", i32 3} +!llvm.module.flags = !{!19} +!llvm.dbg.cu = !{!8} diff --git a/test/Bitcode/DISubprogram-v4.ll.bc b/test/Bitcode/DISubprogram-v4.ll.bc new file mode 100644 index 0000000000000000000000000000000000000000..392df4aae74d3fb0df0c2196191496a4eb4045dd Binary files /dev/null and b/test/Bitcode/DISubprogram-v4.ll.bc differ diff --git a/test/Bitcode/compatibility.ll b/test/Bitcode/compatibility.ll index 8c0471a1134efd85165254b48e9bdb63ad994b8f..1d57f759391af70c5daa041a3ab660ed1b2293d5 100644 --- a/test/Bitcode/compatibility.ll +++ b/test/Bitcode/compatibility.ll @@ -762,7 +762,27 @@ define void @atomics(i32* %word) { } ;; Fast Math Flags -define void @fastmathflags(float %op1, float %op2) { +define void @fastmathflags_unop(float %op1) { + %f.nnan = fneg nnan float %op1 + ; CHECK: %f.nnan = fneg nnan float %op1 + %f.ninf = fneg ninf float %op1 + ; CHECK: %f.ninf = fneg ninf float %op1 + %f.nsz = fneg nsz float %op1 + ; CHECK: %f.nsz = fneg nsz float %op1 + %f.arcp = fneg arcp float %op1 + ; CHECK: %f.arcp = fneg arcp float %op1 + %f.contract = fneg contract float %op1 + ; CHECK: %f.contract = fneg contract float %op1 + %f.afn = fneg afn float %op1 + ; CHECK: %f.afn = fneg afn float %op1 + %f.reassoc = fneg reassoc float %op1 + ; CHECK: %f.reassoc = fneg reassoc float %op1 + %f.fast = fneg fast float %op1 + ; CHECK: %f.fast = fneg fast float %op1 + ret void +} + +define void @fastmathflags_binops(float %op1, float %op2) { %f.nnan = fadd nnan float %op1, %op2 ; CHECK: %f.nnan = fadd nnan float %op1, %op2 %f.ninf = fadd ninf float %op1, %op2 @@ -997,6 +1017,13 @@ continue: ret i32 0 } +; Instructions -- Unary Operations +define void @instructions.unops(double %op1) { + fneg double %op1 + ; CHECK: fneg double %op1 + ret void +} + ; Instructions -- Binary Operations define void @instructions.binops(i8 %op1, i8 %op2) { ; nuw x nsw diff --git a/test/Bitcode/function-encoding-rel-operands.ll b/test/Bitcode/function-encoding-rel-operands.ll index 1307dd4833770c3b5cc3816a605ec5c34760f720..9486e04a0d5aa856547affb94cc0f783c5c9027d 100644 --- a/test/Bitcode/function-encoding-rel-operands.ll +++ b/test/Bitcode/function-encoding-rel-operands.ll @@ -3,6 +3,15 @@ ; RUN: llvm-as < %s | llvm-bcanalyzer -dump | FileCheck %s ; RUN: verify-uselistorder < %s +; CHECK: FUNCTION_BLOCK +; CHECK: INST_UNOP {{.*}}op0=1 +; CHECK: INST_RET {{.*}}op0=1 +define double @test_float_unops(double %a) nounwind { + %1 = fneg double %a + ret double %1 +} + + ; CHECK: FUNCTION_BLOCK ; CHECK: INST_BINOP {{.*}}op0=1 op1=1 ; CHECK: INST_BINOP {{.*}}op0=1 op1=1 diff --git a/test/Bitcode/summary_version.ll b/test/Bitcode/summary_version.ll index b285da7a6f4e6dcd74f06d53a6931676680be278..fc3b3bd4877718f6cda3f2c1575fd9b12373119a 100644 --- a/test/Bitcode/summary_version.ll +++ b/test/Bitcode/summary_version.ll @@ -2,7 +2,7 @@ ; RUN: opt -module-summary %s -o - | llvm-bcanalyzer -dump | FileCheck %s ; CHECK: +; CHECK: diff --git a/test/Bitcode/thinlto-alias.ll b/test/Bitcode/thinlto-alias.ll index 05de932faeebed7a5a2136520dcff4891854dd46..835d720c69e5d76564c025cca1f31e235b81e034 100644 --- a/test/Bitcode/thinlto-alias.ll +++ b/test/Bitcode/thinlto-alias.ll @@ -20,7 +20,7 @@ ; CHECK-NEXT: +; CHECK-NEXT: ; CHECK-NEXT: ; CHECK: ; COMBINED-NEXT: -; COMBINED-NEXT: +; COMBINED-NEXT: ; COMBINED-NEXT: +; CHECK-NEXT: ; CHECK-NEXT: ; CHECK-NEXT: diff --git a/test/Bitcode/thinlto-function-summary-callgraph-cast.ll b/test/Bitcode/thinlto-function-summary-callgraph-cast.ll index 45801c9a74dd20a9961c477f5840c441a2ceda12..79644403d38cf4edcabc29a52e06d96d8a452194 100644 --- a/test/Bitcode/thinlto-function-summary-callgraph-cast.ll +++ b/test/Bitcode/thinlto-function-summary-callgraph-cast.ll @@ -6,9 +6,9 @@ ; CHECK: +; CHECK-NEXT: ; "another_caller" has only references but no calls. -; CHECK-NEXT: +; CHECK-NEXT: ; CHECK-NEXT: ; CHECK-NEXT: diff --git a/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll b/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll index bb3e8e978356b9d50868c88d572eaf2fb4d47deb..e332224343e96b52ff8d0e94396ef1065e469565 100644 --- a/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll +++ b/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll @@ -17,7 +17,7 @@ ; CHECK: +; CHECK-NEXT: ; CHECK-NEXT: ; CHECK: +; COMBINED-NEXT: ; COMBINED-NEXT: ; ModuleID = 'thinlto-function-summary-callgraph.ll' diff --git a/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll b/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll index fc3c5c90ab97a96f57a1446d50034e85654beb43..31c99c189ac014ae85e6bd850b9421b2d4ddef36 100644 --- a/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll +++ b/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll @@ -48,7 +48,7 @@ ; CHECK-NEXT: ; op4=hot1 op6=cold op8=hot2 op10=hot4 op12=none1 op14=hot3 op16=none2 op18=none3 op20=123 -; CHECK-NEXT: +; CHECK-NEXT: ; CHECK-NEXT: ; CHECK: +; COMBINED-NEXT: ; COMBINED_NEXT: diff --git a/test/Bitcode/thinlto-function-summary-callgraph-relbf.ll b/test/Bitcode/thinlto-function-summary-callgraph-relbf.ll index d84517865a8ccbbe060408da64d6ee76e3881116..6c14465222569d449442e225b0c922c95d740e7c 100644 --- a/test/Bitcode/thinlto-function-summary-callgraph-relbf.ll +++ b/test/Bitcode/thinlto-function-summary-callgraph-relbf.ll @@ -13,7 +13,7 @@ ; CHECK: ; CHECK: ; op4=none1 op6=hot1 op8=cold1 op10=none2 op12=hot2 op14=cold2 op16=none3 op18=hot3 op20=cold3 op22=123 -; CHECK-NEXT: +; CHECK-NEXT: ; CHECK-NEXT: ; CHECK: +; COMBINED-NEXT: ; COMBINED_NEXT: diff --git a/test/Bitcode/thinlto-function-summary-callgraph.ll b/test/Bitcode/thinlto-function-summary-callgraph.ll index 8025eee5929ba9a6770c3aaf41ece19d584c21ae..a605b7ec2219b8c79bf73980057b71b3e5b066a9 100644 --- a/test/Bitcode/thinlto-function-summary-callgraph.ll +++ b/test/Bitcode/thinlto-function-summary-callgraph.ll @@ -17,7 +17,7 @@ ; CHECK-NEXT: ; CHECK: +; COMBINED-NEXT: ; COMBINED-NEXT: ; ModuleID = 'thinlto-function-summary-callgraph.ll' diff --git a/test/Bitcode/thinlto-function-summary-refgraph.ll b/test/Bitcode/thinlto-function-summary-refgraph.ll index 848598fa6860477a2d4ee8fdaf175920f402cb16..e8a2a770f61df0550b597a8352050a550ce96c1a 100644 --- a/test/Bitcode/thinlto-function-summary-refgraph.ll +++ b/test/Bitcode/thinlto-function-summary-refgraph.ll @@ -41,27 +41,27 @@ ; CHECK: +; CHECK-DAG: ; Function W contains a call to func3 as well as a reference to globalvar: ; op0=W op4=globalvar op5=func3 -; CHECK-DAG: +; CHECK-DAG: ; Function X contains call to foo, as well as address reference to foo ; which is in the same instruction as the call: ; op0=X op4=foo op5=foo -; CHECK-DAG: +; CHECK-DAG: ; Function Y contains call to func2, and ensures we don't incorrectly add ; a reference to it when reached while earlier analyzing the phi using its ; return value: ; op0=Y op4=func2 -; CHECK-DAG: +; CHECK-DAG: ; Function Z contains call to func2, and ensures we don't incorrectly add ; a reference to it when reached while analyzing subsequent use of its return ; value: ; op0=Z op4=func2 -; CHECK-DAG: +; CHECK-DAG: ; Variable bar initialization contains address reference to func: ; op0=bar op2=func -; CHECK-DAG: +; CHECK-DAG: ; CHECK: ; CHECK: + +; Ensure synthetic entry count flag is set on distributed index +; when option used to enable synthetic count propagation +; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \ +; RUN: -r %t.o,glob,plx -thinlto-synthesize-entry-counts \ +; RUN: -compute-dead=false +; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=HASSYNTHETIC +; HASSYNTHETIC: + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@glob = global i32 0 diff --git a/test/BugPoint/compile-custom.ll b/test/BugPoint/compile-custom.ll index cb82a01383e1120ee291a5c7f96810be092779ad..847d1184f016b6360355f59527dc87de89da46bb 100644 --- a/test/BugPoint/compile-custom.ll +++ b/test/BugPoint/compile-custom.ll @@ -1,4 +1,4 @@ -; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext --compile-custom --compile-command="%/s.py arg1 arg2" --opt-command opt --output-prefix %t %s | FileCheck %s +; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext --compile-custom --compile-command="%python %/s.py arg1 arg2" --output-prefix %t %s | FileCheck %s ; REQUIRES: loadable_module ; Test that arguments are correctly passed in --compile-command. The output diff --git a/test/BugPoint/crash-narrowfunctiontest.ll b/test/BugPoint/crash-narrowfunctiontest.ll index ef78eb2b0232fb30531f6ac86807d10711e71917..d080d9dd4b0cac54d64b4fe42e9853ebddc2c2e2 100644 --- a/test/BugPoint/crash-narrowfunctiontest.ll +++ b/test/BugPoint/crash-narrowfunctiontest.ll @@ -1,6 +1,6 @@ ; Test that bugpoint can narrow down the testcase to the important function ; -; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls --opt-command opt -silence-passes > /dev/null +; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes > /dev/null ; REQUIRES: loadable_module define i32 @foo() { ret i32 1 } diff --git a/test/BugPoint/func-attrs-keyval.ll b/test/BugPoint/func-attrs-keyval.ll new file mode 100644 index 0000000000000000000000000000000000000000..830d0964cde1a3523dcc6b8bea852268fb59b0f8 --- /dev/null +++ b/test/BugPoint/func-attrs-keyval.ll @@ -0,0 +1,11 @@ +; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashfuncattr -silence-passes +; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s +; REQUIRES: loadable_module + +; CHECK: f() #[[ATTRS:[0-9]+]] +define void @f() #0 { + ret void +} + +; CHECK: attributes #[[ATTRS]] = { "bugpoint-crash"="sure" } +attributes #0 = { "bugpoint-crash"="sure" noreturn "no-frame-pointer-elim-non-leaf" } diff --git a/test/BugPoint/func-attrs.ll b/test/BugPoint/func-attrs.ll new file mode 100644 index 0000000000000000000000000000000000000000..3941e73c217c83b882c8a06f543518364ea2824c --- /dev/null +++ b/test/BugPoint/func-attrs.ll @@ -0,0 +1,11 @@ +; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashfuncattr -silence-passes +; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s +; REQUIRES: loadable_module + +; CHECK: f() #[[ATTRS:[0-9]+]] +define void @f() #0 { + ret void +} + +; CHECK: attributes #[[ATTRS]] = { "bugpoint-crash" } +attributes #0 = { noinline "bugpoint-crash" "no-frame-pointer-elim-non-leaf" } diff --git a/test/BugPoint/invalid-debuginfo.ll b/test/BugPoint/invalid-debuginfo.ll index f703667982c02a79a04c8b227023ea89fa796033..2005a13b67578de17d186fd7f4163c21bf3cec91 100644 --- a/test/BugPoint/invalid-debuginfo.ll +++ b/test/BugPoint/invalid-debuginfo.ll @@ -1,4 +1,4 @@ -; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crash-too-many-cus -silence-passes --opt-command opt 2>&1 | FileCheck %s +; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crash-too-many-cus -silence-passes 2>&1 | FileCheck %s ; REQUIRES: loadable_module ; CHECK: DICompileUnit not listed in llvm.dbg.cu diff --git a/test/BugPoint/metadata.ll b/test/BugPoint/metadata.ll index e5986494e16feb0184b8dea64d256b417efe316f..ac77b9e5a7d1772f6418dfae461a956418315fad 100644 --- a/test/BugPoint/metadata.ll +++ b/test/BugPoint/metadata.ll @@ -1,11 +1,11 @@ ; REQUIRES: loadable_module -; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes -disable-namedmd-remove -disable-strip-debuginfo -disable-strip-debug-types --opt-command opt > /dev/null +; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes -disable-namedmd-remove -disable-strip-debuginfo -disable-strip-debug-types > /dev/null ; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s ; -; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t-nodebug -bugpoint-crashcalls -silence-passes -disable-namedmd-remove --opt-command opt > /dev/null +; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t-nodebug -bugpoint-crashcalls -silence-passes -disable-namedmd-remove > /dev/null ; RUN: llvm-dis %t-nodebug-reduced-simplified.bc -o - | FileCheck %s --check-prefix=NODEBUG ; -; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t-notype -bugpoint-crashcalls -silence-passes -disable-namedmd-remove -disable-strip-debuginfo --opt-command opt > /dev/null +; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t-notype -bugpoint-crashcalls -silence-passes -disable-namedmd-remove -disable-strip-debuginfo > /dev/null ; RUN: llvm-dis %t-notype-reduced-simplified.bc -o - | FileCheck %s --check-prefix=NOTYPE ; ; Bugpoint should keep the call's metadata attached to the call. diff --git a/test/BugPoint/named-md.ll b/test/BugPoint/named-md.ll index 21b65af6f73eb697e22d8b13cf3c9f8eea3438e5..1ed34435fbebba40ac8b3c3c59bdf934c86cd7e4 100644 --- a/test/BugPoint/named-md.ll +++ b/test/BugPoint/named-md.ll @@ -1,4 +1,4 @@ -; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crash-too-many-cus -silence-passes -disable-strip-debuginfo --opt-command opt > /dev/null +; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crash-too-many-cus -silence-passes -disable-strip-debuginfo > /dev/null ; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s ; RUN-DISABLE: bugpoint -disable-namedmd-remove -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crash-too-many-cus -silence-passes > /dev/null ; RUN-DISABLE: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s diff --git a/test/BugPoint/remove_arguments_test.ll b/test/BugPoint/remove_arguments_test.ll index d5967675ef868ae77af27eac060285f83b0c2609..72be4fe559363d61da57e2a76f20d9114136880b 100644 --- a/test/BugPoint/remove_arguments_test.ll +++ b/test/BugPoint/remove_arguments_test.ll @@ -1,4 +1,4 @@ -; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes --opt-command opt +; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes ; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s ; REQUIRES: loadable_module diff --git a/test/BugPoint/replace-funcs-with-null.ll b/test/BugPoint/replace-funcs-with-null.ll index b6976c9a2bbb77fe12d74cfa19c2709cc29224f2..622f9eb67a29faa7ee22256804f678f8ef5e9f2d 100644 --- a/test/BugPoint/replace-funcs-with-null.ll +++ b/test/BugPoint/replace-funcs-with-null.ll @@ -1,6 +1,6 @@ ; Test that bugpoint can reduce the set of functions by replacing them with null. ; -; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -replace-funcs-with-null -bugpoint-crash-decl-funcs -silence-passes -safe-run-llc --opt-command opt +; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -replace-funcs-with-null -bugpoint-crash-decl-funcs -silence-passes -safe-run-llc ; REQUIRES: loadable_module @foo2 = alias i32 (), i32 ()* @foo diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-stackprotect.ll b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-stackprotect.ll index 62abf3d81d58009ef1eaf4e1a5f533fcfa23f47b..ad01264f1a43f19362aec25d92e7682a25feb39c 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-stackprotect.ll +++ b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-stackprotect.ll @@ -3,6 +3,9 @@ ; CHECK: name: test_stack_guard +; CHECK: frameInfo: +; CHECK: stackProtector: '%stack.0.StackGuardSlot' + ; CHECK: stack: ; CHECK: - { id: 0, name: StackGuardSlot, type: default, offset: 0, size: 8, alignment: 8, ; CHECK-NOT: id: 1 diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll index 2997c5350ebc1cce4dfee1d22f3685d8dc1184a6..74de033447fcf5cc4ba95aea45da8cc81213a8f4 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll +++ b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll @@ -1173,12 +1173,12 @@ define void @test_constant_float(float* %addr) { ; CHECK: [[BOOLADDR:%[0-9]+]]:_(p0) = COPY $x2 ; CHECK: [[LHS:%[0-9]+]]:_(s32) = G_LOAD [[LHSADDR]](p0) ; CHECK: [[RHS:%[0-9]+]]:_(s32) = G_LOAD [[RHSADDR]](p0) -; CHECK: [[TST:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[LHS]](s32), [[RHS]] +; CHECK: [[TST:%[0-9]+]]:_(s1) = nnan ninf nsz arcp contract afn reassoc G_FCMP floatpred(oge), [[LHS]](s32), [[RHS]] ; CHECK: G_STORE [[TST]](s1), [[BOOLADDR]](p0) define void @float_comparison(float* %a.addr, float* %b.addr, i1* %bool.addr) { %a = load float, float* %a.addr %b = load float, float* %b.addr - %res = fcmp oge float %a, %b + %res = fcmp nnan ninf nsz arcp contract afn reassoc oge float %a, %b store i1 %res, i1* %bool.addr ret void } @@ -1338,9 +1338,9 @@ define float @test_pow_intrin(float %l, float %r) { ; CHECK-LABEL: name: test_pow_intrin ; CHECK: [[LHS:%[0-9]+]]:_(s32) = COPY $s0 ; CHECK: [[RHS:%[0-9]+]]:_(s32) = COPY $s1 -; CHECK: [[RES:%[0-9]+]]:_(s32) = G_FPOW [[LHS]], [[RHS]] +; CHECK: [[RES:%[0-9]+]]:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FPOW [[LHS]], [[RHS]] ; CHECK: $s0 = COPY [[RES]] - %res = call float @llvm.pow.f32(float %l, float %r) + %res = call nnan ninf nsz arcp contract afn reassoc float @llvm.pow.f32(float %l, float %r) ret float %res } @@ -1350,9 +1350,9 @@ define float @test_fma_intrin(float %a, float %b, float %c) { ; CHECK: [[A:%[0-9]+]]:_(s32) = COPY $s0 ; CHECK: [[B:%[0-9]+]]:_(s32) = COPY $s1 ; CHECK: [[C:%[0-9]+]]:_(s32) = COPY $s2 -; CHECK: [[RES:%[0-9]+]]:_(s32) = G_FMA [[A]], [[B]], [[C]] +; CHECK: [[RES:%[0-9]+]]:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FMA [[A]], [[B]], [[C]] ; CHECK: $s0 = COPY [[RES]] - %res = call float @llvm.fma.f32(float %a, float %b, float %c) + %res = call nnan ninf nsz arcp contract afn reassoc float @llvm.fma.f32(float %a, float %b, float %c) ret float %res } @@ -1360,9 +1360,9 @@ declare float @llvm.exp.f32(float) define float @test_exp_intrin(float %a) { ; CHECK-LABEL: name: test_exp_intrin ; CHECK: [[A:%[0-9]+]]:_(s32) = COPY $s0 -; CHECK: [[RES:%[0-9]+]]:_(s32) = G_FEXP [[A]] +; CHECK: [[RES:%[0-9]+]]:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FEXP [[A]] ; CHECK: $s0 = COPY [[RES]] - %res = call float @llvm.exp.f32(float %a) + %res = call nnan ninf nsz arcp contract afn reassoc float @llvm.exp.f32(float %a) ret float %res } @@ -1370,9 +1370,9 @@ declare float @llvm.exp2.f32(float) define float @test_exp2_intrin(float %a) { ; CHECK-LABEL: name: test_exp2_intrin ; CHECK: [[A:%[0-9]+]]:_(s32) = COPY $s0 -; CHECK: [[RES:%[0-9]+]]:_(s32) = G_FEXP2 [[A]] +; CHECK: [[RES:%[0-9]+]]:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FEXP2 [[A]] ; CHECK: $s0 = COPY [[RES]] - %res = call float @llvm.exp2.f32(float %a) + %res = call nnan ninf nsz arcp contract afn reassoc float @llvm.exp2.f32(float %a) ret float %res } @@ -1380,9 +1380,9 @@ declare float @llvm.log.f32(float) define float @test_log_intrin(float %a) { ; CHECK-LABEL: name: test_log_intrin ; CHECK: [[A:%[0-9]+]]:_(s32) = COPY $s0 -; CHECK: [[RES:%[0-9]+]]:_(s32) = G_FLOG [[A]] +; CHECK: [[RES:%[0-9]+]]:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FLOG [[A]] ; CHECK: $s0 = COPY [[RES]] - %res = call float @llvm.log.f32(float %a) + %res = call nnan ninf nsz arcp contract afn reassoc float @llvm.log.f32(float %a) ret float %res } @@ -1396,13 +1396,23 @@ define float @test_log2_intrin(float %a) { ret float %res } +declare float @llvm.log10.f32(float) +define float @test_log10_intrin(float %a) { +; CHECK-LABEL: name: test_log10_intrin +; CHECK: [[A:%[0-9]+]]:_(s32) = COPY $s0 +; CHECK: [[RES:%[0-9]+]]:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FLOG10 [[A]] +; CHECK: $s0 = COPY [[RES]] + %res = call nnan ninf nsz arcp contract afn reassoc float @llvm.log10.f32(float %a) + ret float %res +} + declare float @llvm.fabs.f32(float) define float @test_fabs_intrin(float %a) { ; CHECK-LABEL: name: test_fabs_intrin ; CHECK: [[A:%[0-9]+]]:_(s32) = COPY $s0 -; CHECK: [[RES:%[0-9]+]]:_(s32) = G_FABS [[A]] +; CHECK: [[RES:%[0-9]+]]:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FABS [[A]] ; CHECK: $s0 = COPY [[RES]] - %res = call float @llvm.fabs.f32(float %a) + %res = call nnan ninf nsz arcp contract afn reassoc float @llvm.fabs.f32(float %a) ret float %res } @@ -1561,7 +1571,7 @@ define i32 @test_singleelementvector(i32 %elt){ define <2 x i32> @test_constantaggzerovector_v2i32() { ; CHECK-LABEL: name: test_constantaggzerovector_v2i32 ; CHECK: [[ZERO:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 -; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[ZERO]](s32), [[ZERO]](s32) +; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ZERO]](s32), [[ZERO]](s32) ; CHECK: $d0 = COPY [[VEC]](<2 x s32>) ret <2 x i32> zeroinitializer } @@ -1569,7 +1579,7 @@ define <2 x i32> @test_constantaggzerovector_v2i32() { define <2 x float> @test_constantaggzerovector_v2f32() { ; CHECK-LABEL: name: test_constantaggzerovector_v2f32 ; CHECK: [[ZERO:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 -; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[ZERO]](s32), [[ZERO]](s32) +; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ZERO]](s32), [[ZERO]](s32) ; CHECK: $d0 = COPY [[VEC]](<2 x s32>) ret <2 x float> zeroinitializer } @@ -1577,7 +1587,7 @@ define <2 x float> @test_constantaggzerovector_v2f32() { define i32 @test_constantaggzerovector_v3i32() { ; CHECK-LABEL: name: test_constantaggzerovector_v3i32 ; CHECK: [[ZERO:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 -; CHECK: [[VEC:%[0-9]+]]:_(<3 x s32>) = G_MERGE_VALUES [[ZERO]](s32), [[ZERO]](s32), [[ZERO]](s32) +; CHECK: [[VEC:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ZERO]](s32), [[ZERO]](s32), [[ZERO]](s32) ; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<3 x s32>) %elt = extractelement <3 x i32> zeroinitializer, i32 1 ret i32 %elt @@ -1587,7 +1597,7 @@ define <2 x i32> @test_constantdatavector_v2i32() { ; CHECK-LABEL: name: test_constantdatavector_v2i32 ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 -; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C2]](s32) +; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C2]](s32) ; CHECK: $d0 = COPY [[VEC]](<2 x s32>) ret <2 x i32> } @@ -1597,7 +1607,7 @@ define i32 @test_constantdatavector_v3i32() { ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 -; CHECK: [[VEC:%[0-9]+]]:_(<3 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C2]](s32), [[C3]](s32) +; CHECK: [[VEC:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C2]](s32), [[C3]](s32) ; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<3 x s32>) %elt = extractelement <3 x i32> , i32 1 ret i32 %elt @@ -1609,7 +1619,7 @@ define <4 x i32> @test_constantdatavector_v4i32() { ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 -; CHECK: [[VEC:%[0-9]+]]:_(<4 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C2]](s32), [[C3]](s32), [[C4]](s32) +; CHECK: [[VEC:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C2]](s32), [[C3]](s32), [[C4]](s32) ; CHECK: $q0 = COPY [[VEC]](<4 x s32>) ret <4 x i32> } @@ -1618,7 +1628,7 @@ define <2 x double> @test_constantdatavector_v2f64() { ; CHECK-LABEL: name: test_constantdatavector_v2f64 ; CHECK: [[FC1:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 ; CHECK: [[FC2:%[0-9]+]]:_(s64) = G_FCONSTANT double 2.000000e+00 -; CHECK: [[VEC:%[0-9]+]]:_(<2 x s64>) = G_MERGE_VALUES [[FC1]](s64), [[FC2]](s64) +; CHECK: [[VEC:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FC1]](s64), [[FC2]](s64) ; CHECK: $q0 = COPY [[VEC]](<2 x s64>) ret <2 x double> } @@ -1662,7 +1672,7 @@ define <2 x i32> @test_shufflevector_s32_v2s32(i32 %arg) { ; CHECK: [[ARG:%[0-9]+]]:_(s32) = COPY $w0 ; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; CHECK-DAG: [[C0:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 -; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C0]](s32) +; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C0]](s32), [[C0]](s32) ; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](s32), [[UNDEF]], [[MASK]](<2 x s32>) ; CHECK: $d0 = COPY [[VEC]](<2 x s32>) %vec = insertelement <1 x i32> undef, i32 %arg, i32 0 @@ -1688,7 +1698,7 @@ define <2 x i32> @test_shufflevector_v2s32_v2s32(<2 x i32> %arg) { ; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF ; CHECK-DAG: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-DAG: [[C0:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 -; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C0]](s32) +; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C0]](s32) ; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](<2 x s32>), [[UNDEF]], [[MASK]](<2 x s32>) ; CHECK: $d0 = COPY [[VEC]](<2 x s32>) %res = shufflevector <2 x i32> %arg, <2 x i32> undef, <2 x i32> @@ -1701,7 +1711,7 @@ define i32 @test_shufflevector_v2s32_v3s32(<2 x i32> %arg) { ; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF ; CHECK-DAG: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-DAG: [[C0:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 -; CHECK-DAG: [[MASK:%[0-9]+]]:_(<3 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C0]](s32), [[C1]](s32) +; CHECK-DAG: [[MASK:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C0]](s32), [[C1]](s32) ; CHECK: [[VEC:%[0-9]+]]:_(<3 x s32>) = G_SHUFFLE_VECTOR [[ARG]](<2 x s32>), [[UNDEF]], [[MASK]](<3 x s32>) ; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<3 x s32>) %vec = shufflevector <2 x i32> %arg, <2 x i32> undef, <3 x i32> @@ -1717,7 +1727,7 @@ define <4 x i32> @test_shufflevector_v2s32_v4s32(<2 x i32> %arg1, <2 x i32> %arg ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 -; CHECK: [[MASK:%[0-9]+]]:_(<4 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C1]](s32), [[C2]](s32), [[C3]](s32) +; CHECK: [[MASK:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C0]](s32), [[C1]](s32), [[C2]](s32), [[C3]](s32) ; CHECK: [[VEC:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[ARG1]](<2 x s32>), [[ARG2]], [[MASK]](<4 x s32>) ; CHECK: $q0 = COPY [[VEC]](<4 x s32>) %res = shufflevector <2 x i32> %arg1, <2 x i32> %arg2, <4 x i32> @@ -1730,7 +1740,7 @@ define <2 x i32> @test_shufflevector_v4s32_v2s32(<4 x i32> %arg) { ; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF ; CHECK-DAG: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-DAG: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 -; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C3]](s32) +; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C3]](s32) ; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](<4 x s32>), [[UNDEF]], [[MASK]](<2 x s32>) ; CHECK: $d0 = COPY [[VEC]](<2 x s32>) %res = shufflevector <4 x i32> %arg, <4 x i32> undef, <2 x i32> @@ -1758,7 +1768,7 @@ define <16 x i8> @test_shufflevector_v8s8_v16s8(<8 x i8> %arg1, <8 x i8> %arg2) ; CHECK: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 14 ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 ; CHECK: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 -; CHECK: [[MASK:%[0-9]+]]:_(<16 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C8]](s32), [[C1]](s32), [[C9]](s32), [[C2]](s32), [[C10]](s32), [[C3]](s32), [[C11]](s32), [[C4]](s32), [[C12]](s32), [[C5]](s32), [[C13]](s32), [[C6]](s32), [[C14]](s32), [[C7]](s32), [[C15]](s32) +; CHECK: [[MASK:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[C0]](s32), [[C8]](s32), [[C1]](s32), [[C9]](s32), [[C2]](s32), [[C10]](s32), [[C3]](s32), [[C11]](s32), [[C4]](s32), [[C12]](s32), [[C5]](s32), [[C13]](s32), [[C6]](s32), [[C14]](s32), [[C7]](s32), [[C15]](s32) ; CHECK: [[VEC:%[0-9]+]]:_(<16 x s8>) = G_SHUFFLE_VECTOR [[ARG1]](<8 x s8>), [[ARG2]], [[MASK]](<16 x s32>) ; CHECK: $q0 = COPY [[VEC]](<16 x s8>) %res = shufflevector <8 x i8> %arg1, <8 x i8> %arg2, <16 x i32> @@ -1768,7 +1778,7 @@ define <16 x i8> @test_shufflevector_v8s8_v16s8(<8 x i8> %arg1, <8 x i8> %arg2) ; CHECK-LABEL: test_constant_vector ; CHECK: [[UNDEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF ; CHECK: [[F:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 -; CHECK: [[M:%[0-9]+]]:_(<4 x s16>) = G_MERGE_VALUES [[UNDEF]](s16), [[UNDEF]](s16), [[UNDEF]](s16), [[F]](s16) +; CHECK: [[M:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UNDEF]](s16), [[UNDEF]](s16), [[UNDEF]](s16), [[F]](s16) ; CHECK: $d0 = COPY [[M]](<4 x s16>) define <4 x half> @test_constant_vector() { ret <4 x half> diff --git a/test/CodeGen/AArch64/GlobalISel/irtranslator-stackprotect-check.ll b/test/CodeGen/AArch64/GlobalISel/irtranslator-stackprotect-check.ll new file mode 100644 index 0000000000000000000000000000000000000000..3559c8284a46ff3a9544dcb519db5d43722ae95f --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/irtranslator-stackprotect-check.ll @@ -0,0 +1,50 @@ +; RUN: llc -O0 -stop-before=irtranslator -global-isel %s -o - | FileCheck %s +; RUN: llc -O0 -stop-after=irtranslator -verify-machineinstrs -global-isel %s -o - | FileCheck --check-prefixes CHECK,CHECK-MIR %s + +; Check that when using GlobalISel, the StackProtector pass currently inserts +; both prologue and epilogue instrumentation because GlobalISel does not have +; the same epilogue insertion/optimization as SelectionDAG. + +target triple = "aarch64-none-unknown-eabi" + +define void @foo() ssp { +; CHECK-LABEL: entry: +; CHECK-NEXT: %StackGuardSlot = alloca i8* +; CHECK-NEXT: %0 = call i8* @llvm.stackguard() +; CHECK-NEXT: call void @llvm.stackprotector(i8* %0, i8** %StackGuardSlot) +; CHECK-NEXT: %buf = alloca [8 x i8], align 1 +; CHECK-NEXT: %1 = call i8* @llvm.stackguard() +; CHECK-NEXT: %2 = load volatile i8*, i8** %StackGuardSlot +; CHECK-NEXT: %3 = icmp eq i8* %1, %2 +; CHECK-NEXT: br i1 %3, label %SP_return, label %CallStackCheckFailBlk, !prof !0 +; +; CHECK: SP_return: +; CHECK-NEXT: ret void +; +; CHECK: CallStackCheckFailBlk: +; CHECK-NEXT: call void @__stack_chk_fail() +; CHECK-NEXT: unreachable + +; CHECK-MIR: bb.1.entry: +; CHECK-MIR: %0:_(p0) = G_FRAME_INDEX %stack.0.StackGuardSlot +; CHECK-MIR-NEXT: %1:gpr64sp(p0) = LOAD_STACK_GUARD :: (dereferenceable invariant load 8 from @__stack_chk_guard) +; CHECK-MIR-NEXT: %2:gpr64sp(p0) = LOAD_STACK_GUARD :: (dereferenceable invariant load 8 from @__stack_chk_guard) +; CHECK-MIR-NEXT: G_STORE %2(p0), %0(p0) :: (volatile store 8 into %stack.0.StackGuardSlot) +; CHECK-MIR-NEXT: %3:_(p0) = G_FRAME_INDEX %stack.1.buf +; CHECK-MIR-NEXT: %4:gpr64sp(p0) = LOAD_STACK_GUARD :: (dereferenceable invariant load 8 from @__stack_chk_guard) +; CHECK-MIR-NEXT: %5:_(p0) = G_LOAD %0(p0) :: (volatile load 8 from %ir.StackGuardSlot) +; CHECK-MIR-NEXT: %6:_(s1) = G_ICMP intpred(eq), %4(p0), %5 +; CHECK-MIR-NEXT: G_BRCOND %6(s1), %bb.2 +; CHECK-MIR-NEXT: G_BR %bb.3 +; +; CHECK-MIR: bb.2.SP_return: +; CHECK-MIR-NEXT: RET_ReallyLR +; +; CHECK-MIR: bb.3.CallStackCheckFailBlk: +; CHECK-MIR-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp +; CHECK-MIR-NEXT: BL @__stack_chk_fail, csr_aarch64_aapcs, implicit-def $lr, implicit $sp +; CHECK-MIR-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp +entry: + %buf = alloca [8 x i8], align 1 + ret void +} diff --git a/test/CodeGen/AArch64/GlobalISel/irtranslator-volatile-load-pr36018.ll b/test/CodeGen/AArch64/GlobalISel/irtranslator-volatile-load-pr36018.ll index 9bda39c9fca7fa5305b2686f5110d67e780db485..099a9e71e42dfa565aabbb48e6afef1a5f50e7bf 100644 --- a/test/CodeGen/AArch64/GlobalISel/irtranslator-volatile-load-pr36018.ll +++ b/test/CodeGen/AArch64/GlobalISel/irtranslator-volatile-load-pr36018.ll @@ -3,7 +3,7 @@ @g = global i16 0, align 2 declare void @bar(i32) -; Check that only one load is generated. We fall back to +; Check that only one load is generated for an extending volatile load. define hidden void @foo() { ; CHECK-NOT: ldrh ; CHECK: ldrsh diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir index fe6079c0db407e686c273d2e737d75c69dbb33e4..c3773f50cb0d0204a7a0bbbeec5d46637d490372 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir @@ -95,8 +95,8 @@ body: | %1:_(<2 x s64>) = COPY $q1 %2:_(<2 x s64>) = COPY $q2 %3:_(<2 x s64>) = COPY $q3 - %4:_(<4 x s64>) = G_MERGE_VALUES %0(<2 x s64>), %1(<2 x s64>) - %5:_(<4 x s64>) = G_MERGE_VALUES %2(<2 x s64>), %3(<2 x s64>) + %4:_(<4 x s64>) = G_CONCAT_VECTORS %0, %1 + %5:_(<4 x s64>) = G_CONCAT_VECTORS %2, %3 %6:_(<4 x s64>) = G_ADD %4, %5 %7:_(<2 x s64>), %8:_(<2 x s64>) = G_UNMERGE_VALUES %6(<4 x s64>) $q0 = COPY %7(<2 x s64>) @@ -122,12 +122,11 @@ body: | %1:_(<2 x s64>) = COPY $q1 %2:_(<2 x s64>) = COPY $q2 %3:_(<2 x s64>) = COPY $q3 - %4:_(<6 x s64>) = G_MERGE_VALUES %0(<2 x s64>), %1(<2 x s64>), %2(<2 x s64>) - %5:_(<6 x s64>) = G_MERGE_VALUES %1(<2 x s64>), %2(<2 x s64>), %3(<2 x s64>) + %4:_(<6 x s64>) = G_CONCAT_VECTORS %0(<2 x s64>), %1(<2 x s64>), %2(<2 x s64>) + %5:_(<6 x s64>) = G_CONCAT_VECTORS %1(<2 x s64>), %2(<2 x s64>), %3(<2 x s64>) %6:_(<6 x s64>) = G_ADD %4, %5 %7:_(<2 x s64>), %8:_(<2 x s64>), %9:_(<2 x s64>) = G_UNMERGE_VALUES %6(<6 x s64>) $q0 = COPY %7(<2 x s64>) $q1 = COPY %8(<2 x s64>) $q2 = COPY %9(<2 x s64>) - ... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-build-vector.mir b/test/CodeGen/AArch64/GlobalISel/legalize-build-vector.mir new file mode 100644 index 0000000000000000000000000000000000000000..226a469682e7a853eb4ffdb1a93def69cb353d62 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/legalize-build-vector.mir @@ -0,0 +1,41 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64-linux-gnu -O0 -run-pass=legalizer %s -o - | FileCheck %s + +--- +name: legal_v4s32 +body: | + bb.0: + liveins: $w0, $w1, $w2, $w3 + ; CHECK-LABEL: name: legal_v4s32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: $q0 = COPY [[BUILD_VECTOR]](<4 x s32>) + ; CHECK: RET_ReallyLR + %0:_(s32) = COPY $w0 + %1:_(s32) = COPY $w1 + %2:_(s32) = COPY $w2 + %3:_(s32) = COPY $w3 + %4:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32) + $q0 = COPY %4(<4 x s32>) + RET_ReallyLR +... +--- +name: legal_v2s64 +body: | + bb.0: + liveins: $x0, $x1 + ; CHECK-LABEL: name: legal_v2s64 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[COPY1]](s64) + ; CHECK: $q0 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; CHECK: RET_ReallyLR + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(<2 x s64>) = G_BUILD_VECTOR %0(s64), %1(s64) + $q0 = COPY %2(<2 x s64>) + RET_ReallyLR +... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir b/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir index b1be33cbeb5c24939a241e407ba36e9f776554aa..a8926832e096ca586fb2a631d4197c955cbca50a 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir @@ -51,10 +51,8 @@ body: | ; CHECK: $w0 = COPY [[ASHR2]](s32) ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK: [[TRUNC10:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C6]](s32) - ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[TRUNC10]], [[COPY5]] - ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[AND3]](s32) - ; CHECK: $w0 = COPY [[COPY6]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[TRUNC10]], [[C6]] + ; CHECK: $w0 = COPY [[AND3]](s32) ; CHECK: [[TRUNC11:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) ; CHECK: $w0 = COPY [[TRUNC11]](s32) ; CHECK: [[TRUNC12:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) @@ -118,3 +116,59 @@ body: | $w0 = COPY %27(s32) ... +--- +name: test_anyext_anyext +body: | + bb.0: + liveins: $w0 + + ; CHECK-LABEL: name: test_anyext_anyext + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: $w0 = COPY [[COPY1]](s32) + %0:_(s32) = COPY $w0 + %1:_(s1) = G_TRUNC %0(s32) + %2:_(s8) = G_ANYEXT %1(s1) + %3:_(s32) = G_ANYEXT %2(s8) + $w0 = COPY %3(s32) + +... +--- +name: test_anyext_sext +body: | + bb.0: + liveins: $w0 + + ; CHECK-LABEL: name: test_anyext_sext + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]] + ; CHECK: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]] + ; CHECK: $w0 = COPY [[ASHR]](s32) + %0:_(s32) = COPY $w0 + %1:_(s1) = G_TRUNC %0(s32) + %2:_(s8) = G_SEXT %1(s1) + %3:_(s32) = G_ANYEXT %2(s8) + $w0 = COPY %3(s32) + +... +--- +name: test_anyext_zext +body: | + bb.0: + liveins: $w0 + + ; CHECK-LABEL: name: test_anyext_zext + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] + ; CHECK: $w0 = COPY [[AND]](s32) + %0:_(s32) = COPY $w0 + %1:_(s1) = G_TRUNC %0(s32) + %2:_(s8) = G_ZEXT %1(s1) + %3:_(s32) = G_ANYEXT %2(s8) + $w0 = COPY %3(s32) + +... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir b/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir new file mode 100644 index 0000000000000000000000000000000000000000..ecba4f226301ec0e5025e8a43486fbeb40568503 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir @@ -0,0 +1,21 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64-linux-gnu -O0 -run-pass=legalizer %s -o - | FileCheck %s + +--- +name: test_eve_1 +body: | + bb.0: + liveins: $q0 + ; CHECK-LABEL: name: test_eve_1 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[C]](s32) + ; CHECK: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s64>), [[SEXT]](s64) + ; CHECK: $x0 = COPY [[EVEC]](s64) + ; CHECK: RET_ReallyLR + %0:_(<2 x s64>) = COPY $q0 + %1:_(s32) = G_CONSTANT i32 1 + %2:_(s64) = G_EXTRACT_VECTOR_ELT %0(<2 x s64>), %1(s32) + $x0 = COPY %2(s64) + RET_ReallyLR +... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-load-fewerElts.mir b/test/CodeGen/AArch64/GlobalISel/legalize-load-fewerElts.mir deleted file mode 100644 index 7f42f6e6c336942dbcd1924f889992a5a2d54b1a..0000000000000000000000000000000000000000 --- a/test/CodeGen/AArch64/GlobalISel/legalize-load-fewerElts.mir +++ /dev/null @@ -1,39 +0,0 @@ -# RUN: llc -march=aarch64 -o - -run-pass=legalizer -global-isel-abort=0 -debug-only=legalizer 2>&1 %s | FileCheck %s -# REQUIRES: asserts - -# CHECK: Legalize Machine IR for: load_v4s32 -# CHECK-NEXT: %{{[0-9]+}}:_(<4 x s32>) = G_LOAD %{{[0-9]+}}:_(p0) -# CHECK-NEXT: Reduce number of elements ---- -name: load_v4s32 -legalized: false -tracksRegLiveness: true -body: | - bb.1: - liveins: $x0 - - %0:_(p0) = COPY $x0 - %1:_(<4 x s32>) = G_LOAD %0(p0) :: (load 16, align 4) - %2:_(s32), %3:_(s32), %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %1(<4 x s32>) - $w0 = COPY %5(s32) - -... - -# Make sure we are able to scalarize v2s64. -# CHECK: Legalize Machine IR for: load_v2s64 -# CHECK-NEXT: %{{[0-9]+}}:_(<2 x s64>) = G_LOAD %{{[0-9]+}}:_(p0) -# CHECK-NEXT: Reduce number of elements ---- -name: load_v2s64 -legalized: false -tracksRegLiveness: true -body: | - bb.1: - liveins: $x0 - - %0:_(p0) = COPY $x0 - %1:_(<2 x s64>) = G_LOAD %0(p0) :: (load 16) - %2:_(s64), %3:_(s64) = G_UNMERGE_VALUES %1(<2 x s64>) - $x0 = COPY %3(s64) - -... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-load-store-fewerElts.mir b/test/CodeGen/AArch64/GlobalISel/legalize-load-store-fewerElts.mir new file mode 100644 index 0000000000000000000000000000000000000000..27791881ef3b18cb09010e8e914d6ad2e1776a23 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/legalize-load-store-fewerElts.mir @@ -0,0 +1,54 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=aarch64 -o - -run-pass=legalizer %s | FileCheck %s +--- +name: load_v4s32 +legalized: false +tracksRegLiveness: true +body: | + bb.1: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: load_v4s32 + ; CHECK: liveins: $x0, $x1 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load 8, align 16) + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) + ; CHECK: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[GEP]](p0) :: (load 8) + ; CHECK: G_STORE [[LOAD]](<2 x s32>), [[COPY1]](p0) :: (store 8, align 16) + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C1]](s64) + ; CHECK: G_STORE [[LOAD1]](<2 x s32>), [[GEP1]](p0) :: (store 8) + %0:_(p0) = COPY $x0 + %1:_(p0) = COPY $x1 + %2:_(<4 x s32>) = G_LOAD %0(p0) :: (load 16) + G_STORE %2(<4 x s32>), %1(p0) :: (store 16) + +... +--- +name: load_v2s64 +legalized: false +tracksRegLiveness: true +body: | + bb.1: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: load_v2s64 + ; CHECK: liveins: $x0, $x1 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load 8, align 16) + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) + ; CHECK: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP]](p0) :: (load 8) + ; CHECK: G_STORE [[LOAD]](s64), [[COPY1]](p0) :: (store 8, align 16) + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C1]](s64) + ; CHECK: G_STORE [[LOAD1]](s64), [[GEP1]](p0) :: (store 8) + %0:_(p0) = COPY $x0 + %1:_(p0) = COPY $x1 + %2:_(<2 x s64>) = G_LOAD %0(p0) :: (load 16) + G_STORE %2(<2 x s64>), %1(p0) :: (store 16) + +... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-nonpowerof2eltsvec.mir b/test/CodeGen/AArch64/GlobalISel/legalize-nonpowerof2eltsvec.mir deleted file mode 100644 index f0a45bf3cf0cb251577f6c8a826bf6643467fbaf..0000000000000000000000000000000000000000 --- a/test/CodeGen/AArch64/GlobalISel/legalize-nonpowerof2eltsvec.mir +++ /dev/null @@ -1,33 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s - ---- | - target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" - target triple = "aarch64--" - define void @test_legalize_merge_v3s64() { - ret void - } -... ---- -name: test_legalize_merge_v3s64 -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } - - { id: 3, class: _ } - - { id: 4, class: _ } - - { id: 5, class: _ } -body: | - bb.0: - liveins: $w0 - ; CHECK-LABEL: name: test_legalize_merge_v3s64 - ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 - ; CHECK: [[MV:%[0-9]+]]:_(<3 x s64>) = G_MERGE_VALUES [[COPY]](s64), [[COPY]](s64), [[COPY]](s64) - ; CHECK: $x0 = COPY [[COPY]](s64) - ; CHECK: $noreg = PATCHABLE_RET [[MV]](<3 x s64>) - %0(s64) = COPY $x0 - %1(<3 x s64>) = G_MERGE_VALUES %0(s64), %0(s64), %0(s64) - %2(s64), %3(s64), %4(s64) = G_UNMERGE_VALUES %1(<3 x s64>) - $x0 = COPY %3(s64) - $noreg = PATCHABLE_RET %1(<3 x s64>) -... diff --git a/test/CodeGen/AArch64/GlobalISel/legalizer-combiner.mir b/test/CodeGen/AArch64/GlobalISel/legalizer-combiner.mir new file mode 100644 index 0000000000000000000000000000000000000000..b07445041450f4df07ef3ddbc403b140e36d121f --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/legalizer-combiner.mir @@ -0,0 +1,25 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + target triple = "aarch64--" + define void @test_unmerge() { + entry: + ret void + } +... + +--- +name: test_unmerge +body: | + bb.1: + liveins: $w0 + ; CHECK-LABEL: name: test_unmerge + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: $w0 = COPY [[COPY]](s32) + %0:_(s32) = COPY $w0 + %1:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %0(s32), %0(s32), %0(s32) + %2:_(s32), %3:_(s32), %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %1(<4 x s32>) + $w0 = COPY %2(s32) +... diff --git a/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index ca059cf15443d26c666dba2c0bd07ec3b8c48b5a..0c75cd3ce3b60ef9686079fa371d88b7a5eeae3c 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -69,6 +69,15 @@ # DEBUG-NEXT: G_MERGE_VALUES (opcode {{[0-9]+}}): 2 type indices # DEBUG: .. type index coverage check SKIPPED: user-defined predicate detected # +# DEBUG-NEXT: G_BUILD_VECTOR (opcode {{[0-9]+}}): 2 type indices +# DEBUG: .. type index coverage check SKIPPED: user-defined predicate detected +# +# DEBUG-NEXT: G_BUILD_VECTOR_TRUNC (opcode {{[0-9]+}}): 2 type indices +# DEBUG: .. type index coverage check SKIPPED: no rules defined +# +# DEBUG-NEXT: G_CONCAT_VECTORS (opcode {{[0-9]+}}): 2 type indices +# DEBUG: .. type index coverage check SKIPPED: no rules defined +# # DEBUG-NEXT: G_PTRTOINT (opcode {{[0-9]+}}): 2 type indices # DEBUG: .. the first uncovered type index: 2, OK # @@ -258,6 +267,9 @@ # DEBUG-NEXT: G_FLOG2 (opcode {{[0-9]+}}): 1 type index # DEBUG: .. type index coverage check SKIPPED: no rules defined # +# DEBUG-NEXT: G_FLOG10 (opcode {{[0-9]+}}): 1 type index +# DEBUG: .. type index coverage check SKIPPED: no rules defined +# # DEBUG-NEXT: G_FNEG (opcode {{[0-9]+}}): 1 type index # DEBUG: .. type index coverage check SKIPPED: no rules defined # diff --git a/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir b/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir index 8a78ed962e38b94ef2fb49808fbca2ffde1d37f6..c41405c5a0dd968b4f6ee1cce39e2b1d7b070015 100644 --- a/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir +++ b/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir @@ -1,4 +1,9 @@ # RUN: llc -O0 -run-pass=aarch64-prelegalizer-combiner -global-isel %s -o - | FileCheck %s +# RUN: llc -O0 -run-pass=aarch64-prelegalizer-combiner -global-isel %s -o - \ +# RUN: -debug-only=aarch64-prelegalizer-combiner,gi-combiner 2>&1 >/dev/null \ +# RUN: | FileCheck %s --check-prefix=CHECK-WORKLIST + +# REQUIRES: asserts --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" @@ -102,6 +107,23 @@ body: | ; CHECK: $w1 = COPY [[T6]](s32) $w0 = COPY %3 $w1 = COPY %9 + +# Check that we report the correct modifications to the observer. This acts as +# a test of the debug output and a test. +# +# CHECK-WORKLIST-LABEL: Generic MI Combiner for: multiple_copies +# CHECK-WORKLIST: Try combining [[IN0:%[0-9]+]]:_(s8) = G_LOAD [[IN1:%[0-9]+]]:_(p0) :: (load 1 from %ir.addr) +# CHECK-WORKLIST: Preferred use is: [[IN2:%[0-9]+]]:_(s32) = G_SEXT [[IN0]]:_(s8) +# CHECK-WORKLIST-DAG: Changing: [[IN0]]:_(s8) = G_LOAD [[IN1]]:_(p0) :: (load 1 from %ir.addr) +# CHECK-WORKLIST-DAG: Changing: [[IN3:%[0-9]+]]:_(s8) = G_ADD [[IN0]]:_, [[IN4:%[0-9]+]]:_ +# CHECK-WORKLIST-DAG: Changed: [[IN3]]:_(s8) = G_ADD [[NEW1:%[0-9]+]]:_, [[IN4]]:_ +# CHECK-WORKLIST-DAG: Changing: [[IN5:%[0-9]+]]:_(s8) = G_SUB [[IN0]]:_, [[IN6:%[0-9]+]]:_ +# CHECK-WORKLIST-DAG: Changed: [[IN5]]:_(s8) = G_SUB [[NEW2:%[0-9]+]]:_, [[IN6]]:_ +# CHECK-WORKLIST-DAG: Erased: [[IN2]]:_(s32) = G_SEXT [[IN0]]:_(s8) +# CHECK-WORKLIST-DAG: Changed: [[IN2]]:_(s32) = G_SEXTLOAD [[IN1]]:_(p0) :: (load 1 from %ir.addr) +# CHECK-WORKLIST-DAG: Created: [[NEW1]]:_(s8) = G_TRUNC [[IN2]]:_(s32) +# CHECK-WORKLIST-DAG: Created: [[NEW2]]:_(s8) = G_TRUNC [[IN2]]:_(s32) +# CHECK-WORKLIST: Try combining ... --- @@ -157,37 +179,52 @@ body: | %0:_(p0) = COPY $x0 %1:_(s32) = COPY $w1 %2:_(s8) = G_LOAD %0 :: (load 1 from %ir.addr) - %3:_(s32) = G_SEXT %2 - %4:_(s32) = G_CONSTANT i32 1 - %5:_(s1) = G_ICMP intpred(ne), %1:_(s32), %4:_ - G_BRCOND %5:_(s1), %bb.1 + %3:_(s32) = G_CONSTANT i32 1 + %4:_(s1) = G_ICMP intpred(ne), %1:_(s32), %3:_ + G_BRCOND %4:_(s1), %bb.1 G_BR %bb.2.else bb.1.if: ; CHECK: bb.1.if: successors: %bb.3(0x80000000) - %10:_(s8) = G_CONSTANT i8 1 + %5:_(s8) = G_CONSTANT i8 1 ; CHECK: [[T1:%[0-9]+]]:_(s8) = G_TRUNC [[T0]](s32) - %6:_(s8) = G_ADD %2, %10 + %6:_(s8) = G_ADD %2, %5 ; CHECK: [[T2:%[0-9]+]]:_(s8) = G_ADD [[T1]], {{.*}} G_BR %bb.3.exit bb.2.else: ; CHECK: bb.2.else: successors: %bb.3(0x80000000) - %11:_(s8) = G_CONSTANT i8 1 + %7:_(s8) = G_CONSTANT i8 1 ; CHECK: [[T3:%[0-9]+]]:_(s8) = G_TRUNC [[T0]](s32) - %7:_(s8) = G_SUB %2, %11 + %8:_(s8) = G_SUB %2, %7 ; CHECK: [[T4:%[0-9]+]]:_(s8) = G_SUB [[T3]], {{.*}} G_BR %bb.3.exit bb.3.exit: ; CHECK: bb.3.exit: - %8:_(s8) = G_PHI %6:_(s8), %bb.1, %7:_(s8), %bb.2 + %9:_(s8) = G_PHI %6:_(s8), %bb.1, %8:_(s8), %bb.2 ; CHECK: [[T5:%[0-9]+]]:_(s8) = G_PHI [[T2]](s8), %bb.1, [[T4]](s8) - %9:_(s32) = G_ZEXT %8 + %10:_(s32) = G_SEXT %2 + %11:_(s32) = G_ZEXT %9 ; CHECK: [[T6:%[0-9]+]]:_(s32) = G_ZEXT [[T5]](s8) ; CHECK: $w0 = COPY [[T0]](s32) ; CHECK: $w1 = COPY [[T6]](s32) - $w0 = COPY %3 - $w1 = COPY %9 + $w0 = COPY %10 + $w1 = COPY %11 +# CHECK-WORKLIST-LABEL: Generic MI Combiner for: sink_to_phi_nondominating +# CHECK-WORKLIST: Try combining [[IN0:%[0-9]+]]:_(s8) = G_LOAD [[IN1:%[0-9]+]]:_(p0) :: (load 1 from %ir.addr) +# CHECK-WORKLIST: Preferred use is: [[IN2:%[0-9]+]]:_(s32) = G_SEXT [[IN0]]:_(s8) +# CHECK-WORKLIST-DAG: Changing: [[IN0]]:_(s8) = G_LOAD [[IN1]]:_(p0) :: (load 1 from %ir.addr) +# CHECK-WORKLIST-DAG: Creating: G_TRUNC +# CHECK-WORKLIST-DAG: Changing: [[IN3:%[0-9]+]]:_(s8) = G_ADD [[IN0]]:_, [[IN4:%[0-9]+]]:_ +# CHECK-WORKLIST-DAG: Changed: [[IN3]]:_(s8) = G_ADD [[OUT1:%[0-9]+]]:_, [[IN4]]:_ +# CHECK-WORKLIST-DAG: Creating: G_TRUNC +# CHECK-WORKLIST-DAG: Changing: [[IN5:%[0-9]+]]:_(s8) = G_SUB [[IN0]]:_, [[IN6:%[0-9]+]]:_ +# CHECK-WORKLIST-DAG: Changed: [[IN5]]:_(s8) = G_SUB [[OUT2:%[0-9]+]]:_, [[IN6]]:_ +# CHECK-WORKLIST-DAG: Erased: [[IN2]]:_(s32) = G_SEXT [[IN0]]:_(s8) +# CHECK-WORKLIST-DAG: Changed: [[IN2]]:_(s32) = G_SEXTLOAD [[IN1]]:_(p0) :: (load 1 from %ir.addr) +# CHECK-WORKLIST-DAG: Created: [[OUT1]]:_(s8) = G_TRUNC [[IN2]]:_(s32) +# CHECK-WORKLIST-DAG: Created: [[OUT2]]:_(s8) = G_TRUNC [[IN2]]:_(s32) +# CHECK-WORKLIST: Try combining ... --- diff --git a/test/CodeGen/AArch64/GlobalISel/select-build-vector.mir b/test/CodeGen/AArch64/GlobalISel/select-build-vector.mir new file mode 100644 index 0000000000000000000000000000000000000000..5f3abcc627dde6a29de25345bed46a871fd00b07 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-build-vector.mir @@ -0,0 +1,301 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s +--- | + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64" + + define <4 x float> @test_f32(float %a, float %b, float %c, float %d) { + ret <4 x float> undef + } + + define <2 x double> @test_f64(double %a, double %b) { + ret <2 x double> undef + } + + define <4 x i32> @test_i32(i32 %a, i32 %b, i32 %c, i32 %d) { + ret <4 x i32> undef + } + + define <2 x i64> @test_i64(i64 %a, i64 %b) { + ret <2 x i64> undef + } + +... +--- +name: test_f32 +alignment: 2 +exposesReturnsTwice: false +legalized: true +regBankSelected: true +selected: false +failedISel: false +tracksRegLiveness: true +registers: + - { id: 0, class: fpr, preferred-register: '' } + - { id: 1, class: fpr, preferred-register: '' } + - { id: 2, class: fpr, preferred-register: '' } + - { id: 3, class: fpr, preferred-register: '' } + - { id: 4, class: fpr, preferred-register: '' } + - { id: 5, class: _, preferred-register: '' } + - { id: 6, class: _, preferred-register: '' } + - { id: 7, class: _, preferred-register: '' } + - { id: 8, class: _, preferred-register: '' } + - { id: 9, class: _, preferred-register: '' } + - { id: 10, class: _, preferred-register: '' } + - { id: 11, class: _, preferred-register: '' } + - { id: 12, class: _, preferred-register: '' } + - { id: 13, class: gpr, preferred-register: '' } + - { id: 14, class: gpr, preferred-register: '' } + - { id: 15, class: gpr, preferred-register: '' } + - { id: 16, class: gpr, preferred-register: '' } +liveins: +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0 (%ir-block.0): + liveins: $s0, $s1, $s2, $s3 + + ; CHECK-LABEL: name: test_f32 + ; CHECK: liveins: $s0, $s1, $s2, $s3 + ; CHECK: [[COPY:%[0-9]+]]:fpr32 = COPY $s0 + ; CHECK: [[COPY1:%[0-9]+]]:fpr32 = COPY $s1 + ; CHECK: [[COPY2:%[0-9]+]]:fpr32 = COPY $s2 + ; CHECK: [[COPY3:%[0-9]+]]:fpr32 = COPY $s3 + ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.ssub + ; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.ssub + ; CHECK: [[INSvi32lane:%[0-9]+]]:fpr128 = INSvi32lane [[INSERT_SUBREG]], 1, [[INSERT_SUBREG1]], 0 + ; CHECK: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK: [[INSERT_SUBREG2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[COPY2]], %subreg.ssub + ; CHECK: [[INSvi32lane1:%[0-9]+]]:fpr128 = INSvi32lane [[INSvi32lane]], 2, [[INSERT_SUBREG2]], 0 + ; CHECK: [[DEF3:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK: [[INSERT_SUBREG3:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF3]], [[COPY3]], %subreg.ssub + ; CHECK: [[INSvi32lane2:%[0-9]+]]:fpr128 = INSvi32lane [[INSvi32lane1]], 3, [[INSERT_SUBREG3]], 0 + ; CHECK: $q0 = COPY [[INSvi32lane2]] + ; CHECK: RET_ReallyLR implicit $q0 + %0:fpr(s32) = COPY $s0 + %1:fpr(s32) = COPY $s1 + %2:fpr(s32) = COPY $s2 + %3:fpr(s32) = COPY $s3 + %4:fpr(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32) + $q0 = COPY %4(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: test_f64 +alignment: 2 +exposesReturnsTwice: false +legalized: true +regBankSelected: true +selected: false +failedISel: false +tracksRegLiveness: true +registers: + - { id: 0, class: fpr, preferred-register: '' } + - { id: 1, class: fpr, preferred-register: '' } + - { id: 2, class: fpr, preferred-register: '' } + - { id: 3, class: fpr, preferred-register: '' } + - { id: 4, class: fpr, preferred-register: '' } + - { id: 5, class: _, preferred-register: '' } + - { id: 6, class: _, preferred-register: '' } + - { id: 7, class: _, preferred-register: '' } + - { id: 8, class: _, preferred-register: '' } + - { id: 9, class: gpr, preferred-register: '' } + - { id: 10, class: gpr, preferred-register: '' } +liveins: +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0 (%ir-block.0): + liveins: $d0, $d1, $d2, $d3 + + ; CHECK-LABEL: name: test_f64 + ; CHECK: liveins: $d0, $d1, $d2, $d3 + ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0 + ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1 + ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.dsub + ; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.dsub + ; CHECK: [[INSvi64lane:%[0-9]+]]:fpr128 = INSvi64lane [[INSERT_SUBREG]], 1, [[INSERT_SUBREG1]], 0 + ; CHECK: $q0 = COPY [[INSvi64lane]] + ; CHECK: RET_ReallyLR implicit $q0 + %0:fpr(s64) = COPY $d0 + %1:fpr(s64) = COPY $d1 + %4:fpr(<2 x s64>) = G_BUILD_VECTOR %0(s64), %1(s64) + $q0 = COPY %4(<2 x s64>) + RET_ReallyLR implicit $q0 + +... +--- +name: test_i32 +alignment: 2 +exposesReturnsTwice: false +legalized: true +regBankSelected: true +selected: false +failedISel: false +tracksRegLiveness: true +registers: + - { id: 0, class: gpr, preferred-register: '' } + - { id: 1, class: gpr, preferred-register: '' } + - { id: 2, class: gpr, preferred-register: '' } + - { id: 3, class: gpr, preferred-register: '' } + - { id: 4, class: fpr, preferred-register: '' } + - { id: 5, class: _, preferred-register: '' } + - { id: 6, class: _, preferred-register: '' } + - { id: 7, class: _, preferred-register: '' } + - { id: 8, class: _, preferred-register: '' } + - { id: 9, class: _, preferred-register: '' } + - { id: 10, class: _, preferred-register: '' } + - { id: 11, class: _, preferred-register: '' } + - { id: 12, class: _, preferred-register: '' } +liveins: +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0 (%ir-block.0): + liveins: $w0, $w1, $w2, $w3 + + ; CHECK-LABEL: name: test_i32 + ; CHECK: liveins: $w0, $w1, $w2, $w3 + ; CHECK: [[COPY:%[0-9]+]]:gpr32all = COPY $w0 + ; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1 + ; CHECK: [[COPY2:%[0-9]+]]:gpr32 = COPY $w2 + ; CHECK: [[COPY3:%[0-9]+]]:gpr32 = COPY $w3 + ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.ssub + ; CHECK: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG]], 1, [[COPY1]] + ; CHECK: [[INSvi32gpr1:%[0-9]+]]:fpr128 = INSvi32gpr [[INSvi32gpr]], 2, [[COPY2]] + ; CHECK: [[INSvi32gpr2:%[0-9]+]]:fpr128 = INSvi32gpr [[INSvi32gpr1]], 3, [[COPY3]] + ; CHECK: $q0 = COPY [[INSvi32gpr2]] + ; CHECK: RET_ReallyLR implicit $q0 + %0:gpr(s32) = COPY $w0 + %1:gpr(s32) = COPY $w1 + %2:gpr(s32) = COPY $w2 + %3:gpr(s32) = COPY $w3 + %4:fpr(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32) + $q0 = COPY %4(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: test_i64 +alignment: 2 +exposesReturnsTwice: false +legalized: true +regBankSelected: true +selected: false +failedISel: false +tracksRegLiveness: true +registers: + - { id: 0, class: gpr, preferred-register: '' } + - { id: 1, class: gpr, preferred-register: '' } + - { id: 2, class: gpr, preferred-register: '' } + - { id: 3, class: gpr, preferred-register: '' } + - { id: 4, class: fpr, preferred-register: '' } + - { id: 5, class: _, preferred-register: '' } + - { id: 6, class: _, preferred-register: '' } + - { id: 7, class: _, preferred-register: '' } + - { id: 8, class: _, preferred-register: '' } +liveins: +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0 (%ir-block.0): + liveins: $x0, $x1, $x2, $x3 + + ; CHECK-LABEL: name: test_i64 + ; CHECK: liveins: $x0, $x1, $x2, $x3 + ; CHECK: [[COPY:%[0-9]+]]:gpr64all = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.dsub + ; CHECK: [[INSvi64gpr:%[0-9]+]]:fpr128 = INSvi64gpr [[INSERT_SUBREG]], 1, [[COPY1]] + ; CHECK: $q0 = COPY [[INSvi64gpr]] + ; CHECK: RET_ReallyLR implicit $q0 + %0:gpr(s64) = COPY $x0 + %1:gpr(s64) = COPY $x1 + %4:fpr(<2 x s64>) = G_BUILD_VECTOR %0(s64), %1(s64) + $q0 = COPY %4(<2 x s64>) + RET_ReallyLR implicit $q0 + +... diff --git a/test/CodeGen/AArch64/O0-pipeline.ll b/test/CodeGen/AArch64/O0-pipeline.ll index d85d126883c3260f746a59d3f4d63837d91dd999..6d0aa91272bd058c71155db579e6d9160a3ea4fe 100644 --- a/test/CodeGen/AArch64/O0-pipeline.ll +++ b/test/CodeGen/AArch64/O0-pipeline.ll @@ -50,6 +50,7 @@ ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization ; CHECK-NEXT: Post-RA pseudo instruction expansion pass ; CHECK-NEXT: AArch64 pseudo instruction expansion pass +; CHECK-NEXT: AArch64 speculation hardening pass ; CHECK-NEXT: Analyze Machine Code For Garbage Collection ; CHECK-NEXT: Branch relaxation pass ; CHECK-NEXT: AArch64 Branch Targets diff --git a/test/CodeGen/AArch64/O3-pipeline.ll b/test/CodeGen/AArch64/O3-pipeline.ll index dc2316987d33f1c9efbf9e7b776b321ad6ea789a..98cef01b6a906f5c0a113a4bc423d3b01ffeb536 100644 --- a/test/CodeGen/AArch64/O3-pipeline.ll +++ b/test/CodeGen/AArch64/O3-pipeline.ll @@ -49,6 +49,11 @@ ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics ; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results +; CHECK-NEXT: Memory SSA +; CHECK-NEXT: Interleaved Load Combine Pass +; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Interleaved Access Pass ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: CodeGen Prepare @@ -141,6 +146,7 @@ ; CHECK-NEXT: Post-RA pseudo instruction expansion pass ; CHECK-NEXT: AArch64 pseudo instruction expansion pass ; CHECK-NEXT: AArch64 load / store optimization pass +; CHECK-NEXT: AArch64 speculation hardening pass ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Falkor HW Prefetch Fix Late Phase @@ -149,6 +155,7 @@ ; CHECK-NEXT: Machine Block Frequency Analysis ; CHECK-NEXT: MachinePostDominator Tree Construction ; CHECK-NEXT: Branch Probability Basic Block Placement +; CHECK-NEXT: AArch64 load / store optimization pass ; CHECK-NEXT: Branch relaxation pass ; CHECK-NEXT: AArch64 Branch Targets ; CHECK-NEXT: AArch64 Compress Jump Tables diff --git a/test/CodeGen/AArch64/aarch64-interleaved-ld-combine.ll b/test/CodeGen/AArch64/aarch64-interleaved-ld-combine.ll new file mode 100644 index 0000000000000000000000000000000000000000..970422164a49dc854b7e7ec57b7cd8a89a31cf52 --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-interleaved-ld-combine.ll @@ -0,0 +1,406 @@ +; RUN: llc < %s | FileCheck --check-prefix AS %s +; RUN: opt -S -interleaved-load-combine < %s | FileCheck %s + +; ModuleID = 'aarch64_interleaved-ld-combine.bc' +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "arm64--linux-gnu" + +; This should be lowered into LD4 +define void @aarch64_ilc_const(<4 x float>* %ptr) { +entry: + +;;; Check LLVM transformation +; CHECK-LABEL: @aarch64_ilc_const( +; CHECK-DAG: [[GEP:%.+]] = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2 +; CHECK-DAG: [[CAST:%.+]] = bitcast <4 x float>* [[GEP]] to <16 x float>* +; CHECK-DAG: [[LOAD:%.+]] = load <16 x float>, <16 x float>* [[CAST]], align 16 +; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> +; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> +; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> +; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> +; CHECK: ret void + +;;; Check if it gets lowerd +; AS-LABEL: aarch64_ilc_const +; AS: ld4 +; AS: ret + + %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2 + %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3 + %gep3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 + %gep4 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5 + %ld1 = load <4 x float>, <4 x float>* %gep1, align 16 + %ld2 = load <4 x float>, <4 x float>* %gep2, align 16 + %ld3 = load <4 x float>, <4 x float>* %gep3, align 16 + %ld4 = load <4 x float>, <4 x float>* %gep4, align 16 + %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> + %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> + %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> + %sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> + %m0_3 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> + %m4_7 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> + %m8_11 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> + %m12_15 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> + + store <4 x float> %m0_3, <4 x float>* %gep1, align 16 + store <4 x float> %m4_7, <4 x float>* %gep2, align 16 + store <4 x float> %m8_11, <4 x float>* %gep3, align 16 + store <4 x float> %m12_15, <4 x float>* %gep4, align 16 + ret void +} + +; This should be lowered into LD4 +define void @aarch64_ilc_idx(<4 x float>* %ptr, i64 %idx) { +entry: + +;;; Check LLVM transformation +; CHECK-LABEL: @aarch64_ilc_idx( +; CHECK-DAG: [[ADD:%.+]] = add i64 %idx, 16 +; CHECK-DAG: [[LSHR:%.+]] = lshr i64 [[ADD]], 2 +; CHECK-DAG: [[GEP:%.+]] = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 [[LSHR]] +; CHECK-DAG: [[CAST:%.+]] = bitcast <4 x float>* [[GEP]] to <16 x float>* +; CHECK-DAG: [[LOAD:%.+]] = load <16 x float>, <16 x float>* [[CAST]], align 16 +; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> +; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> +; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> +; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> +; CHECK: ret void + +; AS-LABEL: aarch64_ilc_idx +; AS-DAG: lsl [[LSL:x[0-9]+]], x1, #2 +; AS-DAG: add [[ADD:x[0-9]+]], [[LSL]], #64 +; AS-DAG: and [[AND:x[0-9]+]], [[ADD]], #0xfffffffffffffff0 +; AS-DAG: add [[ADR:x[0-9]+]], x0, [[AND]] +; AS-DAG: ld4 { v[[V0:[0-9]+]].4s, v[[V1:[0-9]+]].4s, v[[V2:[0-9]+]].4s, v[[V3:[0-9]+]].4s }, {{\[}}[[ADR]]{{\]}} +; AS-DAG: str q[[V0]] +; AS-DAG: str q[[V1]] +; AS-DAG: str q[[V2]] +; AS-DAG: str q[[V3]] +; AS: ret + + %a2 = add i64 %idx, 20 + %idx2 = lshr i64 %a2, 2 + %a3 = add i64 %idx, 24 + %a1 = add i64 %idx, 16 + %idx1 = lshr i64 %a1, 2 + %idx3 = lshr i64 %a3, 2 + %a4 = add i64 %idx, 28 + %idx4 = lshr i64 %a4, 2 + + %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 %idx2 + %gep4 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 %idx4 + %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 %idx1 + %gep3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 %idx3 + %ld1 = load <4 x float>, <4 x float>* %gep1, align 16 + %ld2 = load <4 x float>, <4 x float>* %gep2, align 16 + %ld3 = load <4 x float>, <4 x float>* %gep3, align 16 + %ld4 = load <4 x float>, <4 x float>* %gep4, align 16 + %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> + %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> + %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> + %sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> + %m0_3 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> + %m4_7 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> + %m8_11 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> + %m12_15 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> + + store <4 x float> %m0_3, <4 x float>* %gep1, align 16 + store <4 x float> %m4_7, <4 x float>* %gep2, align 16 + store <4 x float> %m8_11, <4 x float>* %gep3, align 16 + store <4 x float> %m12_15, <4 x float>* %gep4, align 16 + ret void +} + +; This should be lowered into LD4, a offset of has to be taken into account +%struct.ilc = type <{ float, [0 x <4 x float>] }> +define void @aarch64_ilc_struct(%struct.ilc* %ptr, i64 %idx) { +entry: + +;;; Check LLVM transformation +; CHECK-LABEL: @aarch64_ilc_struct( +; CHECK-DAG: [[LSHR:%.+]] = lshr i64 %idx, 2 +; CHECK-DAG: [[GEP:%.+]] = getelementptr %struct.ilc, %struct.ilc* %ptr, i32 0, i32 1, i64 [[LSHR]] +; CHECK-DAG: [[CAST:%.+]] = bitcast <4 x float>* [[GEP]] to <16 x float>* +; CHECK-DAG: [[LOAD:%.+]] = load <16 x float>, <16 x float>* [[CAST]], align 4 +; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> +; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> +; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> +; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32> +; CHECK: ret void + +; AS-LABEL: aarch64_ilc_struct +; AS-DAG: lsl [[LSL:x[0-9]+]], x1, #2 +; AS-DAG: add [[ADD:x[0-9]+]], x0, #4 +; AS-DAG: and [[AND:x[0-9]+]], [[LSL]], #0xfffffffffffffff0 +; AS-DAG: add [[ADR:x[0-9]+]], [[ADD]], [[AND]] +; AS-DAG: ld4 { v[[V0:[0-9]+]].4s, v[[V1:[0-9]+]].4s, v[[V2:[0-9]+]].4s, v[[V3:[0-9]+]].4s }, {{\[}}[[ADR]]{{\]}} +; AS-DAG: str q[[V0]] +; AS-DAG: str q[[V1]] +; AS-DAG: str q[[V2]] +; AS-DAG: str q[[V3]] +; AS: ret + + %a1 = add i64 %idx, 4 + %idx2 = lshr i64 %a1, 2 + %a2 = add i64 %idx, 8 + %idx3 = lshr i64 %a2, 2 + %a3 = add i64 %idx, 12 + %idx4 = lshr i64 %a3, 2 + + %gep2 = getelementptr %struct.ilc, %struct.ilc* %ptr, i32 0, i32 1, i64 %idx2 + %gep3 = getelementptr %struct.ilc, %struct.ilc* %ptr, i32 0, i32 1, i64 %idx3 + %gep4 = getelementptr %struct.ilc, %struct.ilc* %ptr, i32 0, i32 1, i64 %idx4 + %idx1 = lshr i64 %idx, 2 + %gep1 = getelementptr %struct.ilc, %struct.ilc* %ptr, i32 0, i32 1, i64 %idx1 + %ld1 = load <4 x float>, <4 x float>* %gep1, align 4 + %ld2 = load <4 x float>, <4 x float>* %gep2, align 4 + %ld3 = load <4 x float>, <4 x float>* %gep3, align 4 + %ld4 = load <4 x float>, <4 x float>* %gep4, align 4 + %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> + %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> + %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> + %sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> + %m0_3 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> + %m4_7 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32> + %m8_11 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> + %m12_15 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32> + + store <4 x float> %m0_3, <4 x float>* %gep1, align 16 + store <4 x float> %m4_7, <4 x float>* %gep2, align 16 + store <4 x float> %m8_11, <4 x float>* %gep3, align 16 + store <4 x float> %m12_15, <4 x float>* %gep4, align 16 + ret void +} + +; This should be lowered into LD2 +define void @aarch64_ilc_idx_ld2(<4 x float>* %ptr, i64 %idx) { +entry: +; CHECK-LABEL: @aarch64_ilc_idx_ld2( +; CHECK-DAG: [[LSHR:%.+]] = lshr i64 %idx, 2 +; CHECK-DAG: [[GEP:%.+]] = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 [[LSHR]] +; CHECK-DAG: [[CAST:%.+]] = bitcast <4 x float>* [[GEP]] to <8 x float>* +; CHECK-DAG: [[LOAD:%.+]] = load <8 x float>, <8 x float>* [[CAST]], align 16 +; CHECK: %{{.* }}= shufflevector <8 x float> [[LOAD]], <8 x float> undef, <4 x i32> +; CHECK: %{{.* }}= shufflevector <8 x float> [[LOAD]], <8 x float> undef, <4 x i32> +; CHECK-DAG: ret void + +; AS-LABEL: aarch64_ilc_idx_ld2 +; AS: ld2 +; AS: ret + + %idx1 = lshr i64 %idx, 2 + %a1 = add i64 %idx, 4 + %idx2 = lshr i64 %a1, 2 + + %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 %idx1 + %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 %idx2 + %ld1 = load <4 x float>, <4 x float>* %gep1, align 16 + %ld2 = load <4 x float>, <4 x float>* %gep2, align 16 + %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> + %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> + + store <4 x float> %m0_3, <4 x float>* %gep1 + store <4 x float> %m4_7, <4 x float>* %gep2 + ret void +} + +; This should be lowered into LD3 +define void @aarch64_ilc_idx_ld3(<4 x float>* %ptr, i64 %idx) { +entry: +; CHECK-LABEL: @aarch64_ilc_idx_ld3( +; CHECK-DAG: [[LSHR:%.+]] = lshr i64 %idx, 2 +; CHECK-DAG: [[GEP:%.+]] = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 [[LSHR]] +; CHECK-DAG: [[CAST:%.+]] = bitcast <4 x float>* [[GEP]] to <12 x float>* +; CHECK-DAG: [[LOAD:%.+]] = load <12 x float>, <12 x float>* [[CAST]], align 16 +; CHECK: %{{.* }}= shufflevector <12 x float> [[LOAD]], <12 x float> undef, <4 x i32> +; CHECK: %{{.* }}= shufflevector <12 x float> [[LOAD]], <12 x float> undef, <4 x i32> +; CHECK: %{{.* }}= shufflevector <12 x float> [[LOAD]], <12 x float> undef, <4 x i32> +; CHECK-DAG: ret void + +; AS-LABEL: aarch64_ilc_idx_ld3 +; AS: ld3 +; AS: ret + + %idx1 = lshr i64 %idx, 2 + %a1 = add i64 %idx, 4 + %idx2 = lshr i64 %a1, 2 + %a2 = add i64 %idx, 8 + %idx3 = lshr i64 %a2, 2 + + %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 %idx1 + %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 %idx2 + %gep3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 %idx3 + %ld1 = load <4 x float>, <4 x float>* %gep1, align 16 + %ld2 = load <4 x float>, <4 x float>* %gep2, align 16 + %ld3 = load <4 x float>, <4 x float>* %gep3, align 16 + + %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> + %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> + %sv3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> + %m0_3 = shufflevector <4 x float> %sv1, <4 x float> %ld3, <4 x i32> + %m4_7 = shufflevector <4 x float> %sv2, <4 x float> %ld3, <4 x i32> + %m8_11 = shufflevector <4 x float> %sv3, <4 x float> %ld3, <4 x i32> + + store <4 x float> %m0_3, <4 x float>* %gep1, align 16 + store <4 x float> %m4_7, <4 x float>* %gep2, align 16 + store <4 x float> %m8_11, <4 x float>* %gep3, align 16 + ret void +} +; %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32> + +; This must not be lowered +define void @aarch64_ilc_i32_idx(<4 x float>* %ptr, i32 %idx) { +; CHECK-LABEL: @aarch64_ilc_i32_idx( +; CHECK: %idx1 = lshr i32 %idx, 2 +; CHECK-NEXT: %a1 = add i32 %idx, 4 +; CHECK-NEXT: %idx2 = lshr i32 %a1, 2 +; CHECK-NEXT: %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 %idx1 +; CHECK-NEXT: %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 %idx2 +; CHECK-NEXT: %ld1 = load <4 x float>, <4 x float>* %gep1, align 16 +; CHECK-NEXT: %ld2 = load <4 x float>, <4 x float>* %gep2, align 16 +; CHECK-NEXT: %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> +; CHECK-NEXT: %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> +; CHECK-NEXT: store <4 x float> %m0_3, <4 x float>* %gep1, align 16 +; CHECK-NEXT: store <4 x float> %m4_7, <4 x float>* %gep2, align 16 +; CHECK-NEXT: ret void + +; AS-LABEL: aarch64_ilc_i32_idx +; AS-DAG: @function +; AS-NOT: ld2 +; AS-NOT: ld3 +; AS-NOT: ld4 +; AS-DAG: ret + +entry: + %idx1 = lshr i32 %idx, 2 + %a1 = add i32 %idx, 4 + %idx2 = lshr i32 %a1, 2 + + %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 %idx1 + %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 %idx2 + %ld1 = load <4 x float>, <4 x float>* %gep1, align 16 + %ld2 = load <4 x float>, <4 x float>* %gep2, align 16 + %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> + %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> + + store <4 x float> %m0_3, <4 x float>* %gep1, align 16 + store <4 x float> %m4_7, <4 x float>* %gep2, align 16 + ret void +} + +; Volatile loads must not be lowered +define void @aarch64_ilc_volatile(<4 x float>* %ptr) { +; CHECK-LABEL: @aarch64_ilc_volatile( +; CHECK: %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 0 +; CHECK-NEXT: %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 1 +; CHECK-NEXT: %ld1 = load volatile <4 x float>, <4 x float>* %gep1, align 16 +; CHECK-NEXT: %ld2 = load <4 x float>, <4 x float>* %gep2, align 16 +; CHECK-NEXT: %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> +; CHECK-NEXT: %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> +; CHECK-NEXT: store <4 x float> %m0_3, <4 x float>* %gep1, align 16 +; CHECK-NEXT: store <4 x float> %m4_7, <4 x float>* %gep2, align 16 +; CHECK-NEXT: ret void + +; AS-LABEL: aarch64_ilc_volatile +; AS-DAG: @function +; AS-NOT: ld2 +; AS-NOT: ld3 +; AS-NOT: ld4 +; AS-DAG: ret + +entry: + %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 0 + %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 1 + %ld1 = load volatile <4 x float>, <4 x float>* %gep1, align 16 + %ld2 = load <4 x float>, <4 x float>* %gep2, align 16 + %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> + %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> + store <4 x float> %m0_3, <4 x float>* %gep1, align 16 + store <4 x float> %m4_7, <4 x float>* %gep2, align 16 + ret void +} + +; This must not be lowered +define void @aarch64_ilc_depmem(<4 x float>* %ptr, i32 %idx) { +entry: +; CHECK-LABEL: @aarch64_ilc_depmem( +; CHECK: %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 0 +; CHECK-NEXT: %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 1 +; CHECK-NEXT: %ld1 = load <4 x float>, <4 x float>* %gep1, align 16 +; CHECK-NEXT: store <4 x float> %ld1, <4 x float>* %gep2, align 16 +; CHECK-NEXT: %ld2 = load <4 x float>, <4 x float>* %gep2, align 16 +; CHECK-NEXT: %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> +; CHECK-NEXT: %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> +; CHECK-NEXT: store <4 x float> %m0_3, <4 x float>* %gep1, align 16 +; CHECK-NEXT: store <4 x float> %m4_7, <4 x float>* %gep2, align 16 +; CHECK-NEXT: ret void + +; AS-LABEL: aarch64_ilc_depmem +; AS-DAG: @function +; AS-NOT: ld2 +; AS-NOT: ld3 +; AS-NOT: ld4 +; AS-DAG: ret + + %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 0 + %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 1 + %ld1 = load <4 x float>, <4 x float>* %gep1, align 16 + store <4 x float> %ld1, <4 x float>* %gep2, align 16 + %ld2 = load <4 x float>, <4 x float>* %gep2, align 16 + %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> + %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32> + + store <4 x float> %m0_3, <4 x float>* %gep1, align 16 + store <4 x float> %m4_7, <4 x float>* %gep2, align 16 + ret void +} + +; This cannot be converted - insertion position cannot be determined +define void @aarch64_no_insertion_pos(float* %ptr) { +entry: +; CHECK-LABEL: @aarch64_no_insertion_pos( +; CHECK: %p0 = getelementptr inbounds float, float* %ptr, i32 0 +; CHECK-NEXT: %p1 = getelementptr inbounds float, float* %ptr, i32 4 +; CHECK-NEXT: %b0 = bitcast float* %p0 to <5 x float>* +; CHECK-NEXT: %b1 = bitcast float* %p1 to <5 x float>* +; CHECK-NEXT: %l0 = load <5 x float>, <5 x float>* %b0 +; CHECK-NEXT: %l1 = load <5 x float>, <5 x float>* %b1 +; CHECK-NEXT: %s0 = shufflevector <5 x float> %l0, <5 x float> %l1, <4 x i32> +; CHECK-NEXT: %s1 = shufflevector <5 x float> %l0, <5 x float> %l1, <4 x i32> +; CHECK-NEXT: ret void + + %p0 = getelementptr inbounds float, float* %ptr, i32 0 + %p1 = getelementptr inbounds float, float* %ptr, i32 4 + %b0 = bitcast float* %p0 to <5 x float>* + %b1 = bitcast float* %p1 to <5 x float>* + %l0 = load <5 x float>, <5 x float>* %b0 + %l1 = load <5 x float>, <5 x float>* %b1 + %s0 = shufflevector <5 x float> %l0, <5 x float> %l1, <4 x i32> + %s1 = shufflevector <5 x float> %l0, <5 x float> %l1, <4 x i32> + ret void +} + +; This cannot be converted - the insertion position does not dominate all +; uses +define void @aarch64_insertpos_does_not_dominate(float* %ptr) { +entry: +; CHECK-LABEL: @aarch64_insertpos_does_not_dominate( +; CHECK: %p0 = getelementptr inbounds float, float* %ptr, i32 0 +; CHECK-NEXT: %p1 = getelementptr inbounds float, float* %ptr, i32 1 +; CHECK-NEXT: %b0 = bitcast float* %p0 to <7 x float>* +; CHECK-NEXT: %b1 = bitcast float* %p1 to <7 x float>* +; CHECK-NEXT: %l1 = load <7 x float>, <7 x float>* %b1 +; CHECK-NEXT: %s1 = shufflevector <7 x float> %l1, <7 x float> undef, <4 x i32> +; CHECK-NEXT: %l0 = load <7 x float>, <7 x float>* %b0 +; CHECK-NEXT: %s0 = shufflevector <7 x float> %l0, <7 x float> undef, <4 x i32> +; CHECK-NEXT: ret void + %p0 = getelementptr inbounds float, float* %ptr, i32 0 + %p1 = getelementptr inbounds float, float* %ptr, i32 1 + %b0 = bitcast float* %p0 to <7 x float>* + %b1 = bitcast float* %p1 to <7 x float>* + %l1 = load <7 x float>, <7 x float>* %b1 + %s1 = shufflevector <7 x float> %l1, <7 x float> undef, <4 x i32> + %l0 = load <7 x float>, <7 x float>* %b0 + %s0 = shufflevector <7 x float> %l0, <7 x float> undef, <4 x i32> + ret void +} diff --git a/test/CodeGen/AArch64/aarch64-smull.ll b/test/CodeGen/AArch64/aarch64-smull.ll index 1c8d13a00b2a2f82cc2c04cfc91293a44f051bb6..8922ae971c2bc84e961a6796fe6149a940e3adea 100644 --- a/test/CodeGen/AArch64/aarch64-smull.ll +++ b/test/CodeGen/AArch64/aarch64-smull.ll @@ -291,15 +291,16 @@ define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { ret <2 x i64> %tmp4 } -define i16 @smullWithInconsistentExtensions(<8 x i8> %vec) { +define i16 @smullWithInconsistentExtensions(<8 x i8> %x, <8 x i8> %y) { ; If one operand has a zero-extend and the other a sign-extend, smull ; cannot be used. ; CHECK-LABEL: smullWithInconsistentExtensions: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h - %1 = sext <8 x i8> %vec to <8 x i16> - %2 = mul <8 x i16> %1, - %3 = extractelement <8 x i16> %2, i32 0 - ret i16 %3 + %s = sext <8 x i8> %x to <8 x i16> + %z = zext <8 x i8> %y to <8 x i16> + %m = mul <8 x i16> %s, %z + %r = extractelement <8 x i16> %m, i32 0 + ret i16 %r } define void @distribute(i16* %dst, i8* %src, i32 %mul) nounwind { diff --git a/test/CodeGen/AArch64/arm64-abi-varargs.ll b/test/CodeGen/AArch64/arm64-abi-varargs.ll index 92c320a82102f211fef1af29a9412fa1fad4678d..b0a4256552726b203636ce49f78f41a55f8e3b83 100644 --- a/test/CodeGen/AArch64/arm64-abi-varargs.ll +++ b/test/CodeGen/AArch64/arm64-abi-varargs.ll @@ -7,14 +7,13 @@ define void @fn9(i32* %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, ; CHECK-LABEL: fn9: ; 9th fixed argument ; CHECK: ldr {{w[0-9]+}}, [sp, #64] -; CHECK: add [[ARGS:x[0-9]+]], sp, #72 -; CHECK: add {{x[0-9]+}}, [[ARGS]], #8 +; CHECK-DAG: add [[ARGS:x[0-9]+]], sp, #72 ; First vararg -; CHECK: ldr {{w[0-9]+}}, [sp, #72] +; CHECK-DAG: ldr {{w[0-9]+}}, [sp, #72] ; Second vararg -; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}], #8 +; CHECK-DAG: ldr {{w[0-9]+}}, [sp, #80] ; Third vararg -; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}], #8 +; CHECK-DAG: ldr {{w[0-9]+}}, [sp, #88] %1 = alloca i32, align 4 %2 = alloca i32, align 4 %3 = alloca i32, align 4 diff --git a/test/CodeGen/AArch64/arm64-build-vector.ll b/test/CodeGen/AArch64/arm64-build-vector.ll index d7fe9c6d68bfd086bf1e95a96e9129ff06cd62e6..d268f761c9a5db51749806c6aac2923855217328 100644 --- a/test/CodeGen/AArch64/arm64-build-vector.ll +++ b/test/CodeGen/AArch64/arm64-build-vector.ll @@ -53,3 +53,25 @@ define void @widen_f16_build_vector(half* %addr) { store <2 x half> , <2 x half>* %1, align 2 ret void } + +; Check that a single element vector is constructed with a mov +define <1 x i64> @single_element_vector_i64(<1 x i64> %arg) { +; CHECK-LABEL: single_element_vector_i64 +; CHECK: orr w[[GREG:[0-9]+]], wzr, #0x1 +; CHECK: fmov d[[DREG:[0-9]+]], x[[GREG]] +; CHECK: add d0, d0, d[[DREG]] +; CHECK: ret +entry: + %add = add <1 x i64> %arg, + ret <1 x i64> %add +} + +define <1 x double> @single_element_vector_double(<1 x double> %arg) { +; CHECK-LABEL: single_element_vector_double +; CHECK: fmov d[[DREG:[0-9]+]], #1.00000000 +; CHECK: fadd d0, d0, d[[DREG]] +; CHECK: ret +entry: + %add = fadd <1 x double> %arg, + ret <1 x double> %add +} diff --git a/test/CodeGen/AArch64/arm64-ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll index b18e638a3a947b59dd1fe20ff332085d425ea8a1..6b497e8f7bfda1ac98b9118ff76f5ab23120b104 100644 --- a/test/CodeGen/AArch64/arm64-ccmp.ll +++ b/test/CodeGen/AArch64/arm64-ccmp.ll @@ -526,8 +526,8 @@ define i32 @select_or_olt_one(double %v0, double %v1, double %v2, double %v3, i3 ; CHECK-LABEL: select_or_one_olt: ; CHECK-LABEL: ; %bb.0: ; CHECK-NEXT: fcmp d0, d1 -; CHECK-NEXT: fccmp d0, d1, #1, ne -; CHECK-NEXT: fccmp d2, d3, #8, vs +; CHECK-NEXT: fccmp d0, d1, #8, le +; CHECK-NEXT: fccmp d2, d3, #8, pl ; CHECK-NEXT: csel w0, w0, w1, mi ; CHECK-NEXT: ret define i32 @select_or_one_olt(double %v0, double %v1, double %v2, double %v3, i32 %a, i32 %b) #0 { @@ -556,8 +556,8 @@ define i32 @select_or_olt_ueq(double %v0, double %v1, double %v2, double %v3, i3 ; CHECK-LABEL: select_or_ueq_olt: ; CHECK-LABEL: ; %bb.0: ; CHECK-NEXT: fcmp d0, d1 -; CHECK-NEXT: fccmp d0, d1, #8, le -; CHECK-NEXT: fccmp d2, d3, #8, mi +; CHECK-NEXT: fccmp d0, d1, #1, ne +; CHECK-NEXT: fccmp d2, d3, #8, vc ; CHECK-NEXT: csel w0, w0, w1, mi ; CHECK-NEXT: ret define i32 @select_or_ueq_olt(double %v0, double %v1, double %v2, double %v3, i32 %a, i32 %b) #0 { @@ -656,4 +656,68 @@ define i32 @f128_select_and_olt_oge(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, ret i32 %sel } +; This testcase resembles the core problem of http://llvm.org/PR39550 +; (an OR operation is 2 levels deep but needs to be implemented first) +; CHECK-LABEL: deep_or +; CHECK: cmp w2, #20 +; CHECK-NEXT: ccmp w2, #15, #4, ne +; CHECK-NEXT: ccmp w1, #0, #4, eq +; CHECK-NEXT: ccmp w0, #0, #4, ne +; CHECK-NEXT: csel w0, w4, w5, ne +; CHECK-NEXT: ret +define i32 @deep_or(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) { + %c0 = icmp ne i32 %a0, 0 + %c1 = icmp ne i32 %a1, 0 + %c2 = icmp eq i32 %a2, 15 + %c3 = icmp eq i32 %a2, 20 + + %or = or i1 %c2, %c3 + %and0 = and i1 %or, %c1 + %and1 = and i1 %and0, %c0 + %sel = select i1 %and1, i32 %x, i32 %y + ret i32 %sel +} + +; Variation of deep_or, we still need to implement the OR first though. +; CHECK-LABEL: deep_or1 +; CHECK: cmp w2, #20 +; CHECK-NEXT: ccmp w2, #15, #4, ne +; CHECK-NEXT: ccmp w0, #0, #4, eq +; CHECK-NEXT: ccmp w1, #0, #4, ne +; CHECK-NEXT: csel w0, w4, w5, ne +; CHECK-NEXT: ret +define i32 @deep_or1(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) { + %c0 = icmp ne i32 %a0, 0 + %c1 = icmp ne i32 %a1, 0 + %c2 = icmp eq i32 %a2, 15 + %c3 = icmp eq i32 %a2, 20 + + %or = or i1 %c2, %c3 + %and0 = and i1 %c0, %or + %and1 = and i1 %and0, %c1 + %sel = select i1 %and1, i32 %x, i32 %y + ret i32 %sel +} + +; Variation of deep_or, we still need to implement the OR first though. +; CHECK-LABEL: deep_or2 +; CHECK: cmp w2, #20 +; CHECK-NEXT: ccmp w2, #15, #4, ne +; CHECK-NEXT: ccmp w1, #0, #4, eq +; CHECK-NEXT: ccmp w0, #0, #4, ne +; CHECK-NEXT: csel w0, w4, w5, ne +; CHECK-NEXT: ret +define i32 @deep_or2(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) { + %c0 = icmp ne i32 %a0, 0 + %c1 = icmp ne i32 %a1, 0 + %c2 = icmp eq i32 %a2, 15 + %c3 = icmp eq i32 %a2, 20 + + %or = or i1 %c2, %c3 + %and0 = and i1 %c0, %c1 + %and1 = and i1 %and0, %or + %sel = select i1 %and1, i32 %x, i32 %y + ret i32 %sel +} + attributes #0 = { nounwind } diff --git a/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/test/CodeGen/AArch64/arm64-convert-v4f64.ll index b9dbfc7745f8d3909c52cd01951dfcd703e7c8e9..ba7bdc4488c71e9afc8ba339d96e8e42fcdcdbf6 100644 --- a/test/CodeGen/AArch64/arm64-convert-v4f64.ll +++ b/test/CodeGen/AArch64/arm64-convert-v4f64.ll @@ -2,30 +2,30 @@ define <4 x i16> @fptosi_v4f64_to_v4i16(<4 x double>* %ptr) { -; CHECK: fptosi_v4f64_to_v4i16 +; CHECK-LABEL: fptosi_v4f64_to_v4i16 ; CHECK-DAG: fcvtzs v[[LHS:[0-9]+]].2d, v0.2d ; CHECK-DAG: fcvtzs v[[RHS:[0-9]+]].2d, v1.2d -; CHECK-DAG: xtn v[[MID:[0-9]+]].2s, v[[LHS]].2d -; CHECK-DAG: xtn2 v[[MID]].4s, v[[RHS]].2d -; CHECK: xtn v0.4h, v[[MID]].4s +; CHECK-DAG: xtn v[[XTN0:[0-9]+]].2s, v[[LHS]].2d +; CHECK-DAG: xtn v[[XTN1:[0-9]+]].2s, v[[RHS]].2d +; CHECK: uzp1 v0.4h, v[[XTN1]].4h, v[[XTN0]].4h %tmp1 = load <4 x double>, <4 x double>* %ptr %tmp2 = fptosi <4 x double> %tmp1 to <4 x i16> ret <4 x i16> %tmp2 } define <8 x i8> @fptosi_v4f64_to_v4i8(<8 x double>* %ptr) { -; CHECK: fptosi_v4f64_to_v4i8 +; CHECK-LABEL: fptosi_v4f64_to_v4i8 ; CHECK-DAG: fcvtzs v[[CONV0:[0-9]+]].2d, v0.2d ; CHECK-DAG: fcvtzs v[[CONV1:[0-9]+]].2d, v1.2d ; CHECK-DAG: fcvtzs v[[CONV2:[0-9]+]].2d, v2.2d ; CHECK-DAG: fcvtzs v[[CONV3:[0-9]+]].2d, v3.2d -; CHECK-DAG: xtn v[[NA2:[0-9]+]].2s, v[[CONV2]].2d -; CHECK-DAG: xtn2 v[[NA2]].4s, v[[CONV3]].2d -; CHECK-DAG: xtn v[[NA0:[0-9]+]].2s, v[[CONV0]].2d -; CHECK-DAG: xtn2 v[[NA0]].4s, v[[CONV1]].2d -; CHECK-DAG: xtn v[[TMP1:[0-9]+]].4h, v[[NA2]].4s -; CHECK-DAG: xtn2 v[[TMP1]].8h, v[[NA0]].4s -; CHECK: xtn v0.8b, v[[TMP1]].8h +; CHECK-DAG: xtn v[[XTN0:[0-9]+]].2s, v[[CONV0]].2d +; CHECK-DAG: xtn v[[XTN1:[0-9]+]].2s, v[[CONV1]].2d +; CHECK-DAG: xtn v[[XTN2:[0-9]+]].2s, v[[CONV2]].2d +; CHECK-DAG: xtn v[[XTN3:[0-9]+]].2s, v[[CONV3]].2d +; CHECK-DAG: uzp1 v[[UZP0:[0-9]+]].4h, v[[XTN1]].4h, v[[XTN0]].4h +; CHECK-DAG: uzp1 v[[UZP1:[0-9]+]].4h, v[[XTN3]].4h, v[[XTN2]].4h +; CHECK: uzp1 v0.8b, v[[UZP1:[0-9]+]].8b, v[[UZP0:[0-9]+]].8b %tmp1 = load <8 x double>, <8 x double>* %ptr %tmp2 = fptosi <8 x double> %tmp1 to <8 x i8> ret <8 x i8> %tmp2 @@ -54,12 +54,12 @@ define <4 x i16> @trunc_v4i64_to_v4i16(<4 x i64>* %ptr) { } define <4 x i16> @fptoui_v4f64_to_v4i16(<4 x double>* %ptr) { -; CHECK: fptoui_v4f64_to_v4i16 -; CHECK-DAG: fcvtzu v[[LHS:[0-9]+]].2d, v0.2d -; CHECK-DAG: fcvtzu v[[RHS:[0-9]+]].2d, v1.2d -; CHECK-DAG: xtn v[[MID:[0-9]+]].2s, v[[LHS]].2d -; CHECK-DAG: xtn2 v[[MID]].4s, v[[RHS]].2d -; CHECK: xtn v0.4h, v[[MID]].4s +; CHECK-LABEL: fptoui_v4f64_to_v4i16 +; CHECK-DAG: fcvtzs v[[LHS:[0-9]+]].2d, v0.2d +; CHECK-DAG: fcvtzs v[[RHS:[0-9]+]].2d, v1.2d +; CHECK-DAG: xtn v[[XTN0:[0-9]+]].2s, v[[LHS]].2d +; CHECK-DAG: xtn v[[XTN1:[0-9]+]].2s, v[[RHS]].2d +; CHECK: uzp1 v0.4h, v[[XTN1]].4h, v[[XTN0]].4h %tmp1 = load <4 x double>, <4 x double>* %ptr %tmp2 = fptoui <4 x double> %tmp1 to <4 x i16> ret <4 x i16> %tmp2 diff --git a/test/CodeGen/AArch64/arm64-ld1.ll b/test/CodeGen/AArch64/arm64-ld1.ll index 5f1caa2d67f89a2a7d2de3106972b5d1e175f494..6b119932cb7378136b188d9f2cb99d73606d22c8 100644 --- a/test/CodeGen/AArch64/arm64-ld1.ll +++ b/test/CodeGen/AArch64/arm64-ld1.ll @@ -915,7 +915,9 @@ entry: ; CHECK: ld1r_2s_from_dup ; CHECK: ld1r.2s { [[ARG1:v[0-9]+]] }, [x0] ; CHECK-NEXT: ld1r.2s { [[ARG2:v[0-9]+]] }, [x1] -; CHECK-NEXT: usubl.8h v[[RESREGNUM:[0-9]+]], [[ARG1]], [[ARG2]] +; CHECK-NEXT: ushll.8h [[ARG1]], [[ARG1]], #0 +; CHECK-NEXT: ushll.8h [[ARG2]], [[ARG2]], #0 +; CHECK-NEXT: sub.4h v[[RESREGNUM:[0-9]+]], [[ARG1]], [[ARG2]] ; CHECK-NEXT: str d[[RESREGNUM]], [x2] ; CHECK-NEXT: ret %tmp = bitcast i8* %a to i32* diff --git a/test/CodeGen/AArch64/arm64-memcpy-inline.ll b/test/CodeGen/AArch64/arm64-memcpy-inline.ll index 951076c10b843209926dbbaf6a1c4ae85007827b..629cf37926aeb1a8009b4ef5db9dfee9a3ff5ea9 100644 --- a/test/CodeGen/AArch64/arm64-memcpy-inline.ll +++ b/test/CodeGen/AArch64/arm64-memcpy-inline.ll @@ -16,10 +16,8 @@ define i32 @t0() { entry: ; CHECK-LABEL: t0: -; CHECK: ldrb [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #10] -; CHECK: strb [[REG0]], [x[[BASEREG2:[0-9]+]], #10] -; CHECK: ldrh [[REG1:w[0-9]+]], [x[[BASEREG]], #8] -; CHECK: strh [[REG1]], [x[[BASEREG2]], #8] +; CHECK: ldur [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #7] +; CHECK: stur [[REG0]], [x[[BASEREG2:[0-9]+]], #7] ; CHECK: ldr [[REG2:x[0-9]+]], ; CHECK: str [[REG2]], call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 getelementptr inbounds (%struct.x, %struct.x* @dst, i32 0, i32 0), i8* align 8 getelementptr inbounds (%struct.x, %struct.x* @src, i32 0, i32 0), i32 11, i1 false) @@ -74,9 +72,9 @@ entry: define void @t5(i8* nocapture %C) nounwind { entry: ; CHECK-LABEL: t5: -; CHECK: strb wzr, [x0, #6] -; CHECK: mov [[REG7:w[0-9]+]], #21587 -; CHECK: strh [[REG7]], [x0, #4] +; CHECK: mov [[REG7:w[0-9]+]], #21337 +; CHECK: movk [[REG7]], +; CHECK: stur [[REG7]], [x0, #3] ; CHECK: mov [[REG8:w[0-9]+]], ; CHECK: movk [[REG8]], ; CHECK: str [[REG8]], [x0] diff --git a/test/CodeGen/AArch64/arm64-memset-inline.ll b/test/CodeGen/AArch64/arm64-memset-inline.ll index 7a9f3b2fa97f90e8eb09091e679f3dc63ed575a8..460ccc25a270ab0a2fb16576075642320a0a537f 100644 --- a/test/CodeGen/AArch64/arm64-memset-inline.ll +++ b/test/CodeGen/AArch64/arm64-memset-inline.ll @@ -113,9 +113,9 @@ define void @bzero_20_stack() { define void @bzero_26_stack() { ; CHECK-LABEL: bzero_26_stack: -; CHECK: stp xzr, xzr, [sp] +; CHECK: stp xzr, xzr, [sp, #8] +; CHECK-NEXT: str xzr, [sp] ; CHECK-NEXT: strh wzr, [sp, #24] -; CHECK-NEXT: str xzr, [sp, #16] ; CHECK-NEXT: bl something %buf = alloca [26 x i8], align 1 %cast = bitcast [26 x i8]* %buf to i8* diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll index 0b6132b1be64bd3e229895de38c59e45876410db..0d4d2c7460071ef18ac71142c2c8550ebc0091a7 100644 --- a/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1,58 +1,81 @@ -; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s - +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s define <16 x i8> @ins16bw(<16 x i8> %tmp1, i8 %tmp2) { ; CHECK-LABEL: ins16bw: -; CHECK: mov {{v[0-9]+}}.b[15], {{w[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.b[15], w0 +; CHECK-NEXT: ret %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 15 ret <16 x i8> %tmp3 } define <8 x i16> @ins8hw(<8 x i16> %tmp1, i16 %tmp2) { ; CHECK-LABEL: ins8hw: -; CHECK: mov {{v[0-9]+}}.h[6], {{w[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.h[6], w0 +; CHECK-NEXT: ret %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 6 ret <8 x i16> %tmp3 } define <4 x i32> @ins4sw(<4 x i32> %tmp1, i32 %tmp2) { ; CHECK-LABEL: ins4sw: -; CHECK: mov {{v[0-9]+}}.s[2], {{w[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.s[2], w0 +; CHECK-NEXT: ret %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 2 ret <4 x i32> %tmp3 } define <2 x i64> @ins2dw(<2 x i64> %tmp1, i64 %tmp2) { ; CHECK-LABEL: ins2dw: -; CHECK: mov {{v[0-9]+}}.d[1], {{x[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.d[1], x0 +; CHECK-NEXT: ret %tmp3 = insertelement <2 x i64> %tmp1, i64 %tmp2, i32 1 ret <2 x i64> %tmp3 } define <8 x i8> @ins8bw(<8 x i8> %tmp1, i8 %tmp2) { ; CHECK-LABEL: ins8bw: -; CHECK: mov {{v[0-9]+}}.b[5], {{w[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.b[5], w0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 5 ret <8 x i8> %tmp3 } define <4 x i16> @ins4hw(<4 x i16> %tmp1, i16 %tmp2) { ; CHECK-LABEL: ins4hw: -; CHECK: mov {{v[0-9]+}}.h[3], {{w[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.h[3], w0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 3 ret <4 x i16> %tmp3 } define <2 x i32> @ins2sw(<2 x i32> %tmp1, i32 %tmp2) { ; CHECK-LABEL: ins2sw: -; CHECK: mov {{v[0-9]+}}.s[1], {{w[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.s[1], w0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1 ret <2 x i32> %tmp3 } define <16 x i8> @ins16b16(<16 x i8> %tmp1, <16 x i8> %tmp2) { ; CHECK-LABEL: ins16b16: -; CHECK: mov {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2] +; CHECK: // %bb.0: +; CHECK-NEXT: mov v1.b[15], v0.b[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <16 x i8> %tmp1, i32 2 %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15 ret <16 x i8> %tmp4 @@ -60,7 +83,10 @@ define <16 x i8> @ins16b16(<16 x i8> %tmp1, <16 x i8> %tmp2) { define <8 x i16> @ins8h8(<8 x i16> %tmp1, <8 x i16> %tmp2) { ; CHECK-LABEL: ins8h8: -; CHECK: mov {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2] +; CHECK: // %bb.0: +; CHECK-NEXT: mov v1.h[7], v0.h[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <8 x i16> %tmp1, i32 2 %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7 ret <8 x i16> %tmp4 @@ -68,7 +94,10 @@ define <8 x i16> @ins8h8(<8 x i16> %tmp1, <8 x i16> %tmp2) { define <4 x i32> @ins4s4(<4 x i32> %tmp1, <4 x i32> %tmp2) { ; CHECK-LABEL: ins4s4: -; CHECK: mov {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2] +; CHECK: // %bb.0: +; CHECK-NEXT: mov v1.s[1], v0.s[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <4 x i32> %tmp1, i32 2 %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1 ret <4 x i32> %tmp4 @@ -76,7 +105,10 @@ define <4 x i32> @ins4s4(<4 x i32> %tmp1, <4 x i32> %tmp2) { define <2 x i64> @ins2d2(<2 x i64> %tmp1, <2 x i64> %tmp2) { ; CHECK-LABEL: ins2d2: -; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: +; CHECK-NEXT: mov v1.d[1], v0.d[0] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <2 x i64> %tmp1, i32 0 %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1 ret <2 x i64> %tmp4 @@ -84,7 +116,10 @@ define <2 x i64> @ins2d2(<2 x i64> %tmp1, <2 x i64> %tmp2) { define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) { ; CHECK-LABEL: ins4f4: -; CHECK: mov {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2] +; CHECK: // %bb.0: +; CHECK-NEXT: mov v1.s[1], v0.s[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <4 x float> %tmp1, i32 2 %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1 ret <4 x float> %tmp4 @@ -92,7 +127,10 @@ define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) { define <2 x double> @ins2df2(<2 x double> %tmp1, <2 x double> %tmp2) { ; CHECK-LABEL: ins2df2: -; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: +; CHECK-NEXT: mov v1.d[1], v0.d[0] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <2 x double> %tmp1, i32 0 %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1 ret <2 x double> %tmp4 @@ -100,7 +138,11 @@ define <2 x double> @ins2df2(<2 x double> %tmp1, <2 x double> %tmp2) { define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) { ; CHECK-LABEL: ins8b16: -; CHECK: mov {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v1.b[15], v0.b[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <8 x i8> %tmp1, i32 2 %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15 ret <16 x i8> %tmp4 @@ -108,7 +150,11 @@ define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) { define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) { ; CHECK-LABEL: ins4h8: -; CHECK: mov {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v1.h[7], v0.h[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <4 x i16> %tmp1, i32 2 %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7 ret <8 x i16> %tmp4 @@ -116,7 +162,11 @@ define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) { define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) { ; CHECK-LABEL: ins2s4: -; CHECK: mov {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v1.s[1], v0.s[1] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <2 x i32> %tmp1, i32 1 %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1 ret <4 x i32> %tmp4 @@ -124,7 +174,11 @@ define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) { define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) { ; CHECK-LABEL: ins1d2: -; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v1.d[1], v0.d[0] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <1 x i64> %tmp1, i32 0 %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1 ret <2 x i64> %tmp4 @@ -132,7 +186,11 @@ define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) { define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) { ; CHECK-LABEL: ins2f4: -; CHECK: mov {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v1.s[1], v0.s[1] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <2 x float> %tmp1, i32 1 %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1 ret <4 x float> %tmp4 @@ -140,7 +198,10 @@ define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) { define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) { ; CHECK-LABEL: ins1f2: -; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: zip1 v0.2d, v1.2d, v0.2d +; CHECK-NEXT: ret %tmp3 = extractelement <1 x double> %tmp1, i32 0 %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1 ret <2 x double> %tmp4 @@ -148,7 +209,11 @@ define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) { define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) { ; CHECK-LABEL: ins16b8: -; CHECK: mov {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[2] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v1.b[7], v0.b[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <16 x i8> %tmp1, i32 2 %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 7 ret <8 x i8> %tmp4 @@ -156,7 +221,11 @@ define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) { define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) { ; CHECK-LABEL: ins8h4: -; CHECK: mov {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v1.h[3], v0.h[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <8 x i16> %tmp1, i32 2 %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3 ret <4 x i16> %tmp4 @@ -164,7 +233,11 @@ define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) { define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) { ; CHECK-LABEL: ins4s2: -; CHECK: mov {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v1.s[1], v0.s[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <4 x i32> %tmp1, i32 2 %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1 ret <2 x i32> %tmp4 @@ -172,7 +245,11 @@ define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) { define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) { ; CHECK-LABEL: ins2d1: -; CHECK: mov {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v1.d[0], v0.d[0] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <2 x i64> %tmp1, i32 0 %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0 ret <1 x i64> %tmp4 @@ -180,7 +257,11 @@ define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) { define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) { ; CHECK-LABEL: ins4f2: -; CHECK: mov {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v1.s[1], v0.s[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <4 x float> %tmp1, i32 2 %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1 ret <2 x float> %tmp4 @@ -188,7 +269,10 @@ define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) { define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) { ; CHECK-LABEL: ins2f1: -; CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.2d, v0.d[1] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret %tmp3 = extractelement <2 x double> %tmp1, i32 1 %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0 ret <1 x double> %tmp4 @@ -196,7 +280,12 @@ define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) { define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) { ; CHECK-LABEL: ins8b8: -; CHECK: mov {{v[0-9]+}}.b[4], {{v[0-9]+}}.b[2] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v1.b[4], v0.b[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <8 x i8> %tmp1, i32 2 %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 4 ret <8 x i8> %tmp4 @@ -204,7 +293,12 @@ define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) { define <4 x i16> @ins4h4(<4 x i16> %tmp1, <4 x i16> %tmp2) { ; CHECK-LABEL: ins4h4: -; CHECK: mov {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v1.h[3], v0.h[2] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <4 x i16> %tmp1, i32 2 %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3 ret <4 x i16> %tmp4 @@ -212,7 +306,12 @@ define <4 x i16> @ins4h4(<4 x i16> %tmp1, <4 x i16> %tmp2) { define <2 x i32> @ins2s2(<2 x i32> %tmp1, <2 x i32> %tmp2) { ; CHECK-LABEL: ins2s2: -; CHECK: mov {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v1.s[1], v0.s[0] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <2 x i32> %tmp1, i32 0 %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1 ret <2 x i32> %tmp4 @@ -220,7 +319,12 @@ define <2 x i32> @ins2s2(<2 x i32> %tmp1, <2 x i32> %tmp2) { define <1 x i64> @ins1d1(<1 x i64> %tmp1, <1 x i64> %tmp2) { ; CHECK-LABEL: ins1d1: -; CHECK: mov {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v1.d[0], v0.d[0] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <1 x i64> %tmp1, i32 0 %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0 ret <1 x i64> %tmp4 @@ -228,7 +332,12 @@ define <1 x i64> @ins1d1(<1 x i64> %tmp1, <1 x i64> %tmp2) { define <2 x float> @ins2f2(<2 x float> %tmp1, <2 x float> %tmp2) { ; CHECK-LABEL: ins2f2: -; CHECK: mov {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v1.s[1], v0.s[0] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = extractelement <2 x float> %tmp1, i32 0 %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1 ret <2 x float> %tmp4 @@ -236,7 +345,8 @@ define <2 x float> @ins2f2(<2 x float> %tmp1, <2 x float> %tmp2) { define <1 x double> @ins1df1(<1 x double> %tmp1, <1 x double> %tmp2) { ; CHECK-LABEL: ins1df1: -; CHECK-NOT: mov {{v[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: ret %tmp3 = extractelement <1 x double> %tmp1, i32 0 %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0 ret <1 x double> %tmp4 @@ -244,7 +354,9 @@ define <1 x double> @ins1df1(<1 x double> %tmp1, <1 x double> %tmp2) { define i32 @umovw16b(<16 x i8> %tmp1) { ; CHECK-LABEL: umovw16b: -; CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[8] +; CHECK: // %bb.0: +; CHECK-NEXT: umov w0, v0.b[8] +; CHECK-NEXT: ret %tmp3 = extractelement <16 x i8> %tmp1, i32 8 %tmp4 = zext i8 %tmp3 to i32 ret i32 %tmp4 @@ -252,7 +364,9 @@ define i32 @umovw16b(<16 x i8> %tmp1) { define i32 @umovw8h(<8 x i16> %tmp1) { ; CHECK-LABEL: umovw8h: -; CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[2] +; CHECK: // %bb.0: +; CHECK-NEXT: umov w0, v0.h[2] +; CHECK-NEXT: ret %tmp3 = extractelement <8 x i16> %tmp1, i32 2 %tmp4 = zext i16 %tmp3 to i32 ret i32 %tmp4 @@ -260,21 +374,28 @@ define i32 @umovw8h(<8 x i16> %tmp1) { define i32 @umovw4s(<4 x i32> %tmp1) { ; CHECK-LABEL: umovw4s: -; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.s[2] +; CHECK: // %bb.0: +; CHECK-NEXT: mov w0, v0.s[2] +; CHECK-NEXT: ret %tmp3 = extractelement <4 x i32> %tmp1, i32 2 ret i32 %tmp3 } define i64 @umovx2d(<2 x i64> %tmp1) { ; CHECK-LABEL: umovx2d: -; CHECK: mov {{x[0-9]+}}, {{v[0-9]+}}.d[1] +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, v0.d[1] +; CHECK-NEXT: ret %tmp3 = extractelement <2 x i64> %tmp1, i32 1 ret i64 %tmp3 } define i32 @umovw8b(<8 x i8> %tmp1) { ; CHECK-LABEL: umovw8b: -; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.b[7] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w0, v0.b[7] +; CHECK-NEXT: ret %tmp3 = extractelement <8 x i8> %tmp1, i32 7 %tmp4 = zext i8 %tmp3 to i32 ret i32 %tmp4 @@ -282,7 +403,10 @@ define i32 @umovw8b(<8 x i8> %tmp1) { define i32 @umovw4h(<4 x i16> %tmp1) { ; CHECK-LABEL: umovw4h: -; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.h[2] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w0, v0.h[2] +; CHECK-NEXT: ret %tmp3 = extractelement <4 x i16> %tmp1, i32 2 %tmp4 = zext i16 %tmp3 to i32 ret i32 %tmp4 @@ -290,21 +414,30 @@ define i32 @umovw4h(<4 x i16> %tmp1) { define i32 @umovw2s(<2 x i32> %tmp1) { ; CHECK-LABEL: umovw2s: -; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.s[1] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov w0, v0.s[1] +; CHECK-NEXT: ret %tmp3 = extractelement <2 x i32> %tmp1, i32 1 ret i32 %tmp3 } define i64 @umovx1d(<1 x i64> %tmp1) { ; CHECK-LABEL: umovx1d: -; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret %tmp3 = extractelement <1 x i64> %tmp1, i32 0 ret i64 %tmp3 } define i32 @smovw16b(<16 x i8> %tmp1) { ; CHECK-LABEL: smovw16b: -; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[8] +; CHECK: // %bb.0: +; CHECK-NEXT: smov w8, v0.b[8] +; CHECK-NEXT: add w0, w8, w8 +; CHECK-NEXT: ret %tmp3 = extractelement <16 x i8> %tmp1, i32 8 %tmp4 = sext i8 %tmp3 to i32 %tmp5 = add i32 %tmp4, %tmp4 @@ -313,7 +446,10 @@ define i32 @smovw16b(<16 x i8> %tmp1) { define i32 @smovw8h(<8 x i16> %tmp1) { ; CHECK-LABEL: smovw8h: -; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2] +; CHECK: // %bb.0: +; CHECK-NEXT: smov w8, v0.h[2] +; CHECK-NEXT: add w0, w8, w8 +; CHECK-NEXT: ret %tmp3 = extractelement <8 x i16> %tmp1, i32 2 %tmp4 = sext i16 %tmp3 to i32 %tmp5 = add i32 %tmp4, %tmp4 @@ -322,7 +458,9 @@ define i32 @smovw8h(<8 x i16> %tmp1) { define i64 @smovx16b(<16 x i8> %tmp1) { ; CHECK-LABEL: smovx16b: -; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.b[8] +; CHECK: // %bb.0: +; CHECK-NEXT: smov x0, v0.b[8] +; CHECK-NEXT: ret %tmp3 = extractelement <16 x i8> %tmp1, i32 8 %tmp4 = sext i8 %tmp3 to i64 ret i64 %tmp4 @@ -330,7 +468,9 @@ define i64 @smovx16b(<16 x i8> %tmp1) { define i64 @smovx8h(<8 x i16> %tmp1) { ; CHECK-LABEL: smovx8h: -; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.h[2] +; CHECK: // %bb.0: +; CHECK-NEXT: smov x0, v0.h[2] +; CHECK-NEXT: ret %tmp3 = extractelement <8 x i16> %tmp1, i32 2 %tmp4 = sext i16 %tmp3 to i64 ret i64 %tmp4 @@ -338,7 +478,9 @@ define i64 @smovx8h(<8 x i16> %tmp1) { define i64 @smovx4s(<4 x i32> %tmp1) { ; CHECK-LABEL: smovx4s: -; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[2] +; CHECK: // %bb.0: +; CHECK-NEXT: smov x0, v0.s[2] +; CHECK-NEXT: ret %tmp3 = extractelement <4 x i32> %tmp1, i32 2 %tmp4 = sext i32 %tmp3 to i64 ret i64 %tmp4 @@ -346,7 +488,11 @@ define i64 @smovx4s(<4 x i32> %tmp1) { define i32 @smovw8b(<8 x i8> %tmp1) { ; CHECK-LABEL: smovw8b: -; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[4] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w8, v0.b[4] +; CHECK-NEXT: add w0, w8, w8 +; CHECK-NEXT: ret %tmp3 = extractelement <8 x i8> %tmp1, i32 4 %tmp4 = sext i8 %tmp3 to i32 %tmp5 = add i32 %tmp4, %tmp4 @@ -355,7 +501,11 @@ define i32 @smovw8b(<8 x i8> %tmp1) { define i32 @smovw4h(<4 x i16> %tmp1) { ; CHECK-LABEL: smovw4h: -; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w8, v0.h[2] +; CHECK-NEXT: add w0, w8, w8 +; CHECK-NEXT: ret %tmp3 = extractelement <4 x i16> %tmp1, i32 2 %tmp4 = sext i16 %tmp3 to i32 %tmp5 = add i32 %tmp4, %tmp4 @@ -364,7 +514,10 @@ define i32 @smovw4h(<4 x i16> %tmp1) { define i32 @smovx8b(<8 x i8> %tmp1) { ; CHECK-LABEL: smovx8b: -; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.b[6] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w0, v0.b[6] +; CHECK-NEXT: ret %tmp3 = extractelement <8 x i8> %tmp1, i32 6 %tmp4 = sext i8 %tmp3 to i32 ret i32 %tmp4 @@ -372,7 +525,10 @@ define i32 @smovx8b(<8 x i8> %tmp1) { define i32 @smovx4h(<4 x i16> %tmp1) { ; CHECK-LABEL: smovx4h: -; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.h[2] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w0, v0.h[2] +; CHECK-NEXT: ret %tmp3 = extractelement <4 x i16> %tmp1, i32 2 %tmp4 = sext i16 %tmp3 to i32 ret i32 %tmp4 @@ -380,7 +536,10 @@ define i32 @smovx4h(<4 x i16> %tmp1) { define i64 @smovx2s(<2 x i32> %tmp1) { ; CHECK-LABEL: smovx2s: -; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[1] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov x0, v0.s[1] +; CHECK-NEXT: ret %tmp3 = extractelement <2 x i32> %tmp1, i32 1 %tmp4 = sext i32 %tmp3 to i64 ret i64 %tmp4 @@ -388,35 +547,52 @@ define i64 @smovx2s(<2 x i32> %tmp1) { define <8 x i8> @test_vcopy_lane_s8(<8 x i8> %v1, <8 x i8> %v2) { ; CHECK-LABEL: test_vcopy_lane_s8: -; CHECK: mov {{v[0-9]+}}.b[5], {{v[0-9]+}}.b[3] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v0.b[5], v1.b[3] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> ret <8 x i8> %vset_lane } define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %v1, <16 x i8> %v2) { ; CHECK-LABEL: test_vcopyq_laneq_s8: -; CHECK: mov {{v[0-9]+}}.b[14], {{v[0-9]+}}.b[6] +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.b[14], v1.b[6] +; CHECK-NEXT: ret %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> ret <16 x i8> %vset_lane } define <8 x i8> @test_vcopy_lane_swap_s8(<8 x i8> %v1, <8 x i8> %v2) { ; CHECK-LABEL: test_vcopy_lane_swap_s8: -; CHECK: mov {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[0] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v1.b[7], v0.b[0] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> ret <8 x i8> %vset_lane } define <16 x i8> @test_vcopyq_laneq_swap_s8(<16 x i8> %v1, <16 x i8> %v2) { ; CHECK-LABEL: test_vcopyq_laneq_swap_s8: -; CHECK: mov {{v[0-9]+}}.b[0], {{v[0-9]+}}.b[15] +; CHECK: // %bb.0: +; CHECK-NEXT: mov v1.b[0], v0.b[15] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> ret <16 x i8> %vset_lane } define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 { ; CHECK-LABEL: test_vdup_n_u8: -; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.8b, w0 +; CHECK-NEXT: ret %vecinit.i = insertelement <8 x i8> undef, i8 %v1, i32 0 %vecinit1.i = insertelement <8 x i8> %vecinit.i, i8 %v1, i32 1 %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 %v1, i32 2 @@ -430,7 +606,9 @@ define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 { define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 { ; CHECK-LABEL: test_vdup_n_u16: -; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.4h, w0 +; CHECK-NEXT: ret %vecinit.i = insertelement <4 x i16> undef, i16 %v1, i32 0 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %v1, i32 1 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %v1, i32 2 @@ -440,7 +618,9 @@ define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 { define <2 x i32> @test_vdup_n_u32(i32 %v1) #0 { ; CHECK-LABEL: test_vdup_n_u32: -; CHECK: dup {{v[0-9]+}}.2s, {{w[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.2s, w0 +; CHECK-NEXT: ret %vecinit.i = insertelement <2 x i32> undef, i32 %v1, i32 0 %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %v1, i32 1 ret <2 x i32> %vecinit1.i @@ -448,14 +628,18 @@ define <2 x i32> @test_vdup_n_u32(i32 %v1) #0 { define <1 x i64> @test_vdup_n_u64(i64 %v1) #0 { ; CHECK-LABEL: test_vdup_n_u64: -; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ret %vecinit.i = insertelement <1 x i64> undef, i64 %v1, i32 0 ret <1 x i64> %vecinit.i } define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 { ; CHECK-LABEL: test_vdupq_n_u8: -; CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.16b, w0 +; CHECK-NEXT: ret %vecinit.i = insertelement <16 x i8> undef, i8 %v1, i32 0 %vecinit1.i = insertelement <16 x i8> %vecinit.i, i8 %v1, i32 1 %vecinit2.i = insertelement <16 x i8> %vecinit1.i, i8 %v1, i32 2 @@ -477,7 +661,9 @@ define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 { define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 { ; CHECK-LABEL: test_vdupq_n_u16: -; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.8h, w0 +; CHECK-NEXT: ret %vecinit.i = insertelement <8 x i16> undef, i16 %v1, i32 0 %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %v1, i32 1 %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %v1, i32 2 @@ -491,7 +677,9 @@ define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 { define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 { ; CHECK-LABEL: test_vdupq_n_u32: -; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.4s, w0 +; CHECK-NEXT: ret %vecinit.i = insertelement <4 x i32> undef, i32 %v1, i32 0 %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %v1, i32 1 %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %v1, i32 2 @@ -501,7 +689,9 @@ define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 { define <2 x i64> @test_vdupq_n_u64(i64 %v1) #0 { ; CHECK-LABEL: test_vdupq_n_u64: -; CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.2d, x0 +; CHECK-NEXT: ret %vecinit.i = insertelement <2 x i64> undef, i64 %v1, i32 0 %vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %v1, i32 1 ret <2 x i64> %vecinit1.i @@ -509,190 +699,252 @@ define <2 x i64> @test_vdupq_n_u64(i64 %v1) #0 { define <8 x i8> @test_vdup_lane_s8(<8 x i8> %v1) #0 { ; CHECK-LABEL: test_vdup_lane_s8: -; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.8b, v0.b[5] +; CHECK-NEXT: ret %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <8 x i32> ret <8 x i8> %shuffle } define <4 x i16> @test_vdup_lane_s16(<4 x i16> %v1) #0 { ; CHECK-LABEL: test_vdup_lane_s16: -; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.4h, v0.h[2] +; CHECK-NEXT: ret %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <4 x i32> ret <4 x i16> %shuffle } define <2 x i32> @test_vdup_lane_s32(<2 x i32> %v1) #0 { ; CHECK-LABEL: test_vdup_lane_s32: -; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.2s, v0.s[1] +; CHECK-NEXT: ret %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> ret <2 x i32> %shuffle } define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %v1) #0 { ; CHECK-LABEL: test_vdupq_lane_s8: -; CHECK: {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.16b, v0.b[5] +; CHECK-NEXT: ret %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <16 x i32> ret <16 x i8> %shuffle } define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %v1) #0 { ; CHECK-LABEL: test_vdupq_lane_s16: -; CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.8h, v0.h[2] +; CHECK-NEXT: ret %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <8 x i32> ret <8 x i16> %shuffle } define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %v1) #0 { ; CHECK-LABEL: test_vdupq_lane_s32: -; CHECK: {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.4s, v0.s[1] +; CHECK-NEXT: ret %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <4 x i32> ret <4 x i32> %shuffle } define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 { ; CHECK-LABEL: test_vdupq_lane_s64: -; CHECK: {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.2d, v0.d[0] +; CHECK-NEXT: ret %shuffle = shufflevector <1 x i64> %v1, <1 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %shuffle } define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 { ; CHECK-LABEL: test_vdup_laneq_s8: -; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5] +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.8b, v0.b[5] +; CHECK-NEXT: ret %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> ret <8 x i8> %shuffle } define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 { ; CHECK-LABEL: test_vdup_laneq_s16: -; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2] +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.4h, v0.h[2] +; CHECK-NEXT: ret %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> ret <4 x i16> %shuffle } define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 { ; CHECK-LABEL: test_vdup_laneq_s32: -; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.2s, v0.s[1] +; CHECK-NEXT: ret %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> ret <2 x i32> %shuffle } define <16 x i8> @test_vdupq_laneq_s8(<16 x i8> %v1) #0 { ; CHECK-LABEL: test_vdupq_laneq_s8: -; CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5] +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.16b, v0.b[5] +; CHECK-NEXT: ret %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> ret <16 x i8> %shuffle } define <8 x i16> @test_vdupq_laneq_s16(<8 x i16> %v1) #0 { ; CHECK-LABEL: test_vdupq_laneq_s16: -; CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2] +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.8h, v0.h[2] +; CHECK-NEXT: ret %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> ret <8 x i16> %shuffle } define <4 x i32> @test_vdupq_laneq_s32(<4 x i32> %v1) #0 { ; CHECK-LABEL: test_vdupq_laneq_s32: -; CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.4s, v0.s[1] +; CHECK-NEXT: ret %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> ret <4 x i32> %shuffle } define <2 x i64> @test_vdupq_laneq_s64(<2 x i64> %v1) #0 { ; CHECK-LABEL: test_vdupq_laneq_s64: -; CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.2d, v0.d[0] +; CHECK-NEXT: ret %shuffle = shufflevector <2 x i64> %v1, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %shuffle } define i64 @test_bitcastv8i8toi64(<8 x i8> %in) { ; CHECK-LABEL: test_bitcastv8i8toi64: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret %res = bitcast <8 x i8> %in to i64 -; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}} ret i64 %res } define i64 @test_bitcastv4i16toi64(<4 x i16> %in) { ; CHECK-LABEL: test_bitcastv4i16toi64: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret %res = bitcast <4 x i16> %in to i64 -; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}} ret i64 %res } define i64 @test_bitcastv2i32toi64(<2 x i32> %in) { ; CHECK-LABEL: test_bitcastv2i32toi64: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret %res = bitcast <2 x i32> %in to i64 -; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}} ret i64 %res } define i64 @test_bitcastv2f32toi64(<2 x float> %in) { ; CHECK-LABEL: test_bitcastv2f32toi64: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret %res = bitcast <2 x float> %in to i64 -; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}} ret i64 %res } define i64 @test_bitcastv1i64toi64(<1 x i64> %in) { ; CHECK-LABEL: test_bitcastv1i64toi64: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret %res = bitcast <1 x i64> %in to i64 -; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}} ret i64 %res } define i64 @test_bitcastv1f64toi64(<1 x double> %in) { ; CHECK-LABEL: test_bitcastv1f64toi64: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret %res = bitcast <1 x double> %in to i64 -; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}} ret i64 %res } define <8 x i8> @test_bitcasti64tov8i8(i64 %in) { ; CHECK-LABEL: test_bitcasti64tov8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ret %res = bitcast i64 %in to <8 x i8> -; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} ret <8 x i8> %res } define <4 x i16> @test_bitcasti64tov4i16(i64 %in) { ; CHECK-LABEL: test_bitcasti64tov4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ret %res = bitcast i64 %in to <4 x i16> -; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} ret <4 x i16> %res } define <2 x i32> @test_bitcasti64tov2i32(i64 %in) { ; CHECK-LABEL: test_bitcasti64tov2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ret %res = bitcast i64 %in to <2 x i32> -; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} ret <2 x i32> %res } define <2 x float> @test_bitcasti64tov2f32(i64 %in) { ; CHECK-LABEL: test_bitcasti64tov2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ret %res = bitcast i64 %in to <2 x float> -; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} ret <2 x float> %res } define <1 x i64> @test_bitcasti64tov1i64(i64 %in) { ; CHECK-LABEL: test_bitcasti64tov1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ret %res = bitcast i64 %in to <1 x i64> -; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} ret <1 x i64> %res } define <1 x double> @test_bitcasti64tov1f64(i64 %in) { ; CHECK-LABEL: test_bitcasti64tov1f64: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ret %res = bitcast i64 %in to <1 x double> -; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} ret <1 x double> %res } define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 { ; CHECK-LABEL: test_bitcastv8i8tov1f64: -; CHECK: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b -; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.8b, v0.8b +; CHECK-NEXT: fcvtzs x8, d0 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret %sub.i = sub <8 x i8> zeroinitializer, %a %1 = bitcast <8 x i8> %sub.i to <1 x double> %vcvt.i = fptosi <1 x double> %1 to <1 x i64> @@ -701,8 +953,11 @@ define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 { define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 { ; CHECK-LABEL: test_bitcastv4i16tov1f64: -; CHECK: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h -; CHECK-NEXT: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4h, v0.4h +; CHECK-NEXT: fcvtzs x8, d0 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret %sub.i = sub <4 x i16> zeroinitializer, %a %1 = bitcast <4 x i16> %sub.i to <1 x double> %vcvt.i = fptosi <1 x double> %1 to <1 x i64> @@ -711,8 +966,11 @@ define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 { define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 { ; CHECK-LABEL: test_bitcastv2i32tov1f64: -; CHECK: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s -; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.2s, v0.2s +; CHECK-NEXT: fcvtzs x8, d0 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret %sub.i = sub <2 x i32> zeroinitializer, %a %1 = bitcast <2 x i32> %sub.i to <1 x double> %vcvt.i = fptosi <1 x double> %1 to <1 x i64> @@ -721,8 +979,11 @@ define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 { define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 { ; CHECK-LABEL: test_bitcastv1i64tov1f64: -; CHECK: neg {{d[0-9]+}}, {{d[0-9]+}} -; CHECK-NEXT: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: neg d0, d0 +; CHECK-NEXT: fcvtzs x8, d0 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret %sub.i = sub <1 x i64> zeroinitializer, %a %1 = bitcast <1 x i64> %sub.i to <1 x double> %vcvt.i = fptosi <1 x double> %1 to <1 x i64> @@ -731,8 +992,11 @@ define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 { define <1 x i64> @test_bitcastv2f32tov1f64(<2 x float> %a) #0 { ; CHECK-LABEL: test_bitcastv2f32tov1f64: -; CHECK: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s -; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: fneg v0.2s, v0.2s +; CHECK-NEXT: fcvtzs x8, d0 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret %sub.i = fsub <2 x float> , %a %1 = bitcast <2 x float> %sub.i to <1 x double> %vcvt.i = fptosi <1 x double> %1 to <1 x i64> @@ -741,8 +1005,12 @@ define <1 x i64> @test_bitcastv2f32tov1f64(<2 x float> %a) #0 { define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 { ; CHECK-LABEL: test_bitcastv1f64tov8i8: -; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}} -; CHECK-NEXT: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: scvtf d0, x8 +; CHECK-NEXT: neg v0.8b, v0.8b +; CHECK-NEXT: ret %vcvt.i = sitofp <1 x i64> %a to <1 x double> %1 = bitcast <1 x double> %vcvt.i to <8 x i8> %sub.i = sub <8 x i8> zeroinitializer, %1 @@ -751,8 +1019,12 @@ define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 { define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 { ; CHECK-LABEL: test_bitcastv1f64tov4i16: -; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}} -; CHECK-NEXT: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: scvtf d0, x8 +; CHECK-NEXT: neg v0.4h, v0.4h +; CHECK-NEXT: ret %vcvt.i = sitofp <1 x i64> %a to <1 x double> %1 = bitcast <1 x double> %vcvt.i to <4 x i16> %sub.i = sub <4 x i16> zeroinitializer, %1 @@ -761,8 +1033,12 @@ define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 { define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 { ; CHECK-LABEL: test_bitcastv1f64tov2i32: -; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}} -; CHECK-NEXT: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: scvtf d0, x8 +; CHECK-NEXT: neg v0.2s, v0.2s +; CHECK-NEXT: ret %vcvt.i = sitofp <1 x i64> %a to <1 x double> %1 = bitcast <1 x double> %vcvt.i to <2 x i32> %sub.i = sub <2 x i32> zeroinitializer, %1 @@ -771,8 +1047,12 @@ define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 { define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 { ; CHECK-LABEL: test_bitcastv1f64tov1i64: -; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}} -; CHECK-NEXT: neg {{d[0-9]+}}, {{d[0-9]+}} +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: scvtf d0, x8 +; CHECK-NEXT: neg d0, d0 +; CHECK-NEXT: ret %vcvt.i = sitofp <1 x i64> %a to <1 x double> %1 = bitcast <1 x double> %vcvt.i to <1 x i64> %sub.i = sub <1 x i64> zeroinitializer, %1 @@ -781,8 +1061,12 @@ define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 { define <2 x float> @test_bitcastv1f64tov2f32(<1 x i64> %a) #0 { ; CHECK-LABEL: test_bitcastv1f64tov2f32: -; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}} -; CHECK-NEXT: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: scvtf d0, x8 +; CHECK-NEXT: fneg v0.2s, v0.2s +; CHECK-NEXT: ret %vcvt.i = sitofp <1 x i64> %a to <1 x double> %1 = bitcast <1 x double> %vcvt.i to <2 x float> %sub.i = fsub <2 x float> , %1 @@ -882,7 +1166,9 @@ define <4 x i32> @testDUP.v1i32(<1 x i32> %a) { define <8 x i8> @getl(<16 x i8> %x) #0 { ; CHECK-LABEL: getl: -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret %vecext = extractelement <16 x i8> %x, i32 0 %vecinit = insertelement <8 x i8> undef, i8 %vecext, i32 0 %vecext1 = extractelement <16 x i8> %x, i32 1 @@ -902,15 +1188,22 @@ define <8 x i8> @getl(<16 x i8> %x) #0 { ret <8 x i8> %vecinit14 } -; CHECK-LABEL: test_extracts_inserts_varidx_extract: -; CHECK: str q0 -; CHECK-DAG: and [[MASKED_IDX:x[0-9]+]], x0, #0x7 -; CHECK: bfi [[PTR:x[0-9]+]], [[MASKED_IDX]], #1, #3 -; CHECK-DAG: ldr h[[R:[0-9]+]], {{\[}}[[PTR]]{{\]}} -; CHECK-DAG: mov v[[R]].h[1], v0.h[1] -; CHECK-DAG: mov v[[R]].h[2], v0.h[2] -; CHECK-DAG: mov v[[R]].h[3], v0.h[3] define <4 x i16> @test_extracts_inserts_varidx_extract(<8 x i16> %x, i32 %idx) { +; CHECK-LABEL: test_extracts_inserts_varidx_extract: +; CHECK: // %bb.0: +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: str q0, [sp, #-16]! +; CHECK-NEXT: and x8, x0, #0x7 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: bfi x9, x8, #1, #3 +; CHECK-NEXT: ldr h1, [x9] +; CHECK-NEXT: mov v1.h[1], v0.h[1] +; CHECK-NEXT: mov v1.h[2], v0.h[2] +; CHECK-NEXT: mov v1.h[3], v0.h[3] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: add sp, sp, #16 // =16 +; CHECK-NEXT: ret %tmp = extractelement <8 x i16> %x, i32 %idx %tmp2 = insertelement <4 x i16> undef, i16 %tmp, i32 0 %tmp3 = extractelement <8 x i16> %x, i32 1 @@ -922,15 +1215,23 @@ define <4 x i16> @test_extracts_inserts_varidx_extract(<8 x i16> %x, i32 %idx) { ret <4 x i16> %tmp8 } -; CHECK-LABEL: test_extracts_inserts_varidx_insert: -; CHECK: and [[MASKED_IDX:x[0-9]+]], x0, #0x3 -; CHECK: bfi x9, [[MASKED_IDX]], #1, #2 -; CHECK: str h0, [x9] -; CHECK-DAG: ldr d[[R:[0-9]+]] -; CHECK-DAG: mov v[[R]].h[1], v0.h[1] -; CHECK-DAG: mov v[[R]].h[2], v0.h[2] -; CHECK-DAG: mov v[[R]].h[3], v0.h[3] define <4 x i16> @test_extracts_inserts_varidx_insert(<8 x i16> %x, i32 %idx) { +; CHECK-LABEL: test_extracts_inserts_varidx_insert: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 // =16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: and x8, x0, #0x3 +; CHECK-NEXT: add x9, sp, #8 // =8 +; CHECK-NEXT: bfi x9, x8, #1, #2 +; CHECK-NEXT: str h0, [x9] +; CHECK-NEXT: ldr d1, [sp, #8] +; CHECK-NEXT: mov v1.h[1], v0.h[1] +; CHECK-NEXT: mov v1.h[2], v0.h[2] +; CHECK-NEXT: mov v1.h[3], v0.h[3] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: add sp, sp, #16 // =16 +; CHECK-NEXT: ret %tmp = extractelement <8 x i16> %x, i32 0 %tmp2 = insertelement <4 x i16> undef, i16 %tmp, i32 %idx %tmp3 = extractelement <8 x i16> %x, i32 1 @@ -944,7 +1245,10 @@ define <4 x i16> @test_extracts_inserts_varidx_insert(<8 x i16> %x, i32 %idx) { define <4 x i16> @test_dup_v2i32_v4i16(<2 x i32> %a) { ; CHECK-LABEL: test_dup_v2i32_v4i16: -; CHECK: dup v0.4h, v0.h[2] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.4h, v0.h[2] +; CHECK-NEXT: ret entry: %x = extractelement <2 x i32> %a, i32 1 %vget_lane = trunc i32 %x to i16 @@ -957,7 +1261,9 @@ entry: define <8 x i16> @test_dup_v4i32_v8i16(<4 x i32> %a) { ; CHECK-LABEL: test_dup_v4i32_v8i16: -; CHECK: dup v0.8h, v0.h[6] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v0.8h, v0.h[6] +; CHECK-NEXT: ret entry: %x = extractelement <4 x i32> %a, i32 3 %vget_lane = trunc i32 %x to i16 @@ -974,7 +1280,10 @@ entry: define <4 x i16> @test_dup_v1i64_v4i16(<1 x i64> %a) { ; CHECK-LABEL: test_dup_v1i64_v4i16: -; CHECK: dup v0.4h, v0.h[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: ret entry: %x = extractelement <1 x i64> %a, i32 0 %vget_lane = trunc i64 %x to i16 @@ -987,7 +1296,10 @@ entry: define <2 x i32> @test_dup_v1i64_v2i32(<1 x i64> %a) { ; CHECK-LABEL: test_dup_v1i64_v2i32: -; CHECK: dup v0.2s, v0.s[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: ret entry: %x = extractelement <1 x i64> %a, i32 0 %vget_lane = trunc i64 %x to i32 @@ -998,7 +1310,9 @@ entry: define <8 x i16> @test_dup_v2i64_v8i16(<2 x i64> %a) { ; CHECK-LABEL: test_dup_v2i64_v8i16: -; CHECK: dup v0.8h, v0.h[4] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v0.8h, v0.h[4] +; CHECK-NEXT: ret entry: %x = extractelement <2 x i64> %a, i32 1 %vget_lane = trunc i64 %x to i16 @@ -1015,7 +1329,9 @@ entry: define <4 x i32> @test_dup_v2i64_v4i32(<2 x i64> %a) { ; CHECK-LABEL: test_dup_v2i64_v4i32: -; CHECK: dup v0.4s, v0.s[2] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v0.4s, v0.s[2] +; CHECK-NEXT: ret entry: %x = extractelement <2 x i64> %a, i32 1 %vget_lane = trunc i64 %x to i32 @@ -1028,7 +1344,9 @@ entry: define <4 x i16> @test_dup_v4i32_v4i16(<4 x i32> %a) { ; CHECK-LABEL: test_dup_v4i32_v4i16: -; CHECK: dup v0.4h, v0.h[2] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v0.4h, v0.h[2] +; CHECK-NEXT: ret entry: %x = extractelement <4 x i32> %a, i32 1 %vget_lane = trunc i32 %x to i16 @@ -1041,7 +1359,9 @@ entry: define <4 x i16> @test_dup_v2i64_v4i16(<2 x i64> %a) { ; CHECK-LABEL: test_dup_v2i64_v4i16: -; CHECK: dup v0.4h, v0.h[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: ret entry: %x = extractelement <2 x i64> %a, i32 0 %vget_lane = trunc i64 %x to i16 @@ -1054,7 +1374,9 @@ entry: define <2 x i32> @test_dup_v2i64_v2i32(<2 x i64> %a) { ; CHECK-LABEL: test_dup_v2i64_v2i32: -; CHECK: dup v0.2s, v0.s[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: ret entry: %x = extractelement <2 x i64> %a, i32 0 %vget_lane = trunc i64 %x to i32 @@ -1066,8 +1388,9 @@ entry: define <2 x float> @test_scalar_to_vector_f32_to_v2f32(<2 x float> %a) { ; CHECK-LABEL: test_scalar_to_vector_f32_to_v2f32: -; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s -; CHECK-NEXT: ret +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmaxp s0, v0.2s +; CHECK-NEXT: ret entry: %0 = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a) %1 = insertelement <1 x float> undef, float %0, i32 0 @@ -1078,8 +1401,9 @@ entry: define <4 x float> @test_scalar_to_vector_f32_to_v4f32(<2 x float> %a) { ; CHECK-LABEL: test_scalar_to_vector_f32_to_v4f32: -; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s -; CHECK-NEXT: ret +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmaxp s0, v0.2s +; CHECK-NEXT: ret entry: %0 = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a) %1 = insertelement <1 x float> undef, float %0, i32 0 @@ -1092,7 +1416,10 @@ declare float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float>) define <2 x i32> @test_concat_undef_v1i32(<2 x i32> %a) { ; CHECK-LABEL: test_concat_undef_v1i32: -; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: ret entry: %0 = extractelement <2 x i32> %a, i32 0 %vecinit1.i = insertelement <2 x i32> undef, i32 %0, i32 1 @@ -1103,8 +1430,10 @@ declare i32 @llvm.aarch64.neon.sqabs.i32(i32) #4 define <2 x i32> @test_concat_v1i32_undef(i32 %a) { ; CHECK-LABEL: test_concat_v1i32_undef: -; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}} -; CHECK-NEXT: ret +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: sqabs s0, s0 +; CHECK-NEXT: ret entry: %b = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a) %vecinit.i432 = insertelement <2 x i32> undef, i32 %b, i32 0 @@ -1113,7 +1442,10 @@ entry: define <2 x i32> @test_concat_same_v1i32_v1i32(<2 x i32> %a) { ; CHECK-LABEL: test_concat_same_v1i32_v1i32: -; CHECK: dup v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: ret entry: %0 = extractelement <2 x i32> %a, i32 0 %vecinit.i = insertelement <2 x i32> undef, i32 %0, i32 0 @@ -1123,9 +1455,15 @@ entry: define <2 x i32> @test_concat_diff_v1i32_v1i32(i32 %a, i32 %b) { ; CHECK-LABEL: test_concat_diff_v1i32_v1i32: -; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}} -; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}} -; CHECK: mov {{v[0-9]+}}.s[1], w{{[0-9]+}} +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s1, w1 +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: sqabs s1, s1 +; CHECK-NEXT: sqabs s0, s0 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret entry: %c = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a) %d = insertelement <2 x i32> undef, i32 %c, i32 0 @@ -1137,7 +1475,9 @@ entry: define <16 x i8> @test_concat_v16i8_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) #0 { ; CHECK-LABEL: test_concat_v16i8_v16i8_v16i8: -; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecinit30 = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> ret <16 x i8> %vecinit30 @@ -1145,7 +1485,10 @@ entry: define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 { ; CHECK-LABEL: test_concat_v16i8_v8i8_v16i8: -; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecext = extractelement <8 x i8> %x, i32 0 %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0 @@ -1169,7 +1512,10 @@ entry: define <16 x i8> @test_concat_v16i8_v16i8_v8i8(<16 x i8> %x, <8 x i8> %y) #0 { ; CHECK-LABEL: test_concat_v16i8_v16i8_v8i8: -; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecext = extractelement <16 x i8> %x, i32 0 %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0 @@ -1208,7 +1554,11 @@ entry: define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 { ; CHECK-LABEL: test_concat_v16i8_v8i8_v8i8: -; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecext = extractelement <8 x i8> %x, i32 0 %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0 @@ -1247,7 +1597,9 @@ entry: define <8 x i16> @test_concat_v8i16_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) #0 { ; CHECK-LABEL: test_concat_v8i16_v8i16_v8i16: -; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecinit14 = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> ret <8 x i16> %vecinit14 @@ -1255,7 +1607,10 @@ entry: define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 { ; CHECK-LABEL: test_concat_v8i16_v4i16_v8i16: -; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecext = extractelement <4 x i16> %x, i32 0 %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0 @@ -1271,7 +1626,10 @@ entry: define <8 x i16> @test_concat_v8i16_v8i16_v4i16(<8 x i16> %x, <4 x i16> %y) #0 { ; CHECK-LABEL: test_concat_v8i16_v8i16_v4i16: -; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecext = extractelement <8 x i16> %x, i32 0 %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0 @@ -1294,7 +1652,11 @@ entry: define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 { ; CHECK-LABEL: test_concat_v8i16_v4i16_v4i16: -; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecext = extractelement <4 x i16> %x, i32 0 %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0 @@ -1317,7 +1679,9 @@ entry: define <4 x i32> @test_concat_v4i32_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) #0 { ; CHECK-LABEL: test_concat_v4i32_v4i32_v4i32: -; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecinit6 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> ret <4 x i32> %vecinit6 @@ -1325,7 +1689,10 @@ entry: define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 { ; CHECK-LABEL: test_concat_v4i32_v2i32_v4i32: -; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecext = extractelement <2 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 @@ -1337,7 +1704,10 @@ entry: define <4 x i32> @test_concat_v4i32_v4i32_v2i32(<4 x i32> %x, <2 x i32> %y) #0 { ; CHECK-LABEL: test_concat_v4i32_v4i32_v2i32: -; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 @@ -1352,7 +1722,11 @@ entry: define <4 x i32> @test_concat_v4i32_v2i32_v2i32(<2 x i32> %x, <2 x i32> %y) #0 { ; CHECK-LABEL: test_concat_v4i32_v2i32_v2i32: -; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecinit6 = shufflevector <2 x i32> %x, <2 x i32> %y, <4 x i32> ret <4 x i32> %vecinit6 @@ -1360,7 +1734,9 @@ entry: define <2 x i64> @test_concat_v2i64_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) #0 { ; CHECK-LABEL: test_concat_v2i64_v2i64_v2i64: -; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: ret entry: %vecinit2 = shufflevector <2 x i64> %x, <2 x i64> %y, <2 x i32> ret <2 x i64> %vecinit2 @@ -1368,7 +1744,10 @@ entry: define <2 x i64> @test_concat_v2i64_v1i64_v2i64(<1 x i64> %x, <2 x i64> %y) #0 { ; CHECK-LABEL: test_concat_v2i64_v1i64_v2i64: -; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: ret entry: %vecext = extractelement <1 x i64> %x, i32 0 %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0 @@ -1378,7 +1757,10 @@ entry: define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 { ; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64: -; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: ret entry: %vecext = extractelement <2 x i64> %x, i32 0 %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0 @@ -1389,7 +1771,11 @@ entry: define <2 x i64> @test_concat_v2i64_v1i64_v1i64(<1 x i64> %x, <1 x i64> %y) #0 { ; CHECK-LABEL: test_concat_v2i64_v1i64_v1i64: -; CHECK: mov {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecext = extractelement <1 x i64> %x, i32 0 %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0 @@ -1401,84 +1787,113 @@ entry: define <4 x i16> @concat_vector_v4i16_const() { ; CHECK-LABEL: concat_vector_v4i16_const: -; CHECK: movi {{v[0-9]+}}.2d, #0 +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ret %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <4 x i32> zeroinitializer ret <4 x i16> %r } define <4 x i16> @concat_vector_v4i16_const_one() { ; CHECK-LABEL: concat_vector_v4i16_const_one: -; CHECK: movi {{v[0-9]+}}.4h, #1 +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.4h, #1 +; CHECK-NEXT: ret %r = shufflevector <1 x i16> , <1 x i16> undef, <4 x i32> zeroinitializer ret <4 x i16> %r } define <4 x i32> @concat_vector_v4i32_const() { ; CHECK-LABEL: concat_vector_v4i32_const: -; CHECK: movi {{v[0-9]+}}.2d, #0 +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ret %r = shufflevector <1 x i32> zeroinitializer, <1 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %r } define <8 x i8> @concat_vector_v8i8_const() { ; CHECK-LABEL: concat_vector_v8i8_const: -; CHECK: movi {{v[0-9]+}}.2d, #0 +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ret %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer ret <8 x i8> %r } define <8 x i16> @concat_vector_v8i16_const() { ; CHECK-LABEL: concat_vector_v8i16_const: -; CHECK: movi {{v[0-9]+}}.2d, #0 +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ret %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <8 x i32> zeroinitializer ret <8 x i16> %r } define <8 x i16> @concat_vector_v8i16_const_one() { ; CHECK-LABEL: concat_vector_v8i16_const_one: -; CHECK: movi {{v[0-9]+}}.8h, #1 +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.8h, #1 +; CHECK-NEXT: ret %r = shufflevector <1 x i16> , <1 x i16> undef, <8 x i32> zeroinitializer ret <8 x i16> %r } define <16 x i8> @concat_vector_v16i8_const() { ; CHECK-LABEL: concat_vector_v16i8_const: -; CHECK: movi {{v[0-9]+}}.2d, #0 +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ret %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <16 x i32> zeroinitializer ret <16 x i8> %r } define <4 x i16> @concat_vector_v4i16(<1 x i16> %a) { ; CHECK-LABEL: concat_vector_v4i16: -; CHECK: dup v0.4h, v0.h[0] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: ret %r = shufflevector <1 x i16> %a, <1 x i16> undef, <4 x i32> zeroinitializer ret <4 x i16> %r } define <4 x i32> @concat_vector_v4i32(<1 x i32> %a) { ; CHECK-LABEL: concat_vector_v4i32: -; CHECK: dup v0.4s, v0.s[0] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ret %r = shufflevector <1 x i32> %a, <1 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %r } define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) { ; CHECK-LABEL: concat_vector_v8i8: -; CHECK: dup v0.8b, v0.b[0] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.8b, v0.b[0] +; CHECK-NEXT: ret %r = shufflevector <1 x i8> %a, <1 x i8> undef, <8 x i32> zeroinitializer ret <8 x i8> %r } define <8 x i16> @concat_vector_v8i16(<1 x i16> %a) { ; CHECK-LABEL: concat_vector_v8i16: -; CHECK: dup v0.8h, v0.h[0] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.8h, v0.h[0] +; CHECK-NEXT: ret %r = shufflevector <1 x i16> %a, <1 x i16> undef, <8 x i32> zeroinitializer ret <8 x i16> %r } define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) { ; CHECK-LABEL: concat_vector_v16i8: -; CHECK: dup v0.16b, v0.b[0] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.16b, v0.b[0] +; CHECK-NEXT: ret %r = shufflevector <1 x i8> %a, <1 x i8> undef, <16 x i32> zeroinitializer ret <16 x i8> %r } diff --git a/test/CodeGen/AArch64/cluster-frame-index.mir b/test/CodeGen/AArch64/cluster-frame-index.mir new file mode 100644 index 0000000000000000000000000000000000000000..75ccdc4559ba51263bf7c68eb329bd380a4d2428 --- /dev/null +++ b/test/CodeGen/AArch64/cluster-frame-index.mir @@ -0,0 +1,27 @@ +#RUN: llc -mtriple=aarch64-- -mcpu=cyclone -run-pass machine-scheduler -o - %s | FileCheck %s +... +--- +name: merge_stack +# CHECK-LABEL: name: merge_stack +tracksRegLiveness: true +stack: + - { id: 0, size: 64, alignment: 8 } +body: | + bb.0: + liveins: $w0, $w1 + + %0:gpr32 = COPY $w0 + %1:gpr32 = COPY $w1 + undef %3.sub_32:gpr64 = ORRWrs $wzr, %0, 0 + STRXui %3, %stack.0, 0 :: (store 8) + undef %5.sub_32:gpr64 = ORRWrs $wzr, %1, 0 + STRXui %5, %stack.0, 1 :: (store 8) + RET_ReallyLR + + ; CHECK: COPY + ; CHECK-NEXT: COPY + ; CHECK-NEXT: ORRWrs + ; CHECK-NEXT: ORRWrs + ; CHECK-NEXT: STRXui + ; CHECK-NEXT: STRXui + ; CHECK-NEXT: RET diff --git a/test/CodeGen/AArch64/cmp-to-cmn.ll b/test/CodeGen/AArch64/cmp-to-cmn.ll new file mode 100644 index 0000000000000000000000000000000000000000..6275da67fbfc45843809edd53097dde19e71290b --- /dev/null +++ b/test/CodeGen/AArch64/cmp-to-cmn.ll @@ -0,0 +1,399 @@ +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "arm64" + +define i1 @test_EQ_IllEbT(i64 %a, i64 %b) { +; CHECK-LABEL: test_EQ_IllEbT +; CHECK: cmn x1, x0 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %add = sub i64 0, %b + %cmp = icmp eq i64 %add, %a + ret i1 %cmp +} + +define i1 @test_EQ_IliEbT(i64 %a, i32 %b) { +; CHECK-LABEL: test_EQ_IliEbT +; CHECK: cmn x0, w1, sxtw +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %conv = sext i32 %b to i64 + %add = sub i64 0, %a + %cmp = icmp eq i64 %conv, %add + ret i1 %cmp +} + +define i1 @test_EQ_IlsEbT(i64 %a, i16 %b) { +; CHECK-LABEL: test_EQ_IlsEbT +; CHECK: cmn x0, w1, sxth +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %conv = sext i16 %b to i64 + %add = sub i64 0, %a + %cmp = icmp eq i64 %conv, %add + ret i1 %cmp +} + +define i1 @test_EQ_IlcEbT(i64 %a, i8 %b) { +; CHECK-LABEL: test_EQ_IlcEbT +; CHECK: cmn x0, w1, uxtb +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %conv = zext i8 %b to i64 + %add = sub i64 0, %a + %cmp = icmp eq i64 %conv, %add + ret i1 %cmp +} + +define i1 @test_EQ_IilEbT(i32 %a, i64 %b) { +; CHECK-LABEL: test_EQ_IilEbT +; CHECK: cmn x1, w0, sxtw +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %conv = sext i32 %a to i64 + %add = sub i64 0, %b + %cmp = icmp eq i64 %conv, %add + ret i1 %cmp +} + +define i1 @test_EQ_IiiEbT(i32 %a, i32 %b) { +; CHECK-LABEL: test_EQ_IiiEbT +; CHECK: cmn w1, w0 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %add = sub i32 0, %b + %cmp = icmp eq i32 %add, %a + ret i1 %cmp +} + +define i1 @test_EQ_IisEbT(i32 %a, i16 %b) { +; CHECK-LABEL: test_EQ_IisEbT +; CHECK: cmn w0, w1, sxth +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %conv = sext i16 %b to i32 + %add = sub i32 0, %a + %cmp = icmp eq i32 %conv, %add + ret i1 %cmp +} + +define i1 @test_EQ_IicEbT(i32 %a, i8 %b) { +; CHECK-LABEL: test_EQ_IicEbT +; CHECK: cmn w0, w1, uxtb +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %conv = zext i8 %b to i32 + %add = sub i32 0, %a + %cmp = icmp eq i32 %conv, %add + ret i1 %cmp +} + +define i1 @test_EQ_IslEbT(i16 %a, i64 %b) { +; CHECK-LABEL: test_EQ_IslEbT +; CHECK: cmn x1, w0, sxth +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %conv = sext i16 %a to i64 + %add = sub i64 0, %b + %cmp = icmp eq i64 %conv, %add + ret i1 %cmp +} + +define i1 @test_EQ_IsiEbT(i16 %a, i32 %b) { +; CHECK-LABEL: test_EQ_IsiEbT +; CHECK: cmn w1, w0, sxth +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %conv = sext i16 %a to i32 + %add = sub i32 0, %b + %cmp = icmp eq i32 %conv, %add + ret i1 %cmp +} + +define i1 @test_EQ_IssEbT(i16 %a, i16 %b) { +; CHECK-LABEL: test_EQ_IssEbT +; CHECK: sxth w8, w1 +; CHECK-NEXT: cmn w8, w0, sxth +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT; ret +entry: + %conv = sext i16 %a to i32 + %conv1 = sext i16 %b to i32 + %add = sub nsw i32 0, %conv1 + %cmp = icmp eq i32 %conv, %add + ret i1 %cmp +} + +define i1 @test_EQ_IscEbT(i16 %a, i8 %b) { +; CHECK-LABEL: test_EQ_IscEbT +; CHECK: and w8, w1, #0xff +; CHECK-NEXT: cmn w8, w0, sxth +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT; ret +entry: + %conv = sext i16 %a to i32 + %conv1 = zext i8 %b to i32 + %add = sub nsw i32 0, %conv1 + %cmp = icmp eq i32 %conv, %add + ret i1 %cmp +} + +define i1 @test_EQ_IclEbT(i8 %a, i64 %b) { +; CHECK-LABEL: test_EQ_IclEbT +; CHECK: cmn x1, w0, uxtb +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %conv = zext i8 %a to i64 + %add = sub i64 0, %b + %cmp = icmp eq i64 %conv, %add + ret i1 %cmp +} + +define i1 @test_EQ_IciEbT(i8 %a, i32 %b) { +; CHECK-LABEL: test_EQ_IciEbT +; CHECK: cmn w1, w0, uxtb +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %conv = zext i8 %a to i32 + %add = sub i32 0, %b + %cmp = icmp eq i32 %conv, %add + ret i1 %cmp +} + +define i1 @test_EQ_IcsEbT(i8 %a, i16 %b) { +; CHECK-LABEL: test_EQ_IcsEbT +; CHECK: sxth w8, w1 +; CHECK-NEXT: cmn w8, w0, uxtb +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %conv = zext i8 %a to i32 + %conv1 = sext i16 %b to i32 + %add = sub nsw i32 0, %conv1 + %cmp = icmp eq i32 %conv, %add + ret i1 %cmp +} + +define i1 @test_EQ_IccEbT(i8 %a, i8 %b) { +; CHECK-LABEL: test_EQ_IccEbT +; CHECK: and w8, w1, #0xff +; CHECK-NEXT: cmn w8, w0, uxtb +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %conv = zext i8 %a to i32 + %conv1 = zext i8 %b to i32 + %add = sub nsw i32 0, %conv1 + %cmp = icmp eq i32 %conv, %add + ret i1 %cmp +} + +define i1 @test_NE_IllEbT(i64 %a, i64 %b) { +; CHECK-LABEL: test_NE_IllEbT +; CHECK: cmn x1, x0 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret +entry: + %add = sub i64 0, %b + %cmp = icmp ne i64 %add, %a + ret i1 %cmp +} + +define i1 @test_NE_IliEbT(i64 %a, i32 %b) { +; CHECK-LABEL: test_NE_IliEbT +; CHECK: cmn x0, w1, sxtw +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret +entry: + %conv = sext i32 %b to i64 + %add = sub i64 0, %a + %cmp = icmp ne i64 %conv, %add + ret i1 %cmp +} + +define i1 @test_NE_IlsEbT(i64 %a, i16 %b) { +; CHECK-LABEL: test_NE_IlsEbT +; CHECK: cmn x0, w1, sxth +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret +entry: + %conv = sext i16 %b to i64 + %add = sub i64 0, %a + %cmp = icmp ne i64 %conv, %add + ret i1 %cmp +} + +define i1 @test_NE_IlcEbT(i64 %a, i8 %b) { +; CHECK-LABEL: test_NE_IlcEbT +; CHECK: cmn x0, w1, uxtb +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret +entry: + %conv = zext i8 %b to i64 + %add = sub i64 0, %a + %cmp = icmp ne i64 %conv, %add + ret i1 %cmp +} + +define i1 @test_NE_IilEbT(i32 %a, i64 %b) { +; CHECK-LABEL: test_NE_IilEbT +; CHECK: cmn x1, w0, sxtw +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret +entry: + %conv = sext i32 %a to i64 + %add = sub i64 0, %b + %cmp = icmp ne i64 %conv, %add + ret i1 %cmp +} + +define i1 @test_NE_IiiEbT(i32 %a, i32 %b) { +; CHECK-LABEL: test_NE_IiiEbT +; CHECK: cmn w1, w0 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret +entry: + %add = sub i32 0, %b + %cmp = icmp ne i32 %add, %a + ret i1 %cmp +} + +define i1 @test_NE_IisEbT(i32 %a, i16 %b) { +; CHECK-LABEL: test_NE_IisEbT +; CHECK: cmn w0, w1, sxth +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret +entry: + %conv = sext i16 %b to i32 + %add = sub i32 0, %a + %cmp = icmp ne i32 %conv, %add + ret i1 %cmp +} + +define i1 @test_NE_IicEbT(i32 %a, i8 %b) { +; CHECK-LABEL: test_NE_IicEbT +; CHECK: cmn w0, w1, uxtb +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret +entry: + %conv = zext i8 %b to i32 + %add = sub i32 0, %a + %cmp = icmp ne i32 %conv, %add + ret i1 %cmp +} + +define i1 @test_NE_IslEbT(i16 %a, i64 %b) { +; CHECK-LABEL: test_NE_IslEbT +; CHECK: cmn x1, w0, sxth +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret +entry: + %conv = sext i16 %a to i64 + %add = sub i64 0, %b + %cmp = icmp ne i64 %conv, %add + ret i1 %cmp +} + +define i1 @test_NE_IsiEbT(i16 %a, i32 %b) { +; CHECK-LABEL: test_NE_IsiEbT +; CHECK: cmn w1, w0, sxth +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret +entry: + %conv = sext i16 %a to i32 + %add = sub i32 0, %b + %cmp = icmp ne i32 %conv, %add + ret i1 %cmp +} + +define i1 @test_NE_IssEbT(i16 %a, i16 %b) { +; CHECK-LABEL:test_NE_IssEbT +; CHECK: sxth w8, w1 +; CHECK-NEXT: cmn w8, w0, sxth +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret +entry: + %conv = sext i16 %a to i32 + %conv1 = sext i16 %b to i32 + %add = sub nsw i32 0, %conv1 + %cmp = icmp ne i32 %conv, %add + ret i1 %cmp +} + +define i1 @test_NE_IscEbT(i16 %a, i8 %b) { +; CHECK-LABEL:test_NE_IscEbT +; CHECK: and w8, w1, #0xff +; CHECK-NEXT: cmn w8, w0, sxth +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret +entry: + %conv = sext i16 %a to i32 + %conv1 = zext i8 %b to i32 + %add = sub nsw i32 0, %conv1 + %cmp = icmp ne i32 %conv, %add + ret i1 %cmp +} + +define i1 @test_NE_IclEbT(i8 %a, i64 %b) { +; CHECK-LABEL:test_NE_IclEbT +; CHECK: cmn x1, w0, uxtb +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret +entry: + %conv = zext i8 %a to i64 + %add = sub i64 0, %b + %cmp = icmp ne i64 %conv, %add + ret i1 %cmp +} + +define i1 @test_NE_IciEbT(i8 %a, i32 %b) { +; CHECK-LABEL:test_NE_IciEbT +; CHECK: cmn w1, w0, uxtb +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret +entry: + %conv = zext i8 %a to i32 + %add = sub i32 0, %b + %cmp = icmp ne i32 %conv, %add + ret i1 %cmp +} + +define i1 @test_NE_IcsEbT(i8 %a, i16 %b) { +; CHECK-LABEL:test_NE_IcsEbT +; CHECK: sxth w8, w1 +; CHECK-NEXT: cmn w8, w0, uxtb +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret +entry: + %conv = zext i8 %a to i32 + %conv1 = sext i16 %b to i32 + %add = sub nsw i32 0, %conv1 + %cmp = icmp ne i32 %conv, %add + ret i1 %cmp +} + +define i1 @test_NE_IccEbT(i8 %a, i8 %b) { +; CHECK-LABEL:test_NE_IccEbT +; CHECK: and w8, w1, #0xff +; CHECK-NEXT: cmn w8, w0, uxtb +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret +entry: + %conv = zext i8 %a to i32 + %conv1 = zext i8 %b to i32 + %add = sub nsw i32 0, %conv1 + %cmp = icmp ne i32 %conv, %add + ret i1 %cmp +} diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll index 0e21903c2760f223ab26c79d549578fabc1da71e..d4a32bd278caa513c0c74bc91beb41523a7ffb14 100644 --- a/test/CodeGen/AArch64/cpus.ll +++ b/test/CodeGen/AArch64/cpus.ll @@ -17,6 +17,7 @@ ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=saphira 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=kryo 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=thunderx2t99 2>&1 | FileCheck %s +; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=tsv110 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID ; CHECK-NOT: {{.*}} is not a recognized processor for this target diff --git a/test/CodeGen/AArch64/fast-isel-erase.ll b/test/CodeGen/AArch64/fast-isel-erase.ll new file mode 100644 index 0000000000000000000000000000000000000000..e8265bce9fec3b2d6769302d53b0be4c872b7b68 --- /dev/null +++ b/test/CodeGen/AArch64/fast-isel-erase.ll @@ -0,0 +1,25 @@ +; RUN: llc -mtriple=arm64-apple-ios -o - %s -fast-isel=1 -O0 | FileCheck %s + +; The zext can be folded into the load and removed, but doing so can invalidate +; pointers internal to FastISel and cause a crash so it must be done carefully. +define i32 @test() { +; CHECK-LABEL: test: +; CHECK: ldrh +; CHECK: bl _callee +; CHECK-NOT: uxth + +entry: + store i32 undef, i32* undef, align 4 + %t81 = load i16, i16* undef, align 2 + call void @callee() + %t82 = zext i16 %t81 to i32 + %t83 = shl i32 %t82, 16 + %t84 = or i32 undef, %t83 + br label %end + +end: + %val = phi i32 [%t84, %entry] + ret i32 %val +} + +declare void @callee() diff --git a/test/CodeGen/AArch64/funclet-local-stack-size.ll b/test/CodeGen/AArch64/funclet-local-stack-size.ll new file mode 100644 index 0000000000000000000000000000000000000000..3b6ca6a94c19e8e381daea0280db497af96d8799 --- /dev/null +++ b/test/CodeGen/AArch64/funclet-local-stack-size.ll @@ -0,0 +1,53 @@ +; RUN: llc -o - %s -mtriple=aarch64-windows | FileCheck %s +; Check that the local stack size is computed correctly for a funclet contained +; within a varargs function. The varargs component shouldn't be included in the +; local stack size computation. +target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-windows-msvc19.11.0" + +%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] } + +$"??_R0H@8" = comdat any + +@"??_7type_info@@6B@" = external constant i8* +@"??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat + +; CHECK-LABEL: ?catch$2@?0??func@@YAHHHZZ@4HA +; CHECK: stp x29, x30, [sp, #-16]! +; CHECK: ldp x29, x30, [sp], #16 +; Function Attrs: uwtable +define dso_local i32 @"?func@@YAHHHZZ"(i32 %a, i32, ...) local_unnamed_addr #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) { +entry: + %arr = alloca [10 x i32], align 4 + %a2 = alloca i32, align 4 + %1 = bitcast [10 x i32]* %arr to i8* + %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %arr, i64 0, i64 0 + %call = call i32 @"?init@@YAHPEAH@Z"(i32* nonnull %arraydecay) + %call1 = invoke i32 @"?func2@@YAHXZ"() + to label %cleanup unwind label %catch.dispatch + +catch.dispatch: ; preds = %entry + %2 = catchswitch within none [label %catch] unwind to caller + +catch: ; preds = %catch.dispatch + %3 = catchpad within %2 [%rtti.TypeDescriptor2* @"??_R0H@8", i32 0, i32* %a2] + %4 = load i32, i32* %a2, align 4 + %add = add nsw i32 %4, 1 + catchret from %3 to label %cleanup + +cleanup: ; preds = %entry, %catch + %retval.0 = phi i32 [ %add, %catch ], [ %call1, %entry ] + ret i32 %retval.0 +} + +declare dso_local i32 @"?init@@YAHPEAH@Z"(i32*) + +declare dso_local i32 @"?func2@@YAHXZ"() + +declare dso_local i32 @__CxxFrameHandler3(...) + +attributes #0 = { uwtable } + +!llvm.module.flags = !{!0} + +!0 = !{i32 1, !"wchar_size", i32 2} diff --git a/test/CodeGen/AArch64/half.ll b/test/CodeGen/AArch64/half.ll index 154d85c9bb610757e9524e598d6effb50716b337..171019080744cf7b33def2640780ad48aab974ce 100644 --- a/test/CodeGen/AArch64/half.ll +++ b/test/CodeGen/AArch64/half.ll @@ -87,9 +87,9 @@ define i16 @test_fccmp(i1 %a) { ;CHECK: fcmp %cmp0 = fcmp ogt half 0xH3333, undef %cmp1 = fcmp ogt half 0xH2222, undef - %x = select i1 %cmp0, i16 0, i16 undef + %x = select i1 %cmp0, i16 0, i16 1 %or = or i1 %cmp1, %cmp0 - %y = select i1 %or, i16 4, i16 undef + %y = select i1 %or, i16 4, i16 1 %r = add i16 %x, %y ret i16 %r } diff --git a/test/CodeGen/AArch64/ldst-opt-after-block-placement.ll b/test/CodeGen/AArch64/ldst-opt-after-block-placement.ll new file mode 100644 index 0000000000000000000000000000000000000000..468f77309d367abdc8c58d427c61c7646c8365cd --- /dev/null +++ b/test/CodeGen/AArch64/ldst-opt-after-block-placement.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O3 -mtriple=aarch64-arm < %s | FileCheck %s + +; Run at O3 to make sure we can optimize load/store instructions after Machine +; Block Placement takes place using Tail Duplication Threshold = 4. + +define void @foo(i1 %cond, i64* %ptr) { +; CHECK-LABEL: foo: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: tbz w0, #0, .LBB0_2 +; CHECK-NEXT: // %bb.1: // %if.then +; CHECK-NEXT: ldp x9, x8, [x1, #8] +; CHECK-NEXT: str xzr, [x1, #16] +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: b.lt .LBB0_3 +; CHECK-NEXT: b .LBB0_4 +; CHECK-NEXT: .LBB0_2: // %if.else +; CHECK-NEXT: ldp x8, x9, [x1] +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: b.ge .LBB0_4 +; CHECK-NEXT: .LBB0_3: // %exit1 +; CHECK-NEXT: str xzr, [x1, #8] +; CHECK-NEXT: .LBB0_4: // %exit2 +; CHECK-NEXT: ret +entry: + br i1 %cond, label %if.then, label %if.else + +if.then: + %0 = getelementptr inbounds i64, i64* %ptr, i64 2 + %1 = load i64, i64* %0, align 8 + store i64 0, i64* %0, align 8 + br label %if.end + +if.else: + %2 = load i64, i64* %ptr, align 8 + br label %if.end + +if.end: + %3 = phi i64 [ %1, %if.then ], [ %2, %if.else ] + %4 = getelementptr inbounds i64, i64* %ptr, i64 1 + %5 = load i64, i64* %4, align 8 + %6 = icmp slt i64 %3, %5 + br i1 %6, label %exit1, label %exit2 + +exit1: + store i64 0, i64* %4, align 8 + ret void + +exit2: + ret void +} diff --git a/test/CodeGen/AArch64/ldst-opt.ll b/test/CodeGen/AArch64/ldst-opt.ll index ae3f59ee8f5dc0f2cf7105bc6c16ba135052d031..7f6cba2133f05a69f2c6885e7f211f3f1979d539 100644 --- a/test/CodeGen/AArch64/ldst-opt.ll +++ b/test/CodeGen/AArch64/ldst-opt.ll @@ -1465,10 +1465,10 @@ entry: define void @merge_zr32_3vec(<3 x i32>* %p) { ; CHECK-LABEL: merge_zr32_3vec: ; CHECK: // %entry -; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}] ; NOSTRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #8] -; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] -; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #8] +; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}] +; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #4] +; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store <3 x i32> zeroinitializer, <3 x i32>* %p @@ -1480,8 +1480,8 @@ define void @merge_zr32_4vec(<4 x i32>* %p) { ; CHECK-LABEL: merge_zr32_4vec: ; CHECK: // %entry ; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}] -; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8] +; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store <4 x i32> zeroinitializer, <4 x i32>* %p @@ -1505,8 +1505,8 @@ define void @merge_zr32_4vecf(<4 x float>* %p) { ; CHECK-LABEL: merge_zr32_4vecf: ; CHECK: // %entry ; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}] -; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8] +; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store <4 x float> zeroinitializer, <4 x float>* %p @@ -1589,8 +1589,8 @@ entry: define void @merge_zr64_3vec(<3 x i64>* %p) { ; CHECK-LABEL: merge_zr64_3vec: ; CHECK: // %entry -; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}] -; CHECK-NEXT: str xzr, [x{{[0-9]+}}, #16] +; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #8] +; CHECK-NEXT: str xzr, [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store <3 x i64> zeroinitializer, <3 x i64>* %p diff --git a/test/CodeGen/AArch64/machine-outliner-all-stack.mir b/test/CodeGen/AArch64/machine-outliner-all-stack.mir new file mode 100644 index 0000000000000000000000000000000000000000..83b170a3c9894d030f74919e2a76c71df1b9dac0 --- /dev/null +++ b/test/CodeGen/AArch64/machine-outliner-all-stack.mir @@ -0,0 +1,112 @@ +# RUN: llc -mtriple=aarch64--- -run-pass=machine-outliner \ +# RUN: -verify-machineinstrs %s -o - | FileCheck %s + +# Show that, when instructions that use the stack are present, it's possible +# for us to outline everything as the default outlining type. +# It's possible for reg-save-possible to outline by storing LR to a register, +# but most candidates in this case require us to modify the stack. The outliner +# should see that it's more beneficial to fix up instructions and save LR to +# the stack in this case. + +--- | + define void @reg-save-possible() #0 { ret void } + define void @stack-save1() #0 { ret void } + define void @stack-save2() #0 { ret void } + define void @stack-save3() #0 { ret void } + attributes #0 = { minsize noinline noredzone "no-frame-pointer-elim"="true" } +... +--- + +name: reg-save-possible +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr + $lr = ORRXri $xzr, 1 + $x19 = ORRXri $xzr, 1 + $x20 = ORRXri $xzr, 1 + bb.1: + liveins: $lr + ; CHECK-LABEL: name: reg-save-possible + ; CHECK: $sp = STRXpre $lr, $sp, -16 + ; CHECK-NEXT: BL [[FN:@OUTLINED_FUNCTION_[0-9]+]] + ; CHECK-NEXT: $sp, $lr = LDRXpost $sp, 16 + $x20, $x19 = LDPXi $sp, 10 + $x20, $x19 = LDPXi $sp, 10 + $x20, $x19 = LDPXi $sp, 10 + $x20, $x19 = LDPXi $sp, 10 + $x20, $x19 = LDPXi $sp, 10 + bb.2: + RET undef $lr + +... +--- + +name: stack-save1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp + $lr = ORRXri $xzr, 1 + bb.1: + liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp + ; CHECK-LABEL: name: stack-save1 + ; CHECK: $sp = STRXpre $lr, $sp, -16 + ; CHECK-NEXT: BL [[FN]] + ; CHECK-NEXT: $sp, $lr = LDRXpost $sp, 16 + $x20, $x19 = LDPXi $sp, 10 + $x20, $x19 = LDPXi $sp, 10 + $x20, $x19 = LDPXi $sp, 10 + $x20, $x19 = LDPXi $sp, 10 + $x20, $x19 = LDPXi $sp, 10 + bb.2: + liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp + RET undef $lr + +... +--- + +name: stack-save2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp + $lr = ORRXri $xzr, 1 + bb.1: + liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp + ; CHECK-LABEL: name: stack-save2 + ; CHECK: $sp = STRXpre $lr, $sp, -16 + ; CHECK-NEXT: BL [[FN]] + ; CHECK-NEXT: $sp, $lr = LDRXpost $sp, 16 + $x20, $x19 = LDPXi $sp, 10 + $x20, $x19 = LDPXi $sp, 10 + $x20, $x19 = LDPXi $sp, 10 + $x20, $x19 = LDPXi $sp, 10 + $x20, $x19 = LDPXi $sp, 10 + bb.2: + liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp + RET undef $lr + +... +--- + +name: stack-save3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp + $lr = ORRXri $xzr, 1 + bb.1: + liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp + ; CHECK-LABEL: name: stack-save3 + ; CHECK: $sp = STRXpre $lr, $sp, -16 + ; CHECK-NEXT: BL [[FN]] + ; CHECK-NEXT: $sp, $lr = LDRXpost $sp, 16 + $x20, $x19 = LDPXi $sp, 10 + $x20, $x19 = LDPXi $sp, 10 + $x20, $x19 = LDPXi $sp, 10 + $x20, $x19 = LDPXi $sp, 10 + $x20, $x19 = LDPXi $sp, 10 + bb.2: + liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp + RET undef $lr diff --git a/test/CodeGen/AArch64/machine-outliner-compatible-candidates.mir b/test/CodeGen/AArch64/machine-outliner-compatible-candidates.mir new file mode 100644 index 0000000000000000000000000000000000000000..b153b4c5de2771ffd5789f40fe88812225a8ee5b --- /dev/null +++ b/test/CodeGen/AArch64/machine-outliner-compatible-candidates.mir @@ -0,0 +1,103 @@ +# RUN: llc -mtriple=aarch64--- -run-pass=machine-outliner \ +# RUN: -verify-machineinstrs %s -o - | FileCheck %s + +# Ensure that we can outline candidates with compatible call/frame classes. +# +# - Save/restores that don't impact the stack can be outlined together. +# - Save/restores that impact the stack if the outlined sequence doesn't use +# the stack. + +--- | + define void @no-save1() #0 { ret void } + define void @no-save2() #0 { ret void } + define void @reg-save() #0 { ret void } + define void @stack-save() #0 { ret void } + attributes #0 = { minsize noinline noredzone "no-frame-pointer-elim"="true" } +... +--- + +name: no-save1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr + $lr = ORRXri $xzr, 1 + bb.1: + ; CHECK-LABEL: name: no-save1 + ; CHECK: BL [[FN:@OUTLINED_FUNCTION_[0-9]+]] + ; CHECK-NOT: STRXpre + ; CHECK-NOT: $lr = + ; CHECK-NOT: ORRXrs + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 + bb.2: + RET undef $lr + +... +--- + +name: no-save2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr + $lr = ORRXri $xzr, 1 + bb.1: + ; CHECK-LABEL: name: no-save2 + ; CHECK: BL [[FN]] + ; CHECK-NOT: STRXpre + ; CHECK-NOT: $lr = + ; CHECK-NOT: ORRXrs + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 + bb.2: + RET undef $lr +... +--- + +name: reg-save +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr + $lr = ORRXri $xzr, 1 + bb.1: + liveins: $lr + ; CHECK-LABEL: name: reg-save + ; CHECK: $[[REG:x[0-9]+]] = ORRXrs $xzr, $lr, 0 + ; CHECK-NEXT: BL [[FN]] + ; CHECK-NEXT: $lr = ORRXrs $xzr, $[[REG]], 0 + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 + bb.2: + liveins: $lr + RET undef $lr + +... +--- + +name: stack-save +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp + $lr = ORRXri $xzr, 1 + bb.1: + liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp + ; CHECK-LABEL: name: stack-save + ; CHECK: $sp = STRXpre $lr, $sp, -16 + ; CHECK-NEXT: BL [[FN]] + ; CHECK-NEXT: $sp, $lr = LDRXpost $sp, 16 + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 + bb.2: + liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp + RET undef $lr diff --git a/test/CodeGen/AArch64/machine-outliner-drop-stack.mir b/test/CodeGen/AArch64/machine-outliner-drop-stack.mir new file mode 100644 index 0000000000000000000000000000000000000000..eb17dab5c10df1f0a378f5d194bd75d383b29342 --- /dev/null +++ b/test/CodeGen/AArch64/machine-outliner-drop-stack.mir @@ -0,0 +1,99 @@ +# RUN: llc -mtriple=aarch64--- -run-pass=machine-outliner \ +# RUN: -verify-machineinstrs %s -o - | FileCheck %s + +--- | + define void @no-save1() #0 { ret void } + define void @no-save2() #0 { ret void } + define void @reg-save() #0 { ret void } + define void @stack-save() #0 { ret void } + attributes #0 = { minsize noinline noredzone "no-frame-pointer-elim"="true" } +... +--- + +name: no-save1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr + $lr = ORRXri $xzr, 1 + bb.1: + ; CHECK-LABEL: name: no-save1 + ; CHECK: BL [[FN:@OUTLINED_FUNCTION_[0-9]+]] + ; CHECK-NOT: STRXpre + ; CHECK-NOT: $lr = + ; CHECK-NOT: ORRXrs + $x12 = ADDXri $sp, 48, 0; + $x12 = ADDXri $sp, 48, 0; + $x12 = ADDXri $sp, 48, 0; + $x12 = ADDXri $sp, 48, 0; + $x12 = ADDXri $sp, 48, 0; + bb.2: + RET undef $lr + +... +--- + +name: no-save2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr + $lr = ORRXri $xzr, 1 + bb.1: + ; CHECK-LABEL: name: no-save2 + ; CHECK: BL [[FN]] + ; CHECK-NOT: STRXpre + ; CHECK-NOT: $lr = + ; CHECK-NOT: ORRXrs + $x12 = ADDXri $sp, 48, 0; + $x12 = ADDXri $sp, 48, 0; + $x12 = ADDXri $sp, 48, 0; + $x12 = ADDXri $sp, 48, 0; + $x12 = ADDXri $sp, 48, 0; + bb.2: + RET undef $lr +... +--- + +name: reg-save +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr + $lr = ORRXri $xzr, 1 + bb.1: + liveins: $lr + ; CHECK-LABEL: name: reg-save + ; CHECK: $[[REG:x[0-9]+]] = ORRXrs $xzr, $lr, 0 + ; CHECK-NEXT: BL [[FN]] + ; CHECK-NEXT: $lr = ORRXrs $xzr, $[[REG]], 0 + $x12 = ADDXri $sp, 48, 0; + $x12 = ADDXri $sp, 48, 0; + $x12 = ADDXri $sp, 48, 0; + $x12 = ADDXri $sp, 48, 0; + $x12 = ADDXri $sp, 48, 0; + bb.2: + liveins: $lr + RET undef $lr + +... +--- + +name: stack-save +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp + $lr = ORRXri $xzr, 1 + bb.1: + liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp + ; CHECK-LABEL: name: stack-save + ; CHECK-NOT: BL + $x12 = ADDXri $sp, 48, 0; + $x12 = ADDXri $sp, 48, 0; + $x12 = ADDXri $sp, 48, 0; + $x12 = ADDXri $sp, 48, 0; + $x12 = ADDXri $sp, 48, 0; + bb.2: + liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x18, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp + RET undef $lr diff --git a/test/CodeGen/AArch64/machine-outliner-ordering.mir b/test/CodeGen/AArch64/machine-outliner-ordering.mir new file mode 100644 index 0000000000000000000000000000000000000000..7395df641f5542dc84e01d8ac3c2135d39800cf4 --- /dev/null +++ b/test/CodeGen/AArch64/machine-outliner-ordering.mir @@ -0,0 +1,106 @@ +# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=machine-outliner \ +# RUN: -verify-machineinstrs %s -o - | FileCheck %s + +# Ensure that outlined function names appear as expected. Currently, they are +# output in order of benefit. + +--- | + define void @should_have_fn2() #0 { ret void } + define void @should_have_fn0() #0 { ret void } + define void @should_have_fn1() #0 { ret void } + attributes #0 = { noredzone optsize minsize } +... +--- + +name: should_have_fn2 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: should_have_fn2 + ; CHECK-NOT: OUTLINED_FUNCTION_1 + ; CHECK-NOT: OUTLINED_FUNCTION_0 + ; CHECK: OUTLINED_FUNCTION_2 + $w0 = ORRWri $wzr, 1 + $w0 = ORRWri $wzr, 1 + $w0 = ORRWri $wzr, 1 + bb.1: + ; CHECK-DAG: OUTLINED_FUNCTION_2 + $w0 = ORRWri $wzr, 1 + $w0 = ORRWri $wzr, 1 + $w0 = ORRWri $wzr, 1 + bb.2: + ; CHECK-DAG: OUTLINED_FUNCTION_2 + $w0 = ORRWri $wzr, 1 + $w0 = ORRWri $wzr, 1 + $w0 = ORRWri $wzr, 1 + bb.3: + ; CHECK-DAG: OUTLINED_FUNCTION_2 + $w0 = ORRWri $wzr, 1 + $w0 = ORRWri $wzr, 1 + $w0 = ORRWri $wzr, 1 + bb.4: + RET undef $lr + +... +--- + +name: should_have_fn0 + +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: should_have_fn0 + ; CHECK-NOT: OUTLINED_FUNCTION_1 + ; CHECK-NOT: OUTLINED_FUNCTION_2 + ; CHECK: OUTLINED_FUNCTION_0 + $w1 = ORRWri $wzr, 1 + $w1 = ORRWri $wzr, 1 + $w1 = ORRWri $wzr, 1 + $w1 = ORRWri $wzr, 1 + $w1 = ORRWri $wzr, 1 + bb.1: + ; CHECK-DAG: OUTLINED_FUNCTION_0 + $w1 = ORRWri $wzr, 1 + $w1 = ORRWri $wzr, 1 + $w1 = ORRWri $wzr, 1 + $w1 = ORRWri $wzr, 1 + $w1 = ORRWri $wzr, 1 + bb.3: + ; CHECK-DAG: OUTLINED_FUNCTION_0 + $w1 = ORRWri $wzr, 1 + $w1 = ORRWri $wzr, 1 + $w1 = ORRWri $wzr, 1 + $w1 = ORRWri $wzr, 1 + $w1 = ORRWri $wzr, 1 + bb.4: + RET undef $lr + +... +--- + +name: should_have_fn1 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: should_have_fn1 + ; CHECK-NOT: OUTLINED_FUNCTION_0 + ; CHECK-NOT: OUTLINED_FUNCTION_2 + ; CHECK: OUTLINED_FUNCTION_1 + $w2 = ORRWri $wzr, 1 + $w2 = ORRWri $wzr, 1 + $w2 = ORRWri $wzr, 1 + $w2 = ORRWri $wzr, 1 + bb.1: + ; CHECK-DAG: OUTLINED_FUNCTION_1 + $w2 = ORRWri $wzr, 1 + $w2 = ORRWri $wzr, 1 + $w2 = ORRWri $wzr, 1 + $w2 = ORRWri $wzr, 1 + bb.3: + ; CHECK-DAG: OUTLINED_FUNCTION_1 + $w2 = ORRWri $wzr, 1 + $w2 = ORRWri $wzr, 1 + $w2 = ORRWri $wzr, 1 + $w2 = ORRWri $wzr, 1 + bb.4: + RET undef $lr diff --git a/test/CodeGen/AArch64/machine-outliner-regsave.mir b/test/CodeGen/AArch64/machine-outliner-regsave.mir index 6d00bd39cde7063d257ef59cff30493be78d6571..d05cbddbb327164f3a1ab8bbba3ab29b0d179c4b 100644 --- a/test/CodeGen/AArch64/machine-outliner-regsave.mir +++ b/test/CodeGen/AArch64/machine-outliner-regsave.mir @@ -2,8 +2,8 @@ # RUN: -run-pass=machine-outliner -verify-machineinstrs %s -o - | FileCheck %s # Check that we save LR to a callee-saved register when possible. # foo() should use a callee-saved register. However, bar() should not. ---- | - +--- | + define void @foo() #0 { ret void } @@ -17,33 +17,40 @@ --- # Make sure that when we outline and a register is available, we # use it to save + restore LR instead of SP. +# Also make sure that we can call functions that require no save as the same +# outlined function. # CHECK: name: foo -# CHECK-DAG: bb.0 +# CHECK-DAG: bb.1 # CHECK-DAG: $x[[REG:[0-9]+]] = ORRXrs $xzr, $lr, 0 # CHECK-NEXT: BL # CHECK-NEXT: $lr = ORRXrs $xzr, $x[[REG]], 0 -# CHECK-DAG: bb.1 +# CHECK-DAG: bb.2 # CHECK-DAG: $x[[REG]] = ORRXrs $xzr, $lr, 0 # CHECK-NEXT: BL # CHECK-NEXT: $lr = ORRXrs $xzr, $x[[REG]], 0 -# CHECK-DAG: bb.2 +# CHECK-DAG: bb.3 # CHECK-DAG: $x[[REG]] = ORRXrs $xzr, $lr, 0 # CHECK-NEXT: BL # CHECK-NEXT: $lr = ORRXrs $xzr, $x[[REG]], 0 +# CHECK-DAG: bb.4 +# CHECK-NOT: $x[[REG]] = ORRXrs $xzr, $lr, 0 +# CHECK-DAG: BL name: foo tracksRegLiveness: true -fixedStack: +fixedStack: body: | bb.0: - liveins: $lr, $w9 $x25 = ORRXri $xzr, 1 + $lr = ORRXri $xzr, 1 + bb.1: + liveins: $lr, $w9 $w9 = ORRWri $wzr, 1 $w9 = ORRWri $wzr, 1 $w9 = ORRWri $wzr, 1 $w9 = ORRWri $wzr, 1 $w9 = ORRWri $wzr, 1 $w9 = ORRWri $wzr, 2 - bb.1: + bb.2: liveins: $lr, $w9 $w9 = ORRWri $wzr, 1 $w9 = ORRWri $wzr, 1 @@ -51,7 +58,15 @@ body: | $w9 = ORRWri $wzr, 1 $w9 = ORRWri $wzr, 1 $w9 = ORRWri $wzr, 2 - bb.2: + bb.3: + liveins: $lr, $w9 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 1 + $w9 = ORRWri $wzr, 2 + bb.4: liveins: $lr, $w9 $w9 = ORRWri $wzr, 1 $w9 = ORRWri $wzr, 1 @@ -59,6 +74,8 @@ body: | $w9 = ORRWri $wzr, 1 $w9 = ORRWri $wzr, 1 $w9 = ORRWri $wzr, 2 + bb.5: + liveins: $w9 RET undef $lr ... @@ -109,4 +126,3 @@ body: | bb.3: liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28 RET undef $lr - diff --git a/test/CodeGen/AArch64/machine-outliner-remarks.ll b/test/CodeGen/AArch64/machine-outliner-remarks.ll index 29872d9518afc3b5257e32e0208d82380d7a3451..19351262b82b16be43bd1c37a12477825b35dc87 100644 --- a/test/CodeGen/AArch64/machine-outliner-remarks.ll +++ b/test/CodeGen/AArch64/machine-outliner-remarks.ll @@ -1,10 +1,10 @@ ; RUN: llc %s -enable-machine-outliner -mtriple=aarch64-unknown-unknown -pass-remarks=machine-outliner -pass-remarks-missed=machine-outliner -o /dev/null 2>&1 | FileCheck %s ; CHECK: :0:0: ; CHECK-SAME: Did not outline 2 instructions from 2 locations. -; CHECK-SAME: Bytes from outlining all occurrences (36) >= +; CHECK-SAME: Bytes from outlining all occurrences (16) >= ; CHECK-SAME: Unoutlined instruction bytes (16) ; CHECK-SAME: (Also found at: ) -; CHECK: remark: :0:0: Saved 20 bytes by outlining 12 instructions +; CHECK: remark: :0:0: Saved 48 bytes by outlining 14 instructions ; CHECK-SAME: from 2 locations. (Found at: , ; CHECK-SAME: ) ; RUN: llc %s -enable-machine-outliner -mtriple=aarch64-unknown-unknown -o /dev/null -pass-remarks-missed=machine-outliner -pass-remarks-output=%t.yaml @@ -16,7 +16,7 @@ ; YAML-NEXT: Pass: machine-outliner ; YAML-NEXT: Name: NotOutliningCheaper ; YAML-NEXT: Function: -; YAML-NEXT: Args: +; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Did not outline ' ; YAML-NEXT: - Length: '2' ; YAML-NEXT: - String: ' instructions' @@ -24,7 +24,7 @@ ; YAML-NEXT: - NumOccurrences: '2' ; YAML-NEXT: - String: ' locations.' ; YAML-NEXT: - String: ' Bytes from outlining all occurrences (' -; YAML-NEXT: - OutliningCost: '36' +; YAML-NEXT: - OutliningCost: '16' ; YAML-NEXT: - String: ')' ; YAML-NEXT: - String: ' >= Unoutlined instruction bytes (' ; YAML-NEXT: - NotOutliningCost: '16' @@ -36,12 +36,12 @@ ; YAML-NEXT: Pass: machine-outliner ; YAML-NEXT: Name: OutlinedFunction ; YAML-NEXT: Function: OUTLINED_FUNCTION_0 -; YAML-NEXT: Args: +; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Saved ' -; YAML-NEXT: - OutliningBenefit: '20' +; YAML-NEXT: - OutliningBenefit: '48' ; YAML-NEXT: - String: ' bytes by ' ; YAML-NEXT: - String: 'outlining ' -; YAML-NEXT: - Length: '12' +; YAML-NEXT: - Length: '14' ; YAML-NEXT: - String: ' instructions ' ; YAML-NEXT: - String: 'from ' ; YAML-NEXT: - NumOccurrences: '2' diff --git a/test/CodeGen/AArch64/machine-outliner-unsafe-stack-call.mir b/test/CodeGen/AArch64/machine-outliner-unsafe-stack-call.mir new file mode 100644 index 0000000000000000000000000000000000000000..330d61eb03d4474e1358aac6e62a518628f0e9d6 --- /dev/null +++ b/test/CodeGen/AArch64/machine-outliner-unsafe-stack-call.mir @@ -0,0 +1,72 @@ +# RUN: llc -mtriple=aarch64--- -run-pass=machine-outliner \ +# RUN: -verify-machineinstrs %s -o - | FileCheck %s + +# Ensure that we never outline calls into sequences where unsafe stack +# instructions are present. + +--- | + define void @foo() #0 { ret void } + define void @bar() #0 { ret void } + define void @baz() #0 { ret void } + define void @f1() #0 { ret void } + define void @f2() #0 { ret void } + attributes #0 = { minsize noinline noredzone "no-frame-pointer-elim"="true" } +... +--- + +name: f1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr + ; CHECK-LABEL: name: f1 + ; CHECK: foo + ; CHECK-DAG: bar + ; CHECK-DAG: baz + $lr = ORRXri $xzr, 1 + BL @foo, implicit-def dead $lr, implicit $sp + $x20, $x19 = LDPXi $sp, 255 + $x20, $x19 = LDPXi $sp, 255 + $x20, $x19 = LDPXi $sp, 255 + $x20, $x19 = LDPXi $sp, 255 + bb.1: + BL @bar, implicit-def dead $lr, implicit $sp + $x11 = ADDXri $sp, 48, 0; + $x12 = ADDXri $sp, 48, 0; + $x13 = ADDXri $sp, 48, 0; + $x14 = ADDXri $sp, 48, 0; + bb.2: + BL @baz, implicit-def dead $lr, implicit $sp + $x0 = ADDXri $sp, 48, 0; + $x1 = ADDXri $sp, 48, 0; + RET undef $lr + +... +--- + +name: f2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr + ; CHECK-LABEL: name: f2 + ; CHECK: foo + ; CHECK-DAG: bar + ; CHECK-DAG: baz + $lr = ORRXri $xzr, 1 + BL @foo, implicit-def dead $lr, implicit $sp + $x20, $x19 = LDPXi $sp, 255 + $x20, $x19 = LDPXi $sp, 255 + $x20, $x19 = LDPXi $sp, 255 + $x20, $x19 = LDPXi $sp, 255 + bb.1: + BL @bar, implicit-def dead $lr, implicit $sp + $x11 = ADDXri $sp, 48, 0; + $x12 = ADDXri $sp, 48, 0; + $x13 = ADDXri $sp, 48, 0; + $x14 = ADDXri $sp, 48, 0; + bb.2: + BL @baz, implicit-def dead $lr, implicit $sp + $x0 = ADDXri $sp, 48, 0; + $x1 = ADDXri $sp, 48, 0; + RET undef $lr diff --git a/test/CodeGen/AArch64/machine-outliner.ll b/test/CodeGen/AArch64/machine-outliner.ll index 19be14d8d39136628953dd96cb650a19b649b40a..42d0a09f0282ba36fb275252540074bdfb8b5d38 100644 --- a/test/CodeGen/AArch64/machine-outliner.ll +++ b/test/CodeGen/AArch64/machine-outliner.ll @@ -103,6 +103,7 @@ define void @dog() #0 { ; CHECK-NEXT: str w8, [sp, #12] ; CHECK-NEXT: orr w8, wzr, #0x6 ; CHECK-NEXT: str w8, [sp, #8] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret attributes #0 = { noredzone "target-cpu"="cyclone" "target-features"="+sse" } diff --git a/test/CodeGen/AArch64/machine-outliner.mir b/test/CodeGen/AArch64/machine-outliner.mir index bd1abdccd44c21302ca27fa82a8bffb900baa021..625d6a826848cafc28274aa94df25717b145cda1 100644 --- a/test/CodeGen/AArch64/machine-outliner.mir +++ b/test/CodeGen/AArch64/machine-outliner.mir @@ -10,11 +10,11 @@ define i32 @main() #0 { ret i32 0 } - + define void @bar(i32 %a) #0 { ret void } - + attributes #0 = { noinline noredzone "no-frame-pointer-elim"="true" } ... --- @@ -22,79 +22,79 @@ # - Create outlined functions # - Don't outline anything to do with LR or W30 # - Save LR when it's not available -# - Don't outline stack instructions when we might need to save + restore # - Functions whose addresses are taken can still be outlined # -# CHECK-LABEL: name: main - -# CHECK: BL @OUTLINED_FUNCTION_[[F0:[0-9]+]] +# CHECK-LABEL: main +# CHECK-LABEL: bb.1: +# CHECK-DAG: BL @OUTLINED_FUNCTION_[[F0:[0-9]+]] # CHECK-NEXT: $lr = ORRXrs $xzr, $x[[REG:[0-9]+]], 0 -# CHECK-NEXT: $x16 = ADDXri $sp, 48, 0 -# CHECK-NEXT: STRHHroW $w16, $x9, $w30, 1, 1 +# CHECK-NEXT: STRHHroW $w12, $x9, $w30, 1, 1 # CHECK-NEXT: $lr = ORRXri $xzr, 1 +# CHECK-DAG: bb.2 # CHECK: BL @OUTLINED_FUNCTION_[[F0]] # CHECK-NEXT: $lr = ORRXrs $xzr, $x[[REG]], 0 -# CHECK-NEXT: $x16 = ADDXri $sp, 48, 0 -# CHECK-NEXT: STRHHroW $w16, $x9, $w30, 1, 1 +# CHECK-NEXT: STRHHroW $w12, $x9, $w30, 1, 1 # CHECK-NEXT: $lr = ORRXri $xzr, 1 +# CHECK-DAG: bb.3 # CHECK: BL @OUTLINED_FUNCTION_[[F0]] # CHECK-NEXT: $lr = ORRXrs $xzr, $x[[REG]], 0 -# CHECK-NEXT: $x16 = ADDXri $sp, 48, 0 -# CHECK-NEXT: STRHHroW $w16, $x9, $w30, 1, 1 +# CHECK-NEXT: STRHHroW $w12, $x9, $w30, 1, 1 # CHECK-NEXT: $lr = ORRXri $xzr, 1 name: main tracksRegLiveness: true body: | bb.0: + liveins: $lr $sp = frame-setup SUBXri $sp, 16, 0 renamable $x9 = ADRP target-flags(aarch64-page) @bar $x9 = ORRXri $xzr, 1 - $w16 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 $w30 = ORRWri $wzr, 1 $lr = ORRXri $xzr, 1 - + bb.1: + liveins: $lr $x20, $x19 = LDPXi $sp, 10 - $w16 = ORRWri $wzr, 1 - $w16 = ORRWri $wzr, 1 - $w16 = ORRWri $wzr, 1 - $w16 = ORRWri $wzr, 1 - $w16 = ORRWri $wzr, 1 - $w16 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 renamable $x9 = ADRP target-flags(aarch64-page) @x - $x16 = ADDXri $sp, 48, 0; - STRHHroW $w16, $x9, $w30, 1, 1 + $x12 = ADDXri $sp, 48, 0; + STRHHroW $w12, $x9, $w30, 1, 1 $lr = ORRXri $xzr, 1 - $w3 = ORRWri $wzr, 1993 - + bb.2: + liveins: $lr $x20, $x19 = LDPXi $sp, 10 - $w16 = ORRWri $wzr, 1 - $w16 = ORRWri $wzr, 1 - $w16 = ORRWri $wzr, 1 - $w16 = ORRWri $wzr, 1 - $w16 = ORRWri $wzr, 1 - $w16 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 renamable $x9 = ADRP target-flags(aarch64-page) @x - $x16 = ADDXri $sp, 48, 0; - STRHHroW $w16, $x9, $w30, 1, 1 - $lr = ORRXri $xzr, 1 - - $w4 = ORRWri $wzr, 1994 - + $x12 = ADDXri $sp, 48, 0; + STRHHroW $w12, $x9, $w30, 1, 1 + $lr = ORRXri $xzr, 1 + bb.3: + liveins: $lr $x20, $x19 = LDPXi $sp, 10 - $w16 = ORRWri $wzr, 1 - $w16 = ORRWri $wzr, 1 - $w16 = ORRWri $wzr, 1 - $w16 = ORRWri $wzr, 1 - $w16 = ORRWri $wzr, 1 - $w16 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 + $w12 = ORRWri $wzr, 1 renamable $x9 = ADRP target-flags(aarch64-page) @x - $x16 = ADDXri $sp, 48, 0; - STRHHroW $w16, $x9, $w30, 1, 1 + $x12 = ADDXri $sp, 48, 0; + STRHHroW $w12, $x9, $w30, 1, 1 $lr = ORRXri $xzr, 1 - $sp = ADDXri $sp, 16, 0 + bb.4: + liveins: $lr RET undef $lr ... @@ -104,10 +104,10 @@ body: | # CHECK-LABEL: bb.1: # CHECK-NOT: BL @baz, implicit-def dead $lr, implicit $sp # CHECK: BL @OUTLINED_FUNCTION_[[F1:[0-9]+]], implicit-def $lr, implicit $sp -# CHECK-NEXT: $w17 = ORRWri $wzr, 2 +# CHECK-NEXT: $w11 = ORRWri $wzr, 2 # CHECK-NEXT: BL @OUTLINED_FUNCTION_[[F1]], implicit-def $lr, implicit $sp # CHECK-NEXT: $w8 = ORRWri $wzr, 0 -# CHECK-NOT: $w17 = KILL renamable $w17, implicit killed $w17 +# CHECK-NOT: $w11 = KILL renamable $w11, implicit killed $w11 name: bar tracksRegLiveness: true body: | @@ -118,25 +118,25 @@ body: | bb.1: BL @baz, implicit-def dead $lr, implicit $sp - $w17 = ORRWri $wzr, 1 - $w17 = ORRWri $wzr, 1 - $w17 = KILL renamable $w17, implicit killed $w17 - $w17 = ORRWri $wzr, 1 - $w17 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 + $w11 = KILL renamable $w11, implicit killed $w11 + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 BL @baz, implicit-def dead $lr, implicit $sp - $w17 = ORRWri $wzr, 1 - $w17 = ORRWri $wzr, 1 - $w17 = ORRWri $wzr, 2 + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 2 BL @baz, implicit-def dead $lr, implicit $sp - $w17 = ORRWri $wzr, 1 - $w17 = ORRWri $wzr, 1 - $w17 = ORRWri $wzr, 1 - $w17 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 BL @baz, implicit-def dead $lr, implicit $sp - $w17 = ORRWri $wzr, 1 - $w17 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 + $w11 = ORRWri $wzr, 1 $w8 = ORRWri $wzr, 0 - + bb.2: $w15 = ORRWri $wzr, 1 $w15 = ORRWri $wzr, 1 @@ -150,7 +150,7 @@ body: | $w15 = ORRWri $wzr, 1 $x15 = ADDXri $sp, 48, 0; $w8 = ORRWri $wzr, 0 - + bb.3: $fp, $lr = LDPXi $sp, 2 $sp = ADDXri $sp, 32, 0 diff --git a/test/CodeGen/AArch64/pr40091.ll b/test/CodeGen/AArch64/pr40091.ll new file mode 100644 index 0000000000000000000000000000000000000000..8cf51f4beb5f44a87cfecda9360103abe108a20c --- /dev/null +++ b/test/CodeGen/AArch64/pr40091.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-arm-none-eabi | FileCheck %s + +define i64 @test(i64 %aa) { +; CHECK-LABEL: test: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v0.2d, #0xffffffffffffffff +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret +entry: + %a = bitcast i64 %aa to <1 x i64> + %k = icmp sgt <1 x i64> %a, zeroinitializer + %l = zext <1 x i1> %k to <1 x i64> + %o = and <1 x i64> %l, %a + %p = xor <1 x i64> %l, + %q = and <1 x i64> %p, + %r = or <1 x i64> %q, %o + %s = bitcast <1 x i64> %r to <8 x i8> + %t = shufflevector <8 x i8> %s, <8 x i8> %s, <8 x i32> + %u = bitcast <8 x i8> %t to i64 + ret i64 %u +} diff --git a/test/CodeGen/AArch64/remat.ll b/test/CodeGen/AArch64/remat.ll index ed1d415067e82e2fb012b4b0bcb2af8eccfe04a5..52ebe876014491566da199e907f4750603adca85 100644 --- a/test/CodeGen/AArch64/remat.ll +++ b/test/CodeGen/AArch64/remat.ll @@ -13,6 +13,7 @@ ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=saphira -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=kryo -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=thunderx2t99 -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=tsv110 -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mattr=+custom-cheap-as-move -o - %s | FileCheck %s %X = type { i64, i64, i64 } diff --git a/test/CodeGen/AArch64/select_cc.ll b/test/CodeGen/AArch64/select_cc.ll new file mode 100644 index 0000000000000000000000000000000000000000..785e6b800ed74acc160efd04b554a40f674a6131 --- /dev/null +++ b/test/CodeGen/AArch64/select_cc.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64 | FileCheck %s + +define i64 @select_ogt_float(float %a, float %b) { +; CHECK-LABEL: select_ogt_float: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: lsl x0, x8, #2 +; CHECK-NEXT: ret +entry: + %cc = fcmp ogt float %a, %b + %sel = select i1 %cc, i64 4, i64 0 + ret i64 %sel +} + +define i64 @select_ule_float_inverse(float %a, float %b) { +; CHECK-LABEL: select_ule_float_inverse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: lsl x0, x8, #2 +; CHECK-NEXT: ret +entry: + %cc = fcmp ule float %a, %b + %sel = select i1 %cc, i64 0, i64 4 + ret i64 %sel +} + +define i64 @select_eq_i32(i32 %a, i32 %b) { +; CHECK-LABEL: select_eq_i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: lsl x0, x8, #2 +; CHECK-NEXT: ret +entry: + %cc = icmp eq i32 %a, %b + %sel = select i1 %cc, i64 4, i64 0 + ret i64 %sel +} + +define i64 @select_ne_i32_inverse(i32 %a, i32 %b) { +; CHECK-LABEL: select_ne_i32_inverse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: lsl x0, x8, #2 +; CHECK-NEXT: ret +entry: + %cc = icmp ne i32 %a, %b + %sel = select i1 %cc, i64 0, i64 4 + ret i64 %sel +} diff --git a/test/CodeGen/AArch64/shadow-call-stack.ll b/test/CodeGen/AArch64/shadow-call-stack.ll index dbd44fd3cd17dfbbf50e7780edbe9b65a6b33aa1..060d81b7fc218d5b00525fabcf9dce55b17d33cf 100644 --- a/test/CodeGen/AArch64/shadow-call-stack.ll +++ b/test/CodeGen/AArch64/shadow-call-stack.ll @@ -22,6 +22,7 @@ declare i32 @bar() define i32 @f3() shadowcallstack { ; CHECK: f3: ; CHECK: str x30, [x18], #8 + ; CHECK: .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78 ; CHECK: str x30, [sp, #-16]! %res = call i32 @bar() %res1 = add i32 %res, 1 @@ -45,3 +46,11 @@ define i32 @f4() shadowcallstack { ; CHECK: ret ret i32 %res1234 } + +define i32 @f5() shadowcallstack nounwind { + ; CHECK: f5: + ; CHECK-NOT: .cfi_escape + %res = call i32 @bar() + %res1 = add i32 %res, 1 + ret i32 %res +} diff --git a/test/CodeGen/AArch64/sign-return-address.ll b/test/CodeGen/AArch64/sign-return-address.ll index c057c815acfd4b6ae5c7ef0d0e755a1a993d9c61..dfd52f87fdeb18f20aa16201ecd7939cfac6dc27 100644 --- a/test/CodeGen/AArch64/sign-return-address.ll +++ b/test/CodeGen/AArch64/sign-return-address.ll @@ -24,17 +24,17 @@ define i32 @leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf" { ; CHECK-LABEL: @leaf_sign_all ; CHECK: paciasp ; CHECK: autiasp -; CHECK-NEXT: ret +; CHECK: ret define i32 @leaf_sign_all(i32 %x) "sign-return-address"="all" { ret i32 %x } ; CHECK: @leaf_clobbers_lr ; CHECK: paciasp -; CHECK-NEXT: str x30, [sp, #-16]! +; CHECK: str x30, [sp, #-16]! ; CHECK: ldr x30, [sp], #16 ; CHECK-NEXT: autiasp -; CHECK-NEXT: ret +; CHECK: ret define i64 @leaf_clobbers_lr(i64 %x) "sign-return-address"="non-leaf" { call void asm sideeffect "mov x30, $0", "r,~{lr}"(i64 %x) #1 ret i64 %x @@ -45,7 +45,7 @@ declare i32 @foo(i32) ; CHECK: @non_leaf_sign_all ; CHECK: paciasp ; CHECK: autiasp -; CHECK-NEXT: ret +; CHECK: ret define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" { %call = call i32 @foo(i32 %x) ret i32 %call @@ -53,10 +53,10 @@ define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" { ; CHECK: @non_leaf_sign_non_leaf ; CHECK: paciasp -; CHECK-NEXT: str x30, [sp, #-16]! +; CHECK: str x30, [sp, #-16]! ; CHECK: ldr x30, [sp], #16 -; CHECK-NEXT: autiasp -; CHECK-NEXT: ret +; CHECK: autiasp +; CHECK: ret define i32 @non_leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf" { %call = call i32 @foo(i32 %x) ret i32 %call @@ -65,7 +65,7 @@ define i32 @non_leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf" { ; CHECK-LABEL: @leaf_sign_all_v83 ; CHECK: paciasp ; CHECK-NOT: ret -; CHECK-NEXT: retaa +; CHECK: retaa ; CHECK-NOT: ret define i32 @leaf_sign_all_v83(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" { ret i32 %x @@ -75,10 +75,10 @@ declare fastcc i64 @bar(i64) ; CHECK-LABEL: @spill_lr_and_tail_call ; CHECK: paciasp -; CHECK-NEXT: str x30, [sp, #-16]! +; CHECK: str x30, [sp, #-16]! ; CHECK: ldr x30, [sp], #16 -; CHECK-NEXT: autiasp -; CHECK-NEXT: b bar +; CHECK: autiasp +; CHECK: b bar define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" { call void asm sideeffect "mov x30, $0", "r,~{lr}"(i64 %x) #1 tail call fastcc i64 @bar(i64 %x) diff --git a/test/CodeGen/AArch64/speculation-hardening-dagisel.ll b/test/CodeGen/AArch64/speculation-hardening-dagisel.ll new file mode 100644 index 0000000000000000000000000000000000000000..4d13d98441e0d1a76d543e274a644ba21d7021a5 --- /dev/null +++ b/test/CodeGen/AArch64/speculation-hardening-dagisel.ll @@ -0,0 +1,71 @@ +; RUN: sed -e 's/SLHATTR/speculative_load_hardening/' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK,SLH --dump-input-on-failure +; RUN: sed -e 's/SLHATTR//' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK,NOSLH --dump-input-on-failure + +declare i64 @g(i64, i64) local_unnamed_addr +define i64 @f_using_reserved_reg_x16(i64 %a, i64 %b) local_unnamed_addr SLHATTR { +; CHECK-LABEL: f_using_reserved_reg_x16 +; SLH: dsb sy +; SLH: isb +; NOSLH-NOT: dsb sy +; NOSLH-NOT: isb +entry: + %cmp = icmp ugt i64 %a, %b + br i1 %cmp, label %if.then, label %cleanup + +; CHECK: b.ls +; SLH: dsb sy +; SLH: isb +; NOSLH-NOT: dsb sy +; NOSLH-NOT: isb +if.then: + %0 = tail call i64 asm "autia1716", "={x17},{x16},0"(i64 %b, i64 %a) +; CHECK: bl g +; SLH: dsb sy +; SLH: isb +; NOSLH-NOT: dsb sy +; NOSLH-NOT: isb +; CHECK: ret + %call = tail call i64 @g(i64 %a, i64 %b) #3 + %add = add i64 %call, %0 + br label %cleanup + +cleanup: +; SLH: dsb sy +; SLH: isb +; NOSLH-NOT: dsb sy +; NOSLH-NOT: isb +; SLH: ret + %retval.0 = phi i64 [ %add, %if.then ], [ %b, %entry ] + ret i64 %retval.0 +} + +define i32 @f_clobbered_reg_w16(i32 %a, i32 %b) local_unnamed_addr SLHATTR { +; CHECK-LABEL: f_clobbered_reg_w16 +entry: +; SLH: dsb sy +; SLH: isb +; NOSLH-NOT: dsb sy +; NOSLH-NOT: isb + %cmp = icmp sgt i32 %a, %b + br i1 %cmp, label %if.then, label %if.end +; CHECK: b.le + +if.then: +; SLH: dsb sy +; SLH: isb +; NOSLH-NOT: dsb sy +; NOSLH-NOT: isb +; CHECK: mov w16, w0 + tail call void asm sideeffect "mov w16, ${0:w}", "r,~{w16}"(i32 %a) + br label %if.end +; SLH: ret + +if.end: + %add = add nsw i32 %b, %a + ret i32 %add +; SLH: dsb sy +; SLH: isb +; NOSLH-NOT: dsb sy +; NOSLH-NOT: isb +; SLH: ret +} diff --git a/test/CodeGen/AArch64/speculation-hardening.ll b/test/CodeGen/AArch64/speculation-hardening.ll new file mode 100644 index 0000000000000000000000000000000000000000..3535b63c32cc8f11bd11845c7b68840e7ab3fcc0 --- /dev/null +++ b/test/CodeGen/AArch64/speculation-hardening.ll @@ -0,0 +1,156 @@ +; RUN: sed -e 's/SLHATTR/speculative_load_hardening/' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK,SLH --dump-input-on-failure +; RUN: sed -e 's/SLHATTR//' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK,NOSLH --dump-input-on-failure +; RUN: sed -e 's/SLHATTR/speculative_load_hardening/' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -global-isel | FileCheck %s --check-prefixes=CHECK,SLH --dump-input-on-failure +; RUN sed -e 's/SLHATTR//' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -global-isel | FileCheck %s --check-prefixes=CHECK,NOSLH --dump-input-on-failure +; RUN: sed -e 's/SLHATTR/speculative_load_hardening/' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -fast-isel | FileCheck %s --check-prefixes=CHECK,SLH --dump-input-on-failure +; RUN: sed -e 's/SLHATTR//' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -fast-isel | FileCheck %s --check-prefixes=CHECK,NOSLH --dump-input-on-failure + +define i32 @f(i8* nocapture readonly %p, i32 %i, i32 %N) local_unnamed_addr SLHATTR { +; CHECK-LABEL: f +entry: +; SLH: cmp sp, #0 +; SLH: csetm x16, ne +; NOSLH-NOT: cmp sp, #0 +; NOSLH-NOT: csetm x16, ne + +; SLH: mov x17, sp +; SLH: and x17, x17, x16 +; SLH: mov sp, x17 +; NOSLH-NOT: mov x17, sp +; NOSLH-NOT: and x17, x17, x16 +; NOSLH-NOT: mov sp, x17 + %call = tail call i32 @tail_callee(i32 %i) +; SLH: cmp sp, #0 +; SLH: csetm x16, ne +; NOSLH-NOT: cmp sp, #0 +; NOSLH-NOT: csetm x16, ne + %cmp = icmp slt i32 %call, %N + br i1 %cmp, label %if.then, label %return +; GlobalISel lowers the branch to a b.ne sometimes instead of b.ge as expected.. +; CHECK: b.[[COND:(ge)|(lt)|(ne)]] + +if.then: ; preds = %entry +; NOSLH-NOT: csel x16, x16, xzr, {{(lt)|(ge)|(eq)}} +; SLH-DAG: csel x16, x16, xzr, {{(lt)|(ge)|(eq)}} + %idxprom = sext i32 %i to i64 + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %idxprom + %0 = load i8, i8* %arrayidx, align 1 +; CHECK-DAG: ldrb [[LOADED:w[0-9]+]], + %conv = zext i8 %0 to i32 + br label %return + +; SLH-DAG: csel x16, x16, xzr, [[COND]] +; NOSLH-NOT: csel x16, x16, xzr, [[COND]] +return: ; preds = %entry, %if.then + %retval.0 = phi i32 [ %conv, %if.then ], [ 0, %entry ] +; SLH: mov x17, sp +; SLH: and x17, x17, x16 +; SLH: mov sp, x17 +; NOSLH-NOT: mov x17, sp +; NOSLH-NOT: and x17, x17, x16 +; NOSLH-NOT: mov sp, x17 + ret i32 %retval.0 +} + +; Make sure that for a tail call, taint doesn't get put into SP twice. +define i32 @tail_caller(i32 %a) local_unnamed_addr SLHATTR { +; CHECK-LABEL: tail_caller: +; SLH: mov x17, sp +; SLH: and x17, x17, x16 +; SLH: mov sp, x17 +; NOSLH-NOT: mov x17, sp +; NOSLH-NOT: and x17, x17, x16 +; NOSLH-NOT: mov sp, x17 +; GlobalISel doesn't optimize tail calls (yet?), so only check that +; cross-call taint register setup code is missing if a tail call was +; actually produced. +; SLH: {{(bl tail_callee[[:space:]] cmp sp, #0)|(b tail_callee)}} +; SLH-NOT: cmp sp, #0 + %call = tail call i32 @tail_callee(i32 %a) + ret i32 %call +} + +declare i32 @tail_callee(i32) local_unnamed_addr + +; Verify that no cb(n)z/tb(n)z instructions are produced when implementing +; SLH +define i32 @compare_branch_zero(i32, i32) SLHATTR { +; CHECK-LABEL: compare_branch_zero + %3 = icmp eq i32 %0, 0 + br i1 %3, label %then, label %else +;SLH-NOT: cb{{n?}}z +;NOSLH: cb{{n?}}z +then: + %4 = sdiv i32 5, %1 + ret i32 %4 +else: + %5 = sdiv i32 %1, %0 + ret i32 %5 +} + +define i32 @test_branch_zero(i32, i32) SLHATTR { +; CHECK-LABEL: test_branch_zero + %3 = and i32 %0, 16 + %4 = icmp eq i32 %3, 0 + br i1 %4, label %then, label %else +;SLH-NOT: tb{{n?}}z +;NOSLH: tb{{n?}}z +then: + %5 = sdiv i32 5, %1 + ret i32 %5 +else: + %6 = sdiv i32 %1, %0 + ret i32 %6 +} + +define i32 @landingpad(i32 %l0, i32 %l1) SLHATTR personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +; CHECK-LABEL: landingpad +entry: +; SLH: cmp sp, #0 +; SLH: csetm x16, ne +; NOSLH-NOT: cmp sp, #0 +; NOSLH-NOT: csetm x16, ne +; CHECK: bl _Z10throwing_fv + invoke void @_Z10throwing_fv() + to label %exit unwind label %lpad +; SLH: cmp sp, #0 +; SLH: csetm x16, ne + +lpad: + %l4 = landingpad { i8*, i32 } + catch i8* null +; SLH: cmp sp, #0 +; SLH: csetm x16, ne +; NOSLH-NOT: cmp sp, #0 +; NOSLH-NOT: csetm x16, ne + %l5 = extractvalue { i8*, i32 } %l4, 0 + %l6 = tail call i8* @__cxa_begin_catch(i8* %l5) + %l7 = icmp sgt i32 %l0, %l1 + br i1 %l7, label %then, label %else +; GlobalISel lowers the branch to a b.ne sometimes instead of b.ge as expected.. +; CHECK: b.[[COND:(le)|(gt)|(ne)]] + +then: +; SLH-DAG: csel x16, x16, xzr, [[COND]] + %l9 = sdiv i32 %l0, %l1 + br label %postif + +else: +; SLH-DAG: csel x16, x16, xzr, {{(gt)|(le)|(eq)}} + %l11 = sdiv i32 %l1, %l0 + br label %postif + +postif: + %l13 = phi i32 [ %l9, %then ], [ %l11, %else ] + tail call void @__cxa_end_catch() + br label %exit + +exit: + %l15 = phi i32 [ %l13, %postif ], [ 0, %entry ] + ret i32 %l15 +} + +declare i32 @__gxx_personality_v0(...) +declare void @_Z10throwing_fv() local_unnamed_addr +declare i8* @__cxa_begin_catch(i8*) local_unnamed_addr +declare void @__cxa_end_catch() local_unnamed_addr diff --git a/test/CodeGen/AArch64/speculation-hardening.mir b/test/CodeGen/AArch64/speculation-hardening.mir new file mode 100644 index 0000000000000000000000000000000000000000..cf8357d9558b0064b54c9f618e97411ff830bde9 --- /dev/null +++ b/test/CodeGen/AArch64/speculation-hardening.mir @@ -0,0 +1,117 @@ +# RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu \ +# RUN: -start-before aarch64-speculation-hardening -o - %s \ +# RUN: | FileCheck %s --dump-input-on-failure + +# Check that the speculation hardening pass generates code as expected for +# basic blocks ending with a variety of branch patterns: +# - (1) no branches (fallthrough) +# - (2) one unconditional branch +# - (3) one conditional branch + fall-through +# - (4) one conditional branch + one unconditional branch +# - other direct branches don't seem to be generated by the AArch64 codegen +--- | + define void @nobranch_fallthrough(i32 %a, i32 %b) speculative_load_hardening { + ret void + } + define void @uncondbranch(i32 %a, i32 %b) speculative_load_hardening { + ret void + } + define void @condbranch_fallthrough(i32 %a, i32 %b) speculative_load_hardening { + ret void + } + define void @condbranch_uncondbranch(i32 %a, i32 %b) speculative_load_hardening { + ret void + } + define void @indirectbranch(i32 %a, i32 %b) speculative_load_hardening { + ret void + } +... +--- +name: nobranch_fallthrough +tracksRegLiveness: true +body: | + ; CHECK-LABEL: nobranch_fallthrough + bb.0: + successors: %bb.1 + liveins: $w0, $w1 + ; CHECK-NOT: csel + bb.1: + liveins: $w0 + RET undef $lr, implicit $w0 +... +--- +name: uncondbranch +tracksRegLiveness: true +body: | + ; CHECK-LABEL: uncondbranch + bb.0: + successors: %bb.1 + liveins: $w0, $w1 + B %bb.1 + ; CHECK-NOT: csel + bb.1: + liveins: $w0 + RET undef $lr, implicit $w0 +... +--- +name: condbranch_fallthrough +tracksRegLiveness: true +body: | + ; CHECK-LABEL: condbranch_fallthrough + bb.0: + successors: %bb.1, %bb.2 + liveins: $w0, $w1 + $wzr = SUBSWrs renamable $w0, renamable $w1, 0, implicit-def $nzcv, implicit-def $nzcv + Bcc 11, %bb.2, implicit $nzcv + ; CHECK: b.lt [[BB_LT_T:\.LBB[0-9_]+]] + + bb.1: + liveins: $nzcv, $w0 + ; CHECK: csel x16, x16, xzr, ge + RET undef $lr, implicit $w0 + bb.2: + liveins: $nzcv, $w0 + ; CHECK: csel x16, x16, xzr, lt + RET undef $lr, implicit $w0 +... +--- +name: condbranch_uncondbranch +tracksRegLiveness: true +body: | + ; CHECK-LABEL: condbranch_uncondbranch + bb.0: + successors: %bb.1, %bb.2 + liveins: $w0, $w1 + $wzr = SUBSWrs renamable $w0, renamable $w1, 0, implicit-def $nzcv, implicit-def $nzcv + Bcc 11, %bb.2, implicit $nzcv + B %bb.1, implicit $nzcv + ; CHECK: b.lt [[BB_LT_T:\.LBB[0-9_]+]] + + bb.1: + liveins: $nzcv, $w0 + ; CHECK: csel x16, x16, xzr, ge + RET undef $lr, implicit $w0 + bb.2: + liveins: $nzcv, $w0 + ; CHECK: csel x16, x16, xzr, lt + RET undef $lr, implicit $w0 +... +--- +name: indirectbranch +tracksRegLiveness: true +body: | + ; Check that no instrumentation is done on indirect branches (for now). + ; CHECK-LABEL: indirectbranch + bb.0: + successors: %bb.1, %bb.2 + liveins: $x0 + BR $x0 + bb.1: + liveins: $x0 + ; CHECK-NOT: csel + RET undef $lr, implicit $x0 + bb.2: + liveins: $x0 + ; CHECK-NOT: csel + RET undef $lr, implicit $x0 +... diff --git a/test/CodeGen/AArch64/stack-protector-target.ll b/test/CodeGen/AArch64/stack-protector-target.ll index 787e4a76ec01b7192a5b4fcab6dbc7d3fd0f5777..e54fb8c4b0a13e434308ade52827f38db93ee747 100644 --- a/test/CodeGen/AArch64/stack-protector-target.ll +++ b/test/CodeGen/AArch64/stack-protector-target.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=aarch64-linux-android < %s -o - | FileCheck --check-prefix=ANDROID-AARCH64 %s ; RUN: llc -mtriple=aarch64-fuchsia < %s -o - | FileCheck --check-prefixes=FUCHSIA-AARCH64-COMMON,FUCHSIA-AARCH64-USER %s ; RUN: llc -mtriple=aarch64-fuchsia -code-model=kernel < %s -o - | FileCheck --check-prefixes=FUCHSIA-AARCH64-COMMON,FUCHSIA-AARCH64-KERNEL %s +; RUN: llc -mtriple=aarch64-windows < %s -o - | FileCheck --check-prefix=WINDOWS-AARCH64 %s define void @_Z1fv() sspreq { entry: @@ -27,3 +28,10 @@ declare void @_Z7CapturePi(i32*) ; FUCHSIA-AARCH64-COMMON: ldur [[C:.*]], {{\[}}[[A]], #-16] ; FUCHSIA-AARCH64-COMMON: ldr [[D:.*]], [sp, ; FUCHSIA-AARCH64-COMMON: cmp [[C]], [[D]] + +; WINDOWS-AARCH64: adrp x8, __security_cookie +; WINDOWS-AARCH64: ldr x8, [x8, __security_cookie] +; WINDOWS-AARCH64: str x8, [sp, #8] +; WINDOWS-AARCH64: bl _Z7CapturePi +; WINDOWS-AARCH64: ldr x0, [sp, #8] +; WINDOWS-AARCH64: bl __security_check_cookie diff --git a/test/CodeGen/AArch64/swifterror.ll b/test/CodeGen/AArch64/swifterror.ll index 637ff3e2e29bb3325fe19577a074b07c1db3f107..8ea89464ab0e9107bba35f260f754ed5e22e9e8f 100644 --- a/test/CodeGen/AArch64/swifterror.ll +++ b/test/CodeGen/AArch64/swifterror.ll @@ -314,13 +314,12 @@ define float @foo_vararg(%swift_error** swifterror %error_ptr_ref, ...) { ; CHECK-APPLE-DAG: strb [[ID]], [x0, #8] ; First vararg -; CHECK-APPLE-DAG: orr {{x[0-9]+}}, [[ARGS]], #0x8 ; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #16] ; Second vararg -; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{x[0-9]+}}], #8 +; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #24] ; CHECK-APPLE-DAG: add {{x[0-9]+}}, {{x[0-9]+}}, #16 ; Third vararg -; CHECK-APPLE: ldr {{w[0-9]+}}, [{{x[0-9]+}}], #8 +; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #32] ; CHECK-APPLE: mov x21, x0 ; CHECK-APPLE-NOT: x21 diff --git a/test/CodeGen/AArch64/unfold-masked-merge-scalar-variablemask.ll b/test/CodeGen/AArch64/unfold-masked-merge-scalar-variablemask.ll index 41de7e8af195970f9a4262914b64387f6cc374c9..f3dd80f71fe0e410d5c7957e9d346ad2bf37885e 100644 --- a/test/CodeGen/AArch64/unfold-masked-merge-scalar-variablemask.ll +++ b/test/CodeGen/AArch64/unfold-masked-merge-scalar-variablemask.ll @@ -65,9 +65,9 @@ define i64 @out64(i64 %x, i64 %y, i64 %mask) { define i8 @in8(i8 %x, i8 %y, i8 %mask) { ; CHECK-LABEL: in8: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, w2 -; CHECK-NEXT: bic w9, w1, w2 -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: bic w8, w1, w2 +; CHECK-NEXT: and w9, w0, w2 +; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %n0 = xor i8 %x, %y %n1 = and i8 %n0, %mask @@ -78,9 +78,9 @@ define i8 @in8(i8 %x, i8 %y, i8 %mask) { define i16 @in16(i16 %x, i16 %y, i16 %mask) { ; CHECK-LABEL: in16: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, w2 -; CHECK-NEXT: bic w9, w1, w2 -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: bic w8, w1, w2 +; CHECK-NEXT: and w9, w0, w2 +; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %n0 = xor i16 %x, %y %n1 = and i16 %n0, %mask diff --git a/test/CodeGen/AArch64/vcvt-oversize.ll b/test/CodeGen/AArch64/vcvt-oversize.ll index b6e25cfadaa918a1571fce84a03adc068ebaa89d..823fe44744c52f0ff4858146233206dab6d5f839 100644 --- a/test/CodeGen/AArch64/vcvt-oversize.ll +++ b/test/CodeGen/AArch64/vcvt-oversize.ll @@ -2,14 +2,14 @@ define <8 x i8> @float_to_i8(<8 x float>* %in) { ; CHECK-LABEL: float_to_i8: -; CHECK: ldp q1, q0, [x0] -; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v1.4s, v1.4s -; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v0.4s, v0.4s -; CHECK-DAG: fcvtzu v[[LSB2:[0-9]+]].4s, v[[LSB]].4s -; CHECK-DAG: fcvtzu v[[MSB2:[0-9]+]].4s, v[[MSB]].4s +; CHECK: ldp q0, q1, [x0] +; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v0.4s, v0.4s +; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v1.4s, v1.4s +; CHECK-DAG: fcvtzs v[[LSB2:[0-9]+]].4s, v[[LSB]].4s +; CHECK-DAG: fcvtzs v[[MSB2:[0-9]+]].4s, v[[MSB]].4s ; CHECK-DAG: xtn v[[TMP:[0-9]+]].4h, v[[LSB]].4s -; CHECK-DAG: xtn2 v[[TMP]].8h, v[[MSB]].4s -; CHECK-DAG: xtn v0.8b, v[[TMP]].8h +; CHECK-DAG: xtn v[[TMP2:[0-9]+]].4h, v[[MSB]].4s +; CHECK-DAG: uzp1 v0.8b, v[[TMP]].8b, v[[TMP2]].8b %l = load <8 x float>, <8 x float>* %in %scale = fmul <8 x float> %l, %conv = fptoui <8 x float> %scale to <8 x i8> diff --git a/test/CodeGen/AArch64/windows-SEH-support.ll b/test/CodeGen/AArch64/windows-SEH-support.ll new file mode 100644 index 0000000000000000000000000000000000000000..499ae782beb1c5dcf615418fa73423aea855ab62 --- /dev/null +++ b/test/CodeGen/AArch64/windows-SEH-support.ll @@ -0,0 +1,36 @@ +; RUN: llc -mtriple aarch64-unknown-windows-msvc -filetype asm -o /dev/null %s + +declare dllimport void @f() local_unnamed_addr + +declare dso_local i32 @__C_specific_handler(...) + +define hidden swiftcc void @g() unnamed_addr personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) { +entry: + invoke void @f() to label %__try.cont unwind label %catch.dispatch + +catch.dispatch: ; preds = %entry + %0 = catchswitch within none [label %__except] unwind to caller + +__except: + %1 = catchpad within %0 [i8* null] ; preds = %catch.dispatch + catchret from %1 to label %__try.cont + +__try.cont: ; preds = %__except, %entry + ret void +} + +define hidden fastcc void @h() unnamed_addr personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) { +entry: + invoke void @f() to label %__try.cont unwind label %catch.dispatch + +catch.dispatch: ; preds = %entry + %0 = catchswitch within none [label %__except] unwind to caller + +__except: ; preds = %catch.dispatch + %1 = catchpad within %0 [i8* null] + catchret from %1 to label %__try.cont + +__try.cont: ; preds = %__except, %entry + ret void +} + diff --git a/test/CodeGen/AArch64/wineh-mingw.ll b/test/CodeGen/AArch64/wineh-mingw.ll new file mode 100644 index 0000000000000000000000000000000000000000..ae26b06cfa3857d7c513ddac3a28a12fd5aa21df --- /dev/null +++ b/test/CodeGen/AArch64/wineh-mingw.ll @@ -0,0 +1,48 @@ +; RUN: llc < %s -exception-model=wineh -mtriple=aarch64-pc-mingw32 | FileCheck %s -check-prefix=WINEH +; RUN: llc < %s -exception-model=wineh -mtriple=aarch64-pc-mingw32 -filetype=obj | llvm-readobj -s | FileCheck %s -check-prefix=WINEH-SECTIONS + +; Check emission of eh handler and handler data +declare i32 @_d_eh_personality(i32, i32, i64, i8*, i8*) +declare void @_d_eh_resume_unwind(i8*) + +declare i32 @bar() + +define i32 @foo4() #0 personality i32 (i32, i32, i64, i8*, i8*)* @_d_eh_personality { +entry: + %step = alloca i32, align 4 + store i32 0, i32* %step + %tmp = load i32, i32* %step + + %tmp1 = invoke i32 @bar() + to label %finally unwind label %landingpad + +finally: + store i32 1, i32* %step + br label %endtryfinally + +landingpad: + %landing_pad = landingpad { i8*, i32 } + cleanup + %tmp3 = extractvalue { i8*, i32 } %landing_pad, 0 + store i32 2, i32* %step + call void @_d_eh_resume_unwind(i8* %tmp3) + unreachable + +endtryfinally: + %tmp10 = load i32, i32* %step + ret i32 %tmp10 +} +; WINEH-LABEL: foo4: +; WINEH: .seh_proc foo4 +; WINEH: .seh_handler _d_eh_personality, @unwind, @except +; WINEH: ret +; WINEH: .section .xdata,"dr" +; WINEH-NEXT: .seh_handlerdata +; WINEH-NEXT: .text +; WINEH-NEXT: .seh_endproc +; WINEH: .section .xdata,"dr" +; WINEH-NEXT: .p2align 2 +; WINEH-NEXT: GCC_except_table0: + +; WINEH-SECTIONS: Name: .xdata +; WINEH-SECTIONS-NOT: Name: .gcc_except_table diff --git a/test/CodeGen/AArch64/wineh-try-catch-nobase.ll b/test/CodeGen/AArch64/wineh-try-catch-nobase.ll new file mode 100644 index 0000000000000000000000000000000000000000..9d3af8043d7ad6b837d11a6b385efacefdb0628f --- /dev/null +++ b/test/CodeGen/AArch64/wineh-try-catch-nobase.ll @@ -0,0 +1,49 @@ +; RUN: llc -o - %s -mtriple=aarch64-windows -verify-machineinstrs | FileCheck %s + +; Make sure we don't have a base pointer. +; CHECK-LABEL: "?a@@YAXXZ": +; CHECK-NOT: x19 + +; Check that we compute the address relative to fp. +; CHECK-LABEL: "?catch$2@?0??a@@YAXXZ@4HA": +; CHECK: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-NEXT: sub x0, x29, #16 ; =16 +; CHECK-NEXT: mov x1, xzr +; CHECK-NEXT: bl "?bb@@YAXPEAHH@Z" +; CHECK-NEXT: adrp x0, .LBB0_1 +; CHECK-NEXT: add x0, x0, .LBB0_1 +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret + +target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-windows-msvc19.11.0" + +define dso_local void @"?a@@YAXXZ"(i64 %p1) personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) { +entry: + %a = alloca i32, align 16 + %0 = bitcast i32* %a to i8* + store i32 305419896, i32* %a, align 16 + invoke void @"?bb@@YAXPEAHH@Z"(i32* nonnull %a, i32* null) + to label %try.cont unwind label %catch.dispatch + +catch.dispatch: ; preds = %entry + %1 = catchswitch within none [label %catch] unwind to caller + +catch: ; preds = %catch.dispatch + %2 = catchpad within %1 [i8* null, i32 64, i8* null] + call void @"?bb@@YAXPEAHH@Z"(i32* nonnull %a, i32* null) [ "funclet"(token %2) ] + catchret from %2 to label %try.cont + +try.cont: ; preds = %entry, %catch + call void @"?cc@@YAXXZ"() + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) + +declare dso_local void @"?bb@@YAXPEAHH@Z"(i32*, i32*) + +declare dso_local i32 @__CxxFrameHandler3(...) + +declare dso_local void @"?cc@@YAXXZ"() + diff --git a/test/CodeGen/AArch64/wineh-try-catch-realign.ll b/test/CodeGen/AArch64/wineh-try-catch-realign.ll new file mode 100644 index 0000000000000000000000000000000000000000..78255fbb16691de1388bd88d48b353e99d729eed --- /dev/null +++ b/test/CodeGen/AArch64/wineh-try-catch-realign.ll @@ -0,0 +1,59 @@ +; RUN: llc -o - %s -mtriple=aarch64-windows -verify-machineinstrs | FileCheck %s + +; Make sure we have a base pointer. +; CHECK-LABEL: "?a@@YAXXZ": +; CHECK: and sp, x9, #0xffffffffffffffc0 +; CHECK: mov x19, sp + +; Make sure the funclet prologue/epilogue are correct: specifically, +; it shouldn't access the parent's frame via sp, and the prologue and +; epilogue should be symmetrical. +; CHECK-LABEL: "?catch$2@?0??a@@YAXXZ@4HA": +; CHECK: str x28, [sp, #-32]! +; CHECK-NEXT: str x19, [sp, #8] +; CHECK-NEXT: stp x29, x30, [sp, #16] +; CHECK-NEXT: add x0, x19, #64 +; CHECK-NEXT: mov w1, wzr +; CHECK-NEXT: bl "?bb@@YAXPEAHH@Z" +; CHECK-NEXT: adrp x0, .LBB0_1 +; CHECK-NEXT: add x0, x0, .LBB0_1 +; CHECK-NEXT: ldp x29, x30, [sp, #16] +; CHECK-NEXT: ldr x19, [sp, #8] +; CHECK-NEXT: ldr x28, [sp], #32 +; CHECK-NEXT: ret + + +target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-windows-msvc19.11.0" + +define dso_local void @"?a@@YAXXZ"() personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) { +entry: + %a = alloca [100 x i32], align 64 + %0 = bitcast [100 x i32]* %a to i8* + call void @llvm.memset.p0i8.i64(i8* nonnull align 64 %0, i8 0, i64 400, i1 false) + %1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i64 0, i64 0 + store i32 305419896, i32* %1, align 64 + invoke void @"?bb@@YAXPEAHH@Z"(i32* nonnull %1, i32 1) + to label %try.cont unwind label %catch.dispatch + +catch.dispatch: ; preds = %entry + %2 = catchswitch within none [label %catch] unwind to caller + +catch: ; preds = %catch.dispatch + %3 = catchpad within %2 [i8* null, i32 64, i8* null] + call void @"?bb@@YAXPEAHH@Z"(i32* nonnull %1, i32 0) [ "funclet"(token %3) ] + catchret from %3 to label %try.cont + +try.cont: ; preds = %entry, %catch + call void @"?cc@@YAXXZ"() + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) + +declare dso_local void @"?bb@@YAXPEAHH@Z"(i32*, i32) + +declare dso_local i32 @__CxxFrameHandler3(...) + +declare dso_local void @"?cc@@YAXXZ"() + diff --git a/test/CodeGen/AArch64/wineh-try-catch-vla.ll b/test/CodeGen/AArch64/wineh-try-catch-vla.ll new file mode 100644 index 0000000000000000000000000000000000000000..7ae5ca54ff913d9aabe8b6c9e407cd9b238a7c5f --- /dev/null +++ b/test/CodeGen/AArch64/wineh-try-catch-vla.ll @@ -0,0 +1,52 @@ +; RUN: llc -o - %s -mtriple=aarch64-windows -verify-machineinstrs | FileCheck %s + +; Check we don't have a base pointer. +; CHECK-LABEL: "?a@@YAXXZ": +; CHECK-NOT: x19 + +; Make sure the prologue and epilogue are sane. Make sure the +; frame index is relative to the FP, since there is no base pointer. +; (Funclets aren't allowed to contain dynamic allocas.) +; CHECK-LABEL: "?catch$2@?0??a@@YAXXZ@4HA": +; CHECK: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: ldur x0, [x29, #-8] +; CHECK-NEXT: mov x1, x0 +; CHECK-NEXT: bl "?bb@@YAXPEAHH@Z" +; CHECK-NEXT: adrp x0, .LBB0_1 +; CHECK-NEXT: add x0, x0, .LBB0_1 +; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret + +target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-windows-msvc19.11.0" + +define dso_local void @"?a@@YAXXZ"(i64 %p1) personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) { +entry: + %a = alloca i32, i64 %p1, align 16 + %0 = bitcast i32* %a to i8* + call void @llvm.memset.p0i8.i64(i8* nonnull align 16 %0, i8 0, i64 400, i1 false) + store i32 305419896, i32* %a, align 16 + invoke void @"?bb@@YAXPEAHH@Z"(i32* nonnull %a, i32* null) + to label %try.cont unwind label %catch.dispatch + +catch.dispatch: ; preds = %entry + %1 = catchswitch within none [label %catch] unwind to caller + +catch: ; preds = %catch.dispatch + %2 = catchpad within %1 [i8* null, i32 64, i8* null] + call void @"?bb@@YAXPEAHH@Z"(i32* nonnull %a, i32* %a) [ "funclet"(token %2) ] + catchret from %2 to label %try.cont + +try.cont: ; preds = %entry, %catch + call void @"?cc@@YAXXZ"() + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) + +declare dso_local void @"?bb@@YAXPEAHH@Z"(i32*, i32*) + +declare dso_local i32 @__CxxFrameHandler3(...) + +declare dso_local void @"?cc@@YAXXZ"() + diff --git a/test/CodeGen/AArch64/wineh-try-catch.ll b/test/CodeGen/AArch64/wineh-try-catch.ll new file mode 100644 index 0000000000000000000000000000000000000000..940a86282d39a1a7b0642c9dabb5f81657642083 --- /dev/null +++ b/test/CodeGen/AArch64/wineh-try-catch.ll @@ -0,0 +1,197 @@ +; RUN: llc -o - %s -mtriple=aarch64-windows -verify-machineinstrs | FileCheck %s +; RUN: llc -o %t -filetype=obj %s -mtriple=aarch64-windows +; RUN: llvm-readobj -unwind %t | FileCheck %s -check-prefix=UNWIND + +; We test the following +; 1) That the unwind help object is created and that its offset from the stack +; pointer on entry is patched into the table fed to __CxxFrameHandler3 +; 2) That the stack update for the catch funclet only includes the callee saved +; registers +; 3) That the locals are accessed using the frame pointer in both the funclet +; and the parent function. + +; The following checks that the unwind help object has -2 stored into it at +; fp - 400 - 256 = fp - 656, which is on-entry sp - 48 + 32 - 656 = +; on-entry sp - 672. We check this offset in the table later on. + +; CHECK-LABEL: "?func@@YAHXZ": +; CHECK: str x28, [sp, #-48]! +; CHECK: str x21, [sp, #8] +; CHECK: stp x19, x20, [sp, #16] +; CHECK: stp x29, x30, [sp, #32] +; CHECK: add x29, sp, #32 +; CHECK: sub sp, sp, #624 +; CHECK: mov x19, sp +; CHECK: orr x1, xzr, #0xfffffffffffffffe +; CHECK: stur x1, [x19] + +; Now check that x is stored at fp - 20. We check that this is the same +; location accessed from the funclet to retrieve x. +; CHECK: orr w8, wzr, #0x1 +; CHECK: stur w8, [x29, [[X_OFFSET:#-[1-9][0-9]+]] + +; Check the offset off the frame pointer at which B is located. +; Check the same offset is used to pass the address of B to init2 in the +; funclet. +; CHECK: sub x0, x29, [[B_OFFSET:#[1-9][0-9]+]] +; CHECK: bl "?init@@YAXPEAH@Z" + +; This is the label for the throw that is encoded in the ip2state. +; We are inside the try block, where we make a call to func2 +; CHECK-LABEL: .Ltmp0: +; CHECK: bl "?func2@@YAHXZ + +; CHECK: [[CATCHRETDEST:.LBB0_[0-9]+]]: ; %catchret.dest + +; Check the catch funclet. +; CHECK-LABEL: "?catch$2@?0??func@@YAHXZ@4HA": + +; Check that the stack space is allocated only for the callee saved registers. +; CHECK: str x28, [sp, #-48]! +; CHECK: str x21, [sp, #8] +; CHECK: stp x19, x20, [sp, #16] +; CHECK: stp x29, x30, [sp, #32] +; CHECK: add x20, x19, #12 + +; Check that there are no further stack updates. +; CHECK-NOT: sub sp, sp + +; Check that the stack address passed to init2 is off the frame pointer, and +; that it matches the address of B in the parent function. +; CHECK: sub x0, x29, [[B_OFFSET]] +; CHECK: bl "?init2@@YAXPEAH@Z" + +; Check that are storing x back to the same location off the frame pointer as in +; the parent function. +; CHECK: stur w8, [x29, [[X_OFFSET]]] + +; Check that the funclet branches back to the catchret destination +; CHECK: adrp x0, .LBB0_3 +; CHECK-NEXT: add x0, x0, [[CATCHRETDEST]] + + +; Now check that the offset of the unwind help object from the stack pointer on +; entry to func is encoded in cppxdata that is passed to __CxxFrameHandler3. As +; computed above, this comes to -672. +; CHECK-LABEL: "$cppxdata$?func@@YAHXZ": +; CHECK-NEXT: .word 429065506 ; MagicNumber +; CHECK-NEXT: .word 2 ; MaxState +; CHECK-NEXT: .word ("$stateUnwindMap$?func@@YAHXZ")@IMGREL ; UnwindMap +; CHECK-NEXT: .word 1 ; NumTryBlocks +; CHECK-NEXT: .word ("$tryMap$?func@@YAHXZ")@IMGREL ; TryBlockMap +; CHECK-NEXT: .word 4 ; IPMapEntries +; CHECK-NEXT: .word ("$ip2state$?func@@YAHXZ")@IMGREL ; IPToStateXData +; CHECK-NEXT: .word -672 ; UnwindHelp + +; UNWIND: Function: ?func@@YAHXZ (0x0) +; UNWIND: Prologue [ +; UNWIND-NEXT: ; nop +; UNWIND-NEXT: ; sub sp, #624 +; UNWIND-NEXT: ; add fp, sp, #32 +; UNWIND-NEXT: ; stp x29, x30, [sp, #32] +; UNWIND-NEXT: ; stp x19, x20, [sp, #16] +; UNWIND-NEXT: ; str x21, [sp, #8] +; UNWIND-NEXT: ; str x28, [sp, #48]! +; UNWIND-NEXT: ; end +; UNWIND: Function: ?catch$2@?0??func@@YAHXZ@4HA +; UNWIND: Prologue [ +; UNWIND-NEXT: ; stp x29, x30, [sp, #32] +; UNWIND-NEXT: ; stp x19, x20, [sp, #16] +; UNWIND-NEXT: ; str x21, [sp, #8] +; UNWIND-NEXT: ; str x28, [sp, #48]! +; UNWIND-NEXT: ; end + +target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-windows-msvc19.11.0" + +%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] } +%eh.CatchableType = type { i32, i32, i32, i32, i32, i32, i32 } +%eh.CatchableTypeArray.1 = type { i32, [1 x i32] } +%eh.ThrowInfo = type { i32, i32, i32, i32 } + +$"??_R0H@8" = comdat any + +$"_CT??_R0H@84" = comdat any + +$_CTA1H = comdat any + +$_TI1H = comdat any + +@"??_7type_info@@6B@" = external constant i8* +@"??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat +@__ImageBase = external dso_local constant i8 +@"_CT??_R0H@84" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor2* @"??_R0H@8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32), i32 0, i32 -1, i32 0, i32 4, i32 0 }, section ".xdata", comdat +@_CTA1H = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x i32] [i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableType* @"_CT??_R0H@84" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32)] }, section ".xdata", comdat +@_TI1H = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i32 0, i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableTypeArray.1* @_CTA1H to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section ".xdata", comdat + +; Function Attrs: noinline optnone +define dso_local i32 @"?func@@YAHXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) { +entry: + %B = alloca [50 x i32], align 4 + %x = alloca i32, align 4 + %tmp = alloca i32, align 4 + %i = alloca i32, align 4 + %C = alloca [100 x i32], align 4 + store i32 1, i32* %x, align 4 + %arraydecay = getelementptr inbounds [50 x i32], [50 x i32]* %B, i32 0, i32 0 + call void @"?init@@YAXPEAH@Z"(i32* %arraydecay) + %call = invoke i32 @"?func2@@YAHXZ"() + to label %invoke.cont unwind label %catch.dispatch + +invoke.cont: ; preds = %entry + store i32 %call, i32* %tmp, align 4 + %0 = bitcast i32* %tmp to i8* + invoke void @_CxxThrowException(i8* %0, %eh.ThrowInfo* @_TI1H) #2 + to label %unreachable unwind label %catch.dispatch + +catch.dispatch: ; preds = %invoke.cont, %entry + %1 = catchswitch within none [label %catch] unwind to caller + +catch: ; preds = %catch.dispatch + %2 = catchpad within %1 [%rtti.TypeDescriptor2* @"??_R0H@8", i32 0, i32* %i] + %arraydecay1 = getelementptr inbounds [100 x i32], [100 x i32]* %C, i32 0, i32 0 + call void @"?init@@YAXPEAH@Z"(i32* %arraydecay1) [ "funclet"(token %2) ] + %arraydecay2 = getelementptr inbounds [50 x i32], [50 x i32]* %B, i32 0, i32 0 + call void @"?init2@@YAXPEAH@Z"(i32* %arraydecay2) [ "funclet"(token %2) ] + %3 = load i32, i32* %i, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds [50 x i32], [50 x i32]* %B, i64 0, i64 %idxprom + %4 = load i32, i32* %arrayidx, align 4 + %5 = load i32, i32* %i, align 4 + %idxprom3 = sext i32 %5 to i64 + %arrayidx4 = getelementptr inbounds [100 x i32], [100 x i32]* %C, i64 0, i64 %idxprom3 + %6 = load i32, i32* %arrayidx4, align 4 + %add = add nsw i32 %4, %6 + %7 = load i32, i32* %i, align 4 + %8 = load i32, i32* %i, align 4 + %mul = mul nsw i32 %7, %8 + %add5 = add nsw i32 %add, %mul + store i32 %add5, i32* %x, align 4 + catchret from %2 to label %catchret.dest + +catchret.dest: ; preds = %catch + br label %try.cont + +try.cont: ; preds = %catchret.dest + %arrayidx6 = getelementptr inbounds [50 x i32], [50 x i32]* %B, i64 0, i64 2 + %9 = load i32, i32* %arrayidx6, align 4 + %10 = load i32, i32* %x, align 4 + %add7 = add nsw i32 %9, %10 + ret i32 %add7 + +unreachable: ; preds = %invoke.cont + unreachable +} + +declare dso_local void @"?init@@YAXPEAH@Z"(i32*) + +declare dso_local i32 @"?func2@@YAHXZ"() + +declare dso_local i32 @__CxxFrameHandler3(...) + +declare dllimport void @_CxxThrowException(i8*, %eh.ThrowInfo*) + +declare dso_local void @"?init2@@YAXPEAH@Z"(i32*) + +attributes #0 = { noinline optnone } +attributes #2 = { noreturn } diff --git a/test/CodeGen/AArch64/xor.ll b/test/CodeGen/AArch64/xor.ll new file mode 100644 index 0000000000000000000000000000000000000000..f7cd8bf0c4c83c2fe4fdae826406dd8a88f8a218 --- /dev/null +++ b/test/CodeGen/AArch64/xor.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s + +define i32 @PR39657(i8* %p, i64 %x) { +; CHECK-LABEL: PR39657: +; CHECK: // %bb.0: +; CHECK-NEXT: mvn x8, x1 +; CHECK-NEXT: ldr w0, [x0, x8, lsl #2] +; CHECK-NEXT: ret + %sh = shl i64 %x, 2 + %mul = xor i64 %sh, -4 + %add.ptr = getelementptr inbounds i8, i8* %p, i64 %mul + %bc = bitcast i8* %add.ptr to i32* + %load = load i32, i32* %bc, align 4 + ret i32 %load +} + diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir index 3f8d58522caa4bbf0c26de828bde7809184e3888..57e1148bc59eefbce645dcac8a2a43023a156fb8 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir @@ -1,16 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s ---- | - define void @test_add() { ret void } -... - --- name: test_add -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } body: | bb.0: liveins: $vgpr0, $vgpr1 @@ -19,8 +11,8 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]] - %0(s32) = COPY $vgpr0 - %1(s32) = COPY $vgpr1 - %2(s32) = G_ADD %0, %1 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_ADD %0, %1 $vgpr0 = COPY %2 ... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir index 0282ff40837d6b5984d574859ba7040c8cb44a8a..f561cc7d092d6fae8ad64c306f850b86284a27f5 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir @@ -1,16 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s ---- | - define void @test_and() { ret void } -... - --- name: test_and -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } body: | bb.0: liveins: $vgpr0, $vgpr1 @@ -19,8 +11,8 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[COPY1]] - %0(s32) = COPY $vgpr0 - %1(s32) = COPY $vgpr1 - %2(s32) = G_AND %0, %1 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_AND %0, %1 $vgpr0 = COPY %2 ... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir index 71a9de8e6bfbc009d70630dc6e723e7804585c2a..59b5ac993ce47fc2c2708c0bc170eec6bd79c90b 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir @@ -3,10 +3,6 @@ --- name: test_ashr -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } body: | bb.0.entry: liveins: $vgpr0, $vgpr1 @@ -15,8 +11,8 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[COPY1]] - %0(s32) = COPY $vgpr0 - %1(s32) = COPY $vgpr1 - %2(s32) = G_ASHR %0, %1 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_ASHR %0, %1 $vgpr0 = COPY %2 ... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir index 5bf39e38c2cdaf78759d9677f6dee83cc33904d6..492da48a93f0913aeb20395be257f8b245eaa0ca 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir @@ -1,16 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s ---- | - define void @test_bitcast() { ret void } -... - --- name: test_bitcast -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } body: | bb.0: liveins: $vgpr0 @@ -19,8 +11,8 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY]](s32) ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>) - %0(s32) = COPY $vgpr0 - %1(<2 x s16>) = G_BITCAST %0 - %2(s32) = G_BITCAST %1 + %0:_(s32) = COPY $vgpr0 + %1:_(<2 x s16>) = G_BITCAST %0 + %2:_(s32) = G_BITCAST %1 $vgpr0 = COPY %2 ... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-block-addr.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-block-addr.mir new file mode 100644 index 0000000000000000000000000000000000000000..c532d117e07b3e05d52a98d0b46d84eb42cd5f02 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-block-addr.mir @@ -0,0 +1,28 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s +--- | + + @addr = global i8* null + + define void @test_blockaddress() { + store i8* blockaddress(@test_blockaddress, %block), i8** @addr + indirectbr i8* blockaddress(@test_blockaddress, %block), [label %block] + + block: + ret void + } + +... +--- +name: test_blockaddress +alignment: 4 +tracksRegLiveness: true +body: | + bb.1 (%ir-block.0): + ; CHECK-LABEL: name: test_blockaddress + ; CHECK: [[BLOCK_ADDR:%[0-9]+]]:_(p0) = G_BLOCK_ADDR blockaddress(@test_blockaddress, %ir-block.block) + ; CHECK: S_ENDPGM implicit [[BLOCK_ADDR]](p0) + %0:_(p0) = G_BLOCK_ADDR blockaddress(@test_blockaddress, %ir-block.block) + S_ENDPGM implicit %0 + +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir index 194d14afe24a6b5bf8900da471dc206ffc7b21cd..ec609f296391f119fa0eb49a2be7235a95e43279 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir @@ -58,12 +58,12 @@ body: | liveins: $vgpr0 ; CHECK-LABEL: name: extract_vector_elt_0_v5i32 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[MV:%[0-9]+]]:_(<5 x s32>) = G_MERGE_VALUES [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32) + ; CHECK: [[MV:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[MV]](<5 x s32>), [[C]](s32) ; CHECK: $vgpr0 = COPY [[EVEC]](s32) %0:_(s32) = COPY $vgpr0 - %1:_(<5 x s32>) = G_MERGE_VALUES %0, %0, %0, %0, %0 + %1:_(<5 x s32>) = G_BUILD_VECTOR %0, %0, %0, %0, %0 %2:_(s32) = G_CONSTANT i32 0 %3:_(s32) = G_EXTRACT_VECTOR_ELT %1, %2 $vgpr0 = COPY %3 @@ -77,12 +77,12 @@ body: | liveins: $vgpr0 ; CHECK-LABEL: name: extract_vector_elt_0_v6i32 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[MV:%[0-9]+]]:_(<6 x s32>) = G_MERGE_VALUES [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32) + ; CHECK: [[MV:%[0-9]+]]:_(<6 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[MV]](<6 x s32>), [[C]](s32) ; CHECK: $vgpr0 = COPY [[EVEC]](s32) %0:_(s32) = COPY $vgpr0 - %1:_(<6 x s32>) = G_MERGE_VALUES %0, %0, %0, %0, %0, %0 + %1:_(<6 x s32>) = G_BUILD_VECTOR %0, %0, %0, %0, %0, %0 %2:_(s32) = G_CONSTANT i32 0 %3:_(s32) = G_EXTRACT_VECTOR_ELT %1, %2 $vgpr0 = COPY %3 @@ -96,12 +96,12 @@ body: | liveins: $vgpr0 ; CHECK-LABEL: name: extract_vector_elt_0_v7i32 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[MV:%[0-9]+]]:_(<7 x s32>) = G_MERGE_VALUES [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32) + ; CHECK: [[MV:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[MV]](<7 x s32>), [[C]](s32) ; CHECK: $vgpr0 = COPY [[EVEC]](s32) %0:_(s32) = COPY $vgpr0 - %1:_(<7 x s32>) = G_MERGE_VALUES %0, %0, %0, %0, %0, %0, %0 + %1:_(<7 x s32>) = G_BUILD_VECTOR %0, %0, %0, %0, %0, %0, %0 %2:_(s32) = G_CONSTANT i32 0 %3:_(s32) = G_EXTRACT_VECTOR_ELT %1, %2 $vgpr0 = COPY %3 @@ -115,12 +115,12 @@ body: | liveins: $vgpr0 ; CHECK-LABEL: name: extract_vector_elt_0_v8i32 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[MV:%[0-9]+]]:_(<8 x s32>) = G_MERGE_VALUES [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32) + ; CHECK: [[MV:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[MV]](<8 x s32>), [[C]](s32) ; CHECK: $vgpr0 = COPY [[EVEC]](s32) %0:_(s32) = COPY $vgpr0 - %1:_(<8 x s32>) = G_MERGE_VALUES %0, %0, %0, %0, %0, %0, %0, %0 + %1:_(<8 x s32>) = G_BUILD_VECTOR %0, %0, %0, %0, %0, %0, %0, %0 %2:_(s32) = G_CONSTANT i32 0 %3:_(s32) = G_EXTRACT_VECTOR_ELT %1, %2 $vgpr0 = COPY %3 @@ -134,12 +134,12 @@ body: | liveins: $vgpr0 ; CHECK-LABEL: name: extract_vector_elt_0_v16i32 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[MV:%[0-9]+]]:_(<16 x s32>) = G_MERGE_VALUES [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32) + ; CHECK: [[MV:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[MV]](<16 x s32>), [[C]](s32) ; CHECK: $vgpr0 = COPY [[EVEC]](s32) %0:_(s32) = COPY $vgpr0 - %1:_(<16 x s32>) = G_MERGE_VALUES %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0 + %1:_(<16 x s32>) = G_BUILD_VECTOR %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0, %0 %2:_(s32) = G_CONSTANT i32 0 %3:_(s32) = G_EXTRACT_VECTOR_ELT %1, %2 $vgpr0 = COPY %3 diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir new file mode 100644 index 0000000000000000000000000000000000000000..31f1e41775fe73bc48c788c2b378d53d692cd958 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir @@ -0,0 +1,25 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=legalizer -global-isel %s -o - | FileCheck %s + +--- +name: test_fabs_f32 +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_fabs_f32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_FABS %0 +... +--- +name: test_fabs_f64 +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_fabs_f64 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = G_FABS %0 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir index 1a66bbabe8cf8c82fa70a0eabed2b6c5f4badb41..85ed234bf47fb51790d14ad8fcbcad6293ff2e29 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir @@ -1,27 +1,26 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -run-pass=legalizer %s -o - | FileCheck %s ---- | - define void @test_fadd() { - entry: - ret void - } - -... - --- -name: test_fadd -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } +name: test_fadd_f32 body: | bb.0.entry: liveins: $vgpr0, $vgpr1 ; CHECK-LABEL: name: test_fadd ; CHECK: %2:_(s32) = G_FADD %0, %1 - %0(s32) = COPY $vgpr0 - %1(s32) = COPY $vgpr1 - %2(s32) = G_FADD %0, %1 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_FADD %0, %1 $vgpr0 = COPY %2 ... +--- +name: test_fadd_f64 +body: | + bb.0.entry: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s64) = G_FADD %0, %1 + $vgpr0_vgpr1 = COPY %2 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir index bcd372e05873f47439adfb1d437e7185a8c250b4..c09903f25730767a8b278415c455e441168bc204 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir @@ -3,33 +3,25 @@ --- name: test_fcmp_f32 -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } body: | bb.0: liveins: $vgpr0 ; CHECK-LABEL: name: test_fcmp_f32 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - %0(s32) = G_CONSTANT i32 0 - %1(s32) = COPY $vgpr0 + %0:_(s32) = G_CONSTANT i32 0 + %1:_(s32) = COPY $vgpr0 - %2(s1) = G_FCMP floatpred(uge), %0, %1 + %2:_(s1) = G_FCMP floatpred(uge), %0, %1 ... --- name: test_fcmp_f64 -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } body: | bb.0: liveins: $vgpr0_vgpr1 ; CHECK-LABEL: name: test_fcmp_f64 ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - %0(s64) = G_CONSTANT i64 0 - %1(s64) = COPY $vgpr0_vgpr1 + %0:_(s64) = G_CONSTANT i64 0 + %1:_(s64) = COPY $vgpr0_vgpr1 - %2(s1) = G_FCMP floatpred(uge), %0, %1 + %2:_(s1) = G_FCMP floatpred(uge), %0, %1 ... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir new file mode 100644 index 0000000000000000000000000000000000000000..954859e8cdaf309916a93b0182a8910b9911bdf6 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir @@ -0,0 +1,35 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s + +--- +name: test_fma_f32 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_fma + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = G_FMA %0, %1, %2 + $vgpr0 = COPY %3 +... +--- +name: test_fma_f64 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 + + ; CHECK-LABEL: name: test_fma + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; CHECK: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[COPY]], [[COPY1]] + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s64) = COPY $vgpr4_vgpr5 + %3:_(s64) = G_FMA %0, %1, %2 + $vgpr0_vgpr1 = COPY %3 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir index e68b1bed5fb49efff64876b4abdf8cb5f6b996ee..8a0ce1c6e91a5c69b3010b17ddde4e00695977e7 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir @@ -1,16 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s ---- | - define void @test_fmul() { ret void } -... - --- -name: test_fmul -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } +name: test_fmul_f32 body: | bb.0: liveins: $vgpr0, $vgpr1 @@ -19,8 +11,23 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]] - %0(s32) = COPY $vgpr0 - %1(s32) = COPY $vgpr1 - %2(s32) = G_FMUL %0, %1 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_FMUL %0, %1 $vgpr0 = COPY %2 ... +--- +name: test_fmul_f64 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; CHECK-LABEL: name: test_fmul + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; CHECK: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[COPY1]] + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s64) = G_FMUL %0, %1 + $vgpr0_vgpr1 = COPY %2 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir new file mode 100644 index 0000000000000000000000000000000000000000..7d280f7f9c11781490a514e78cb0efbe0717a51c --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir @@ -0,0 +1,25 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=legalizer -global-isel %s -o - | FileCheck %s + +--- +name: test_fneg_f32 +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_fneg_f32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_FNEG %0 +... +--- +name: test_fneg_f64 +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_fneg_f64 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = G_FNEG %0 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-fptoui.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-fptoui.mir index d83eb1ae2a8da40578d78e2d6a7b1edb924782d5..aede6af2603590517f71fe17103590369c327e49 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-fptoui.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-fptoui.mir @@ -1,22 +1,14 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -run-pass=legalizer %s -o - | FileCheck %s ---- | - define void @test_fptoui() { ret void } -... - --- name: test_fptoui -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } body: | bb.0: liveins: $vgpr0 ; CHECK-LABEL: name: test_fptoui ; CHECK: %1:_(s32) = G_FPTOUI %0 - %0(s32) = COPY $vgpr0 - %1(s32) = G_FPTOUI %0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_FPTOUI %0 $vgpr0 = COPY %1 ... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir new file mode 100644 index 0000000000000000000000000000000000000000..13012f459ed3bfb245fab07f8d2cf1cae3548e8a --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir @@ -0,0 +1,36 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s + +--- +name: test_fsub_f32 +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_fsub_f32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[COPY1]] + ; CHECK: $vgpr0 = COPY [[FSUB]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_FSUB %0, %1 + $vgpr0 = COPY %2 +... +--- +name: test_fsub_f64 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; CHECK-LABEL: name: test_fsub_f64 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; CHECK: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY1]] + ; CHECK: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[COPY]], [[FNEG]] + ; CHECK: $vgpr0_vgpr1 = COPY [[FADD]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s64) = G_FSUB %0, %1 + $vgpr0_vgpr1 = COPY %2 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-icmp.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-icmp.mir index 0549c685ce46442abc154b98fc75e98aaf4ec759..00c0c96335074a36be17ebf207cfba35f1303e49 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-icmp.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-icmp.mir @@ -1,19 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -O0 -march=amdgcn -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s ---- | - define void @test_icmp() { - entry: - ret void - } -... - --- name: test_icmp -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } body: | bb.0.entry: liveins: $vgpr0 @@ -23,9 +12,9 @@ body: | ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[C]](s32), [[COPY]] ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C]], [[COPY]] ; CHECK: $vgpr0 = COPY [[SELECT]](s32) - %0(s32) = G_CONSTANT i32 0 - %1(s32) = COPY $vgpr0 - %2(s1) = G_ICMP intpred(ne), %0, %1 + %0:_(s32) = G_CONSTANT i32 0 + %1:_(s32) = COPY $vgpr0 + %2:_(s1) = G_ICMP intpred(ne), %0, %1 %3:_(s32) = G_SELECT %2(s1), %0(s32), %1(s32) $vgpr0 = COPY %3 ... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector.mir similarity index 86% rename from test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values.mir rename to test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector.mir index c9d8a071a90dc80a1a4c6f864e1b15ea6864a35b..df966affcf264f51cc43c335516dff4827113ec0 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector.mir @@ -23,11 +23,11 @@ body: | ; CHECK-LABEL: name: test_merge_s32_s32_v2s32 ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; CHECK: [[MV:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[C]](s32), [[C1]](s32) + ; CHECK: [[MV:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32) ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](<2 x s32>) %0:_(s32) = G_CONSTANT i32 0 %1:_(s32) = G_CONSTANT i32 1 - %2:_(<2 x s32>) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + %2:_(<2 x s32>) = G_BUILD_VECTOR %0:_(s32), %1:_(s32) $vgpr0_vgpr1 = COPY %2(<2 x s32>) ... @@ -39,12 +39,12 @@ body: | ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; CHECK: [[MV:%[0-9]+]]:_(<3 x s32>) = G_MERGE_VALUES [[C]](s32), [[C1]](s32), [[C2]](s32) + ; CHECK: [[MV:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32) ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](<3 x s32>) %0:_(s32) = G_CONSTANT i32 0 %1:_(s32) = G_CONSTANT i32 1 %2:_(s32) = G_CONSTANT i32 2 - %3:_(<3 x s32>) = G_MERGE_VALUES %0:_(s32), %1:_(s32), %2:_(s32) + %3:_(<3 x s32>) = G_BUILD_VECTOR %0:_(s32), %1:_(s32), %2:_(s32) $vgpr0_vgpr1_vgpr2 = COPY %3(<3 x s32>) ... @@ -55,11 +55,11 @@ body: | ; CHECK-LABEL: name: test_merge_s64_s64_s128 ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; CHECK: [[MV:%[0-9]+]]:_(<2 x s64>) = G_MERGE_VALUES [[C]](s64), [[C1]](s64) + ; CHECK: [[MV:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64) ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](<2 x s64>) %0:_(s64) = G_CONSTANT i64 0 %1:_(s64) = G_CONSTANT i64 1 - %2:_(<2 x s64>) = G_MERGE_VALUES %0(s64), %1(s64) + %2:_(<2 x s64>) = G_BUILD_VECTOR %0(s64), %1(s64) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %2(<2 x s64>) ... @@ -72,13 +72,13 @@ body: | ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 - ; CHECK: [[MV:%[0-9]+]]:_(<4 x s64>) = G_MERGE_VALUES [[C]](s64), [[C1]](s64), [[C2]](s64), [[C3]](s64) + ; CHECK: [[MV:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64), [[C2]](s64), [[C3]](s64) ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[MV]](<4 x s64>) %0:_(s64) = G_CONSTANT i64 0 %1:_(s64) = G_CONSTANT i64 1 %2:_(s64) = G_CONSTANT i64 2 %3:_(s64) = G_CONSTANT i64 3 - %4:_(<4 x s64>) = G_MERGE_VALUES %0(s64), %1(s64), %2(s64), %3(s64) + %4:_(<4 x s64>) = G_BUILD_VECTOR %0(s64), %1(s64), %2(s64), %3(s64) $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %4(<4 x s64>) ... @@ -109,6 +109,6 @@ body: | # %16:_(s32) = G_CONSTANT i32 16 -# %17:_(<17 x s32>) = G_MERGE_VALUES %0:_(s32), %1:_(s32), %2:_(s32), %3:_(s32), %4:_(s32), %5:_(s32), %6:_(s32), %7:_(s32), %8:_(s32), %9:_(s32), %10:_(s32), %11:_(s32), %12:_(s32), %13:_(s32), %14:_(s32), %15:_(s32), %16:_(s32) +# %17:_(<17 x s32>) = G_BUILD_VECTOR %0:_(s32), %1:_(s32), %2:_(s32), %3:_(s32), %4:_(s32), %5:_(s32), %6:_(s32), %7:_(s32), %8:_(s32), %9:_(s32), %10:_(s32), %11:_(s32), %12:_(s32), %13:_(s32), %14:_(s32), %15:_(s32), %16:_(s32) # S_ENDPGM implicit %17(<17 x s32>) # ... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir index 5b176e408d27d78aa5ac8bed78501fcfe5f6ac2b..6e988dabe172f278923f2c64c9bd694a2df71e8a 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir @@ -1,15 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s ---- | - define void @test_or() { ret void } -... --- name: test_or -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } body: | bb.0: liveins: $vgpr0, $vgpr1 @@ -18,8 +11,8 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[COPY1]] - %0(s32) = COPY $vgpr0 - %1(s32) = COPY $vgpr1 - %2(s32) = G_OR %0, %1 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_OR %0, %1 $vgpr0 = COPY %2 ... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir index ec223539cfc5170299886854a7971200178023fd..4c04612240e81435c5866b296683611fc1a0edeb 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir @@ -1,19 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -O0 -march=amdgcn -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s ---- | - define void @test_select() { ret void } -... - --- name: test_select -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } - - { id: 3, class: _ } - - { id: 4, class: _ } - - { id: 5, class: _ } body: | bb.0: liveins: $vgpr0 @@ -24,13 +13,13 @@ body: | ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[C2]] - %0(s32) = G_CONSTANT i32 0 - %1(s32) = COPY $vgpr0 + %0:_(s32) = G_CONSTANT i32 0 + %1:_(s32) = COPY $vgpr0 - %2(s1) = G_ICMP intpred(ne), %0, %1 - %3(s32) = G_CONSTANT i32 1 - %4(s32) = G_CONSTANT i32 2 - %5(s32) = G_SELECT %2, %3, %4 + %2:_(s1) = G_ICMP intpred(ne), %0, %1 + %3:_(s32) = G_CONSTANT i32 1 + %4:_(s32) = G_CONSTANT i32 2 + %5:_(s32) = G_SELECT %2, %3, %4 $vgpr0 = COPY %5 ... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir index 5d2ec5f7fb6243115bfc4ae3a5ed81c657db8d85..cd24d48a0dbe4f51b668a3d81f51ecd3020fd8f6 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir @@ -3,10 +3,6 @@ --- name: test_shl -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } body: | bb.0.entry: liveins: $vgpr0, $vgpr1 @@ -15,8 +11,8 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[COPY1]] - %0(s32) = COPY $vgpr0 - %1(s32) = COPY $vgpr1 - %2(s32) = G_SHL %0, %1 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_SHL %0, %1 $vgpr0 = COPY %2 ... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir index 1f4a21b73561c327dfe47388310c3408e9506367..1e004ac478b3e21e0f4bfb58098eb006eebaa044 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir @@ -16,18 +16,3 @@ body: | $vgpr2 = COPY %2(s32) ... ---- -name: test_unmerge_v2s32_s32 -body: | - bb.0: - liveins: $vgpr0_vgpr1 - ; CHECK-LABEL: name: test_unmerge_v2s32_s32 - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; CHECK: $vgpr0 = COPY [[UV]](s32) - ; CHECK: $vgpr2 = COPY [[UV1]](s32) - %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 - %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0:_(<2 x s32>) - $vgpr0 = COPY %1(s32) - $vgpr2 = COPY %2(s32) -... diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir new file mode 100644 index 0000000000000000000000000000000000000000..828e5e41e2ebccb15e0812431a8876367f429143 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-anyext.mir @@ -0,0 +1,31 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s + +--- +name: anyext_i32_to_i64_s +legalized: true + +body: | + bb.0: + liveins: $sgpr0 + ; CHECK-LABEL: name: anyext_i32_to_i64_s + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[SEXT:%[0-9]+]]:sgpr(s64) = G_ANYEXT [[COPY]](s32) + %0:_(s32) = COPY $sgpr0 + %1:_(s64) = G_ANYEXT %0 +... + +--- +name: anyext_i32_to_i64_v +legalized: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: anyext_i32_to_i64_v + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[SEXT:%[0-9]+]]:vgpr(s64) = G_ANYEXT [[COPY]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s64) = G_ANYEXT %0 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir new file mode 100644 index 0000000000000000000000000000000000000000..10a9bea6a8743b1c3d27503dd3184ecaf3cb233e --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir @@ -0,0 +1,29 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -O0 -march amdgcn -mcpu=fiji -run-pass=regbankselect %s -o - | FileCheck %s + +--- | + + @addr = global i8* null + + define void @test_blockaddress() { + store i8* blockaddress(@test_blockaddress, %block), i8** @addr + indirectbr i8* blockaddress(@test_blockaddress, %block), [label %block] + + block: ; preds = %0 + ret void + } + +... +--- +name: test_blockaddress +alignment: 4 +legalized: true +body: | + bb.1 (%ir-block.0): + ; CHECK-LABEL: name: test_blockaddress + ; CHECK: [[BLOCK_ADDR:%[0-9]+]]:sgpr(p0) = G_BLOCK_ADDR blockaddress(@test_blockaddress, %ir-block.block) + ; CHECK: S_ENDPGM implicit [[BLOCK_ADDR]](p0) + %0:_(p0) = G_BLOCK_ADDR blockaddress(@test_blockaddress, %ir-block.block) + S_ENDPGM implicit %0 + +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bswap.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bswap.mir new file mode 100644 index 0000000000000000000000000000000000000000..9850b87959af380d36c8da2b9bb91e7fd5c90bec --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bswap.mir @@ -0,0 +1,31 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s + +--- +name: bswap_i32_s +legalized: true + +body: | + bb.0: + liveins: $sgpr0 + ; CHECK-LABEL: name: bswap_i32_s + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[BSWAP:%[0-9]+]]:sgpr(s32) = G_BSWAP [[COPY]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = G_BSWAP %0 +... + +--- +name: bswap_i32_v +legalized: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: bswap_i32_v + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BSWAP:%[0-9]+]]:vgpr(s32) = G_BSWAP [[COPY]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_BSWAP %0 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir new file mode 100644 index 0000000000000000000000000000000000000000..19b64b93499beb8eb0a1e73acecdb247f54b3197 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir @@ -0,0 +1,31 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s + +--- +name: ctlz_zero_undef_i32_s +legalized: true + +body: | + bb.0: + liveins: $sgpr0 + ; CHECK-LABEL: name: ctlz_zero_undef_i32_s + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:sgpr(s32) = G_CTLZ_ZERO_UNDEF [[COPY]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = G_CTLZ_ZERO_UNDEF %0 +... + +--- +name: ctlz_zero_undef_i32_v +legalized: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: ctlz_zero_undef_i32_v + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:vgpr(s32) = G_CTLZ_ZERO_UNDEF [[COPY]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CTLZ_ZERO_UNDEF %0 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz.mir new file mode 100644 index 0000000000000000000000000000000000000000..441bc894f2bab43d71256959bc63ff9551e6b3d9 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz.mir @@ -0,0 +1,31 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s + +--- +name: ctlz_i32_s +legalized: true + +body: | + bb.0: + liveins: $sgpr0 + ; CHECK-LABEL: name: ctlz_i32_s + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[CTLZ:%[0-9]+]]:sgpr(s32) = G_CTLZ [[COPY]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = G_CTLZ %0 +... + +--- +name: ctlz_i32_v +legalized: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: ctlz_i32_v + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[CTLZ:%[0-9]+]]:vgpr(s32) = G_CTLZ [[COPY]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CTLZ %0 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctpop.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctpop.mir new file mode 100644 index 0000000000000000000000000000000000000000..e4694371805aa176863cd19677282fdf505d2ee1 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctpop.mir @@ -0,0 +1,31 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s + +--- +name: ctpop_i32_s +legalized: true + +body: | + bb.0: + liveins: $sgpr0 + ; CHECK-LABEL: name: ctpop_i32_s + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[CTPOP:%[0-9]+]]:sgpr(s32) = G_CTPOP [[COPY]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = G_CTPOP %0 +... + +--- +name: ctpop_i32_v +legalized: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: ctpop_i32_v + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[CTPOP:%[0-9]+]]:vgpr(s32) = G_CTPOP [[COPY]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CTPOP %0 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir new file mode 100644 index 0000000000000000000000000000000000000000..127c7fb8a84ec48592e4a47d734e9d5645ff35ca --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir @@ -0,0 +1,31 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s + +--- +name: cttz_zero_undef_i32_s +legalized: true + +body: | + bb.0: + liveins: $sgpr0 + ; CHECK-LABEL: name: cttz_zero_undef_i32_s + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:sgpr(s32) = G_CTTZ_ZERO_UNDEF [[COPY]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = G_CTTZ_ZERO_UNDEF %0 +... + +--- +name: cttz_zero_undef_i32_v +legalized: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: cttz_zero_undef_i32_v + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:vgpr(s32) = G_CTTZ_ZERO_UNDEF [[COPY]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CTTZ_ZERO_UNDEF %0 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz.mir new file mode 100644 index 0000000000000000000000000000000000000000..c75d89bdaffbac34101a10e631b77faba5037d69 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz.mir @@ -0,0 +1,31 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s + +--- +name: cttz_i32_s +legalized: true + +body: | + bb.0: + liveins: $sgpr0 + ; CHECK-LABEL: name: cttz_i32_s + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[CTTZ:%[0-9]+]]:sgpr(s32) = G_CTTZ [[COPY]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = G_CTTZ %0 +... + +--- +name: cttz_i32_v +legalized: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: cttz_i32_v + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[CTTZ:%[0-9]+]]:vgpr(s32) = G_CTTZ [[COPY]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CTTZ %0 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fabs.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fabs.mir new file mode 100644 index 0000000000000000000000000000000000000000..c48d10191d57c40b60003f6584c1317c2a6c6248 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fabs.mir @@ -0,0 +1,35 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s + +--- +name: fabs_s +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: fabs_s + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[FABS:%[0-9]+]]:sgpr(s32) = G_FABS [[COPY]] + ; CHECK: $vgpr0 = COPY [[FABS]](s32) + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = G_FABS %0 + $vgpr0 = COPY %1 +... + +--- +name: fabs_v +legalized: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: fabs_v + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[FABS:%[0-9]+]]:vgpr(s32) = G_FABS [[COPY]] + ; CHECK: $vgpr0 = COPY [[FABS]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_FABS %0 + $vgpr0 = COPY %1 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fma.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fma.mir new file mode 100644 index 0000000000000000000000000000000000000000..3f0dc2237dca154d3f53e2d89afb06b81ea18f12 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fma.mir @@ -0,0 +1,148 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s + +--- +name: fma_sss +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + ; CHECK-LABEL: name: fma_sss + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) + ; CHECK: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY]], [[COPY3]], [[COPY4]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s32) = COPY $sgpr2 + %3:_(s32) = G_FMA %0, %1, %2 +... +--- +name: fma_vss +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $sgpr0, $sgpr1 + ; CHECK-LABEL: name: fma_vss + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) + ; CHECK: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY]], [[COPY3]], [[COPY4]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $sgpr0 + %2:_(s32) = COPY $sgpr1 + %3:_(s32) = G_FMA %0, %1, %2 +... +--- +name: fma_svs +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0, $sgpr1 + ; CHECK-LABEL: name: fma_svs + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) + ; CHECK: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY3]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $vgpr0 + %2:_(s32) = COPY $sgpr1 + %3:_(s32) = G_FMA %0, %1, %2 +... +--- +name: fma_ssv +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $vgpr0 + ; CHECK-LABEL: name: fma_ssv + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY]], [[COPY3]], [[COPY2]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s32) = COPY $vgpr0 + %3:_(s32) = G_FMA %0, %1, %2 +... +--- +name: fma_vvs +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr0 + ; CHECK-LABEL: name: fma_vvs + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) + ; CHECK: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY3]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $sgpr0 + %3:_(s32) = G_FMA %0, %1, %2 +... +--- +name: fma_vsv +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $sgpr0, $vgpr1 + ; CHECK-LABEL: name: fma_vsv + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY]], [[COPY3]], [[COPY2]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s32) = COPY $vgpr1 + %3:_(s32) = G_FMA %0, %1, %2 +... +--- +name: fma_svv +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0, $vgpr1 + ; CHECK-LABEL: name: fma_svv + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $vgpr0 + %2:_(s32) = COPY $vgpr1 + %3:_(s32) = G_FMA %0, %1, %2 +... +--- +name: fma_vvv +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: fma_vvv + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; CHECK: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = G_FMA %0, %1, %2 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fneg.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fneg.mir new file mode 100644 index 0000000000000000000000000000000000000000..3438275fbe38d371bb957c13c6efe6b17e8dd907 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fneg.mir @@ -0,0 +1,35 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s + +--- +name: fneg_s +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: fneg_s + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[FNEG:%[0-9]+]]:sgpr(s32) = G_FNEG [[COPY]] + ; CHECK: $vgpr0 = COPY [[FNEG]](s32) + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = G_FNEG %0 + $vgpr0 = COPY %1 +... + +--- +name: fneg_v +legalized: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: fneg_v + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[FNEG:%[0-9]+]]:vgpr(s32) = G_FNEG [[COPY]] + ; CHECK: $vgpr0 = COPY [[FNEG]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_FNEG %0 + $vgpr0 = COPY %1 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir new file mode 100644 index 0000000000000000000000000000000000000000..77a444d6d14e9264b457c615852d97d6c578937c --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir @@ -0,0 +1,23 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s + +--- | + target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + define void @test_frame_index_p5() { + %ptr0 = alloca i32, addrspace(5) + ret void + } +... +--- +name: test_frame_index_p5 +legalized: true +stack: + - { id: 0, name: ptr0, offset: 0, size: 4, alignment: 4 } +body: | + bb.0: + ; CHECK-LABEL: name: test_frame_index_p5 + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:sgpr(p5) = G_FRAME_INDEX %stack.0.ptr0 + %0:_(p5) = G_FRAME_INDEX %stack.0.ptr0 + +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fsub.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fsub.mir new file mode 100644 index 0000000000000000000000000000000000000000..a924e5ee49769ea7a72e93eb8679dee4b854e556 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fsub.mir @@ -0,0 +1,69 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s + +--- +name: fsub_ss +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; CHECK-LABEL: name: fsub_ss + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK: [[FSUB:%[0-9]+]]:vgpr(s32) = G_FSUB [[COPY]], [[COPY2]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s32) = G_FSUB %0, %1 +... + +--- +name: fsub_sv +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + ; CHECK-LABEL: name: fsub_sv + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[FSUB:%[0-9]+]]:vgpr(s32) = G_FSUB [[COPY]], [[COPY1]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $vgpr0 + %2:_(s32) = G_FSUB %0, %1 +... + +--- +name: fsub_vs +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + ; CHECK-LABEL: name: fsub_vs + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK: [[FSUB:%[0-9]+]]:vgpr(s32) = G_FSUB [[COPY]], [[COPY2]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $sgpr0 + %2:_(s32) = G_FSUB %0, %1 +... + +--- +name: fsub_vv +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: fsub_vv + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[FSUB:%[0-9]+]]:vgpr(s32) = G_FSUB [[COPY]], [[COPY1]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_FSUB %0, %1 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrtoint.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrtoint.mir new file mode 100644 index 0000000000000000000000000000000000000000..83d8994d0fe310eaf6cd8d135160d76427e39e36 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrtoint.mir @@ -0,0 +1,31 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s + +--- +name: ptrtoint_s +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: ptrtoint_s + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 + ; CHECK: [[PTRTOINT:%[0-9]+]]:sgpr(p4) = G_PTRTOINT [[COPY]](s64) + %0:_(s64) = COPY $sgpr0_sgpr1 + %1:_(p4) = G_PTRTOINT %0 +... + +--- +name: ptrtoint_v +legalized: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: ptrtoint_v + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[PTRTOINT:%[0-9]+]]:vgpr(p0) = G_PTRTOINT [[COPY]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(p0) = G_PTRTOINT %0 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir new file mode 100644 index 0000000000000000000000000000000000000000..9c3e2f218d2c45a2ce120dddc3ce08264ea8d419 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir @@ -0,0 +1,31 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s + +--- +name: zext_i32_to_i64_s +legalized: true + +body: | + bb.0: + liveins: $sgpr0 + ; CHECK-LABEL: name: zext_i32_to_i64_s + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[SEXT:%[0-9]+]]:sgpr(s64) = G_SEXT [[COPY]](s32) + %0:_(s32) = COPY $sgpr0 + %1:_(s64) = G_SEXT %0 +... + +--- +name: zext_i32_to_i64_v +legalized: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: zext_i32_to_i64_v + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[SEXT:%[0-9]+]]:vgpr(s64) = G_SEXT [[COPY]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s64) = G_SEXT %0 +... diff --git a/test/CodeGen/AMDGPU/add3.ll b/test/CodeGen/AMDGPU/add3.ll new file mode 100644 index 0000000000000000000000000000000000000000..35055190b348e628033c1489105a425b31a20afb --- /dev/null +++ b/test/CodeGen/AMDGPU/add3.ll @@ -0,0 +1,171 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s + +; =================================================================================== +; V_ADD3_U32 +; =================================================================================== + +define amdgpu_ps float @add3(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: add3: +; VI: ; %bb.0: +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add3: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add3_u32 v0, v0, v1, v2 +; GFX9-NEXT: ; return to shader part epilog + %x = add i32 %a, %b + %result = add i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +; ThreeOp instruction variant not used due to Constant Bus Limitations +; TODO: with reassociation it is possible to replace a v_add_u32_e32 with a s_add_i32 +define amdgpu_ps float @add3_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) { +; VI-LABEL: add3_vgpr_b: +; VI: ; %bb.0: +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s3, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add3_vgpr_b: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s3, v0 +; GFX9-NEXT: ; return to shader part epilog + %x = add i32 %a, %b + %result = add i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @add3_vgpr_all2(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: add3_vgpr_all2: +; VI: ; %bb.0: +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add3_vgpr_all2: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add3_u32 v0, v1, v2, v0 +; GFX9-NEXT: ; return to shader part epilog + %x = add i32 %b, %c + %result = add i32 %a, %x + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @add3_vgpr_bc(i32 inreg %a, i32 %b, i32 %c) { +; VI-LABEL: add3_vgpr_bc: +; VI: ; %bb.0: +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add3_vgpr_bc: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add3_u32 v0, s2, v0, v1 +; GFX9-NEXT: ; return to shader part epilog + %x = add i32 %a, %b + %result = add i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @add3_vgpr_const(i32 %a, i32 %b) { +; VI-LABEL: add3_vgpr_const: +; VI: ; %bb.0: +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add3_vgpr_const: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add3_u32 v0, v0, v1, 16 +; GFX9-NEXT: ; return to shader part epilog + %x = add i32 %a, %b + %result = add i32 %x, 16 + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps <2 x float> @add3_multiuse_outer(i32 %a, i32 %b, i32 %c, i32 %x) { +; VI-LABEL: add3_multiuse_outer: +; VI: ; %bb.0: +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mul_lo_i32 v1, v0, v3 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add3_multiuse_outer: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add3_u32 v0, v0, v1, v2 +; GFX9-NEXT: v_mul_lo_i32 v1, v0, v3 +; GFX9-NEXT: ; return to shader part epilog + %inner = add i32 %a, %b + %outer = add i32 %inner, %c + %x1 = mul i32 %outer, %x + %r1 = insertelement <2 x i32> undef, i32 %outer, i32 0 + %r0 = insertelement <2 x i32> %r1, i32 %x1, i32 1 + %bc = bitcast <2 x i32> %r0 to <2 x float> + ret <2 x float> %bc +} + +define amdgpu_ps <2 x float> @add3_multiuse_inner(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: add3_multiuse_inner: +; VI: ; %bb.0: +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v0, v2 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add3_multiuse_inner: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v0, v2 +; GFX9-NEXT: ; return to shader part epilog + %inner = add i32 %a, %b + %outer = add i32 %inner, %c + %r1 = insertelement <2 x i32> undef, i32 %inner, i32 0 + %r0 = insertelement <2 x i32> %r1, i32 %outer, i32 1 + %bc = bitcast <2 x i32> %r0 to <2 x float> + ret <2 x float> %bc +} + +; A case where uniform values end up in VGPRs -- we could use v_add3_u32 here, +; but we don't. +define amdgpu_ps float @add3_uniform_vgpr(float inreg %a, float inreg %b, float inreg %c) { +; VI-LABEL: add3_uniform_vgpr: +; VI: ; %bb.0: +; VI-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-NEXT: v_add_f32_e64 v0, s2, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s3, 2.0 +; VI-NEXT: v_add_f32_e32 v2, s4, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add3_uniform_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GFX9-NEXT: v_add_f32_e64 v0, s2, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s3, 2.0 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX9-NEXT: ; return to shader part epilog + %a1 = fadd float %a, 1.0 + %b2 = fadd float %b, 2.0 + %c3 = fadd float %c, 3.0 + %bc.a = bitcast float %a1 to i32 + %bc.b = bitcast float %b2 to i32 + %bc.c = bitcast float %c3 to i32 + %x = add i32 %bc.a, %bc.b + %result = add i32 %x, %bc.c + %bc = bitcast i32 %result to float + ret float %bc +} diff --git a/test/CodeGen/AMDGPU/add_shl.ll b/test/CodeGen/AMDGPU/add_shl.ll new file mode 100644 index 0000000000000000000000000000000000000000..88ae042e86726785482801232971c458f9bee14a --- /dev/null +++ b/test/CodeGen/AMDGPU/add_shl.ll @@ -0,0 +1,115 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s + +; =================================================================================== +; V_ADD_LSHL_U32 +; =================================================================================== + +define amdgpu_ps float @add_shl(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: add_shl: +; VI: ; %bb.0: +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add_shl: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add_lshl_u32 v0, v0, v1, v2 +; GFX9-NEXT: ; return to shader part epilog + %x = add i32 %a, %b + %result = shl i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @add_shl_vgpr_c(i32 inreg %a, i32 inreg %b, i32 %c) { +; VI-LABEL: add_shl_vgpr_c: +; VI: ; %bb.0: +; VI-NEXT: s_add_i32 s2, s2, s3 +; VI-NEXT: v_lshlrev_b32_e64 v0, v0, s2 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add_shl_vgpr_c: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s2 +; GFX9-NEXT: ; return to shader part epilog + %x = add i32 %a, %b + %result = shl i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @add_shl_vgpr_ac(i32 %a, i32 inreg %b, i32 %c) { +; VI-LABEL: add_shl_vgpr_ac: +; VI: ; %bb.0: +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add_shl_vgpr_ac: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add_lshl_u32 v0, v0, s2, v1 +; GFX9-NEXT: ; return to shader part epilog + %x = add i32 %a, %b + %result = shl i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @add_shl_vgpr_const(i32 %a, i32 %b) { +; VI-LABEL: add_shl_vgpr_const: +; VI: ; %bb.0: +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add_shl_vgpr_const: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add_lshl_u32 v0, v0, v1, 9 +; GFX9-NEXT: ; return to shader part epilog + %x = add i32 %a, %b + %result = shl i32 %x, 9 + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @add_shl_vgpr_const_inline_const(i32 %a) { +; VI-LABEL: add_shl_vgpr_const_inline_const: +; VI: ; %bb.0: +; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7e800, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add_shl_vgpr_const_inline_const: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e800 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 9, v1 +; GFX9-NEXT: ; return to shader part epilog + %x = add i32 %a, 1012 + %result = shl i32 %x, 9 + %bc = bitcast i32 %result to float + ret float %bc +} + +; TODO: Non-optimal code generation because SelectionDAG combines +; (shl (add x, CONST), y) ---> (add (shl x, y), CONST'). +; +define amdgpu_ps float @add_shl_vgpr_inline_const_x2(i32 %a) { +; VI-LABEL: add_shl_vgpr_inline_const_x2: +; VI: ; %bb.0: +; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x600, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: add_shl_vgpr_inline_const_x2: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0x600 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 9, v1 +; GFX9-NEXT: ; return to shader part epilog + %x = add i32 %a, 3 + %result = shl i32 %x, 9 + %bc = bitcast i32 %result to float + ret float %bc +} diff --git a/test/CodeGen/AMDGPU/addrspacecast-captured.ll b/test/CodeGen/AMDGPU/addrspacecast-captured.ll index 138bc36b9e1b8bcaee5f4fb6516ec732841e3889..64ff0e5fac82a412f50b07c345832bbfa625e9f2 100644 --- a/test/CodeGen/AMDGPU/addrspacecast-captured.ll +++ b/test/CodeGen/AMDGPU/addrspacecast-captured.ll @@ -1,45 +1,46 @@ ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s - ; Nothing should be done if the addrspacecast is captured. +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + declare void @consume_ptr2int(i32) #0 ; CHECK-LABEL: @addrspacecast_captured( -; CHECK: %data = alloca i32, align 4 -; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)* -; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32 +; CHECK: %data = alloca i32, align 4, addrspace(5) +; CHECK: %cast = addrspacecast i32 addrspace(5)* %data to i32* +; CHECK: %ptr2int = ptrtoint i32* %cast to i32 ; CHECK: store i32 %ptr2int, i32 addrspace(1)* %out define amdgpu_kernel void @addrspacecast_captured(i32 addrspace(1)* %out) #0 { entry: - %data = alloca i32, align 4 - %cast = addrspacecast i32* %data to i32 addrspace(4)* - %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32 + %data = alloca i32, align 4, addrspace(5) + %cast = addrspacecast i32 addrspace(5)* %data to i32* + %ptr2int = ptrtoint i32* %cast to i32 store i32 %ptr2int, i32 addrspace(1)* %out ret void } ; CHECK-LABEL: @addrspacecast_captured_store( -; CHECK: %data = alloca i32, align 4 -; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)* -; CHECK: store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %out -define amdgpu_kernel void @addrspacecast_captured_store(i32 addrspace(4)* addrspace(1)* %out) #0 { +; CHECK: %data = alloca i32, align 4, addrspace(5) +; CHECK: %cast = addrspacecast i32 addrspace(5)* %data to i32* +; CHECK: store i32* %cast, i32* addrspace(1)* %out +define amdgpu_kernel void @addrspacecast_captured_store(i32* addrspace(1)* %out) #0 { entry: - %data = alloca i32, align 4 - %cast = addrspacecast i32* %data to i32 addrspace(4)* - store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %out + %data = alloca i32, align 4, addrspace(5) + %cast = addrspacecast i32 addrspace(5)* %data to i32* + store i32* %cast, i32* addrspace(1)* %out ret void } ; CHECK-LABEL: @addrspacecast_captured_call( -; CHECK: %data = alloca i32, align 4 -; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)* -; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32 +; CHECK: %data = alloca i32, align 4, addrspace(5) +; CHECK: %cast = addrspacecast i32 addrspace(5)* %data to i32* +; CHECK: %ptr2int = ptrtoint i32* %cast to i32 ; CHECK: call void @consume_ptr2int(i32 %ptr2int) define amdgpu_kernel void @addrspacecast_captured_call() #0 { entry: - %data = alloca i32, align 4 - %cast = addrspacecast i32* %data to i32 addrspace(4)* - %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32 + %data = alloca i32, align 4, addrspace(5) + %cast = addrspacecast i32 addrspace(5)* %data to i32* + %ptr2int = ptrtoint i32* %cast to i32 call void @consume_ptr2int(i32 %ptr2int) ret void } diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll index 95bbe958e9321ee64bd59e2f4f8bf84e85aaf87e..ea40cda4fa6bed1736133c62d0936af86e40cfc8 100644 --- a/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/test/CodeGen/AMDGPU/addrspacecast.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3,-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3,-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast: ; HSA: enable_sgpr_private_segment_buffer = 1 diff --git a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index 023c19915c7a175e441dab83e9301b34b0ad0187..199a96c64435ca7abbb1746abd0ad0005a39f48f 100644 --- a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -1,10 +1,10 @@ -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE %s -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA %s -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE-VECT -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE %s +; RUN: llc -show-mc-encoding -mattr=-code-object-v3,-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=-code-object-v3,-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA %s +; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -show-mc-encoding -mattr=-code-object-v3,+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE-VECT -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -show-mc-encoding -mattr=-code-object-v3,-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-code-object-v3,-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -data-layout=A5 -mcpu=kaveri -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck -enable-var-scope -check-prefix=HSAOPT -check-prefix=OPT %s ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -data-layout=A5 -mcpu=kaveri -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck -enable-var-scope -check-prefix=NOHSAOPT -check-prefix=OPT %s diff --git a/test/CodeGen/AMDGPU/and_or.ll b/test/CodeGen/AMDGPU/and_or.ll new file mode 100644 index 0000000000000000000000000000000000000000..c8ff582976c643b8a4d2ab97ccab0b2b5a27fc7f --- /dev/null +++ b/test/CodeGen/AMDGPU/and_or.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s +;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s + +; =================================================================================== +; V_AND_OR_B32 +; =================================================================================== + +define amdgpu_ps float @and_or(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: and_or: +; VI: ; %bb.0: +; VI-NEXT: v_and_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: and_or: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_and_or_b32 v0, v0, v1, v2 +; GFX9-NEXT: ; return to shader part epilog + %x = and i32 %a, %b + %result = or i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +; ThreeOp instruction variant not used due to Constant Bus Limitations +define amdgpu_ps float @and_or_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) { +; VI-LABEL: and_or_vgpr_b: +; VI: ; %bb.0: +; VI-NEXT: v_and_b32_e32 v0, s2, v0 +; VI-NEXT: v_or_b32_e32 v0, s3, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: and_or_vgpr_b: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX9-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX9-NEXT: ; return to shader part epilog + %x = and i32 %a, %b + %result = or i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @and_or_vgpr_ab(i32 %a, i32 %b, i32 inreg %c) { +; VI-LABEL: and_or_vgpr_ab: +; VI: ; %bb.0: +; VI-NEXT: v_and_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, s2, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: and_or_vgpr_ab: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_and_or_b32 v0, v0, v1, s2 +; GFX9-NEXT: ; return to shader part epilog + %x = and i32 %a, %b + %result = or i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @and_or_vgpr_const(i32 %a, i32 %b) { +; VI-LABEL: and_or_vgpr_const: +; VI: ; %bb.0: +; VI-NEXT: v_and_b32_e32 v0, 4, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: and_or_vgpr_const: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_and_or_b32 v0, v0, 4, v1 +; GFX9-NEXT: ; return to shader part epilog + %x = and i32 4, %a + %result = or i32 %x, %b + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @and_or_vgpr_const_inline_const(i32 %a) { +; VI-LABEL: and_or_vgpr_const_inline_const: +; VI: ; %bb.0: +; VI-NEXT: v_and_b32_e32 v0, 20, v0 +; VI-NEXT: v_or_b32_e32 v0, 0x808, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: and_or_vgpr_const_inline_const: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0x808 +; GFX9-NEXT: v_and_or_b32 v0, v0, 20, v1 +; GFX9-NEXT: ; return to shader part epilog + %x = and i32 20, %a + %result = or i32 %x, 2056 + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @and_or_vgpr_inline_const_x2(i32 %a) { +; VI-LABEL: and_or_vgpr_inline_const_x2: +; VI: ; %bb.0: +; VI-NEXT: v_and_b32_e32 v0, 4, v0 +; VI-NEXT: v_or_b32_e32 v0, 1, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: and_or_vgpr_inline_const_x2: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_and_or_b32 v0, v0, 4, 1 +; GFX9-NEXT: ; return to shader part epilog + %x = and i32 4, %a + %result = or i32 %x, 1 + %bc = bitcast i32 %result to float + ret float %bc +} diff --git a/test/CodeGen/AMDGPU/andorbitset.ll b/test/CodeGen/AMDGPU/andorbitset.ll new file mode 100644 index 0000000000000000000000000000000000000000..95bba7d54382f067a50aa51754fe248cb890938a --- /dev/null +++ b/test/CodeGen/AMDGPU/andorbitset.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}s_clear_msb: +; SI: s_bitset0_b32 s{{[0-9]+}}, 31 +define amdgpu_kernel void @s_clear_msb(i32 addrspace(1)* %out, i32 %in) { + %x = and i32 %in, 2147483647 + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_set_msb: +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 +define amdgpu_kernel void @s_set_msb(i32 addrspace(1)* %out, i32 %in) { + %x = or i32 %in, 2147483648 + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_clear_lsb: +; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, -2 +define amdgpu_kernel void @s_clear_lsb(i32 addrspace(1)* %out, i32 %in) { + %x = and i32 %in, 4294967294 + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_set_lsb: +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 +define amdgpu_kernel void @s_set_lsb(i32 addrspace(1)* %out, i32 %in) { + %x = or i32 %in, 1 + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_clear_midbit: +; SI: s_bitset0_b32 s{{[0-9]+}}, 8 +define amdgpu_kernel void @s_clear_midbit(i32 addrspace(1)* %out, i32 %in) { + %x = and i32 %in, 4294967039 + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_set_midbit: +; SI: s_bitset1_b32 s{{[0-9]+}}, 8 +define amdgpu_kernel void @s_set_midbit(i32 addrspace(1)* %out, i32 %in) { + %x = or i32 %in, 256 + store i32 %x, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/andorn2.ll b/test/CodeGen/AMDGPU/andorn2.ll new file mode 100644 index 0000000000000000000000000000000000000000..390c103f3676a37ab17a39b790e262f53f4bf103 --- /dev/null +++ b/test/CodeGen/AMDGPU/andorn2.ll @@ -0,0 +1,103 @@ +; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX600 %s +; RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX700 %s +; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX801 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX900 %s + +; GCN-LABEL: {{^}}scalar_andn2_i32_one_use +; GCN: s_andn2_b32 +define amdgpu_kernel void @scalar_andn2_i32_one_use( + i32 addrspace(1)* %r0, i32 %a, i32 %b) { +entry: + %nb = xor i32 %b, -1 + %r0.val = and i32 %a, %nb + store i32 %r0.val, i32 addrspace(1)* %r0 + ret void +} + +; GCN-LABEL: {{^}}scalar_andn2_i64_one_use +; GCN: s_andn2_b64 +define amdgpu_kernel void @scalar_andn2_i64_one_use( + i64 addrspace(1)* %r0, i64 %a, i64 %b) { +entry: + %nb = xor i64 %b, -1 + %r0.val = and i64 %a, %nb + store i64 %r0.val, i64 addrspace(1)* %r0 + ret void +} + +; GCN-LABEL: {{^}}scalar_orn2_i32_one_use +; GCN: s_orn2_b32 +define amdgpu_kernel void @scalar_orn2_i32_one_use( + i32 addrspace(1)* %r0, i32 %a, i32 %b) { +entry: + %nb = xor i32 %b, -1 + %r0.val = or i32 %a, %nb + store i32 %r0.val, i32 addrspace(1)* %r0 + ret void +} + +; GCN-LABEL: {{^}}scalar_orn2_i64_one_use +; GCN: s_orn2_b64 +define amdgpu_kernel void @scalar_orn2_i64_one_use( + i64 addrspace(1)* %r0, i64 %a, i64 %b) { +entry: + %nb = xor i64 %b, -1 + %r0.val = or i64 %a, %nb + store i64 %r0.val, i64 addrspace(1)* %r0 + ret void +} + +; GCN-LABEL: {{^}}vector_andn2_i32_s_v_one_use +; GCN: v_not_b32 +; GCN: v_and_b32 +define amdgpu_kernel void @vector_andn2_i32_s_v_one_use( + i32 addrspace(1)* %r0, i32 %s) { +entry: + %v = call i32 @llvm.amdgcn.workitem.id.x() #1 + %not = xor i32 %v, -1 + %r0.val = and i32 %s, %not + store i32 %r0.val, i32 addrspace(1)* %r0 + ret void +} + +; GCN-LABEL: {{^}}vector_andn2_i32_v_s_one_use +; GCN: s_not_b32 +; GCN: v_and_b32 +define amdgpu_kernel void @vector_andn2_i32_v_s_one_use( + i32 addrspace(1)* %r0, i32 %s) { +entry: + %v = call i32 @llvm.amdgcn.workitem.id.x() #1 + %not = xor i32 %s, -1 + %r0.val = and i32 %v, %not + store i32 %r0.val, i32 addrspace(1)* %r0 + ret void +} + +; GCN-LABEL: {{^}}vector_orn2_i32_s_v_one_use +; GCN: v_not_b32 +; GCN: v_or_b32 +define amdgpu_kernel void @vector_orn2_i32_s_v_one_use( + i32 addrspace(1)* %r0, i32 %s) { +entry: + %v = call i32 @llvm.amdgcn.workitem.id.x() #1 + %not = xor i32 %v, -1 + %r0.val = or i32 %s, %not + store i32 %r0.val, i32 addrspace(1)* %r0 + ret void +} + +; GCN-LABEL: {{^}}vector_orn2_i32_v_s_one_use +; GCN: s_not_b32 +; GCN: v_or_b32 +define amdgpu_kernel void @vector_orn2_i32_v_s_one_use( + i32 addrspace(1)* %r0, i32 %s) { +entry: + %v = call i32 @llvm.amdgcn.workitem.id.x() #1 + %not = xor i32 %s, -1 + %r0.val = or i32 %v, %not + store i32 %r0.val, i32 addrspace(1)* %r0 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.amdgcn.workitem.id.x() #0 diff --git a/test/CodeGen/AMDGPU/andorxorinvimm.ll b/test/CodeGen/AMDGPU/andorxorinvimm.ll new file mode 100644 index 0000000000000000000000000000000000000000..a16fd534247ec2e15566736062389ea38e7dbf98 --- /dev/null +++ b/test/CodeGen/AMDGPU/andorxorinvimm.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}s_or_to_orn2: +; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 +define amdgpu_kernel void @s_or_to_orn2(i32 addrspace(1)* %out, i32 %in) { + %x = or i32 %in, -51 + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_or_to_orn2_imm0: +; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 +define amdgpu_kernel void @s_or_to_orn2_imm0(i32 addrspace(1)* %out, i32 %in) { + %x = or i32 -51, %in + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_and_to_andn2: +; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 +define amdgpu_kernel void @s_and_to_andn2(i32 addrspace(1)* %out, i32 %in) { + %x = and i32 %in, -51 + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_and_to_andn2_imm0: +; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 +define amdgpu_kernel void @s_and_to_andn2_imm0(i32 addrspace(1)* %out, i32 %in) { + %x = and i32 -51, %in + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_xor_to_xnor: +; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 +define amdgpu_kernel void @s_xor_to_xnor(i32 addrspace(1)* %out, i32 %in) { + %x = xor i32 %in, -51 + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_xor_to_xnor_imm0: +; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 +define amdgpu_kernel void @s_xor_to_xnor_imm0(i32 addrspace(1)* %out, i32 %in) { + %x = xor i32 -51, %in + store i32 %x, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll new file mode 100644 index 0000000000000000000000000000000000000000..e57ce963e3c61a221b94bb111e1b94435b942708 --- /dev/null +++ b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll @@ -0,0 +1,145 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=+code-object-v3 < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -amdgpu-verify-hsa-metadata -filetype=obj -mattr=+code-object-v3 -o /dev/null < %s 2>&1 | FileCheck --check-prefix=PARSER %s + +; CHECK-LABEL: {{^}}min_64_max_64: +; CHECK: SGPRBlocks: 0 +; CHECK: VGPRBlocks: 0 +; CHECK: NumSGPRsForWavesPerEU: 1 +; CHECK: NumVGPRsForWavesPerEU: 1 +define amdgpu_kernel void @min_64_max_64() #0 { +entry: + ret void +} +attributes #0 = {"amdgpu-flat-work-group-size"="64,64"} + +; CHECK-LABEL: {{^}}min_64_max_128: +; CHECK: SGPRBlocks: 0 +; CHECK: VGPRBlocks: 0 +; CHECK: NumSGPRsForWavesPerEU: 1 +; CHECK: NumVGPRsForWavesPerEU: 1 +define amdgpu_kernel void @min_64_max_128() #1 { +entry: + ret void +} +attributes #1 = {"amdgpu-flat-work-group-size"="64,128"} + +; CHECK-LABEL: {{^}}min_128_max_128: +; CHECK: SGPRBlocks: 0 +; CHECK: VGPRBlocks: 0 +; CHECK: NumSGPRsForWavesPerEU: 1 +; CHECK: NumVGPRsForWavesPerEU: 1 +define amdgpu_kernel void @min_128_max_128() #2 { +entry: + ret void +} +attributes #2 = {"amdgpu-flat-work-group-size"="128,128"} + +; CHECK-LABEL: {{^}}min_1024_max_2048 +; CHECK: SGPRBlocks: 1 +; CHECK: VGPRBlocks: 7 +; CHECK: NumSGPRsForWavesPerEU: 12 +; CHECK: NumVGPRsForWavesPerEU: 32 +@var = addrspace(1) global float 0.0 +define amdgpu_kernel void @min_1024_max_2048() #3 { + %val0 = load volatile float, float addrspace(1)* @var + %val1 = load volatile float, float addrspace(1)* @var + %val2 = load volatile float, float addrspace(1)* @var + %val3 = load volatile float, float addrspace(1)* @var + %val4 = load volatile float, float addrspace(1)* @var + %val5 = load volatile float, float addrspace(1)* @var + %val6 = load volatile float, float addrspace(1)* @var + %val7 = load volatile float, float addrspace(1)* @var + %val8 = load volatile float, float addrspace(1)* @var + %val9 = load volatile float, float addrspace(1)* @var + %val10 = load volatile float, float addrspace(1)* @var + %val11 = load volatile float, float addrspace(1)* @var + %val12 = load volatile float, float addrspace(1)* @var + %val13 = load volatile float, float addrspace(1)* @var + %val14 = load volatile float, float addrspace(1)* @var + %val15 = load volatile float, float addrspace(1)* @var + %val16 = load volatile float, float addrspace(1)* @var + %val17 = load volatile float, float addrspace(1)* @var + %val18 = load volatile float, float addrspace(1)* @var + %val19 = load volatile float, float addrspace(1)* @var + %val20 = load volatile float, float addrspace(1)* @var + %val21 = load volatile float, float addrspace(1)* @var + %val22 = load volatile float, float addrspace(1)* @var + %val23 = load volatile float, float addrspace(1)* @var + %val24 = load volatile float, float addrspace(1)* @var + %val25 = load volatile float, float addrspace(1)* @var + %val26 = load volatile float, float addrspace(1)* @var + %val27 = load volatile float, float addrspace(1)* @var + %val28 = load volatile float, float addrspace(1)* @var + %val29 = load volatile float, float addrspace(1)* @var + %val30 = load volatile float, float addrspace(1)* @var + %val31 = load volatile float, float addrspace(1)* @var + %val32 = load volatile float, float addrspace(1)* @var + %val33 = load volatile float, float addrspace(1)* @var + %val34 = load volatile float, float addrspace(1)* @var + %val35 = load volatile float, float addrspace(1)* @var + %val36 = load volatile float, float addrspace(1)* @var + %val37 = load volatile float, float addrspace(1)* @var + %val38 = load volatile float, float addrspace(1)* @var + %val39 = load volatile float, float addrspace(1)* @var + %val40 = load volatile float, float addrspace(1)* @var + + store volatile float %val0, float addrspace(1)* @var + store volatile float %val1, float addrspace(1)* @var + store volatile float %val2, float addrspace(1)* @var + store volatile float %val3, float addrspace(1)* @var + store volatile float %val4, float addrspace(1)* @var + store volatile float %val5, float addrspace(1)* @var + store volatile float %val6, float addrspace(1)* @var + store volatile float %val7, float addrspace(1)* @var + store volatile float %val8, float addrspace(1)* @var + store volatile float %val9, float addrspace(1)* @var + store volatile float %val10, float addrspace(1)* @var + store volatile float %val11, float addrspace(1)* @var + store volatile float %val12, float addrspace(1)* @var + store volatile float %val13, float addrspace(1)* @var + store volatile float %val14, float addrspace(1)* @var + store volatile float %val15, float addrspace(1)* @var + store volatile float %val16, float addrspace(1)* @var + store volatile float %val17, float addrspace(1)* @var + store volatile float %val18, float addrspace(1)* @var + store volatile float %val19, float addrspace(1)* @var + store volatile float %val20, float addrspace(1)* @var + store volatile float %val21, float addrspace(1)* @var + store volatile float %val22, float addrspace(1)* @var + store volatile float %val23, float addrspace(1)* @var + store volatile float %val24, float addrspace(1)* @var + store volatile float %val25, float addrspace(1)* @var + store volatile float %val26, float addrspace(1)* @var + store volatile float %val27, float addrspace(1)* @var + store volatile float %val28, float addrspace(1)* @var + store volatile float %val29, float addrspace(1)* @var + store volatile float %val30, float addrspace(1)* @var + store volatile float %val31, float addrspace(1)* @var + store volatile float %val32, float addrspace(1)* @var + store volatile float %val33, float addrspace(1)* @var + store volatile float %val34, float addrspace(1)* @var + store volatile float %val35, float addrspace(1)* @var + store volatile float %val36, float addrspace(1)* @var + store volatile float %val37, float addrspace(1)* @var + store volatile float %val38, float addrspace(1)* @var + store volatile float %val39, float addrspace(1)* @var + store volatile float %val40, float addrspace(1)* @var + + ret void +} +attributes #3 = {"amdgpu-flat-work-group-size"="1024,2048"} + +; CHECK: amdhsa.kernels: +; CHECK: .max_flat_workgroup_size: 64 +; CHECK: .name: min_64_max_64 +; CHECK: .max_flat_workgroup_size: 128 +; CHECK: .name: min_64_max_128 +; CHECK: .max_flat_workgroup_size: 128 +; CHECK: .name: min_128_max_128 +; CHECK: .max_flat_workgroup_size: 2048 +; CHECK: .name: min_1024_max_2048 +; CHECK: amdhsa.version: +; CHECK: - 1 +; CHECK: - 0 + +; PARSER: AMDGPU HSA Metadata Parser Test: PASS diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll index 7fe5604c3ec724cbb4d9841f9398490cf072ecbc..2f281cab48cfc36f87ff265cac28a3f1dad5d7ab 100644 --- a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll +++ b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=HSAMD %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=HSAMD %s ; CHECK-LABEL: {{^}}min_64_max_64: ; CHECK: SGPRBlocks: 0 diff --git a/test/CodeGen/AMDGPU/branch-relaxation.ll b/test/CodeGen/AMDGPU/branch-relaxation.ll index 72c983d5d9788c881ce541254ba7e4e0a5fe29d4..45ed056567c2e2463d329694a9a8418e82ab089b 100644 --- a/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -444,7 +444,7 @@ endif: ; GCN-NEXT: s_xor_b64 exec, exec, [[TEMP_MASK1]] ; GCN-NEXT: ; mask branch [[RET:BB[0-9]+_[0-9]+]] -; GCN: [[LOOP_BODY:BB[0-9]+_[0-9]+]]: ; %loop +; GCN: [[LOOP_BODY:BB[0-9]+_[0-9]+]]: ; %loop{{$}} ; GCN: ;;#ASMSTART ; GCN: v_nop_e64 ; GCN: v_nop_e64 diff --git a/test/CodeGen/AMDGPU/byval-frame-setup.ll b/test/CodeGen/AMDGPU/byval-frame-setup.ll index bf2f5d38babc73145f576160fd5a2ae1bb2d9c56..3a48ffdda9ed22deb0b0adc6cc268903ae866e3b 100644 --- a/test/CodeGen/AMDGPU/byval-frame-setup.ll +++ b/test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -48,8 +48,8 @@ entry: ; GCN: v_readlane_b32 ; GCN-NOT: v_readlane_b32 s32 -; GCN: buffer_load_dword v32, -; GCN: buffer_load_dword v33, +; GCN-DAG: buffer_load_dword v32, +; GCN-DAG: buffer_load_dword v33, ; GCN: s_sub_u32 s32, s32, 0xc00{{$}} ; GCN: s_setpc_b64 define void @void_func_byval_struct_non_leaf(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 { diff --git a/test/CodeGen/AMDGPU/call-argument-types.ll b/test/CodeGen/AMDGPU/call-argument-types.ll index 84d327b6f37980e9919da43330d60be12dd0a2f8..67614adc9152d6426895dca0eb6be40062182629 100644 --- a/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/test/CodeGen/AMDGPU/call-argument-types.ll @@ -61,11 +61,11 @@ declare void @external_void_func_v16i8(<16 x i8>) #0 ; MESA-DAG: s_mov_b64 s[0:1], s[36:37] -; GCN: v_mov_b32_e32 v0, 1{{$}} +; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1@rel32@lo+4 +; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1@rel32@hi+4 +; GCN-DAG: v_mov_b32_e32 v0, 1{{$}} ; MESA-DAG: s_mov_b64 s[2:3], s[38:39] -; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1@rel32@hi+4 ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm @@ -123,12 +123,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GCN-LABEL: {{^}}test_call_external_void_func_i8_imm: ; MESA-DAG: s_mov_b32 s33, s3{{$}} -; GCN: v_mov_b32_e32 v0, 0x7b -; HSA-DAG: s_mov_b32 s4, s33{{$}} -; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8@rel32@hi+4 +; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8@rel32@lo+4 +; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8@rel32@hi+4 +; GCN-DAG: v_mov_b32_e32 v0, 0x7b +; HSA-DAG: s_mov_b32 s4, s33{{$}} ; GCN-DAG: s_mov_b32 s32, s33{{$}} ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} @@ -144,11 +144,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; MESA-DAG: s_mov_b32 s33, s3{{$}} ; GCN-DAG: buffer_load_sbyte v0 -; GCN: s_mov_b32 s4, s33 -; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+4 +; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4 +; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+4 +; GCN-DAG: s_mov_b32 s4, s33 ; GCN-DAG: s_mov_b32 s32, s3 ; GCN: s_waitcnt vmcnt(0) @@ -165,10 +165,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; HSA-DAG: s_mov_b32 s33, s9{{$}} ; GCN-DAG: buffer_load_ubyte v0 -; GCN: s_mov_b32 s4, s33 -; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+4 +; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4 +; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+4 ; GCN-DAG: s_mov_b32 s32, s33 @@ -197,10 +196,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; MESA-DAG: s_mov_b32 s33, s3{{$}} ; GCN-DAG: buffer_load_sshort v0 -; GCN: s_mov_b32 s4, s33 -; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+4 +; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4 +; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+4 ; GCN-DAG: s_mov_b32 s32, s33 @@ -217,11 +215,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; MESA-DAG: s_mov_b32 s33, s3{{$}} -; GCN-DAG: buffer_load_ushort v0 -; GCN: s_mov_b32 s4, s33 -; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext@rel32@hi+4 +; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext@rel32@lo+4 +; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext@rel32@hi+4 ; GCN-DAG: s_mov_b32 s32, s33 @@ -237,11 +233,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GCN-LABEL: {{^}}test_call_external_void_func_i32_imm: ; MESA-DAG: s_mov_b32 s33, s3{{$}} -; GCN: v_mov_b32_e32 v0, 42 -; GCN: s_mov_b32 s4, s33 -; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+4 +; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4 +; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+4 +; GCN-DAG: v_mov_b32_e32 v0, 42 +; GCN-DAG: s_mov_b32 s4, s33 ; GCN-DAG: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} @@ -688,7 +684,7 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; GCN-NOT: s_add_u32 [[SP]] ; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4 ; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:8 -; GCN-NEXT: s_swappc_b64 +; GCN: s_swappc_b64 ; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:16 ; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:20 ; GCN-NOT: s_sub_u32 [[SP]] diff --git a/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/test/CodeGen/AMDGPU/call-graph-register-usage.ll index 21c69d9bee7d0eeb2b69ba366379cd1015d8bca5..c4c30a6675518d1eadcde687a562238ef13d931e 100644 --- a/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-BUG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=iceland -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-BUG %s ; Make sure to run a GPU with the SGPR allocation bug. diff --git a/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll index 79abb96cccf16221d0011a2512c309f0d5c853ba..5060c0fed1a9bcd7a0a6aa5b24d704de92300b47 100644 --- a/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ b/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}use_dispatch_ptr: ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 @@ -383,13 +383,12 @@ define void @other_arg_use_workgroup_id_z(i32 %arg0) #1 { ; GCN: enable_sgpr_workgroup_id_y = 0 ; GCN: enable_sgpr_workgroup_id_z = 0 +; GCN-NOT: s6 ; GCN-DAG: s_mov_b32 s33, s7 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b - -; GCN-NOT: s6 -; GCN: s_mov_b32 s4, s33 -; GCN-NOT: s6 +; GCN-DAG: s_mov_b32 s4, s33 ; GCN-DAG: s_mov_b32 s32, s33 +; GCN-NOT: s6 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 { call void @other_arg_use_workgroup_id_x(i32 555) @@ -578,16 +577,16 @@ define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 { ; GCN: s_swappc_b64 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4 -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s[[LO_X]] -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s[[HI_X]] -; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s[[LO_Y]] -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s[[HI_Y]] -; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s[[LO_Z]] -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s[[HI_Z]] -; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4 +; GCN-DAG: v_mov_b32_e32 v[[LO1:[0-9]+]], s[[LO_X]] +; GCN-DAG: v_mov_b32_e32 v[[HI1:[0-9]+]], s[[HI_X]] +; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO1]]:[[HI1]]{{\]}} +; GCN-DAG: v_mov_b32_e32 v[[LO2:[0-9]+]], s[[LO_Y]] +; GCN-DAG: v_mov_b32_e32 v[[HI2:[0-9]+]], s[[HI_Y]] +; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO2]]:[[HI2]]{{\]}} +; GCN-DAG: v_mov_b32_e32 v[[LO3:[0-9]+]], s[[LO_Z]] +; GCN-DAG: v_mov_b32_e32 v[[HI3:[0-9]+]], s[[HI_Z]] +; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO3]]:[[HI3]]{{\]}} ; GCN: ; use ; GCN: ; use [[SAVE_X]] ; GCN: ; use [[SAVE_Y]] diff --git a/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll index 7f14a24d6da65eb931c10240d236f1a2c6d43325..750a0203c9bfc4b5a424c317eb84bc140e57b1c7 100644 --- a/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; GCN-LABEL: {{^}}use_workitem_id_x: ; GCN: s_waitcnt diff --git a/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll index 04ad3bcccd3f33ebee62871b006af8ed9b7a8479..f6c30f8af1c37244dacda586d2ce8512b1463d3d 100644 --- a/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll +++ b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll @@ -125,11 +125,11 @@ ret: ; GCN: s_cbranch_scc1 ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004 -; VI: s_and_b32 s{{[0-9]+}}, [[BFE]], 0xff +; VI: v_mov_b32_e32 v{{[0-9]+}}, 0xff ; GCN: BB2_2: ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004 -; VI: s_and_b32 s{{[0-9]+}}, [[BFE]], 0x7f +; VI: v_mov_b32_e32 v{{[0-9]+}}, 0x7f ; GCN: BB2_3: ; GCN: buffer_store_short diff --git a/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll index 789decb45f27b6727d0699ad225a09e35d707222..faa468c974c0829afaf8aa06dbb65d3928b9d55a 100644 --- a/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll +++ b/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll @@ -32,7 +32,6 @@ bb2: ; GCN-LABEL: {{^}}preserve_condition_undef_flag: ; GCN-NOT: vcc -; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) { bb0: %tmp = icmp sgt i32 %arg1, 4 @@ -40,7 +39,7 @@ bb0: %tmp4 = select i1 %undef, float %arg, float 1.000000e+00 %tmp5 = fcmp ogt float %arg2, 0.000000e+00 %tmp6 = fcmp olt float %arg2, 1.000000e+00 - %tmp7 = fcmp olt float %arg, %tmp4 + %tmp7 = fcmp olt float %arg, undef %tmp8 = and i1 %tmp5, %tmp6 %tmp9 = and i1 %tmp8, %tmp7 br i1 %tmp9, label %bb1, label %bb2 diff --git a/test/CodeGen/AMDGPU/code-object-v3.ll b/test/CodeGen/AMDGPU/code-object-v3.ll index 3f13cebc1583889b62c3019ec32ea29b4e1adbb4..d84325539e81918b393b8d5a0b000c3ddf654b91 100644 --- a/test/CodeGen/AMDGPU/code-object-v3.ll +++ b/test/CodeGen/AMDGPU/code-object-v3.ll @@ -3,6 +3,8 @@ ; ALL-ASM-LABEL: {{^}}fadd: +; OSABI-AMDHSA-ASM-NOT: .hsa_code_object_version +; OSABI-AMDHSA-ASM-NOT: .hsa_code_object_isa ; OSABI-AMDHSA-ASM-NOT: .amdgpu_hsa_kernel ; OSABI-AMDHSA-ASM-NOT: .amd_kernel_code_t @@ -57,7 +59,8 @@ ; OSABI-AMDHSA-ELF: {{[0-9]+}}: 0000000000000000 64 OBJECT GLOBAL DEFAULT {{[0-9]+}} fadd.kd ; OSABI-AMDHSA-ELF: {{[0-9]+}}: 0000000000000040 64 OBJECT GLOBAL DEFAULT {{[0-9]+}} fsub.kd -; OSABI-AMDHSA-ELF-NOT: Displaying notes found +; OSABI-AMDHSA-ELF: Displaying notes found at file offset +; OSABI-AMDHSA-ELF: AMDGPU 0x{{[0-9a-f]+}} NT_AMDGPU_METADATA (AMDGPU Metadata) define amdgpu_kernel void @fadd( float addrspace(1)* %r, diff --git a/test/CodeGen/AMDGPU/computeKnownBits-scalar-to-vector-crash.ll b/test/CodeGen/AMDGPU/computeKnownBits-scalar-to-vector-crash.ll new file mode 100644 index 0000000000000000000000000000000000000000..06049b77b2136735264895b53bd79adcbe14124e --- /dev/null +++ b/test/CodeGen/AMDGPU/computeKnownBits-scalar-to-vector-crash.ll @@ -0,0 +1,11 @@ +; RUN: llc -march=amdgcn -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck %s + +; CHECK: s_waitcnt +define <2 x i16> @main(<2 x float>) #0 { + %2 = bitcast <2 x float> %0 to <4 x i16> + %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <2 x i32> + %4 = extractelement <4 x i16> %2, i32 0 + %5 = insertelement <2 x i16> %3, i16 %4, i32 0 + ret <2 x i16> %5 +} + diff --git a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll index 09d4b2c8bd7744969280786861f94228761761e2..8611cd080e15d05752ea79ad7a0eb5ffd109a200 100644 --- a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll +++ b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll @@ -107,7 +107,7 @@ define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) { ; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}} ; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}} ; GCN-DAG: v_not_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]] -; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]], v[[VREG1_LO]] +; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]] ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) { diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 0b28d8912a3d08b6581aa3155db431b5a7d330cd..d8126fa635af8367d45aa9d06ec7b25906e40094 100644 --- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -36,10 +36,10 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias ; GCN-LABEL: {{^}}load_v3i8_to_v3f32: ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; GCN-NOT: v_cvt_f32_ubyte3_e32 -; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]] -; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]] +; GCN-DAG: v_cvt_f32_ubyte2_e32 v[[HIRESULT:[0-9]+]], [[VAL]] +; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[VAL]] ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]] -; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +; GCN: buffer_store_dwordx3 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid diff --git a/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll b/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll index 5bcdaa0fc0abb6b2489b409a528e4fc9aedcdcb7..9e1c58bcef7e280c5cca26d89280073f72246535 100644 --- a/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll +++ b/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll @@ -1,19 +1,12 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -O0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; GCN-LABEL: {{^}}eq_t: ; GCN-DAG: s_load_dword [[X:s[0-9]+]] -; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]] -; GCN: v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[VONE]]{{$}} +; GCN: v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], 1.0{{$}} ; GCN-NOT: 0xddd5 ; GCN-NOT: v_cndmask_b32 ; GCN-NOT: v_cmp_eq_u32 -; GCN-NOT: v_cndmask_b32 -; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]] -; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0 -; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]] -; GCN: v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[VTWO]], [[VFOUR]], [[CC]] +; GCN: v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], 2.0, 4.0, [[CC]] ; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}} define amdgpu_kernel void @eq_t(float %x) { %c1 = fcmp olt float %x, 1.0 @@ -26,18 +19,11 @@ define amdgpu_kernel void @eq_t(float %x) { ; GCN-LABEL: {{^}}ne_t: ; GCN-DAG: s_load_dword [[X:s[0-9]+]] -; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]] -; GCN: v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[VONE]]{{$}} +; GCN: v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], 1.0{{$}} ; GCN-NOT: 0xddd5 ; GCN-NOT: v_cndmask_b32 ; GCN-NOT: v_cmp_eq_u32 -; GCN-NOT: v_cndmask_b32 -; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]] -; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0 -; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]] -; GCN: v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[VFOUR]], [[VTWO]], [[CC]] +; GCN: v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], 4.0, 2.0, [[CC]] ; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}} define amdgpu_kernel void @ne_t(float %x) { %c1 = fcmp olt float %x, 1.0 @@ -50,18 +36,11 @@ define amdgpu_kernel void @ne_t(float %x) { ; GCN-LABEL: {{^}}eq_f: ; GCN-DAG: s_load_dword [[X:s[0-9]+]] -; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]] -; GCN: v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[VONE]]{{$}} +; GCN: v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], 1.0{{$}} ; GCN-NOT: 0xddd5 ; GCN-NOT: v_cndmask_b32 ; GCN-NOT: v_cmp_eq_u32 -; GCN-NOT: v_cndmask_b32 -; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]] -; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0 -; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]] -; GCN: v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[VFOUR]], [[VTWO]], [[CC]] +; GCN: v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], 4.0, 2.0, [[CC]] ; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}} define amdgpu_kernel void @eq_f(float %x) { %c1 = fcmp olt float %x, 1.0 @@ -74,18 +53,11 @@ define amdgpu_kernel void @eq_f(float %x) { ; GCN-LABEL: {{^}}ne_f: ; GCN-DAG: s_load_dword [[X:s[0-9]+]] -; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]] -; GCN: v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[VONE]]{{$}} +; GCN: v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], 1.0{{$}} ; GCN-NOT: 0xddd5 ; GCN-NOT: v_cndmask_b32 ; GCN-NOT: v_cmp_eq_u32 -; GCN-NOT: v_cndmask_b32 -; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]] -; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0 -; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]] -; GCN: v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[VTWO]], [[VFOUR]], [[CC]] +; GCN: v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], 2.0, 4.0, [[CC]] ; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}} define amdgpu_kernel void @ne_f(float %x) { %c1 = fcmp olt float %x, 1.0 @@ -97,18 +69,8 @@ define amdgpu_kernel void @ne_f(float %x) { } ; GCN-LABEL: {{^}}different_constants: -; GCN-DAG: s_load_dword [[X:s[0-9]+]] -; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]] -; GCN-DAG: v_cmp_lt_f32_e{{32|64}} [[CC1:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[VONE]]{{$}} -; GCN-DAG: v_cndmask_b32_e{{32|64}} [[CND1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, [[CC1]] -; GCN-DAG: v_cmp_eq_u32_e{{32|64}} [[CC2:s\[[0-9]+:[0-9]+\]|vcc]], s{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]] -; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0 -; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]] -; GCN: v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[VTWO]], [[VFOUR]], [[CC2]] -; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}} +; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 2.0 +; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}} define amdgpu_kernel void @different_constants(float %x) { %c1 = fcmp olt float %x, 1.0 %s1 = select i1 %c1, i32 56789, i32 1 diff --git a/test/CodeGen/AMDGPU/debugger-emit-prologue.ll b/test/CodeGen/AMDGPU/debugger-emit-prologue.ll index 46d81e5706592cc301a419ec688db11fb225f292..b416537b9f8d818d680a8a430b6529ca547edee6 100644 --- a/test/CodeGen/AMDGPU/debugger-emit-prologue.ll +++ b/test/CodeGen/AMDGPU/debugger-emit-prologue.ll @@ -1,5 +1,5 @@ -; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-emit-prologue -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s --check-prefix=NOATTR +; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-code-object-v3,+amdgpu-debugger-emit-prologue -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck %s --check-prefix=NOATTR target datalayout = "A5" ; CHECK: debug_wavefront_private_segment_offset_sgpr = [[SOFF:[0-9]+]] diff --git a/test/CodeGen/AMDGPU/default-fp-mode.ll b/test/CodeGen/AMDGPU/default-fp-mode.ll index 6867afee1706e3ddeea466e2339d6c764ce0cf9e..94d518e63ce242479c709b76b894b49cf1e950f0 100644 --- a/test/CodeGen/AMDGPU/default-fp-mode.ll +++ b/test/CodeGen/AMDGPU/default-fp-mode.ll @@ -86,12 +86,10 @@ define amdgpu_kernel void @test_f32_f16_f64_denormals(half addrspace(1)* %out0, ; GCN: IeeeMode: 0 define amdgpu_gs void @kill_gs_const() { main_body: - %0 = icmp ule i32 0, 3 - %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00 - call void @llvm.AMDGPU.kill(float %1) - %2 = icmp ule i32 3, 0 - %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00 - call void @llvm.AMDGPU.kill(float %3) + %cmp0 = icmp ule i32 0, 3 + call void @llvm.amdgcn.kill(i1 %cmp0) + %cmp1 = icmp ule i32 3, 0 + call void @llvm.amdgcn.kill(i1 %cmp1) ret void } @@ -100,12 +98,12 @@ main_body: define amdgpu_ps float @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(4)* byval, [17 x <16 x i8>] addrspace(4)* byval, [17 x <4 x i32>] addrspace(4)* byval, [34 x <8 x i32>] addrspace(4)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) { entry: %tmp0 = fcmp olt float %13, 0.0 - call void @llvm.AMDGPU.kill(float %14) + call void @llvm.amdgcn.kill(i1 %tmp0) %tmp1 = select i1 %tmp0, float 1.0, float 0.0 ret float %tmp1 } -declare void @llvm.AMDGPU.kill(float) +declare void @llvm.amdgcn.kill(i1) attributes #0 = { nounwind "target-cpu"="tahiti" } attributes #1 = { nounwind "target-cpu"="fiji" } diff --git a/test/CodeGen/AMDGPU/dpp_combine.ll b/test/CodeGen/AMDGPU/dpp_combine.ll new file mode 100644 index 0000000000000000000000000000000000000000..36356a72a772705fed2eb772e75141b282010dcc --- /dev/null +++ b/test/CodeGen/AMDGPU/dpp_combine.ll @@ -0,0 +1,185 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-dpp-combine -verify-machineinstrs < %s | FileCheck %s + +; VOP2 with literal cannot be combined +; CHECK-LABEL: {{^}}dpp_combine_i32_literal: +; CHECK: v_mov_b32_dpp [[OLD:v[0-9]+]], {{v[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x1 bound_ctrl:0 +; CHECK: v_add_u32_e32 {{v[0-9]+}}, vcc, 42, [[OLD]] +define amdgpu_kernel void @dpp_combine_i32_literal(i32 addrspace(1)* %out, i32 %in) { + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 2, i32 1, i1 1) #0 + %res = add nsw i32 %dpp, 42 + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_i32_bz: +; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +define amdgpu_kernel void @dpp_combine_i32_bz(i32 addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0 + %res = add nsw i32 %dpp, %x + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_i32_boff_undef: +; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +define amdgpu_kernel void @dpp_combine_i32_boff_undef(i32 addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 0) #0 + %res = add nsw i32 %dpp, %x + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_i32_boff_0: +; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +define amdgpu_kernel void @dpp_combine_i32_boff_0(i32 addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %in, i32 1, i32 1, i32 1, i1 0) #0 + %res = add nsw i32 %dpp, %x + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_i32_boff_max: +; CHECK: v_bfrev_b32_e32 [[OLD:v[0-9]+]], -2 +; CHECK: v_max_i32_dpp [[OLD]], {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +define amdgpu_kernel void @dpp_combine_i32_boff_max(i32 addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 2147483647, i32 %in, i32 1, i32 1, i32 1, i1 0) #0 + %cmp = icmp sge i32 %dpp, %x + %res = select i1 %cmp, i32 %dpp, i32 %x + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_i32_boff_min: +; CHECK: v_bfrev_b32_e32 [[OLD:v[0-9]+]], 1 +; CHECK: v_min_i32_dpp [[OLD]], {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +define amdgpu_kernel void @dpp_combine_i32_boff_min(i32 addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 %in, i32 1, i32 1, i32 1, i1 0) #0 + %cmp = icmp sle i32 %dpp, %x + %res = select i1 %cmp, i32 %dpp, i32 %x + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_i32_boff_mul: +; CHECK: v_mul_i32_i24_dpp v0, v3, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +define amdgpu_kernel void @dpp_combine_i32_boff_mul(i32 addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 1, i32 %in, i32 1, i32 1, i32 1, i1 0) #0 + + %dpp.shl = shl i32 %dpp, 8 + %dpp.24 = ashr i32 %dpp.shl, 8 + %x.shl = shl i32 %x, 8 + %x.24 = ashr i32 %x.shl, 8 + %res = mul i32 %dpp.24, %x.24 + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_i32_commute: +; CHECK: v_subrev_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +define amdgpu_kernel void @dpp_combine_i32_commute(i32 addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 2, i32 1, i32 1, i1 1) #0 + %res = sub nsw i32 %x, %dpp + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_f32: +; CHECK: v_add_f32_dpp {{v[0-9]+}}, {{v[0-9]+}}, v0 quad_perm:[3,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +define amdgpu_kernel void @dpp_combine_f32(i32 addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 3, i32 1, i32 1, i1 1) #0 + %dpp.f32 = bitcast i32 %dpp to float + %x.f32 = bitcast i32 %x to float + %res.f32 = fadd float %x.f32, %dpp.f32 + %res = bitcast float %res.f32 to i32 + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_test_f32_mods: +; CHECK: v_mul_f32_dpp {{v[0-9]+}}, |{{v[0-9]+}}|, -v0 quad_perm:[0,1,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +define amdgpu_kernel void @dpp_combine_test_f32_mods(i32 addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 4, i32 1, i32 1, i1 1) #0 + + %x.f32 = bitcast i32 %x to float + %x.f32.neg = fsub float -0.000000e+00, %x.f32 + + %dpp.f32 = bitcast i32 %dpp to float + %dpp.f32.cmp = fcmp fast olt float %dpp.f32, 0.000000e+00 + %dpp.f32.sign = select i1 %dpp.f32.cmp, float -1.000000e+00, float 1.000000e+00 + %dpp.f32.abs = fmul fast float %dpp.f32, %dpp.f32.sign + + %res.f32 = fmul float %x.f32.neg, %dpp.f32.abs + %res = bitcast float %res.f32 to i32 + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_mac: +; CHECK: v_mac_f32_dpp v0, {{v[0-9]+}}, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +define amdgpu_kernel void @dpp_combine_mac(float addrspace(1)* %out, i32 %in) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0 + %dpp.f32 = bitcast i32 %dpp to float + %x.f32 = bitcast i32 %x to float + %y.f32 = bitcast i32 %y to float + + %mult = fmul float %dpp.f32, %y.f32 + %res = fadd float %mult, %x.f32 + store float %res, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_sequence: +define amdgpu_kernel void @dpp_combine_sequence(i32 addrspace(1)* %out, i32 %in, i1 %cmp) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0 + br i1 %cmp, label %bb1, label %bb2 +bb1: +; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 + %resadd = add nsw i32 %dpp, %x + br label %bb3 +bb2: +; CHECK: v_subrev_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 + %ressub = sub nsw i32 %x, %dpp + br label %bb3 +bb3: + %res = phi i32 [%resadd, %bb1], [%ressub, %bb2] + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}dpp_combine_sequence_negative: +; CHECK: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +define amdgpu_kernel void @dpp_combine_sequence_negative(i32 addrspace(1)* %out, i32 %in, i1 %cmp) { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0 + br i1 %cmp, label %bb1, label %bb2 +bb1: + %resadd = add nsw i32 %dpp, %x + br label %bb3 +bb2: + %ressub = sub nsw i32 2, %dpp ; break seq + br label %bb3 +bb3: + %res = phi i32 [%resadd, %bb1], [%ressub, %bb2] + store i32 %res, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.workitem.id.y() +declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0 + +attributes #0 = { nounwind readnone convergent } diff --git a/test/CodeGen/AMDGPU/dpp_combine_subregs.mir b/test/CodeGen/AMDGPU/dpp_combine_subregs.mir new file mode 100644 index 0000000000000000000000000000000000000000..83f992f492a7bb72b6e1a7a8e0fe69680ee1bb26 --- /dev/null +++ b/test/CodeGen/AMDGPU/dpp_combine_subregs.mir @@ -0,0 +1,143 @@ +# RUN: llc -march=amdgcn -mcpu=tonga -run-pass=gcn-dpp-combine -o - %s | FileCheck %s + +# test if $old definition is correctly tracked through subreg manipulation pseudos + +--- +# CHECK-LABEL: name: mul_old_subreg +# CHECK: %7:vgpr_32 = V_MUL_I32_I24_dpp %0.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec + +name: mul_old_subreg +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: vreg_64 } + - { id: 5, class: vreg_64 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$vgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vreg_64 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_e32 42, implicit $exec + %4 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %5 = INSERT_SUBREG %4, %1, %subreg.sub1 ; %5.sub0 is taken from %4 + %6:vgpr_32 = V_MOV_B32_dpp %5.sub0, %1, 1, 1, 1, 0, implicit $exec + %7:vgpr_32 = V_MUL_I32_I24_e32 %6, %0.sub1, implicit $exec +... + +# CHECK-LABEL: name: add_old_subreg +# CHECK: [[OLD:\%[0-9]+]]:vgpr_32 = IMPLICIT_DEF +# CHECK: %5:vgpr_32 = V_ADD_U32_dpp [[OLD]], %1, %0.sub1, 1, 1, 1, 1, implicit $exec + +name: add_old_subreg +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: vgpr_32 } + +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$vgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vreg_64 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %3:vreg_64 = INSERT_SUBREG %0, %2, %subreg.sub1 ; %3.sub1 is inserted + %4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 1, 1, 0, implicit $exec + %5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec +... + +# CHECK-LABEL: name: add_old_subreg_undef +# CHECK: %5:vgpr_32 = V_ADD_U32_dpp %3.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec + +name: add_old_subreg_undef +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: vgpr_32 } + +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$vgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vreg_64 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %3:vreg_64 = REG_SEQUENCE %2, %subreg.sub0 ; %3.sub1 is undef + %4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 1, 1, 0, implicit $exec + %5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec +... + +# CHECK-LABEL: name: add_f32_e64 +# CHECK: %3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec +# CHECK: %4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %0, 0, 1, implicit $exec +# CHECK: %6:vgpr_32 = V_ADD_F32_dpp %2, 0, %1, 0, %0, 1, 1, 1, 1, implicit $exec +# CHECK: %7:vgpr_32 = V_ADD_F32_dpp %2, 1, %1, 2, %0, 1, 1, 1, 1, implicit $exec +# CHECK: %9:vgpr_32 = V_ADD_F32_e64 4, %8, 8, %0, 0, 0, implicit $exec + +name: add_f32_e64 +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: vgpr_32 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + - { id: 8, class: vgpr_32 } + - { id: 9, class: vgpr_32 } + +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$vgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec + + ; this shouldn't be combined as omod is set + %4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %0, 0, 1, implicit $exec + + %5:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec + + ; this should be combined as all modifiers are default + %6:vgpr_32 = V_ADD_F32_e64 0, %5, 0, %0, 0, 0, implicit $exec + + ; this should be combined as modifiers other than abs|neg are default + %7:vgpr_32 = V_ADD_F32_e64 1, %5, 2, %0, 0, 0, implicit $exec + + %8:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec + + ; this shouldn't be combined as modifiers aren't abs|neg + %9:vgpr_32 = V_ADD_F32_e64 4, %8, 8, %0, 0, 0, implicit $exec +... diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll index 7cb070c12b65df4f44069de79814992893004971..03436f6b3a3c2ce048451362950b3f385076313f 100644 --- a/test/CodeGen/AMDGPU/ds_write2.ll +++ b/test/CodeGen/AMDGPU/ds_write2.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s @lds = addrspace(3) global [512 x float] undef, align 4 @lds.f64 = addrspace(3) global [512 x double] undef, align 8 @@ -31,8 +31,8 @@ define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, flo ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 @@ -177,8 +177,8 @@ define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 @@ -362,8 +362,8 @@ define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace ; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8 +; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} diff --git a/test/CodeGen/AMDGPU/ds_write2st64.ll b/test/CodeGen/AMDGPU/ds_write2st64.ll index 54f2500afab4f7a366b4f042dd7b1f6e83d30a92..2a405352f25352f86c19acc8b9d313f4446d22ee 100644 --- a/test/CodeGen/AMDGPU/ds_write2st64.ll +++ b/test/CodeGen/AMDGPU/ds_write2st64.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s @lds = addrspace(3) global [512 x float] undef, align 4 @@ -30,8 +30,8 @@ define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} @@ -59,8 +59,8 @@ define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}} ; GCN: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]] @@ -87,8 +87,8 @@ define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrsp ; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8 +; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}} ; GCN: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]] diff --git a/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/test/CodeGen/AMDGPU/early-if-convert-cost.ll index 626a6e2c5b82854815e57ab6bf6e31300616c660..b8bb10632acb61d6e81db8b39b275ffaa06a7d03 100644 --- a/test/CodeGen/AMDGPU/early-if-convert-cost.ll +++ b/test/CodeGen/AMDGPU/early-if-convert-cost.ll @@ -60,8 +60,7 @@ endif: ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc -; GCN-DAG: buffer_store_dword v -; GCN-DAG: buffer_store_dwordx2 +; GCN-DAG: buffer_store_dwordx3 define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 { entry: %v = load <3 x i32>, <3 x i32> addrspace(1)* %in diff --git a/test/CodeGen/AMDGPU/elf-notes.ll b/test/CodeGen/AMDGPU/elf-notes.ll index b81292bfdb96d8e15edf7c1c6602722d217d5b12..43e569de65cfc110307157bc23157f1f0e56d41e 100644 --- a/test/CodeGen/AMDGPU/elf-notes.ll +++ b/test/CodeGen/AMDGPU/elf-notes.ll @@ -1,12 +1,12 @@ -; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=gfx802 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK --check-prefix=GFX802 %s -; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=iceland < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK --check-prefix=GFX802 %s -; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=gfx802 -filetype=obj < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ELF --check-prefix=GFX802 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA --check-prefix=GFX802 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA --check-prefix=GFX802 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ELF --check-prefix=GFX802 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx802 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL --check-prefix=GFX802 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=iceland < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL --check-prefix=GFX802 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx802 -filetype=obj < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ELF --check-prefix=GFX802 %s +; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=gfx802 -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK --check-prefix=GFX802 %s +; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=iceland -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK --check-prefix=GFX802 %s +; RUN: llc -mtriple=amdgcn-amd-unknown -mcpu=gfx802 -filetype=obj -mattr=-code-object-v3 < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=GCN --check-prefix=OSABI-UNK-ELF --check-prefix=GFX802 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA --check-prefix=GFX802 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA --check-prefix=GFX802 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -mattr=-code-object-v3 < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=GCN --check-prefix=OSABI-HSA-ELF --check-prefix=GFX802 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx802 -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL --check-prefix=GFX802 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=iceland -mattr=-code-object-v3 < %s | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL --check-prefix=GFX802 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx802 -filetype=obj -mattr=-code-object-v3 < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=GCN --check-prefix=OSABI-PAL-ELF --check-prefix=GFX802 %s ; RUN: llc -march=r600 < %s | FileCheck --check-prefix=R600 %s ; OSABI-UNK-NOT: .hsa_code_object_version diff --git a/test/CodeGen/AMDGPU/elf.metadata.ll b/test/CodeGen/AMDGPU/elf.metadata.ll new file mode 100644 index 0000000000000000000000000000000000000000..097310a167392b43a3b2dbf395d9d37285091bd5 --- /dev/null +++ b/test/CodeGen/AMDGPU/elf.metadata.ll @@ -0,0 +1,56 @@ +; RUN: llc < %s -march=amdgcn -mcpu=fiji -filetype=obj | llvm-readobj -symbols -s -sd - | FileCheck %s + +; CHECK: Section { +; CHECK: Name: .AMDGPU.metadata.info_1 +; CHECK: Type: SHT_PROGBITS (0x1) +; CHECK: Flags [ (0x0) +; CHECK: Size: 16 +; CHECK: SectionData ( +; CHECK: 0000: 414D4431 414D4431 414D4431 414D4431 |AMD1AMD1AMD1AMD1| +; CHECK: ) +; CHECK: } + +; CHECK: Section { +; CHECK: Name: .AMDGPU.metadata.info_2 +; CHECK: Type: SHT_PROGBITS (0x1) +; CHECK: Flags [ (0x0) +; CHECK: Size: 16 +; CHECK: SectionData ( +; CHECK: 0000: 414D4432 414D4432 414D4432 414D4432 |AMD2AMD2AMD2AMD2| +; CHECK: ) +; CHECK: } + +; CHECK: Section { +; CHECK: Name: .AMDGPU.metadata.info_3 +; CHECK: Type: SHT_PROGBITS (0x1) +; CHECK: Flags [ (0x0) +; CHECK: Size: 16 +; CHECK: SectionData ( +; CHECK: 0000: 414D4433 414D4433 414D4433 414D4433 |AMD3AMD3AMD3AMD3| +; CHECK: ) +; CHECK: } + +; CHECK: Symbol { +; CHECK: Name: metadata_info_var_1 +; CHECK: Size: 16 +; CHECK: Binding: Local +; CHECK: Section: .AMDGPU.metadata.info_1 +; CHECK: } + +; CHECK: Symbol { +; CHECK: Name: metadata_info_var_2 +; CHECK: Size: 16 +; CHECK: Binding: Global +; CHECK: Section: .AMDGPU.metadata.info_2 +; CHECK: } + +; CHECK: Symbol { +; CHECK: Name: metadata_info_var_3 +; CHECK: Size: 16 +; CHECK: Binding: Global +; CHECK: Section: .AMDGPU.metadata.info_3 +; CHECK: } + +@metadata_info_var_1 = internal global [4 x i32][i32 826559809, i32 826559809, i32 826559809, i32 826559809], align 1, section ".AMDGPU.metadata.info_1" +@metadata_info_var_2 = constant [4 x i32][i32 843337025, i32 843337025, i32 843337025, i32 843337025], align 1, section ".AMDGPU.metadata.info_2" +@metadata_info_var_3 = global [4 x i32][i32 860114241, i32 860114241, i32 860114241, i32 860114241], align 1, section ".AMDGPU.metadata.info_3" diff --git a/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/test/CodeGen/AMDGPU/extract_vector_dynelt.ll new file mode 100644 index 0000000000000000000000000000000000000000..a2b68cc4214a9a24a3d19bfafa9e38aa77541a34 --- /dev/null +++ b/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -0,0 +1,321 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s + +; GCN-LABEL: {{^}}float4_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1.0, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2.0, [[V1]], [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], 4.0, [[V2]], [[C3]] +; GCN: store_dword v[{{[0-9:]+}}], [[V3]] +define amdgpu_kernel void @float4_extelt(float addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <4 x float> , i32 %sel + store float %ext, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}int4_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2, [[V1]], [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], 4, [[V2]], [[C3]] +; GCN: store_dword v[{{[0-9:]+}}], [[V3]] +define amdgpu_kernel void @int4_extelt(i32 addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <4 x i32> , i32 %sel + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}double4_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]] +; GCN: store_dwordx2 v[{{[0-9:]+}}] +define amdgpu_kernel void @double4_extelt(double addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <4 x double> , i32 %sel + store double %ext, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}half4_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00 +; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200 +; GCN-DAG: s_lshl_b32 [[SEL:s[0-p]+]], s{{[0-9]+}}, 4 +; GCN: s_lshr_b64 s{{\[}}[[RL:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SL]]:[[SH]]], [[SEL]] +; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]] +; GCN: store_short v[{{[0-9:]+}}], v[[VRL]] +define amdgpu_kernel void @half4_extelt(half addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <4 x half> , i32 %sel + store half %ext, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}float2_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1.0, [[C1]] +; GCN: store_dword v[{{[0-9:]+}}], [[V1]] +define amdgpu_kernel void @float2_extelt(float addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <2 x float> , i32 %sel + store float %ext, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}double2_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]] +; GCN: store_dwordx2 v[{{[0-9:]+}}] +define amdgpu_kernel void @double2_extelt(double addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <2 x double> , i32 %sel + store double %ext, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}half8_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4 +; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5 +; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6 +; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7 +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]] +; GCN: store_short v[{{[0-9:]+}}], [[V7]] +define amdgpu_kernel void @half8_extelt(half addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <8 x half> , i32 %sel + store half %ext, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}short8_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4 +; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5 +; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6 +; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7 +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]] +; GCN: store_short v[{{[0-9:]+}}], [[V7]] +define amdgpu_kernel void @short8_extelt(i16 addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <8 x i16> , i32 %sel + store i16 %ext, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}float8_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4 +; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5 +; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6 +; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7 +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]] +; GCN: store_dword v[{{[0-9:]+}}], [[V7]] +define amdgpu_kernel void @float8_extelt(float addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <8 x float> , i32 %sel + store float %ext, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}float16_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: s_mov_b32 m0, +; GCN-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40a00000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40c00000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40e00000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41000000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41100000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41200000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41300000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41400000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41500000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41600000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41700000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41800000 +; GCN-DAG: v_movrels_b32_e32 [[RES:v[0-9]+]], [[VLO]] +; GCN: store_dword v[{{[0-9:]+}}], [[RES]] +define amdgpu_kernel void @float16_extelt(float addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <16 x float> , i32 %sel + store float %ext, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}double16_extelt: +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: store_dword +define amdgpu_kernel void @double16_extelt(double addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <16 x double> , i32 %sel + store double %ext, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}byte8_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x4030201 +; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x8070605 +; GCN-DAG: s_lshl_b32 [[SEL:s[0-p]+]], s{{[0-9]+}}, 3 +; GCN: s_lshr_b64 s{{\[}}[[RL:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SL]]:[[SH]]], [[SEL]] +; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]] +; GCN: store_byte v[{{[0-9:]+}}], v[[VRL]] +define amdgpu_kernel void @byte8_extelt(i8 addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <8 x i8> , i32 %sel + store i8 %ext, i8 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}byte16_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4 +; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5 +; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6 +; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7 +; GCN-DAG: v_cmp_ne_u32_e64 [[C8:[^,]+]], [[IDX]], 8 +; GCN-DAG: v_cmp_ne_u32_e64 [[C9:[^,]+]], [[IDX]], 9 +; GCN-DAG: v_cmp_ne_u32_e64 [[C10:[^,]+]], [[IDX]], 10 +; GCN-DAG: v_cmp_ne_u32_e64 [[C11:[^,]+]], [[IDX]], 11 +; GCN-DAG: v_cmp_ne_u32_e64 [[C12:[^,]+]], [[IDX]], 12 +; GCN-DAG: v_cmp_ne_u32_e64 [[C13:[^,]+]], [[IDX]], 13 +; GCN-DAG: v_cmp_ne_u32_e64 [[C14:[^,]+]], [[IDX]], 14 +; GCN-DAG: v_cmp_ne_u32_e64 [[C15:[^,]+]], [[IDX]], 15 +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V8:v[0-9]+]], {{[^,]+}}, [[V7]], [[C8]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V9:v[0-9]+]], {{[^,]+}}, [[V8]], [[C8]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V10:v[0-9]+]], {{[^,]+}}, [[V9]], [[C10]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V11:v[0-9]+]], {{[^,]+}}, [[V10]], [[C11]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V12:v[0-9]+]], {{[^,]+}}, [[V11]], [[C12]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V13:v[0-9]+]], {{[^,]+}}, [[V12]], [[C13]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V14:v[0-9]+]], {{[^,]+}}, [[V13]], [[C14]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V15:v[0-9]+]], {{[^,]+}}, [[V14]], [[C15]] +; GCN: store_byte v[{{[0-9:]+}}], [[V15]] +define amdgpu_kernel void @byte16_extelt(i8 addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <16 x i8> , i32 %sel + store i8 %ext, i8 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}bit4_extelt: +; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN-DAG: buffer_store_byte [[ZERO]], +; GCN-DAG: buffer_store_byte [[ONE]], +; GCN-DAG: buffer_store_byte [[ZERO]], +; GCN-DAG: buffer_store_byte [[ONE]], +; GCN: buffer_load_ubyte [[LOAD:v[0-9]+]], +; GCN: v_and_b32_e32 [[RES:v[0-9]+]], 1, [[LOAD]] +; GCN: flat_store_dword v[{{[0-9:]+}}], [[RES]] +define amdgpu_kernel void @bit4_extelt(i32 addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <4 x i1> , i32 %sel + %zext = zext i1 %ext to i32 + store i32 %zext, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}bit128_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 1, 0, +; GCN-DAG: v_mov_b32_e32 [[LASTIDX:v[0-9]+]], 0x7f +; GCN-DAG: v_cmp_ne_u32_e32 [[CL:[^,]+]], s{{[0-9]+}}, [[LASTIDX]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} [[VL:v[0-9]+]], 0, v{{[0-9]+}}, [[CL]] +; GCN: v_and_b32_e32 [[RES:v[0-9]+]], 1, [[VL]] +; GCN: store_dword v[{{[0-9:]+}}], [[RES]] +define amdgpu_kernel void @bit128_extelt(i32 addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <128 x i1> , i32 %sel + %zext = zext i1 %ext to i32 + store i32 %zext, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll b/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll index 203c813f4fb452e5f5ab1556444dd62999f7959e..1dc85cab8fc8a622378699af78d73b927882bb7e 100644 --- a/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll +++ b/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; GCN-LABEL: {{^}}extract_vector_elt_v3f64_2: ; GCN: buffer_load_dwordx4 @@ -13,6 +13,14 @@ define amdgpu_kernel void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3f64: +; GCN-NOT: buffer_load +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 { %dynelt = extractelement <3 x double> %foo, i32 %elt store volatile double %dynelt, double addrspace(1)* %out @@ -20,6 +28,17 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %ou } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4f64: +; GCN-NOT: buffer_load +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] +; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 { %dynelt = extractelement <4 x double> %foo, i32 %elt store volatile double %dynelt, double addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll index 96c16d320153a21a9990b9e50b2e2db9c499b8ef..62144ba0437632b8fb6aca8ef32b08f6970aaae5 100644 --- a/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll +++ b/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll @@ -30,6 +30,11 @@ define amdgpu_kernel void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64: +; GCN-NOT: buffer_load +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 { %dynelt = extractelement <2 x i64> %foo, i32 %elt store volatile i64 %dynelt, i64 addrspace(1)* %out @@ -37,6 +42,12 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64_2: +; GCN: buffer_load_dwordx4 +; GCN-NOT: buffer_load +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 { %load = load volatile <2 x i64>, <2 x i64> addrspace(1)* %foo %or = or <2 x i64> %load, %arst @@ -46,6 +57,14 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64: +; GCN-NOT: buffer_load +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 { %dynelt = extractelement <3 x i64> %foo, i32 %elt store volatile i64 %dynelt, i64 addrspace(1)* %out @@ -53,6 +72,17 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64: +; GCN-NOT: buffer_load +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] +; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 { %dynelt = extractelement <4 x i64> %foo, i32 %elt store volatile i64 %dynelt, i64 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/fabs.ll b/test/CodeGen/AMDGPU/fabs.ll index ba72969b82617e16c452934a0cdbaa38e83b7046..f96019dba6dcc99ea446106959b1f2206ae5ec40 100644 --- a/test/CodeGen/AMDGPU/fabs.ll +++ b/test/CodeGen/AMDGPU/fabs.ll @@ -11,7 +11,8 @@ ; R600-NOT: AND ; R600: |PV.{{[XYZW]}}| -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff +; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff +; VI: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) { %bc= bitcast i32 %in to float %fabs = call float @fabs(float %bc) @@ -23,7 +24,8 @@ define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) { ; R600-NOT: AND ; R600: |PV.{{[XYZW]}}| -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff +; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff +; VI: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) { %bc= bitcast i32 %in to float %fabs = call float @llvm.fabs.f32(float %bc) @@ -34,7 +36,8 @@ define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) { ; FUNC-LABEL: {{^}}s_fabs_f32: ; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff +; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff +; VI: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_fabs_f32(float addrspace(1)* %out, float %in) { %fabs = call float @llvm.fabs.f32(float %in) store float %fabs, float addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/fceil64.ll b/test/CodeGen/AMDGPU/fceil64.ll index ba26ed2383296e34057dda632980f0c73e45a13a..da852af3f2303dd87b4e3550e663ffcc5b8a71a8 100644 --- a/test/CodeGen/AMDGPU/fceil64.ll +++ b/test/CodeGen/AMDGPU/fceil64.ll @@ -17,8 +17,7 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone ; are not always followed. ; SI-DAG: s_add_i32 [[SEXP0:s[0-9]+]], [[SEXP]], 0xfffffc01 ; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP0]] -; SI-DAG: s_not_b64 -; SI-DAG: s_and_b64 +; SI-DAG: s_andn2_b64 ; SI-DAG: cmp_gt_i32 ; SI-DAG: cndmask_b32 ; SI-DAG: cndmask_b32 diff --git a/test/CodeGen/AMDGPU/fdot2.ll b/test/CodeGen/AMDGPU/fdot2.ll index 35364d340601e86c576d4b3db3052d7a4aa9c698..da573abe86a52bfe83c2a6b4a08ddda5b50a9ff5 100644 --- a/test/CodeGen/AMDGPU/fdot2.ll +++ b/test/CodeGen/AMDGPU/fdot2.ll @@ -8,16 +8,16 @@ ; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions ; are not converted from f16 to f32. ; GCN-LABEL: {{^}}dotproduct_f16 -; GFX900: v_fma_legacy_f16 -; GCN900: v_fma_legacy_f16 +; GFX900: v_fma_f16 +; GFX900: v_fma_f16 ; GFX906: v_mul_f16_e32 ; GFX906: v_mul_f16_e32 -; GFX906-UNSAFE: v_fma_legacy_f16 +; GFX906-UNSAFE: v_fma_f16 ; GFX906-CONTRACT: v_mac_f16_e32 -; GFX906-DENORM-CONTRACT: v_fma_legacy_f16 +; GFX906-DENORM-CONTRACT: v_fma_f16 define amdgpu_kernel void @dotproduct_f16(<2 x half> addrspace(1)* %src1, <2 x half> addrspace(1)* %src2, half addrspace(1)* nocapture %dst) { @@ -45,7 +45,7 @@ entry: ; and the vectors are of type <2 x half> ; GCN-LABEL: {{^}}dotproduct_f16_f32 ; GFX900: v_mad_mix_f32 -; GCN900: v_mad_mix_f32 +; GFX900: v_mad_mix_f32 ; GFX906: v_mad_f32 ; GFX906: v_mac_f32_e32 @@ -85,7 +85,7 @@ entry: ; and the vectors are of type <2 x half> ; GCN-LABEL: {{^}}dotproduct_diffvecorder ; GFX900: v_mad_mix_f32 -; GCN900: v_mad_mix_f32 +; GFX900: v_mad_mix_f32 ; GFX906: v_mad_f32 ; GFX906: v_mac_f32_e32 @@ -159,7 +159,7 @@ entry: ; GCN-LABEL: {{^}}NotAdotproduct ; GFX900: v_mad_mix_f32 -; GCN900: v_mad_mix_f32 +; GFX900: v_mad_mix_f32 ; GFX906: v_mad_f32 ; GFX906: v_mac_f32_e32 @@ -196,7 +196,7 @@ entry: ; GCN-LABEL: {{^}}Diff_Idx_NotAdotproduct ; GFX900: v_mad_mix_f32 -; GCN900: v_mad_mix_f32 +; GFX900: v_mad_mix_f32 ; GFX906: v_mad_f32 ; GFX906: v_mac_f32_e32 @@ -229,4 +229,4 @@ entry: %acc2 = fadd float %mul1, %acc1 store float %acc2, float addrspace(1)* %dst, align 4 ret void -} \ No newline at end of file +} diff --git a/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll index b2ac534a7d655d82acc96c90e2de5b0d43a8649f..17f557b3a6cd59605f49b6672f848f6830dd7a9a 100644 --- a/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll +++ b/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-DEFAULT -check-prefix=ALL %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-NODEFAULT -check-prefix=ALL %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3,+flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-DEFAULT -check-prefix=ALL %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3,-flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-NODEFAULT -check-prefix=ALL %s ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga < %s | FileCheck -check-prefix=HSA-NOADDR64 -check-prefix=ALL %s ; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=NOHSA-DEFAULT -check-prefix=ALL %s ; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=NOHSA-NODEFAULT -check-prefix=ALL %s diff --git a/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/test/CodeGen/AMDGPU/flat-scratch-reg.ll index a7664c399fbb0c505ffa1d82e71a65b4a0fd7f32..38909d3e3e9c1a0840fe70b1a5753694ec6a66c9 100644 --- a/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -7,9 +7,9 @@ ; RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=stoney -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-CI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-NOXNACK -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-XNACK -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-CI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-NOXNACK -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3,+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-XNACK -check-prefix=GCN %s ; GCN-LABEL: {{^}}no_vcc_no_flat: ; HSA-CI: is_xnack_enabled = 0 diff --git a/test/CodeGen/AMDGPU/fneg-fabs.ll b/test/CodeGen/AMDGPU/fneg-fabs.ll index c72dab08e38676b5c91c3548582bc5907d20de06..0ff5d9652c1047bada4f4b480d4884e52732e92c 100644 --- a/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32: @@ -35,6 +35,7 @@ define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x ; R600: -PV ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 +; VI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) { %bc = bitcast i32 %in to float %fabs = call float @llvm.fabs.f32(float %bc) diff --git a/test/CodeGen/AMDGPU/fpext-free.ll b/test/CodeGen/AMDGPU/fpext-free.ll index 0a504b3e03e4e833ec1a9bfd2b7b7e31d3b597ca..1bd4caa180a4b853e0d4f9f9f5c04d1266f9e7aa 100644 --- a/test/CodeGen/AMDGPU/fpext-free.ll +++ b/test/CodeGen/AMDGPU/fpext-free.ll @@ -171,7 +171,7 @@ entry: ; GCN-LABEL: {{^}}fadd_fpext_fmuladd_f16_to_f32: ; GFX9: v_mul_f16 -; GFX9: v_fma_legacy_f16 +; GFX9: v_fma_f16 ; GFX9: v_cvt_f32_f16 ; GFX9: v_add_f32_e32 define float @fadd_fpext_fmuladd_f16_to_f32(float %x, half %y, half %z, half %u, half %v) #0 { @@ -185,7 +185,7 @@ entry: ; GCN-LABEL: {{^}}fadd_fpext_fma_f16_to_f32: ; GFX9: v_mul_f16 -; GFX9: v_fma_legacy_f16 +; GFX9: v_fma_f16 ; GFX9: v_cvt_f32_f16 ; GFX9: v_add_f32_e32 define float @fadd_fpext_fma_f16_to_f32(float %x, half %y, half %z, half %u, half %v) #0 { @@ -199,7 +199,7 @@ entry: ; GCN-LABEL: {{^}}fadd_fpext_fma_f16_to_f32_commute: ; GFX9: v_mul_f16 -; GFX9: v_fma_legacy_f16 +; GFX9: v_fma_f16 ; GFX9: v_cvt_f32_f16 ; GFX9: v_add_f32_e32 define float @fadd_fpext_fma_f16_to_f32_commute(float %x, half %y, half %z, half %u, half %v) #0 { @@ -322,7 +322,7 @@ entry: ; GCN-LABEL: {{^}}fsub_fpext_muladd_mul_f16_to_f32: ; GFX9: v_mul_f16 -; GFX9: v_fma_legacy_f16 +; GFX9: v_fma_f16 ; GFX9: v_cvt_f32_f16 ; GFX9: v_sub_f32 ; GCN: s_setpc_b64 @@ -363,7 +363,7 @@ entry: ; GCN-LABEL: {{^}}fsub_fpext_muladd_mul_f16_to_f32_commute: ; GCN: s_waitcnt ; GFX9-NEXT: v_mul_f16_e32 v3, v3, v4 -; GFX9-NEXT: v_fma_legacy_f16 v1, v1, v2, v3 +; GFX9-NEXT: v_fma_f16 v1, v1, v2, v3 ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 diff --git a/test/CodeGen/AMDGPU/frame-index-elimination.ll b/test/CodeGen/AMDGPU/frame-index-elimination.ll index 44a69185f43c65e58d4d5891b35996159e826e99..114710d1e92a2d93d43040b3eb4a79f451b5007f 100644 --- a/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -167,13 +167,13 @@ ret: ; GCN-DAG: s_movk_i32 s6, 0x204 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 -; CI: v_add_i32_e64 v0, s[6:7], s6, [[SCALED]] +; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s[6:7], s6, [[SCALED]] ; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6 -; GFX9: v_add_u32_e32 v0, s6, [[SCALED]] +; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], s6, [[SCALED]] -; GCN: v_mul_lo_i32 v0, v0, 9 -; GCN: ds_write_b32 v0, v0 +; GCN: v_mul_lo_i32 [[VZ]], [[VZ]], 9 +; GCN: ds_write_b32 v0, [[VZ]] define void @func_other_fi_user_non_inline_imm_offset_i32() #0 { %alloca0 = alloca [128 x i32], align 4, addrspace(5) %alloca1 = alloca [8 x i32], align 4, addrspace(5) @@ -191,13 +191,13 @@ define void @func_other_fi_user_non_inline_imm_offset_i32() #0 { ; GCN-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x204 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[DIFF]], 6 -; CI: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]] +; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]] ; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[DIFF]] -; GFX9: v_add_u32_e32 v0, [[OFFSET]], [[SCALED]] +; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], [[OFFSET]], [[SCALED]] -; GCN: v_mul_lo_i32 v0, v0, 9 -; GCN: ds_write_b32 v0, v0 +; GCN: v_mul_lo_i32 [[VZ]], [[VZ]], 9 +; GCN: ds_write_b32 v0, [[VZ]] define void @func_other_fi_user_non_inline_imm_offset_i32_vcc_live() #0 { %alloca0 = alloca [128 x i32], align 4, addrspace(5) %alloca1 = alloca [8 x i32], align 4, addrspace(5) diff --git a/test/CodeGen/AMDGPU/ftrunc.f64.ll b/test/CodeGen/AMDGPU/ftrunc.f64.ll index 6fc4c8b7d243b737f768a6cadef22df56658f1cb..226125335c38dfc3fd4c5432f16a5e402c459e89 100644 --- a/test/CodeGen/AMDGPU/ftrunc.f64.ll +++ b/test/CodeGen/AMDGPU/ftrunc.f64.ll @@ -27,8 +27,7 @@ define amdgpu_kernel void @v_ftrunc_f64(double addrspace(1)* %out, double addrsp ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 ; SI-DAG: s_add_i32 [[SEXP1:s[0-9]+]], [[SEXP]], 0xfffffc01 ; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]] -; SI-DAG: s_not_b64 -; SI-DAG: s_and_b64 +; SI-DAG: s_andn2_b64 ; SI-DAG: cmp_gt_i32 ; SI-DAG: cndmask_b32 ; SI-DAG: cndmask_b32 diff --git a/test/CodeGen/AMDGPU/gep-address-space.ll b/test/CodeGen/AMDGPU/gep-address-space.ll index 7fb47e08ea58c51b6e2d36c7e59b8b4952033a96..b2fd9f6acf9d95939ec929d20c1f5822f52180ec 100644 --- a/test/CodeGen/AMDGPU/gep-address-space.ll +++ b/test/CodeGen/AMDGPU/gep-address-space.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @use_gep_address_space([1024 x i32] addrspace(3)* %arr ; CHECK-LABEL: {{^}}use_gep_address_space_large_offset: ; The LDS offset will be 65536 bytes, which is larger than the size of LDS on ; SI, which is why it is being OR'd with the base pointer. -; SI: s_or_b32 +; SI: s_bitset1_b32 ; CI: s_add_i32 ; CHECK: ds_write_b32 define amdgpu_kernel void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind { diff --git a/test/CodeGen/AMDGPU/gfx902-without-xnack.ll b/test/CodeGen/AMDGPU/gfx902-without-xnack.ll index 445e112a3016a40711333714460076ed47af999e..8577382cff5a303f433423eae1d4ed0b206a886f 100644 --- a/test/CodeGen/AMDGPU/gfx902-without-xnack.ll +++ b/test/CodeGen/AMDGPU/gfx902-without-xnack.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=-xnack < %s | FileCheck %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=-code-object-v3,-xnack < %s | FileCheck %s ; CHECK: .hsa_code_object_isa 9,0,2,"AMD","AMDGPU" define amdgpu_kernel void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind { diff --git a/test/CodeGen/AMDGPU/global-load-store-atomics.mir b/test/CodeGen/AMDGPU/global-load-store-atomics.mir new file mode 100644 index 0000000000000000000000000000000000000000..8ac7c3e14a172d337e706fd662208de6474f4ff8 --- /dev/null +++ b/test/CodeGen/AMDGPU/global-load-store-atomics.mir @@ -0,0 +1,249 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-fixup-vector-isel -amdgpu-enable-global-sgpr-addr %s -o - | FileCheck -check-prefix=GCN %s + +# Coverage tests for GLOBAL_* to their _SADDR equivalent. + +# GCN-LABEL: name: global_load_store_atomics +# GCN: GLOBAL_LOAD_DWORD_SADDR +# GCN: GLOBAL_STORE_DWORD_SADDR +# GCN: GLOBAL_LOAD_DWORDX2_SADDR +# GCN: GLOBAL_STORE_DWORDX2_SADDR +# GCN: GLOBAL_LOAD_DWORDX3_SADDR +# GCN: GLOBAL_STORE_DWORDX3_SADDR +# GCN: GLOBAL_LOAD_DWORDX4_SADDR +# GCN: GLOBAL_STORE_DWORDX4_SADDR +# GCN: GLOBAL_LOAD_SSHORT_SADDR +# GCN: GLOBAL_STORE_SHORT_SADDR +# GCN: GLOBAL_LOAD_USHORT_SADDR +# GCN: GLOBAL_STORE_SHORT_SADDR +# GCN: GLOBAL_LOAD_UBYTE_SADDR +# GCN: GLOBAL_STORE_BYTE_SADDR +# GCN: GLOBAL_LOAD_SBYTE_SADDR +# GCN: GLOBAL_STORE_BYTE_SADDR +# GCN: GLOBAL_LOAD_SBYTE_D16_SADDR +# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR +# GCN: GLOBAL_LOAD_UBYTE_D16_SADDR +# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR +# GCN: GLOBAL_LOAD_SBYTE_D16_HI_SADDR +# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR +# GCN: GLOBAL_LOAD_UBYTE_D16_HI_SADDR +# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR +# GCN: GLOBAL_LOAD_SHORT_D16_HI_SADDR +# GCN: GLOBAL_STORE_SHORT_D16_HI_SADDR +# GCN: GLOBAL_LOAD_SHORT_D16_SADDR +# GCN: GLOBAL_STORE_SHORT_D16_HI_SADDR + +# GCN: GLOBAL_ATOMIC_XOR_SADDR_RTN +# GCN: GLOBAL_ATOMIC_XOR_SADDR % +# GCN: GLOBAL_ATOMIC_SMIN_SADDR_RTN +# GCN: GLOBAL_ATOMIC_SMIN_SADDR % +# GCN: GLOBAL_ATOMIC_AND_SADDR_RTN +# GCN: GLOBAL_ATOMIC_AND_SADDR % +# GCN: GLOBAL_ATOMIC_SWAP_SADDR_RTN +# GCN: GLOBAL_ATOMIC_SWAP_SADDR % +# GCN: GLOBAL_ATOMIC_SMAX_SADDR_RTN +# GCN: GLOBAL_ATOMIC_SMAX_SADDR % +# GCN: GLOBAL_ATOMIC_UMIN_SADDR_RTN +# GCN: GLOBAL_ATOMIC_UMIN_SADDR % +# GCN: GLOBAL_ATOMIC_UMAX_SADDR_RTN +# GCN: GLOBAL_ATOMIC_UMAX_SADDR % +# GCN: GLOBAL_ATOMIC_OR_SADDR_RTN +# GCN: GLOBAL_ATOMIC_OR_SADDR % +# GCN: GLOBAL_ATOMIC_ADD_SADDR_RTN +# GCN: GLOBAL_ATOMIC_ADD_SADDR % +# GCN: GLOBAL_ATOMIC_SUB_SADDR_RTN +# GCN: GLOBAL_ATOMIC_SUB_SADDR % +# GCN: GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN +# GCN: GLOBAL_ATOMIC_CMPSWAP_SADDR % +# GCN: GLOBAL_ATOMIC_INC_SADDR_RTN +# GCN: GLOBAL_ATOMIC_INC_SADDR % +# GCN: GLOBAL_ATOMIC_DEC_SADDR_RTN +# GCN: GLOBAL_ATOMIC_DEC_SADDR % + +# GCN: GLOBAL_ATOMIC_OR_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_OR_X2_SADDR % +# GCN: GLOBAL_ATOMIC_XOR_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_XOR_X2_SADDR % +# GCN: GLOBAL_ATOMIC_AND_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_AND_X2_SADDR % +# GCN: GLOBAL_ATOMIC_ADD_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_ADD_X2_SADDR % +# GCN: GLOBAL_ATOMIC_SUB_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_SUB_X2_SADDR % +# GCN: GLOBAL_ATOMIC_DEC_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_DEC_X2_SADDR % +# GCN: GLOBAL_ATOMIC_INC_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_INC_X2_SADDR % +# GCN: GLOBAL_ATOMIC_SMIN_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_SMIN_X2_SADDR % +# GCN: GLOBAL_ATOMIC_SWAP_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_SWAP_X2_SADDR % +# GCN: GLOBAL_ATOMIC_SMAX_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_SMAX_X2_SADDR % +# GCN: GLOBAL_ATOMIC_UMIN_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_UMIN_X2_SADDR % +# GCN: GLOBAL_ATOMIC_UMAX_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_UMAX_X2_SADDR % +# GCN: GLOBAL_ATOMIC_CMPSWAP_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_CMPSWAP_X2_SADDR % + +name: global_load_store_atomics +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1, 36, 0 :: (dereferenceable invariant load 8 ) + %5:sreg_32_xm0 = S_MOV_B32 2 + %6:vgpr_32 = V_LSHLREV_B32_e64 killed %5, %0, implicit $exec + %7:sreg_32_xm0 = S_MOV_B32 0 + %15:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %14:vreg_64 = REG_SEQUENCE killed %6, %subreg.sub0, killed %15, %subreg.sub1 + %21:sgpr_32 = COPY %4.sub0 + %22:vgpr_32 = COPY %14.sub0 + %23:sgpr_32 = COPY %4.sub1 + %24:vgpr_32 = COPY %14.sub1 + %17:vgpr_32, %19:sreg_64_xexec = V_ADD_I32_e64 %21, %22, implicit $exec + %25:vgpr_32 = COPY %23 + %18:vgpr_32, dead %20:sreg_64_xexec = V_ADDC_U32_e64 %25, %24, killed %19, implicit $exec + %16:vreg_64 = REG_SEQUENCE %17, %subreg.sub0, %18, %subreg.sub1 + %11:vreg_64 = COPY %16 + + %10:vgpr_32 = GLOBAL_LOAD_DWORD %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_DWORD %11, %10, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %40:vreg_64 = GLOBAL_LOAD_DWORDX2 %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_DWORDX2 %11, %40, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %41:vreg_96 = GLOBAL_LOAD_DWORDX3 %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_DWORDX3 %11, %41, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %42:vreg_128 = GLOBAL_LOAD_DWORDX4 %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_DWORDX4 %11, %42, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %43:vgpr_32 = GLOBAL_LOAD_SSHORT %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_SHORT %11, %43, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %44:vgpr_32 = GLOBAL_LOAD_USHORT %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_SHORT %11, %44, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %45:vgpr_32 = GLOBAL_LOAD_UBYTE %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_BYTE %11, %45, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %46:vgpr_32 = GLOBAL_LOAD_SBYTE %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_BYTE %11, %46, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %47:vgpr_32 = GLOBAL_LOAD_SBYTE_D16 %11, 16, 0, 0, %46, implicit $exec :: (load 4) + GLOBAL_STORE_BYTE_D16_HI %11, %47, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %48:vgpr_32 = GLOBAL_LOAD_UBYTE_D16 %11, 16, 0, 0, %46, implicit $exec :: (load 4) + GLOBAL_STORE_BYTE_D16_HI %11, %48, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %49:vgpr_32 = GLOBAL_LOAD_SBYTE_D16_HI %11, 16, 0, 0, %46, implicit $exec :: (load 4) + GLOBAL_STORE_BYTE_D16_HI %11, %49, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %50:vgpr_32 = GLOBAL_LOAD_UBYTE_D16_HI %11, 16, 0, 0, %46, implicit $exec :: (load 4) + GLOBAL_STORE_BYTE_D16_HI %11, %50, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %51:vgpr_32 = GLOBAL_LOAD_SHORT_D16_HI %11, 16, 0, 0, %46, implicit $exec :: (load 4) + GLOBAL_STORE_SHORT_D16_HI %11, %51, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %52:vgpr_32 = GLOBAL_LOAD_SHORT_D16 %11, 16, 0, 0, %46, implicit $exec :: (load 4) + GLOBAL_STORE_SHORT_D16_HI %11, %52, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + + %53:vgpr_32 = GLOBAL_ATOMIC_XOR_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %53, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_XOR %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %54:vgpr_32 = GLOBAL_ATOMIC_SMIN_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %54, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_SMIN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %55:vgpr_32 = GLOBAL_ATOMIC_AND_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %55, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_AND %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %56:vgpr_32 = GLOBAL_ATOMIC_SWAP_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %56, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_SWAP %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %57:vgpr_32 = GLOBAL_ATOMIC_SMAX_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %57, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_SMAX %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %58:vgpr_32 = GLOBAL_ATOMIC_UMIN_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %58, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_UMIN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %59:vgpr_32 = GLOBAL_ATOMIC_UMAX_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %59, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_UMAX %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %60:vgpr_32 = GLOBAL_ATOMIC_OR_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %60, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_OR %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %61:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %61, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_ADD %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %62:vgpr_32 = GLOBAL_ATOMIC_SUB_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %62, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_SUB %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %63:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %63, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_CMPSWAP %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %64:vgpr_32 = GLOBAL_ATOMIC_INC_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %64, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_INC %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %65:vgpr_32 = GLOBAL_ATOMIC_DEC_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %65, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_DEC %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %66:vreg_64 = GLOBAL_ATOMIC_OR_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %66, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_OR_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %67:vreg_64 = GLOBAL_ATOMIC_XOR_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %67, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_XOR_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %68:vreg_64 = GLOBAL_ATOMIC_AND_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %68, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_AND_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %69:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %69, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_ADD_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %70:vreg_64 = GLOBAL_ATOMIC_SUB_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %70, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_SUB_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %71:vreg_64 = GLOBAL_ATOMIC_DEC_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %71, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_DEC_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %72:vreg_64 = GLOBAL_ATOMIC_INC_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %72, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_INC_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %73:vreg_64 = GLOBAL_ATOMIC_SMIN_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %73, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_SMIN_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %74:vreg_64 = GLOBAL_ATOMIC_SWAP_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %74, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_SWAP_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %75:vreg_64 = GLOBAL_ATOMIC_SMAX_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %75, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_SMAX_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %76:vreg_64 = GLOBAL_ATOMIC_UMIN_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %76, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_UMIN_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %77:vreg_64 = GLOBAL_ATOMIC_UMAX_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %77, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_UMAX_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %79:sreg_128 = REG_SEQUENCE %4, %subreg.sub0, %4, %subreg.sub1, %4, %subreg.sub2, %4, %subreg.sub3 + %80:vreg_128 = COPY %79 + + %78:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN %11, %80, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %78, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_CMPSWAP_X2 %11, %80, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + S_ENDPGM +... diff --git a/test/CodeGen/AMDGPU/global-saddr.ll b/test/CodeGen/AMDGPU/global-saddr.ll new file mode 100644 index 0000000000000000000000000000000000000000..b21fd9852267908909c299eb983f6502a58ed6be --- /dev/null +++ b/test/CodeGen/AMDGPU/global-saddr.ll @@ -0,0 +1,102 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefix=GFX9 %s + +; Test for a conv2d like sequence of loads. + +; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} +; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} +; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-16{{$}} +; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-32{{$}} +; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:8{{$}} + +define hidden amdgpu_kernel void @simpleSaddrs(i64 addrspace(1)* %dst_image, i64 addrspace(1)* %src_image ) { +entry: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep = getelementptr i64, i64 addrspace(1)* %src_image, i64 %idx + %ptr0 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1 + %load0 = load i64, i64 addrspace(1)* %ptr0 + %ptr1 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 2 + %load1 = load i64, i64 addrspace(1)* %ptr1 + %ptr2 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 3 + %load2 = load i64, i64 addrspace(1)* %ptr2 + %ptr3 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 4 + %load3 = load i64, i64 addrspace(1)* %ptr3 + %ptr4 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -4 + %load4 = load i64, i64 addrspace(1)* %ptr4 + %ptr5 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -3 + %load5 = load i64, i64 addrspace(1)* %ptr5 + %ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -2 + %load6 = load i64, i64 addrspace(1)* %ptr6 + %ptr7 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -1 + %load7 = load i64, i64 addrspace(1)* %ptr7 + %ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 0 + %load8 = load i64, i64 addrspace(1)* %ptr8 + %add0 = add i64 %load1, %load0 + %add1 = add i64 %load3, %load2 + %add2 = add i64 %load5, %load4 + %add3 = add i64 %load7, %load6 + %add4 = add i64 %add0, %load8 + %add5 = add i64 %add2, %add1 + %add6 = add i64 %add4, %add3 + %add7 = add i64 %add6, %add5 + %gep9 = getelementptr i64, i64 addrspace(1)* %dst_image, i64 %idx + %ptr9 = getelementptr inbounds i64, i64 addrspace(1)* %gep9, i64 1 + store volatile i64 %add7, i64 addrspace(1)* %ptr9 + +; Test various offset boundaries. +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:4088{{$}} +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}} + %gep11 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 511 + %load11 = load i64, i64 addrspace(1)* %gep11 + %gep12 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1023 + %load12 = load i64, i64 addrspace(1)* %gep12 + %gep13 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 255 + %load13 = load i64, i64 addrspace(1)* %gep13 + %add11 = add i64 %load11, %load12 + %add12 = add i64 %add11, %load13 + store volatile i64 %add12, i64 addrspace(1)* undef + +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-4096{{$}} + %gep21 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -1024 + %load21 = load i64, i64 addrspace(1)* %gep21 + %gep22 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -2048 + %load22 = load i64, i64 addrspace(1)* %gep22 + %gep23 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -512 + %load23 = load i64, i64 addrspace(1)* %gep23 + %add21 = add i64 %load22, %load21 + %add22 = add i64 %add21, %load23 + store volatile i64 %add22, i64 addrspace(1)* undef + +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}} + %gep31 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 257 + %load31 = load i64, i64 addrspace(1)* %gep31 + %gep32 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 256 + %load32 = load i64, i64 addrspace(1)* %gep32 + %gep33 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 255 + %load33 = load i64, i64 addrspace(1)* %gep33 + %add34 = add i64 %load32, %load31 + %add35 = add i64 %add34, %load33 + store volatile i64 %add35, i64 addrspace(1)* undef + ret void +} + +; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:16{{$}} +; GFX9-NEXT: s_waitcnt +; NGFX9-NOT: global_load_dword + +define amdgpu_cs void @_amdgpu_cs_main(i64 inreg %arg) { +bb: + %tmp1 = inttoptr i64 %arg to <4 x i64> addrspace(1)* + %tmp2 = load <4 x i64>, <4 x i64> addrspace(1)* %tmp1, align 16 + store volatile <4 x i64> %tmp2, <4 x i64> addrspace(1)* undef + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +attributes #0 = { convergent nounwind } +attributes #1 = { nounwind readnone speculatable } diff --git a/test/CodeGen/AMDGPU/hsa-fp-mode.ll b/test/CodeGen/AMDGPU/hsa-fp-mode.ll index b1901cf894b08ef4f02b02ea800b83c50cd9bae1..a454fa02579be15dbae02c9c46c944621d4e18eb 100644 --- a/test/CodeGen/AMDGPU/hsa-fp-mode.ll +++ b/test/CodeGen/AMDGPU/hsa-fp-mode.ll @@ -70,10 +70,10 @@ define amdgpu_kernel void @test_no_dx10_clamp_vi(float addrspace(1)* %out0, doub ret void } -attributes #0 = { nounwind "target-cpu"="kaveri" } -attributes #1 = { nounwind "target-cpu"="fiji" } -attributes #2 = { nounwind "target-features"="-fp32-denormals,+fp64-fp16-denormals" } -attributes #3 = { nounwind "target-features"="+fp32-denormals,-fp64-fp16-denormals" } -attributes #4 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" } -attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" } -attributes #6 = { nounwind "target-cpu"="fiji" "target-features"="-dx10-clamp" } +attributes #0 = { nounwind "target-cpu"="kaveri" "target-features"="-code-object-v3" } +attributes #1 = { nounwind "target-cpu"="fiji" "target-features"="-code-object-v3" } +attributes #2 = { nounwind "target-features"="-code-object-v3,-fp32-denormals,+fp64-fp16-denormals" } +attributes #3 = { nounwind "target-features"="-code-object-v3,+fp32-denormals,-fp64-fp16-denormals" } +attributes #4 = { nounwind "target-features"="-code-object-v3,+fp32-denormals,+fp64-fp16-denormals" } +attributes #5 = { nounwind "target-features"="-code-object-v3,-fp32-denormals,-fp64-fp16-denormals" } +attributes #6 = { nounwind "target-cpu"="fiji" "target-features"="-code-object-v3,-dx10-clamp" } diff --git a/test/CodeGen/AMDGPU/hsa-func.ll b/test/CodeGen/AMDGPU/hsa-func.ll index d117cf59ee15c238a6ee58c02e09b41ec94bd46d..76a17215b7fa1224728bb0357c1323176a826090 100644 --- a/test/CodeGen/AMDGPU/hsa-func.ll +++ b/test/CodeGen/AMDGPU/hsa-func.ll @@ -1,9 +1,9 @@ -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA-CI %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA-VI %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri | FileCheck --check-prefix=HSA %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri | FileCheck --check-prefix=HSA-CI %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=carrizo | FileCheck --check-prefix=HSA %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=carrizo | FileCheck --check-prefix=HSA-VI %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF ; The SHT_NOTE section contains the output from the .hsa_code_object_* ; directives. diff --git a/test/CodeGen/AMDGPU/hsa-metadata-deduce-ro-arg-v3.ll b/test/CodeGen/AMDGPU/hsa-metadata-deduce-ro-arg-v3.ll new file mode 100644 index 0000000000000000000000000000000000000000..15b6286bba77d6c85eb40b6f2c6f0019ecc82e5d --- /dev/null +++ b/test/CodeGen/AMDGPU/hsa-metadata-deduce-ro-arg-v3.ll @@ -0,0 +1,33 @@ +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck %s + +; CHECK: .symbol: test_ro_arg.kd +; CHECK: .name: test_ro_arg +; CHECK: .args: +; CHECK-NEXT: - .type_name: 'float*' +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: .name: in +; CHECK-NEXT: .access: read_only +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .is_const: true +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .is_restrict: true +; CHECK-NEXT: .value_type: f32 +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .type_name: 'float*' +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: .name: out +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: f32 +; CHECK-NEXT: .address_space: global + +define amdgpu_kernel void @test_ro_arg(float addrspace(1)* noalias readonly %in, float addrspace(1)* %out) + !kernel_arg_addr_space !0 !kernel_arg_access_qual !1 !kernel_arg_type !2 + !kernel_arg_base_type !2 !kernel_arg_type_qual !3 { + ret void +} + +!0 = !{i32 1, i32 1} +!1 = !{!"none", !"none"} +!2 = !{!"float*", !"float*"} +!3 = !{!"const restrict", !""} diff --git a/test/CodeGen/AMDGPU/hsa-metadata-enqueu-kernel-v3.ll b/test/CodeGen/AMDGPU/hsa-metadata-enqueu-kernel-v3.ll new file mode 100644 index 0000000000000000000000000000000000000000..d72f3f6d745f503a8df8477b859110043ff35ad7 --- /dev/null +++ b/test/CodeGen/AMDGPU/hsa-metadata-enqueu-kernel-v3.ll @@ -0,0 +1,101 @@ +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s + +; CHECK: --- +; CHECK: amdhsa.kernels: +; CHECK: .symbol: test_non_enqueue_kernel_caller.kd +; CHECK: .name: test_non_enqueue_kernel_caller +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: char +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 1 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NOT: .value_kind: hidden_default_queue +; CHECK-NOT: .value_kind: hidden_completion_action +define amdgpu_kernel void @test_non_enqueue_kernel_caller(i8 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: test_enqueue_kernel_caller.kd +; CHECK: .name: test_enqueue_kernel_caller +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: char +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 1 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_none +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .value_kind: hidden_default_queue +; CHECK-NEXT: .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .value_kind: hidden_completion_action +; CHECK-NEXT: .offset: 48 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: amdhsa.version: +; CHECK-NEXT: - 1 +; CHECK-NEXT: - 0 +; CHECK-NOT: amdhsa.printf: + +attributes #0 = { "calls-enqueue-kernel" } + +!1 = !{i32 0} +!2 = !{!"none"} +!3 = !{!"char"} +!4 = !{!""} + +!opencl.ocl.version = !{!90} +!90 = !{i32 2, i32 0} + + +; PARSER: AMDGPU HSA Metadata Parser Test: PASS diff --git a/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll b/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll index 77624aaafad122007be6918140aee1ba3a83a047..f6e3d94b4dce16faf517549e241d3fdadcbb7d76 100644 --- a/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll +++ b/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s ; CHECK: --- ; CHECK: Version: [ 1, 0 ] diff --git a/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll b/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll new file mode 100644 index 0000000000000000000000000000000000000000..85ee9b4858b4defa898193ecc5835518ebb1cab1 --- /dev/null +++ b/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll @@ -0,0 +1,1453 @@ +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s + +%struct.A = type { i8, float } +%opencl.image1d_t = type opaque +%opencl.image2d_t = type opaque +%opencl.image3d_t = type opaque +%opencl.queue_t = type opaque +%opencl.pipe_t = type opaque +%struct.B = type { i32 addrspace(1)*} +%opencl.clk_event_t = type opaque + +@__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)* + +; CHECK: --- +; CHECK: amdhsa.kernels: +; CHECK: .symbol: test_char.kd +; CHECK: .name: test_char +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: char +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 1 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +; CHECK-NOT: .value_kind: hidden_default_queue +; CHECK-NOT: .value_kind: hidden_completion_action +define amdgpu_kernel void @test_char(i8 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9 + !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: test_ushort2.kd +; CHECK: .name: test_ushort2 +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: ushort2 +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: u16 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_ushort2(<2 x i16> %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !10 + !kernel_arg_base_type !10 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: test_int3.kd +; CHECK: .name: test_int3 +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: int3 +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 16 +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_int3(<3 x i32> %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !11 + !kernel_arg_base_type !11 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: test_ulong4.kd +; CHECK: .name: test_ulong4 +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: ulong4 +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 32 +; CHECK-NEXT: .value_type: u64 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 48 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 56 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_ulong4(<4 x i64> %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !12 + !kernel_arg_base_type !12 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: test_half8.kd +; CHECK: .name: test_half8 +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: half8 +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 16 +; CHECK-NEXT: .value_type: f16 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_half8(<8 x half> %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !13 + !kernel_arg_base_type !13 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: test_float16.kd +; CHECK: .name: test_float16 +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: float16 +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 64 +; CHECK-NEXT: .value_type: f32 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 64 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 72 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 80 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 88 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_float16(<16 x float> %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !14 + !kernel_arg_base_type !14 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: test_double16.kd +; CHECK: .name: test_double16 +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: double16 +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 128 +; CHECK-NEXT: .value_type: f64 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 128 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 136 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 144 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 152 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_double16(<16 x double> %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !15 + !kernel_arg_base_type !15 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: test_pointer.kd +; CHECK: .name: test_pointer +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: 'int addrspace(5)*' +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_pointer(i32 addrspace(1)* %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !16 + !kernel_arg_base_type !16 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: test_image.kd +; CHECK: .name: test_image +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: image2d_t +; CHECK-NEXT: .value_kind: image +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: struct +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_image(%opencl.image2d_t addrspace(1)* %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !17 + !kernel_arg_base_type !17 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: test_sampler.kd +; CHECK: .name: test_sampler +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: sampler_t +; CHECK-NEXT: .value_kind: sampler +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_sampler(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !18 + !kernel_arg_base_type !18 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: test_queue.kd +; CHECK: .name: test_queue +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: queue_t +; CHECK-NEXT: .value_kind: queue +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: struct +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_queue(%opencl.queue_t addrspace(1)* %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !19 + !kernel_arg_base_type !19 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: test_struct.kd +; CHECK: .name: test_struct +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: struct A +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: struct +; CHECK-NEXT: .address_space: private +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_struct(%struct.A addrspace(5)* byval %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 + !kernel_arg_base_type !20 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: test_i128.kd +; CHECK: .name: test_i128 +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: i128 +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 16 +; CHECK-NEXT: .value_type: struct +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_i128(i128 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !21 + !kernel_arg_base_type !21 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: test_multi_arg.kd +; CHECK: .name: test_multi_arg +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: int +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .type_name: short2 +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 4 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i16 +; CHECK-NEXT: .name: b +; CHECK-NEXT: - .type_name: char3 +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .name: c +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_multi_arg(i32 %a, <2 x i16> %b, <3 x i8> %c) + !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !24 + !kernel_arg_base_type !24 !kernel_arg_type_qual !25 { + ret void +} + +; CHECK: .symbol: test_addr_space.kd +; CHECK: .name: test_addr_space +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: 'int addrspace(5)*' +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: .name: g +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .type_name: 'int addrspace(5)*' +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: .name: c +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .address_space: constant +; CHECK-NEXT: - .type_name: 'int addrspace(5)*' +; CHECK-NEXT: .value_kind: dynamic_shared_pointer +; CHECK-NEXT: .name: l +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .pointee_align: 4 +; CHECK-NEXT: .address_space: local +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 48 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_addr_space(i32 addrspace(1)* %g, + i32 addrspace(4)* %c, + i32 addrspace(3)* %l) + !kernel_arg_addr_space !50 !kernel_arg_access_qual !23 !kernel_arg_type !51 + !kernel_arg_base_type !51 !kernel_arg_type_qual !25 { + ret void +} + +; CHECK: .symbol: test_type_qual.kd +; CHECK: .name: test_type_qual +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: 'int addrspace(5)*' +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .is_volatile: true +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .type_name: 'int addrspace(5)*' +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: .name: b +; CHECK-NEXT: .is_const: true +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .is_restrict: true +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .type_name: 'int addrspace(5)*' +; CHECK-NEXT: .value_kind: pipe +; CHECK-NEXT: .name: c +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .is_pipe: true +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: struct +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 48 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_type_qual(i32 addrspace(1)* %a, + i32 addrspace(1)* %b, + %opencl.pipe_t addrspace(1)* %c) + !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !51 + !kernel_arg_base_type !51 !kernel_arg_type_qual !70 { + ret void +} + +; CHECK: .symbol: test_access_qual.kd +; CHECK: .name: test_access_qual +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: image1d_t +; CHECK-NEXT: .value_kind: image +; CHECK-NEXT: .name: ro +; CHECK-NEXT: .access: read_only +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: struct +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .type_name: image2d_t +; CHECK-NEXT: .value_kind: image +; CHECK-NEXT: .name: wo +; CHECK-NEXT: .access: write_only +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: struct +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .type_name: image3d_t +; CHECK-NEXT: .value_kind: image +; CHECK-NEXT: .name: rw +; CHECK-NEXT: .access: read_write +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: struct +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 48 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_access_qual(%opencl.image1d_t addrspace(1)* %ro, + %opencl.image2d_t addrspace(1)* %wo, + %opencl.image3d_t addrspace(1)* %rw) + !kernel_arg_addr_space !60 !kernel_arg_access_qual !61 !kernel_arg_type !62 + !kernel_arg_base_type !62 !kernel_arg_type_qual !25 { + ret void +} + +; CHECK: .symbol: test_vec_type_hint_half.kd +; CHECK: .name: test_vec_type_hint_half +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: int +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +; CHECK: .vec_type_hint: half +define amdgpu_kernel void @test_vec_type_hint_half(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !26 { + ret void +} + +; CHECK: .symbol: test_vec_type_hint_float.kd +; CHECK: .name: test_vec_type_hint_float +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: int +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +; CHECK: .vec_type_hint: float +define amdgpu_kernel void @test_vec_type_hint_float(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !27 { + ret void +} + +; CHECK: .symbol: test_vec_type_hint_double.kd +; CHECK: .name: test_vec_type_hint_double +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: int +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +; CHECK: .vec_type_hint: double +define amdgpu_kernel void @test_vec_type_hint_double(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !28 { + ret void +} + +; CHECK: .symbol: test_vec_type_hint_char.kd +; CHECK: .name: test_vec_type_hint_char +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: int +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +; CHECK: .vec_type_hint: char +define amdgpu_kernel void @test_vec_type_hint_char(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !29 { + ret void +} + +; CHECK: .symbol: test_vec_type_hint_short.kd +; CHECK: .name: test_vec_type_hint_short +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: int +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +; CHECK: .vec_type_hint: short +define amdgpu_kernel void @test_vec_type_hint_short(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !30 { + ret void +} + +; CHECK: .symbol: test_vec_type_hint_long.kd +; CHECK: .name: test_vec_type_hint_long +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: int +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +; CHECK: .vec_type_hint: long +define amdgpu_kernel void @test_vec_type_hint_long(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !31 { + ret void +} + +; CHECK: .symbol: test_vec_type_hint_unknown.kd +; CHECK: .name: test_vec_type_hint_unknown +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: int +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +; CHECK: .vec_type_hint: unknown +define amdgpu_kernel void @test_vec_type_hint_unknown(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !32 { + ret void +} + +; CHECK: .reqd_workgroup_size: +; CHECK-NEXT: - 1 +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 4 +; CHECK: .symbol: test_reqd_wgs_vec_type_hint.kd +; CHECK: .name: test_reqd_wgs_vec_type_hint +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: int +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +; CHECK: .vec_type_hint: int +define amdgpu_kernel void @test_reqd_wgs_vec_type_hint(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !5 + !reqd_work_group_size !6 { + ret void +} + +; CHECK: .symbol: test_wgs_hint_vec_type_hint.kd +; CHECK: .workgroup_size_hint: +; CHECK-NEXT: - 8 +; CHECK-NEXT: - 16 +; CHECK-NEXT: - 32 +; CHECK: .name: test_wgs_hint_vec_type_hint +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: int +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +; CHECK: .vec_type_hint: uint4 +define amdgpu_kernel void @test_wgs_hint_vec_type_hint(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !7 + !work_group_size_hint !8 { + ret void +} + +; CHECK: .symbol: test_arg_ptr_to_ptr.kd +; CHECK: .name: test_arg_ptr_to_ptr +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: 'int addrspace(5)* addrspace(5)*' +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_arg_ptr_to_ptr(i32 addrspace(5)* addrspace(1)* %a) + !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !80 + !kernel_arg_base_type !80 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: test_arg_struct_contains_ptr.kd +; CHECK: .name: test_arg_struct_contains_ptr +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: struct B +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: struct +; CHECK-NEXT: .address_space: private +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_arg_struct_contains_ptr(%struct.B addrspace(5)* byval %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !82 + !kernel_arg_base_type !82 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: test_arg_vector_of_ptr.kd +; CHECK: .name: test_arg_vector_of_ptr +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: 'global int addrspace(5)* __attribute__((ext_vector_type(2)))' +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 16 +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_arg_vector_of_ptr(<2 x i32 addrspace(1)*> %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !83 + !kernel_arg_base_type !83 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: test_arg_unknown_builtin_type.kd +; CHECK: .name: test_arg_unknown_builtin_type +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: clk_event_t +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: struct +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_arg_unknown_builtin_type( + %opencl.clk_event_t addrspace(1)* %a) + !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !84 + !kernel_arg_base_type !84 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: test_pointee_align.kd +; CHECK: .name: test_pointee_align +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: 'long addrspace(5)*' +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .type_name: 'char addrspace(5)*' +; CHECK-NEXT: .value_kind: dynamic_shared_pointer +; CHECK-NEXT: .name: b +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .pointee_align: 1 +; CHECK-NEXT: .address_space: local +; CHECK-NEXT: - .type_name: 'char2 addrspace(5)*' +; CHECK-NEXT: .value_kind: dynamic_shared_pointer +; CHECK-NEXT: .name: c +; CHECK-NEXT: .offset: 12 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .pointee_align: 2 +; CHECK-NEXT: .address_space: local +; CHECK-NEXT: - .type_name: 'char3 addrspace(5)*' +; CHECK-NEXT: .value_kind: dynamic_shared_pointer +; CHECK-NEXT: .name: d +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .pointee_align: 4 +; CHECK-NEXT: .address_space: local +; CHECK-NEXT: - .type_name: 'char4 addrspace(5)*' +; CHECK-NEXT: .value_kind: dynamic_shared_pointer +; CHECK-NEXT: .name: e +; CHECK-NEXT: .offset: 20 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .pointee_align: 4 +; CHECK-NEXT: .address_space: local +; CHECK-NEXT: - .type_name: 'char8 addrspace(5)*' +; CHECK-NEXT: .value_kind: dynamic_shared_pointer +; CHECK-NEXT: .name: f +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .pointee_align: 8 +; CHECK-NEXT: .address_space: local +; CHECK-NEXT: - .type_name: 'char16 addrspace(5)*' +; CHECK-NEXT: .value_kind: dynamic_shared_pointer +; CHECK-NEXT: .name: g +; CHECK-NEXT: .offset: 28 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .pointee_align: 16 +; CHECK-NEXT: .address_space: local +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 48 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 56 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a, + i8 addrspace(3)* %b, + <2 x i8> addrspace(3)* %c, + <3 x i8> addrspace(3)* %d, + <4 x i8> addrspace(3)* %e, + <8 x i8> addrspace(3)* %f, + <16 x i8> addrspace(3)* %g) + !kernel_arg_addr_space !91 !kernel_arg_access_qual !92 !kernel_arg_type !93 + !kernel_arg_base_type !93 !kernel_arg_type_qual !94 { + ret void +} + +; CHECK: .symbol: __test_block_invoke_kernel.kd +; CHECK: .device_enqueue_symbol: __test_block_invoke_kernel_runtime_handle +; CHECK: .name: __test_block_invoke_kernel +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: __block_literal +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 25 +; CHECK-NEXT: .value_type: struct +; CHECK-NEXT: .name: arg +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 48 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 56 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @__test_block_invoke_kernel( + <{ i32, i32, i8*, i8 addrspace(1)*, i8 }> %arg) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !110 + !kernel_arg_base_type !110 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: test_enqueue_kernel_caller.kd +; CHECK: .name: test_enqueue_kernel_caller +; CHECK: .language: OpenCL C +; CHECK: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .args: +; CHECK-NEXT: - .type_name: char +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 1 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .name: a +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_printf_buffer +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .value_kind: hidden_default_queue +; CHECK-NEXT: .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .value_kind: hidden_completion_action +; CHECK-NEXT: .offset: 48 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #1 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9 + !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: .symbol: unknown_addrspace_kernarg.kd +; CHECK: .name: unknown_addrspace_kernarg +; CHECK: .args: +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i32 +; CHECK-NEXT: .name: ptr +define amdgpu_kernel void @unknown_addrspace_kernarg(i32 addrspace(12345)* %ptr) #0 { + ret void +} + +; CHECK: amdhsa.version: +; CHECK-NEXT: - 1 +; CHECK-NEXT: - 0 +; CHECK: amdhsa.printf: +; CHECK-NEXT: - '1:1:4:%d\n' +; CHECK-NEXT: - '2:1:8:%g\n' + +attributes #0 = { "runtime-handle"="__test_block_invoke_kernel_runtime_handle" } +attributes #1 = { "calls-enqueue-kernel" } + +!llvm.printf.fmts = !{!100, !101} + +!1 = !{i32 0} +!2 = !{!"none"} +!3 = !{!"int"} +!4 = !{!""} +!5 = !{i32 undef, i32 1} +!6 = !{i32 1, i32 2, i32 4} +!7 = !{<4 x i32> undef, i32 0} +!8 = !{i32 8, i32 16, i32 32} +!9 = !{!"char"} +!10 = !{!"ushort2"} +!11 = !{!"int3"} +!12 = !{!"ulong4"} +!13 = !{!"half8"} +!14 = !{!"float16"} +!15 = !{!"double16"} +!16 = !{!"int addrspace(5)*"} +!17 = !{!"image2d_t"} +!18 = !{!"sampler_t"} +!19 = !{!"queue_t"} +!20 = !{!"struct A"} +!21 = !{!"i128"} +!22 = !{i32 0, i32 0, i32 0} +!23 = !{!"none", !"none", !"none"} +!24 = !{!"int", !"short2", !"char3"} +!25 = !{!"", !"", !""} +!26 = !{half undef, i32 1} +!27 = !{float undef, i32 1} +!28 = !{double undef, i32 1} +!29 = !{i8 undef, i32 1} +!30 = !{i16 undef, i32 1} +!31 = !{i64 undef, i32 1} +!32 = !{i32 addrspace(5)*undef, i32 1} +!50 = !{i32 1, i32 2, i32 3} +!51 = !{!"int addrspace(5)*", !"int addrspace(5)*", !"int addrspace(5)*"} +!60 = !{i32 1, i32 1, i32 1} +!61 = !{!"read_only", !"write_only", !"read_write"} +!62 = !{!"image1d_t", !"image2d_t", !"image3d_t"} +!70 = !{!"volatile", !"const restrict", !"pipe"} +!80 = !{!"int addrspace(5)* addrspace(5)*"} +!81 = !{i32 1} +!82 = !{!"struct B"} +!83 = !{!"global int addrspace(5)* __attribute__((ext_vector_type(2)))"} +!84 = !{!"clk_event_t"} +!opencl.ocl.version = !{!90} +!90 = !{i32 2, i32 0} +!91 = !{i32 0, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3} +!92 = !{!"none", !"none", !"none", !"none", !"none", !"none", !"none"} +!93 = !{!"long addrspace(5)*", !"char addrspace(5)*", !"char2 addrspace(5)*", !"char3 addrspace(5)*", !"char4 addrspace(5)*", !"char8 addrspace(5)*", !"char16 addrspace(5)*"} +!94 = !{!"", !"", !"", !"", !"", !"", !""} +!100 = !{!"1:1:4:%d\5Cn"} +!101 = !{!"2:1:8:%g\5Cn"} +!110 = !{!"__block_literal"} + +; PARSER: AMDGPU HSA Metadata Parser Test: PASS diff --git a/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll b/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll index 485e02da7d9008f7774118bce0ad6d5244301246..4dce2bf832edabffb4d570e65c67ffa9475d96d4 100644 --- a/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll +++ b/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll @@ -1,9 +1,9 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s %struct.A = type { i8, float } %opencl.image1d_t = type opaque diff --git a/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll b/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll new file mode 100644 index 0000000000000000000000000000000000000000..fa8a182cca94bfd0ce9ffa27a7c231833597512e --- /dev/null +++ b/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll @@ -0,0 +1,72 @@ +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s + +; CHECK: --- +; CHECK: amdhsa.kernels: +; CHECK: .symbol: test.kd +; CHECK: .name: test +; CHECK: .args: +; CHECK-NEXT: - .value_kind: global_buffer +; CHECK-NEXT: .name: r +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: f16 +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .value_kind: global_buffer +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: f16 +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .value_kind: global_buffer +; CHECK-NEXT: .name: b +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: f16 +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .value_kind: hidden_global_offset_x +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_y +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_global_offset_z +; CHECK-NEXT: .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .value_kind: hidden_none +; CHECK-NEXT: .offset: 48 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .value_kind: hidden_none +; CHECK-NEXT: .offset: 56 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +; CHECK-NEXT: - .value_kind: hidden_none +; CHECK-NEXT: .offset: 64 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: .address_space: global +define amdgpu_kernel void @test( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fadd half %a.val, %b.val + store half %r.val, half addrspace(1)* %r + ret void +} + +; CHECK: amdhsa.version: +; CHECK-NEXT: - 1 +; CHECK-NEXT: - 0 + +!opencl.ocl.version = !{!0} +!0 = !{i32 2, i32 0} diff --git a/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll b/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll index ed2e79684c6de511cd37922b2e8bab05f1ada46d..6dbc1e2523d249caaf40954671c97aa6aa6def4f 100644 --- a/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll +++ b/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s ; CHECK: --- ; CHECK: Version: [ 1, 0 ] diff --git a/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll b/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll new file mode 100644 index 0000000000000000000000000000000000000000..7a9c810541ef8f90d48dd51069dac5cbfd35d3ee --- /dev/null +++ b/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll @@ -0,0 +1,95 @@ +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s + +%opencl.image1d_t = type opaque +%opencl.image1d_array_t = type opaque +%opencl.image1d_buffer_t = type opaque +%opencl.image2d_t = type opaque +%opencl.image2d_array_t = type opaque +%opencl.image2d_array_depth_t = type opaque +%opencl.image2d_array_msaa_t = type opaque +%opencl.image2d_array_msaa_depth_t = type opaque +%opencl.image2d_depth_t = type opaque +%opencl.image2d_msaa_t = type opaque +%opencl.image2d_msaa_depth_t = type opaque +%opencl.image3d_t = type opaque + +; CHECK: --- +; CHECK: amdhsa.kernels: +; CHECK: .symbol: test.kd +; CHECK: .name: test +; CHECK: .args: +; CHECK: - .type_name: image1d_t +; CHECK: .value_kind: image +; CHECK: .name: a +; CHECK: .size: 8 +; CHECK: - .type_name: image1d_array_t +; CHECK: .value_kind: image +; CHECK: .name: b +; CHECK: .size: 8 +; CHECK: - .type_name: image1d_buffer_t +; CHECK: .value_kind: image +; CHECK: .name: c +; CHECK: .size: 8 +; CHECK: - .type_name: image2d_t +; CHECK: .value_kind: image +; CHECK: .name: d +; CHECK: .size: 8 +; CHECK: - .type_name: image2d_array_t +; CHECK: .value_kind: image +; CHECK: .name: e +; CHECK: .size: 8 +; CHECK: - .type_name: image2d_array_depth_t +; CHECK: .value_kind: image +; CHECK: .name: f +; CHECK: .size: 8 +; CHECK: - .type_name: image2d_array_msaa_t +; CHECK: .value_kind: image +; CHECK: .name: g +; CHECK: .size: 8 +; CHECK: - .type_name: image2d_array_msaa_depth_t +; CHECK: .value_kind: image +; CHECK: .name: h +; CHECK: .size: 8 +; CHECK: - .type_name: image2d_depth_t +; CHECK: .value_kind: image +; CHECK: .name: i +; CHECK: .size: 8 +; CHECK: - .type_name: image2d_msaa_t +; CHECK: .value_kind: image +; CHECK: .name: j +; CHECK: .size: 8 +; CHECK: - .type_name: image2d_msaa_depth_t +; CHECK: .value_kind: image +; CHECK: .name: k +; CHECK: .size: 8 +; CHECK: - .type_name: image3d_t +; CHECK: .value_kind: image +; CHECK: .name: l +; CHECK: .size: 8 +define amdgpu_kernel void @test(%opencl.image1d_t addrspace(1)* %a, + %opencl.image1d_array_t addrspace(1)* %b, + %opencl.image1d_buffer_t addrspace(1)* %c, + %opencl.image2d_t addrspace(1)* %d, + %opencl.image2d_array_t addrspace(1)* %e, + %opencl.image2d_array_depth_t addrspace(1)* %f, + %opencl.image2d_array_msaa_t addrspace(1)* %g, + %opencl.image2d_array_msaa_depth_t addrspace(1)* %h, + %opencl.image2d_depth_t addrspace(1)* %i, + %opencl.image2d_msaa_t addrspace(1)* %j, + %opencl.image2d_msaa_depth_t addrspace(1)* %k, + %opencl.image3d_t addrspace(1)* %l) + !kernel_arg_type !1 !kernel_arg_base_type !1 { + ret void +} + +; CHECK: amdhsa.version: +; CHECK-NEXT: - 1 +; CHECK-NEXT: - 0 + +!1 = !{!"image1d_t", !"image1d_array_t", !"image1d_buffer_t", + !"image2d_t", !"image2d_array_t", !"image2d_array_depth_t", + !"image2d_array_msaa_t", !"image2d_array_msaa_depth_t", + !"image2d_depth_t", !"image2d_msaa_t", !"image2d_msaa_depth_t", + !"image3d_t"} diff --git a/test/CodeGen/AMDGPU/hsa-metadata-images.ll b/test/CodeGen/AMDGPU/hsa-metadata-images.ll index 00dee3b6c69420ffb757fef4422da539523eb48c..fd0159984297409d3c76239fe82dd7964a0878b4 100644 --- a/test/CodeGen/AMDGPU/hsa-metadata-images.ll +++ b/test/CodeGen/AMDGPU/hsa-metadata-images.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s %opencl.image1d_t = type opaque %opencl.image1d_array_t = type opaque diff --git a/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1-v3.ll b/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1-v3.ll new file mode 100644 index 0000000000000000000000000000000000000000..fefb22e939e29e7ab0f7c1d621b24bd2480e302c --- /dev/null +++ b/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1-v3.ll @@ -0,0 +1,11 @@ +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck %s + +; Make sure llc does not crash for invalid opencl version metadata. + +; CHECK: --- +; CHECK: amdhsa.version: +; CHECK-NEXT: - 1 +; CHECK-NEXT: - 0 +; CHECK: ... + +!opencl.ocl.version = !{} diff --git a/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-2-v3.ll b/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-2-v3.ll new file mode 100644 index 0000000000000000000000000000000000000000..0d9f920ade1a418f692305b1c656bc40595fe7d8 --- /dev/null +++ b/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-2-v3.ll @@ -0,0 +1,12 @@ +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck %s + +; Make sure llc does not crash for invalid opencl version metadata. + +; CHECK: --- +; CHECK: amdhsa.version: +; CHECK-NEXT: - 1 +; CHECK-NEXT: - 0 +; CHECK: ... + +!opencl.ocl.version = !{!0} +!0 = !{} diff --git a/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3-v3.ll b/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3-v3.ll new file mode 100644 index 0000000000000000000000000000000000000000..5382e66b6beaf46010e36f9aae3a4d76bea00613 --- /dev/null +++ b/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3-v3.ll @@ -0,0 +1,12 @@ +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck %s + +; Make sure llc does not crash for invalid opencl version metadata. + +; CHECK: --- +; CHECK: amdhsa.version: +; CHECK-NEXT: - 1 +; CHECK-NEXT: - 0 +; CHECK: ... + +!opencl.ocl.version = !{!0} +!0 = !{i32 1} diff --git a/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll new file mode 100644 index 0000000000000000000000000000000000000000..1c23b2e72058af3190d4cbf03dc52f129b120a07 --- /dev/null +++ b/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll @@ -0,0 +1,146 @@ +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s +; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s + +@var = addrspace(1) global float 0.0 + +; CHECK: --- +; CHECK: amdhsa.kernels: + +; CHECK: - .max_flat_workgroup_size: 256 +; CHECK: .kernarg_segment_size: 24 +; CHECK: .private_segment_fixed_size: 0 +; CHECK: .wavefront_size: 64 +; CHECK: .symbol: test.kd +; CHECK: .name: test +; CHECK: .sgpr_count: 8 +; CHECK: .kernarg_segment_align: 8 +; CHECK: .vgpr_count: 6 +; CHECK: .group_segment_fixed_size: 0 +define amdgpu_kernel void @test( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fadd half %a.val, %b.val + store half %r.val, half addrspace(1)* %r + ret void +} + +; CHECK: .symbol: num_spilled_sgprs.kd +; CHECK: .name: num_spilled_sgprs +; GFX700: .sgpr_spill_count: 40 +; GFX803: .sgpr_spill_count: 24 +; GFX900: .sgpr_spill_count: 24 +define amdgpu_kernel void @num_spilled_sgprs( + i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32], + i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, [8 x i32], + i32 addrspace(1)* %out4, i32 addrspace(1)* %out5, [8 x i32], + i32 addrspace(1)* %out6, i32 addrspace(1)* %out7, [8 x i32], + i32 addrspace(1)* %out8, i32 addrspace(1)* %out9, [8 x i32], + i32 addrspace(1)* %outa, i32 addrspace(1)* %outb, [8 x i32], + i32 addrspace(1)* %outc, i32 addrspace(1)* %outd, [8 x i32], + i32 addrspace(1)* %oute, i32 addrspace(1)* %outf, [8 x i32], + i32 %in0, i32 %in1, i32 %in2, i32 %in3, [8 x i32], + i32 %in4, i32 %in5, i32 %in6, i32 %in7, [8 x i32], + i32 %in8, i32 %in9, i32 %ina, i32 %inb, [8 x i32], + i32 %inc, i32 %ind, i32 %ine, i32 %inf) #0 { +entry: + store i32 %in0, i32 addrspace(1)* %out0 + store i32 %in1, i32 addrspace(1)* %out1 + store i32 %in2, i32 addrspace(1)* %out2 + store i32 %in3, i32 addrspace(1)* %out3 + store i32 %in4, i32 addrspace(1)* %out4 + store i32 %in5, i32 addrspace(1)* %out5 + store i32 %in6, i32 addrspace(1)* %out6 + store i32 %in7, i32 addrspace(1)* %out7 + store i32 %in8, i32 addrspace(1)* %out8 + store i32 %in9, i32 addrspace(1)* %out9 + store i32 %ina, i32 addrspace(1)* %outa + store i32 %inb, i32 addrspace(1)* %outb + store i32 %inc, i32 addrspace(1)* %outc + store i32 %ind, i32 addrspace(1)* %outd + store i32 %ine, i32 addrspace(1)* %oute + store i32 %inf, i32 addrspace(1)* %outf + ret void +} + +; CHECK: .symbol: num_spilled_vgprs.kd +; CHECK: .name: num_spilled_vgprs +; CHECK: .vgpr_spill_count: 14 +define amdgpu_kernel void @num_spilled_vgprs() #1 { + %val0 = load volatile float, float addrspace(1)* @var + %val1 = load volatile float, float addrspace(1)* @var + %val2 = load volatile float, float addrspace(1)* @var + %val3 = load volatile float, float addrspace(1)* @var + %val4 = load volatile float, float addrspace(1)* @var + %val5 = load volatile float, float addrspace(1)* @var + %val6 = load volatile float, float addrspace(1)* @var + %val7 = load volatile float, float addrspace(1)* @var + %val8 = load volatile float, float addrspace(1)* @var + %val9 = load volatile float, float addrspace(1)* @var + %val10 = load volatile float, float addrspace(1)* @var + %val11 = load volatile float, float addrspace(1)* @var + %val12 = load volatile float, float addrspace(1)* @var + %val13 = load volatile float, float addrspace(1)* @var + %val14 = load volatile float, float addrspace(1)* @var + %val15 = load volatile float, float addrspace(1)* @var + %val16 = load volatile float, float addrspace(1)* @var + %val17 = load volatile float, float addrspace(1)* @var + %val18 = load volatile float, float addrspace(1)* @var + %val19 = load volatile float, float addrspace(1)* @var + %val20 = load volatile float, float addrspace(1)* @var + %val21 = load volatile float, float addrspace(1)* @var + %val22 = load volatile float, float addrspace(1)* @var + %val23 = load volatile float, float addrspace(1)* @var + %val24 = load volatile float, float addrspace(1)* @var + %val25 = load volatile float, float addrspace(1)* @var + %val26 = load volatile float, float addrspace(1)* @var + %val27 = load volatile float, float addrspace(1)* @var + %val28 = load volatile float, float addrspace(1)* @var + %val29 = load volatile float, float addrspace(1)* @var + %val30 = load volatile float, float addrspace(1)* @var + + store volatile float %val0, float addrspace(1)* @var + store volatile float %val1, float addrspace(1)* @var + store volatile float %val2, float addrspace(1)* @var + store volatile float %val3, float addrspace(1)* @var + store volatile float %val4, float addrspace(1)* @var + store volatile float %val5, float addrspace(1)* @var + store volatile float %val6, float addrspace(1)* @var + store volatile float %val7, float addrspace(1)* @var + store volatile float %val8, float addrspace(1)* @var + store volatile float %val9, float addrspace(1)* @var + store volatile float %val10, float addrspace(1)* @var + store volatile float %val11, float addrspace(1)* @var + store volatile float %val12, float addrspace(1)* @var + store volatile float %val13, float addrspace(1)* @var + store volatile float %val14, float addrspace(1)* @var + store volatile float %val15, float addrspace(1)* @var + store volatile float %val16, float addrspace(1)* @var + store volatile float %val17, float addrspace(1)* @var + store volatile float %val18, float addrspace(1)* @var + store volatile float %val19, float addrspace(1)* @var + store volatile float %val20, float addrspace(1)* @var + store volatile float %val21, float addrspace(1)* @var + store volatile float %val22, float addrspace(1)* @var + store volatile float %val23, float addrspace(1)* @var + store volatile float %val24, float addrspace(1)* @var + store volatile float %val25, float addrspace(1)* @var + store volatile float %val26, float addrspace(1)* @var + store volatile float %val27, float addrspace(1)* @var + store volatile float %val28, float addrspace(1)* @var + store volatile float %val29, float addrspace(1)* @var + store volatile float %val30, float addrspace(1)* @var + + ret void +} + +; CHECK: amdhsa.version: +; CHECK-NEXT: - 1 +; CHECK-NEXT: - 0 + +attributes #0 = { "amdgpu-num-sgpr"="14" } +attributes #1 = { "amdgpu-num-vgpr"="20" } diff --git a/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll index 3dc3f320db851bc772342f3af3631242ca58daba..b5b6aa450bfbc6214dfe1538e1399b0a10a76fd7 100644 --- a/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-code-object-v3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s @var = addrspace(1) global float 0.0 diff --git a/test/CodeGen/AMDGPU/hsa-metadata-kernel-debug-props.ll b/test/CodeGen/AMDGPU/hsa-metadata-kernel-debug-props.ll index fab086e6cb11bb84c16a546cfcf6064ea587e929..7eacdc1cdaba0068e473c1dc2e5da7f9b07aff17 100644 --- a/test/CodeGen/AMDGPU/hsa-metadata-kernel-debug-props.ll +++ b/test/CodeGen/AMDGPU/hsa-metadata-kernel-debug-props.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=-code-object-v3,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=-code-object-v3,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s target datalayout = "A5" declare void @llvm.dbg.declare(metadata, metadata, metadata) @@ -32,7 +32,7 @@ entry: ret void, !dbg !25 } -attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx800" "target-features"="+16-bit-insts,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops,+amdgpu-debugger-reserve-regs,+dpp,+fp64-fp16-denormals,+s-memrealtime,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx800" "target-features"="+16-bit-insts,-code-object-v3,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops,+amdgpu-debugger-reserve-regs,+dpp,+fp64-fp16-denormals,+s-memrealtime,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.dbg.cu = !{!0} !opencl.ocl.version = !{!3} diff --git a/test/CodeGen/AMDGPU/hsa-note-no-func.ll b/test/CodeGen/AMDGPU/hsa-note-no-func.ll index e937aaca66f21db299434bce43d2981ab1c1c695..4cc9e3c112a28f90d66a0923ce18d9e6d0d4ae80 100644 --- a/test/CodeGen/AMDGPU/hsa-note-no-func.ll +++ b/test/CodeGen/AMDGPU/hsa-note-no-func.ll @@ -1,29 +1,29 @@ -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx600 | FileCheck --check-prefix=HSA --check-prefix=HSA-SI600 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx601 | FileCheck --check-prefix=HSA --check-prefix=HSA-SI601 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx700 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI700 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx701 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI701 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx702 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI702 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx703 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx704 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI704 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=bonaire | FileCheck --check-prefix=HSA --check-prefix=HSA-CI704 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=mullins | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=hawaii | FileCheck --check-prefix=HSA --check-prefix=HSA-CI701 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kabini | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA --check-prefix=HSA-CI700 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI801 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI802 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=polaris10 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=polaris11 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx801 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI801 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx802 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI802 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx803 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx810 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI810 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX900 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx902 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX902 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx904 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX904 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx906 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX906 %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx909 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX909 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx600 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-SI600 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx601 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-SI601 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx700 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI700 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx701 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI701 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx702 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI702 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx703 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx704 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI704 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=bonaire -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI704 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=mullins -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=hawaii -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI701 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kabini -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI703 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI700 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global,-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI801 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global,-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI802 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global,-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=polaris10 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=polaris11 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx801 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI801 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx802 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI802 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx803 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx810 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI810 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX900 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx902 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX902 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx904 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX904 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx906 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX906 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx909 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX909 %s ; HSA: .hsa_code_object_version 2,1 ; HSA-SI600: .hsa_code_object_isa 6,0,0,"AMD","AMDGPU" diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll index 0b19fbe7d70ce3faa648991db3c6ca192db6a76e..e23b2d922a37ebb2b64ebd2179b4aa0b397b81cd 100644 --- a/test/CodeGen/AMDGPU/hsa.ll +++ b/test/CodeGen/AMDGPU/hsa.ll @@ -1,9 +1,9 @@ -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck --check-prefix=HSA-CI %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA-VI %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | FileCheck --check-prefix=HSA %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3,-flat-for-global | FileCheck --check-prefix=HSA-CI %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3 | FileCheck --check-prefix=HSA %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3,-flat-for-global | FileCheck --check-prefix=HSA-VI %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj -mattr=-code-object-v3 | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF ; The SHT_NOTE section contains the output from the .hsa_code_object_* ; directives. diff --git a/test/CodeGen/AMDGPU/idot8.ll b/test/CodeGen/AMDGPU/idot8.ll index e0cd2ad506be985e24f3a0ca36129cb49b2f7e26..edb88f1912efc322ff9b6ae66c8eb6c2e47990f3 100644 --- a/test/CodeGen/AMDGPU/idot8.ll +++ b/test/CodeGen/AMDGPU/idot8.ll @@ -3656,67 +3656,60 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s0, 0 -; GFX7-NEXT: s_mov_b32 s12, s0 -; GFX7-NEXT: s_mov_b32 s14, s0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s2, s[10:11], 0x0 -; GFX7-NEXT: s_load_dword s38, s[4:5], 0x0 -; GFX7-NEXT: s_mov_b32 s16, s0 -; GFX7-NEXT: s_mov_b32 s18, s0 +; GFX7-NEXT: s_load_dword s9, s[10:11], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b32 s13, s1, 4 -; GFX7-NEXT: s_lshl_b32 s15, s1, 12 -; GFX7-NEXT: s_lshl_b32 s17, s1, 16 -; GFX7-NEXT: s_lshl_b32 s19, s1, 20 -; GFX7-NEXT: s_ashr_i64 s[10:11], s[12:13], 60 +; GFX7-NEXT: s_ashr_i64 s[10:11], s[0:1], 60 +; GFX7-NEXT: s_lshl_b32 s11, s1, 4 +; GFX7-NEXT: s_ashr_i64 s[14:15], s[10:11], 60 +; GFX7-NEXT: s_lshl_b32 s11, s1, 12 +; GFX7-NEXT: s_ashr_i64 s[16:17], s[10:11], 60 +; GFX7-NEXT: s_lshl_b32 s11, s1, 16 +; GFX7-NEXT: s_ashr_i64 s[18:19], s[10:11], 60 +; GFX7-NEXT: s_lshl_b32 s11, s1, 20 ; GFX7-NEXT: s_lshl_b32 s13, s1, 8 -; GFX7-NEXT: s_ashr_i64 s[8:9], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s21, s1, 24 +; GFX7-NEXT: s_ashr_i64 s[20:21], s[10:11], 60 +; GFX7-NEXT: s_lshl_b32 s11, s1, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 28 -; GFX7-NEXT: s_ashr_i64 s[22:23], s[0:1], 60 -; GFX7-NEXT: s_mov_b32 s1, s2 -; GFX7-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s2, 4 +; GFX7-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 +; GFX7-NEXT: s_lshl_b32 s1, s9, 4 ; GFX7-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s2, 8 +; GFX7-NEXT: s_lshl_b32 s1, s9, 8 ; GFX7-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s2, 12 +; GFX7-NEXT: s_lshl_b32 s1, s9, 12 ; GFX7-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s2, 16 +; GFX7-NEXT: s_lshl_b32 s1, s9, 16 ; GFX7-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s2, 20 +; GFX7-NEXT: s_lshl_b32 s1, s9, 20 ; GFX7-NEXT: s_ashr_i64 s[34:35], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s2, 24 +; GFX7-NEXT: s_lshl_b32 s1, s9, 24 ; GFX7-NEXT: s_ashr_i64 s[36:37], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s2, 28 -; GFX7-NEXT: s_mov_b32 s20, s0 -; GFX7-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s38 -; GFX7-NEXT: v_mad_i32_i24 v0, s22, v0, v1 -; GFX7-NEXT: s_ashr_i64 s[20:21], s[20:21], 60 +; GFX7-NEXT: s_lshl_b32 s1, s9, 28 +; GFX7-NEXT: s_ashr_i64 s[24:25], s[8:9], 60 +; GFX7-NEXT: s_ashr_i64 s[8:9], s[0:1], 60 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mad_i32_i24 v0, s0, v0, v1 +; GFX7-NEXT: s_ashr_i64 s[22:23], s[10:11], 60 ; GFX7-NEXT: v_mov_b32_e32 v1, s36 -; GFX7-NEXT: v_mad_i32_i24 v0, s20, v1, v0 -; GFX7-NEXT: s_ashr_i64 s[18:19], s[18:19], 60 +; GFX7-NEXT: v_mad_i32_i24 v0, s22, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s34 -; GFX7-NEXT: v_mad_i32_i24 v0, s18, v1, v0 -; GFX7-NEXT: s_ashr_i64 s[16:17], s[16:17], 60 +; GFX7-NEXT: v_mad_i32_i24 v0, s20, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s32 -; GFX7-NEXT: v_mad_i32_i24 v0, s16, v1, v0 -; GFX7-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX7-NEXT: v_mad_i32_i24 v0, s18, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s30 -; GFX7-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s16, v1, v0 ; GFX7-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 ; GFX7-NEXT: v_mov_b32_e32 v1, s28 ; GFX7-NEXT: v_mad_i32_i24 v0, s12, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s26 -; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s14, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s24 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -3724,67 +3717,60 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s8, 0 -; GFX8-NEXT: s_mov_b32 s10, s8 -; GFX8-NEXT: s_mov_b32 s12, s8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s9, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s36, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s14, s8 -; GFX8-NEXT: s_mov_b32 s16, s8 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b32 s11, s9, 4 -; GFX8-NEXT: s_lshl_b32 s13, s9, 8 -; GFX8-NEXT: s_lshl_b32 s15, s9, 16 -; GFX8-NEXT: s_lshl_b32 s17, s9, 20 -; GFX8-NEXT: s_ashr_i64 s[6:7], s[10:11], 60 -; GFX8-NEXT: s_ashr_i64 s[10:11], s[12:13], 60 -; GFX8-NEXT: s_lshl_b32 s13, s9, 12 -; GFX8-NEXT: s_ashr_i64 s[4:5], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s19, s9, 24 -; GFX8-NEXT: s_lshl_b32 s9, s9, 28 -; GFX8-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 -; GFX8-NEXT: s_mov_b32 s9, s2 -; GFX8-NEXT: s_ashr_i64 s[22:23], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s2, 4 -; GFX8-NEXT: s_ashr_i64 s[24:25], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s2, 8 -; GFX8-NEXT: s_ashr_i64 s[26:27], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s2, 12 -; GFX8-NEXT: s_ashr_i64 s[28:29], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s2, 16 -; GFX8-NEXT: s_ashr_i64 s[30:31], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s2, 20 -; GFX8-NEXT: s_ashr_i64 s[32:33], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s2, 24 -; GFX8-NEXT: s_ashr_i64 s[34:35], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s2, 28 -; GFX8-NEXT: s_mov_b32 s18, s8 -; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s36 -; GFX8-NEXT: v_mad_i32_i24 v0, s20, v0, v1 -; GFX8-NEXT: s_ashr_i64 s[18:19], s[18:19], 60 -; GFX8-NEXT: v_mov_b32_e32 v1, s34 -; GFX8-NEXT: v_mad_i32_i24 v0, s18, v1, v0 -; GFX8-NEXT: s_ashr_i64 s[16:17], s[16:17], 60 -; GFX8-NEXT: v_mov_b32_e32 v1, s32 -; GFX8-NEXT: v_mad_i32_i24 v0, s16, v1, v0 -; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 -; GFX8-NEXT: v_mov_b32_e32 v1, s30 -; GFX8-NEXT: v_mad_i32_i24 v0, s14, v1, v0 -; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX8-NEXT: v_mov_b32_e32 v1, s28 -; GFX8-NEXT: v_mad_i32_i24 v0, s12, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s26 -; GFX8-NEXT: v_mad_i32_i24 v0, s10, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s24 -; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s22 -; GFX8-NEXT: v_mad_i32_i24 v2, s4, v1, v0 +; GFX8-NEXT: s_load_dword s5, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s7, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_ashr_i64 s[0:1], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s1, s5, 4 +; GFX8-NEXT: s_ashr_i64 s[12:13], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s5, 16 +; GFX8-NEXT: s_ashr_i64 s[14:15], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s5, 20 +; GFX8-NEXT: s_ashr_i64 s[16:17], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s5, 24 +; GFX8-NEXT: s_ashr_i64 s[18:19], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s5, 28 +; GFX8-NEXT: s_lshl_b32 s9, s5, 8 +; GFX8-NEXT: s_lshl_b32 s11, s5, 12 +; GFX8-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s7, 4 +; GFX8-NEXT: s_ashr_i64 s[22:23], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s7, 8 +; GFX8-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s7, 12 +; GFX8-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s7, 16 +; GFX8-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s7, 20 +; GFX8-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s7, 24 +; GFX8-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s1, s7, 28 +; GFX8-NEXT: s_ashr_i64 s[20:21], s[6:7], 60 +; GFX8-NEXT: s_ashr_i64 s[6:7], s[0:1], 60 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_i32_i24 v2, s4, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, s32 +; GFX8-NEXT: v_mad_i32_i24 v2, s18, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s30 +; GFX8-NEXT: v_mad_i32_i24 v2, s16, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s28 +; GFX8-NEXT: v_mad_i32_i24 v2, s14, v3, v2 +; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX8-NEXT: v_mov_b32_e32 v3, s26 +; GFX8-NEXT: v_mad_i32_i24 v2, s10, v3, v2 +; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 +; GFX8-NEXT: v_mov_b32_e32 v3, s24 +; GFX8-NEXT: v_mad_i32_i24 v2, s8, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s22 +; GFX8-NEXT: v_mad_i32_i24 v2, s12, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s20 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -3792,67 +3778,60 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s9, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s36, s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_mov_b32 s16, s8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s11, s9, 4 -; GFX9-NEXT: s_lshl_b32 s13, s9, 8 -; GFX9-NEXT: s_lshl_b32 s15, s9, 16 -; GFX9-NEXT: s_lshl_b32 s17, s9, 20 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[10:11], 60 -; GFX9-NEXT: s_ashr_i64 s[10:11], s[12:13], 60 -; GFX9-NEXT: s_lshl_b32 s13, s9, 12 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s19, s9, 24 -; GFX9-NEXT: s_lshl_b32 s9, s9, 28 -; GFX9-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 -; GFX9-NEXT: s_mov_b32 s9, s2 -; GFX9-NEXT: s_ashr_i64 s[22:23], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s2, 4 -; GFX9-NEXT: s_ashr_i64 s[24:25], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s2, 8 -; GFX9-NEXT: s_ashr_i64 s[26:27], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s2, 12 -; GFX9-NEXT: s_ashr_i64 s[28:29], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s2, 16 -; GFX9-NEXT: s_ashr_i64 s[30:31], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s2, 20 -; GFX9-NEXT: s_ashr_i64 s[32:33], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s2, 24 -; GFX9-NEXT: s_ashr_i64 s[34:35], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s2, 28 -; GFX9-NEXT: s_mov_b32 s18, s8 -; GFX9-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s36 -; GFX9-NEXT: v_mad_i32_i24 v0, s20, v0, v1 -; GFX9-NEXT: s_ashr_i64 s[18:19], s[18:19], 60 -; GFX9-NEXT: v_mov_b32_e32 v1, s34 -; GFX9-NEXT: v_mad_i32_i24 v0, s18, v1, v0 -; GFX9-NEXT: s_ashr_i64 s[16:17], s[16:17], 60 -; GFX9-NEXT: v_mov_b32_e32 v1, s32 -; GFX9-NEXT: v_mad_i32_i24 v0, s16, v1, v0 -; GFX9-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 -; GFX9-NEXT: v_mov_b32_e32 v1, s30 -; GFX9-NEXT: v_mad_i32_i24 v0, s14, v1, v0 -; GFX9-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX9-NEXT: v_mov_b32_e32 v1, s28 -; GFX9-NEXT: v_mad_i32_i24 v0, s12, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s26 -; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s24 -; GFX9-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s22 -; GFX9-NEXT: v_mad_i32_i24 v2, s4, v1, v0 +; GFX9-NEXT: s_load_dword s5, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s7, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i64 s[0:1], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s1, s5, 4 +; GFX9-NEXT: s_ashr_i64 s[12:13], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s5, 16 +; GFX9-NEXT: s_ashr_i64 s[14:15], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s5, 20 +; GFX9-NEXT: s_ashr_i64 s[16:17], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s5, 24 +; GFX9-NEXT: s_ashr_i64 s[18:19], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s5, 28 +; GFX9-NEXT: s_lshl_b32 s9, s5, 8 +; GFX9-NEXT: s_lshl_b32 s11, s5, 12 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s7, 4 +; GFX9-NEXT: s_ashr_i64 s[22:23], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s7, 8 +; GFX9-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s7, 12 +; GFX9-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s7, 16 +; GFX9-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s7, 20 +; GFX9-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s7, 24 +; GFX9-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 +; GFX9-NEXT: s_lshl_b32 s1, s7, 28 +; GFX9-NEXT: s_ashr_i64 s[20:21], s[6:7], 60 +; GFX9-NEXT: s_ashr_i64 s[6:7], s[0:1], 60 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mad_i32_i24 v2, s4, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s32 +; GFX9-NEXT: v_mad_i32_i24 v2, s18, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s30 +; GFX9-NEXT: v_mad_i32_i24 v2, s16, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s28 +; GFX9-NEXT: v_mad_i32_i24 v2, s14, v3, v2 +; GFX9-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX9-NEXT: v_mov_b32_e32 v3, s26 +; GFX9-NEXT: v_mad_i32_i24 v2, s10, v3, v2 +; GFX9-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 +; GFX9-NEXT: v_mov_b32_e32 v3, s24 +; GFX9-NEXT: v_mad_i32_i24 v2, s8, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s22 +; GFX9-NEXT: v_mad_i32_i24 v2, s12, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s20 +; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -3860,67 +3839,60 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s8, 0 -; GFX9-DL-NEXT: s_mov_b32 s10, s8 -; GFX9-DL-NEXT: s_mov_b32 s12, s8 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s9, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s36, s[0:1], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, s8 -; GFX9-DL-NEXT: s_mov_b32 s16, s8 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshl_b32 s11, s9, 4 -; GFX9-DL-NEXT: s_lshl_b32 s13, s9, 8 -; GFX9-DL-NEXT: s_lshl_b32 s15, s9, 16 -; GFX9-DL-NEXT: s_lshl_b32 s17, s9, 20 -; GFX9-DL-NEXT: s_ashr_i64 s[6:7], s[10:11], 60 -; GFX9-DL-NEXT: s_ashr_i64 s[10:11], s[12:13], 60 -; GFX9-DL-NEXT: s_lshl_b32 s13, s9, 12 -; GFX9-DL-NEXT: s_ashr_i64 s[4:5], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s19, s9, 24 -; GFX9-DL-NEXT: s_lshl_b32 s9, s9, 28 -; GFX9-DL-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 -; GFX9-DL-NEXT: s_mov_b32 s9, s2 -; GFX9-DL-NEXT: s_ashr_i64 s[22:23], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s2, 4 -; GFX9-DL-NEXT: s_ashr_i64 s[24:25], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s2, 8 -; GFX9-DL-NEXT: s_ashr_i64 s[26:27], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s2, 12 -; GFX9-DL-NEXT: s_ashr_i64 s[28:29], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s2, 16 -; GFX9-DL-NEXT: s_ashr_i64 s[30:31], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s2, 20 -; GFX9-DL-NEXT: s_ashr_i64 s[32:33], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s2, 24 -; GFX9-DL-NEXT: s_ashr_i64 s[34:35], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s2, 28 -; GFX9-DL-NEXT: s_mov_b32 s18, s8 -; GFX9-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s36 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s20, v0, v1 -; GFX9-DL-NEXT: s_ashr_i64 s[18:19], s[18:19], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s34 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s18, v1, v0 -; GFX9-DL-NEXT: s_ashr_i64 s[16:17], s[16:17], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s32 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s16, v1, v0 -; GFX9-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s30 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s14, v1, v0 -; GFX9-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s28 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s12, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s26 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s10, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s24 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s22 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v1, v0 +; GFX9-DL-NEXT: s_load_dword s5, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s7, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_ashr_i64 s[0:1], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 4 +; GFX9-DL-NEXT: s_ashr_i64 s[12:13], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 16 +; GFX9-DL-NEXT: s_ashr_i64 s[14:15], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 20 +; GFX9-DL-NEXT: s_ashr_i64 s[16:17], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 24 +; GFX9-DL-NEXT: s_ashr_i64 s[18:19], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 28 +; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 8 +; GFX9-DL-NEXT: s_lshl_b32 s11, s5, 12 +; GFX9-DL-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 4 +; GFX9-DL-NEXT: s_ashr_i64 s[22:23], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 8 +; GFX9-DL-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 12 +; GFX9-DL-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 16 +; GFX9-DL-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 20 +; GFX9-DL-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 24 +; GFX9-DL-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 +; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 28 +; GFX9-DL-NEXT: s_ashr_i64 s[20:21], s[6:7], 60 +; GFX9-DL-NEXT: s_ashr_i64 s[6:7], s[0:1], 60 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s32 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s18, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s30 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s28 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v3, v2 +; GFX9-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s26 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s10, v3, v2 +; GFX9-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s24 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s22 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s20 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll b/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll new file mode 100644 index 0000000000000000000000000000000000000000..c8f75518518be3124bc27e67875a5dc1a7110e98 --- /dev/null +++ b/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll @@ -0,0 +1,83 @@ +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,GFX9 %s + +; indexing of vectors. + +; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll +; to avoid gfx9 scheduling induced issues. + + +; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block: +; GCN-DAG: s_load_dwordx16 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]{{\]}} +; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]] +; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62 + +; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]] +; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]] + +; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]: +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] +; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] +; GCN: s_and_saveexec_b64 vcc, vcc + +; MOVREL: s_mov_b32 m0, [[READLANE]] +; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]] + +; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst +; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]] +; IDXMODE: s_set_gpr_idx_off + +; GCN-NEXT: s_xor_b64 exec, exec, vcc +; GCN: s_cbranch_execnz [[LOOP0]] + +; FIXME: Redundant copy +; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]] + +; GCN: s_mov_b64 [[MASK]], exec + +; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]: +; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] +; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] +; GCN: s_and_saveexec_b64 vcc, vcc + +; MOVREL: s_mov_b32 m0, [[READLANE]] +; MOVREL-NEXT: v_movreld_b32_e32 v{{[0-9]+}}, 63 + +; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst +; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 63 +; IDXMODE: s_set_gpr_idx_off + +; GCN-NEXT: s_xor_b64 exec, exec, vcc +; GCN: s_cbranch_execnz [[LOOP1]] + +; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]: + +; GCN: buffer_store_dword [[INS0]] +define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<16 x i32> addrspace(1)* %out0, <16 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <16 x i32> %vec0) #0 { +entry: + %id = call i32 @llvm.amdgcn.workitem.id.x() #1 + %id.ext = zext i32 %id to i64 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext + %idx0 = load volatile i32, i32 addrspace(1)* %gep + %idx1 = add i32 %idx0, 1 + %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"() + %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0 + %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1 + store volatile <16 x i32> %vec2, <16 x i32> addrspace(1)* %out0 + %cmp = icmp eq i32 %id, 0 + br i1 %cmp, label %bb1, label %bb2 + +bb1: + store volatile i32 %live.out.val, i32 addrspace(1)* undef + br label %bb2 + +bb2: + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare void @llvm.amdgcn.s.barrier() #2 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind convergent } diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll b/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll index 63384f5e445083cef0c57a7831d0e5cac27db966..693d06ce68d5cade67dfe7dfc60d2daac8925ba4 100644 --- a/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll +++ b/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll @@ -9,11 +9,14 @@ ; CHECK: s_load_dword [[IN:s[0-9]+]] ; CHECK: s_mov_b32 m0, [[IN]] ; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]] -; CHECK-NEXT: buffer_store_dwordx4 v{{\[}}[[ELT0]]: -define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) { +; CHECK: buffer_store_dwordx4 +; CHECK: buffer_store_dwordx4 +; CHECK: buffer_store_dwordx4 +; CHECK: buffer_store_dwordx4 +define amdgpu_kernel void @insert_wo_offset(<16 x float> addrspace(1)* %out, i32 %in) { entry: - %ins = insertelement <4 x float> , float 5.0, i32 %in - store <4 x float> %ins, <4 x float> addrspace(1)* %out + %ins = insertelement <16 x float> , float 17.0, i32 %in + store <16 x float> %ins, <16 x float> addrspace(1)* %out ret void } diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll b/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll new file mode 100644 index 0000000000000000000000000000000000000000..544ab990fee8457de8356d44d8a8f42434eee699 --- /dev/null +++ b/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll @@ -0,0 +1,86 @@ +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,PREGFX9 %s + +; Tests for indirect addressing on SI, which is implemented using dynamic +; indexing of vectors. + +; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll +; to avoid gfx9 scheduling induced issues. + + +; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block: +; GCN-DAG: s_load_dwordx16 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]{{\]}} +; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]] +; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62 + +; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]] +; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]] + +; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]: +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] +; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] +; GCN: s_and_saveexec_b64 vcc, vcc + +; MOVREL: s_mov_b32 m0, [[READLANE]] +; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]] + +; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst +; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]] +; IDXMODE: s_set_gpr_idx_off + +; GCN-NEXT: s_xor_b64 exec, exec, vcc +; GCN: s_cbranch_execnz [[LOOP0]] + +; FIXME: Redundant copy +; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]] + +; GCN: s_mov_b64 [[MASK]], exec + +; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]: +; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] +; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] +; GCN: s_and_saveexec_b64 vcc, vcc + +; MOVREL: s_mov_b32 m0, [[READLANE]] +; MOVREL-NEXT: v_movreld_b32_e32 v{{[0-9]+}}, 63 + +; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst +; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 63 +; IDXMODE: s_set_gpr_idx_off + +; GCN-NEXT: s_xor_b64 exec, exec, vcc +; GCN: s_cbranch_execnz [[LOOP1]] + +; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]: + +; GCN: buffer_store_dword [[INS0]] +define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<16 x i32> addrspace(1)* %out0, <16 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <16 x i32> %vec0) #0 { +entry: + %id = call i32 @llvm.amdgcn.workitem.id.x() #1 + %id.ext = zext i32 %id to i64 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext + %idx0 = load volatile i32, i32 addrspace(1)* %gep + %idx1 = add i32 %idx0, 1 + %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"() + %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0 + %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1 + store volatile <16 x i32> %vec2, <16 x i32> addrspace(1)* %out0 + %cmp = icmp eq i32 %id, 0 + br i1 %cmp, label %bb1, label %bb2 + +bb1: + store volatile i32 %live.out.val, i32 addrspace(1)* undef + br label %bb2 + +bb2: + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare void @llvm.amdgcn.s.barrier() #2 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind convergent } diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 8e02303377c001354b7acbe1d24ffa2e2e2490f4..081f623ac13d5f43bb79297ddedc80fb0772a9a0 100644 --- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -22,7 +22,7 @@ define amdgpu_kernel void @extract_w_offset(float addrspace(1)* %out, i32 %in) { entry: %idx = add i32 %in, 1 - %elt = extractelement <4 x float> , i32 %idx + %elt = extractelement <16 x float> , i32 %idx store float %elt, float addrspace(1)* %out ret void } @@ -44,11 +44,11 @@ entry: ; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, src0{{$}} ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE-NEXT: s_set_gpr_idx_off -define amdgpu_kernel void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) { +define amdgpu_kernel void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <16 x i32> %or.val) { entry: %idx = add i32 %in, 1 - %vec = or <4 x i32> %or.val, - %elt = extractelement <4 x i32> %vec, i32 %idx + %vec = or <16 x i32> %or.val, + %elt = extractelement <16 x i32> %vec, i32 %idx store i32 %elt, i32 addrspace(1)* %out ret void } @@ -68,7 +68,7 @@ entry: ; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @extract_wo_offset(float addrspace(1)* %out, i32 %in) { entry: - %elt = extractelement <4 x float> , i32 %in + %elt = extractelement <16 x float> , i32 %in store float %elt, float addrspace(1)* %out ret void } @@ -79,15 +79,15 @@ entry: ; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} -; IDXMODE: v_mov_b32_e32 v2, 2 -; IDXMODE: v_mov_b32_e32 v3, 3 +; IDXMODE: v_mov_b32_e32 v14, 15 +; IDXMODE: v_mov_b32_e32 v15, 16 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}} ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) { entry: %index = add i32 %offset, -512 - %value = extractelement <4 x i32> , i32 %index + %value = extractelement <16 x i32> , i32 %index store i32 %value, i32 addrspace(1)* %out ret void } @@ -102,14 +102,26 @@ entry: ; IDXMODE: v_mov_b32_e32 v1, ; IDXMODE: v_mov_b32_e32 v2, ; IDXMODE: v_mov_b32_e32 v3, +; IDXMODE: v_mov_b32_e32 v4, +; IDXMODE: v_mov_b32_e32 v5, +; IDXMODE: v_mov_b32_e32 v6, +; IDXMODE: v_mov_b32_e32 v7, +; IDXMODE: v_mov_b32_e32 v8, +; IDXMODE: v_mov_b32_e32 v9, +; IDXMODE: v_mov_b32_e32 v10, +; IDXMODE: v_mov_b32_e32 v11, +; IDXMODE: v_mov_b32_e32 v12, +; IDXMODE: v_mov_b32_e32 v13, +; IDXMODE: v_mov_b32_e32 v14, +; IDXMODE: v_mov_b32_e32 v15, ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}} ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE-NEXT: s_set_gpr_idx_off -define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) { +define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <16 x i32> %vec0, <16 x i32> %vec1, i32 %offset) { entry: %index = add i32 %offset, -512 - %or = or <4 x i32> %vec0, %vec1 - %value = extractelement <4 x i32> %or, i32 %index + %or = or <16 x i32> %vec0, %vec1 + %value = extractelement <16 x i32> %or, i32 %index store i32 %value, i32 addrspace(1)* %out ret void } @@ -138,7 +150,7 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %index = add i32 %id, -512 - %value = extractelement <4 x i32> , i32 %index + %value = extractelement <16 x i32> , i32 %index store i32 %value, i32 addrspace(1)* %out ret void } @@ -170,15 +182,16 @@ entry: ; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0 ; GCN-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000 ; GCN-DAG: v_mov_b32_e32 v[[ELT3:[0-9]+]], 4.0 -; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x40a00000 +; GCN-DAG: v_mov_b32_e32 v[[ELT15:[0-9]+]], 0x41800000 +; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x41880000 ; MOVREL: v_movreld_b32_e32 v[[ELT1]], v[[INS]] ; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}} -define amdgpu_kernel void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @insert_w_offset(<16 x float> addrspace(1)* %out, i32 %in) { entry: - %0 = add i32 %in, 1 - %1 = insertelement <4 x float> , float 5.0, i32 %0 - store <4 x float> %1, <4 x float> addrspace(1)* %out + %add = add i32 %in, 1 + %ins = insertelement <16 x float> , float 17.0, i32 %add + store <16 x float> %ins, <16 x float> addrspace(1)* %out ret void } @@ -193,27 +206,27 @@ entry: ; IDXMODE-NEXT: s_set_gpr_idx_off ; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]: -define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @insert_wo_offset(<16 x float> addrspace(1)* %out, i32 %in) { entry: - %0 = insertelement <4 x float> , float 5.0, i32 %in - store <4 x float> %0, <4 x float> addrspace(1)* %out + %ins = insertelement <16 x float> , float 17.0, i32 %in + store <16 x float> %ins, <16 x float> addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}insert_neg_offset_sgpr: ; The offset depends on the register that holds the first element of the vector. ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} -; MOVREL: v_movreld_b32_e32 v0, 5 +; MOVREL: v_movreld_b32_e32 v0, 16 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst -; IDXMODE-NEXT: v_mov_b32_e32 v0, 5 +; IDXMODE-NEXT: v_mov_b32_e32 v0, 16 ; IDXMODE-NEXT: s_set_gpr_idx_off -define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) { +define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, i32 %offset) { entry: %index = add i32 %offset, -512 - %value = insertelement <4 x i32> , i32 5, i32 %index - store <4 x i32> %value, <4 x i32> addrspace(1)* %out + %value = insertelement <16 x i32> , i32 16, i32 %index + store <16 x i32> %value, <16 x i32> addrspace(1)* %out ret void } @@ -229,11 +242,11 @@ entry: ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5 ; IDXMODE-NEXT: s_set_gpr_idx_off -define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) { +define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, <16 x i32> %vec, i32 %offset) { entry: %index = add i32 %offset, -512 - %value = insertelement <4 x i32> %vec, i32 5, i32 %index - store <4 x i32> %value, <4 x i32> addrspace(1)* %out + %value = insertelement <16 x i32> %vec, i32 5, i32 %index + store <16 x i32> %value, <16 x i32> addrspace(1)* %out ret void } @@ -244,6 +257,18 @@ entry: ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}} ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}} ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 5{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 6{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 7{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 8{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 9{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 10{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 11{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 12{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 13{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 14{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 15{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 16{{$}} ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]: @@ -251,23 +276,23 @@ entry: ; GCN: s_and_saveexec_b64 vcc, vcc ; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe00 -; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 5 +; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 33 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst -; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 5 +; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 33 ; IDXMODE: s_set_gpr_idx_off ; GCN: s_cbranch_execnz [[LOOPBB]] ; GCN: s_mov_b64 exec, [[SAVEEXEC]] ; GCN: buffer_store_dword -define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %index = add i32 %id, -512 - %value = insertelement <4 x i32> , i32 5, i32 %index - store <4 x i32> %value, <4 x i32> addrspace(1)* %out + %value = insertelement <16 x i32> , i32 33, i32 %index + store <16 x i32> %value, <16 x i32> addrspace(1)* %out ret void } @@ -277,6 +302,18 @@ entry: ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}} ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}} ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 5{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 6{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 7{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 8{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 9{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 10{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 11{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 12{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 13{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 14{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 15{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 16{{$}} ; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x1f4{{$}} ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec @@ -293,12 +330,12 @@ entry: ; IDXMODE: s_set_gpr_idx_off ; GCN: s_cbranch_execnz -define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %index = add i32 %id, -16 - %value = insertelement <4 x i32> , i32 500, i32 %index - store <4 x i32> %value, <4 x i32> addrspace(1)* %out + %value = insertelement <16 x i32> , i32 500, i32 %index + store <16 x i32> %value, <16 x i32> addrspace(1)* %out ret void } @@ -364,9 +401,9 @@ entry: %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext %idx0 = load volatile i32, i32 addrspace(1)* %gep %idx1 = add i32 %idx0, 1 - %val0 = extractelement <4 x i32> , i32 %idx0 + %val0 = extractelement <16 x i32> , i32 %idx0 %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" () - %val1 = extractelement <4 x i32> , i32 %idx1 + %val1 = extractelement <16 x i32> , i32 %idx1 store volatile i32 %val0, i32 addrspace(1)* %out0 store volatile i32 %val1, i32 addrspace(1)* %out0 %cmp = icmp eq i32 %id, 0 @@ -380,76 +417,9 @@ bb2: ret void } -; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block: -; GCN-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}} -; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]] -; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62 - -; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]] -; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}} -; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT1:3]], s{{[0-9]+}} -; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]] - -; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]: -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] -; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] -; GCN: s_and_saveexec_b64 vcc, vcc - -; MOVREL: s_mov_b32 m0, [[READLANE]] -; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]] - -; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst -; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]] -; IDXMODE: s_set_gpr_idx_off - -; GCN-NEXT: s_xor_b64 exec, exec, vcc -; GCN: s_cbranch_execnz [[LOOP0]] - -; FIXME: Redundant copy -; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]] - -; GCN: s_mov_b64 [[MASK]], exec - -; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]: -; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] -; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] -; GCN: s_and_saveexec_b64 vcc, vcc - -; MOVREL: s_mov_b32 m0, [[READLANE]] -; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63 - -; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst -; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63 -; IDXMODE: s_set_gpr_idx_off - -; GCN-NEXT: s_xor_b64 exec, exec, vcc -; GCN: s_cbranch_execnz [[LOOP1]] - -; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]: - -; GCN: buffer_store_dword [[INS0]] -define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 { -entry: - %id = call i32 @llvm.amdgcn.workitem.id.x() #1 - %id.ext = zext i32 %id to i64 - %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext - %idx0 = load volatile i32, i32 addrspace(1)* %gep - %idx1 = add i32 %idx0, 1 - %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"() - %vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0 - %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1 - store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0 - %cmp = icmp eq i32 %id, 0 - br i1 %cmp, label %bb1, label %bb2 - -bb1: - store volatile i32 %live.out.val, i32 addrspace(1)* undef - br label %bb2 - -bb2: - ret void -} +; Moved subtest for insert_vgpr_offset_multiple_in_block to separate file to +; avoid very different schedule induced isses with gfx9. +; test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll ; GCN-LABEL: {{^}}insert_adjacent_blocks: @@ -483,10 +453,10 @@ bb7: ; preds = %bb4, %bb1 ; GCN: s_load_dword [[ARG:s[0-9]+]] ; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000 -; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd ; MOVREL: s_waitcnt ; MOVREL: s_add_i32 m0, [[ARG]], -16 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0 +; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0 ; MOVREL: s_mov_b32 m0, -1 @@ -508,13 +478,13 @@ bb7: ; preds = %bb4, %bb1 define amdgpu_kernel void @multi_same_block(i32 %arg) #0 { bb: %tmp1 = add i32 %arg, -16 - %tmp2 = insertelement <6 x float> , float 4.000000e+00, i32 %tmp1 + %tmp2 = insertelement <9 x float> , float 4.000000e+00, i32 %tmp1 %tmp3 = add i32 %arg, -16 - %tmp4 = insertelement <6 x float> , float -4.0, i32 %tmp3 - %tmp5 = bitcast <6 x float> %tmp2 to <6 x i32> - %tmp6 = extractelement <6 x i32> %tmp5, i32 1 - %tmp7 = bitcast <6 x float> %tmp4 to <6 x i32> - %tmp8 = extractelement <6 x i32> %tmp7, i32 5 + %tmp4 = insertelement <9 x float> , float -4.0, i32 %tmp3 + %tmp5 = bitcast <9 x float> %tmp2 to <9 x i32> + %tmp6 = extractelement <9 x i32> %tmp5, i32 1 + %tmp7 = bitcast <9 x float> %tmp4 to <9 x i32> + %tmp8 = extractelement <9 x i32> %tmp7, i32 5 store volatile i32 %tmp6, i32 addrspace(3)* undef, align 4 store volatile i32 %tmp8, i32 addrspace(3)* undef, align 4 ret void @@ -522,7 +492,7 @@ bb: ; offset puts outside of superegister bounaries, so clamp to 1st element. ; GCN-LABEL: {{^}}extract_largest_inbounds_offset: -; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}} +; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\].* offset:48}} ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]] ; MOVREL: s_mov_b32 m0, [[IDX]] ; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]] @@ -532,11 +502,11 @@ bb: ; IDXMODE: s_set_gpr_idx_off ; GCN: buffer_store_dword [[EXTRACT]] -define amdgpu_kernel void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) { +define amdgpu_kernel void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx) { entry: - %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in - %offset = add i32 %idx, 3 - %value = extractelement <4 x i32> %ld, i32 %offset + %ld = load volatile <16 x i32>, <16 x i32> addrspace(1)* %in + %offset = add i32 %idx, 15 + %value = extractelement <16 x i32> %ld, i32 %offset store i32 %value, i32 addrspace(1)* %out ret void } @@ -544,20 +514,20 @@ entry: ; GCN-LABEL: {{^}}extract_out_of_bounds_offset: ; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}} ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]] -; MOVREL: s_add_i32 m0, [[IDX]], 4 +; MOVREL: s_add_i32 m0, [[IDX]], 16 ; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] -; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 4 +; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 16 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], src0 ; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] ; IDXMODE: s_set_gpr_idx_off ; GCN: buffer_store_dword [[EXTRACT]] -define amdgpu_kernel void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) { +define amdgpu_kernel void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx) { entry: - %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in - %offset = add i32 %idx, 4 - %value = extractelement <4 x i32> %ld, i32 %offset + %ld = load volatile <16 x i32>, <16 x i32> addrspace(1)* %in + %offset = add i32 %idx, 16 + %value = extractelement <16 x i32> %ld, i32 %offset store i32 %value, i32 addrspace(1)* %out ret void } @@ -565,7 +535,7 @@ entry: ; Test that the or is folded into the base address register instead of ; added to m0 -; GCN-LABEL: {{^}}extractelement_v4i32_or_index: +; GCN-LABEL: {{^}}extractelement_v16i32_or_index: ; GCN: s_load_dword [[IDX_IN:s[0-9]+]] ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] ; GCN-NOT: [[IDX_SHL]] @@ -576,17 +546,17 @@ entry: ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], src0 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE: s_set_gpr_idx_off -define amdgpu_kernel void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) { +define amdgpu_kernel void @extractelement_v16i32_or_index(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx.in) { entry: - %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in + %ld = load volatile <16 x i32>, <16 x i32> addrspace(1)* %in %idx.shl = shl i32 %idx.in, 2 %idx = or i32 %idx.shl, 1 - %value = extractelement <4 x i32> %ld, i32 %idx + %value = extractelement <16 x i32> %ld, i32 %idx store i32 %value, i32 addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}insertelement_v4f32_or_index: +; GCN-LABEL: {{^}}insertelement_v16f32_or_index: ; GCN: s_load_dword [[IDX_IN:s[0-9]+]] ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] ; GCN-NOT: [[IDX_SHL]] @@ -597,11 +567,11 @@ entry: ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], dst ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE: s_set_gpr_idx_off -define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind { +define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %idx.in) nounwind { %idx.shl = shl i32 %idx.in, 2 %idx = or i32 %idx.shl, 1 - %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx - store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 + %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %idx + store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64 ret void } @@ -636,9 +606,9 @@ bb2: ; preds = %bb4, %bb bb4: ; preds = %bb2 %vgpr = load volatile i32, i32 addrspace(1)* undef - %tmp5 = insertelement <8 x i32> undef, i32 undef, i32 %vgpr - %tmp6 = insertelement <8 x i32> %tmp5, i32 %arg1, i32 %vgpr - %tmp7 = extractelement <8 x i32> %tmp6, i32 0 + %tmp5 = insertelement <16 x i32> undef, i32 undef, i32 %vgpr + %tmp6 = insertelement <16 x i32> %tmp5, i32 %arg1, i32 %vgpr + %tmp7 = extractelement <16 x i32> %tmp6, i32 0 br label %bb2 bb8: ; preds = %bb2 diff --git a/test/CodeGen/AMDGPU/infinite-loop.ll b/test/CodeGen/AMDGPU/infinite-loop.ll index 5005b781ca3af75233a2155ca2696501687cd757..75ad58df43b347a52932d403868246fb90ca997a 100644 --- a/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/test/CodeGen/AMDGPU/infinite-loop.ll @@ -32,8 +32,8 @@ loop: ; SI: s_cbranch_execz [[RET:BB[0-9]+_[0-9]+]] ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7 -; SI: [[LOOP:BB[0-9]+_[0-9]+]]: ; %loop ; SI: s_and_b64 vcc, exec, -1 +; SI: [[LOOP:BB[0-9]+_[0-9]+]]: ; %loop ; SI: s_waitcnt lgkmcnt(0) ; SI: buffer_store_dword [[REG]] ; SI: s_cbranch_vccnz [[LOOP]] diff --git a/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir b/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir new file mode 100644 index 0000000000000000000000000000000000000000..9427b5cd25420fac63ce0749c6efd11166bc11e4 --- /dev/null +++ b/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir @@ -0,0 +1,320 @@ +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-skips -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s + +--- +# GCN-LABEL: name: and_execz_mov_vccz +# GCN-NOT: S_MOV_ +# GCN-NOT: S_AND_ +# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_mov_vccz +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_imm_vccz +# GCN-NOT: S_AND_ +# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_imm_vccz +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execnz_imm_vccnz +# GCN-NOT: S_AND_ +# GCN: S_CBRANCH_EXECNZ %bb.1, implicit $exec +name: and_execnz_imm_vccnz +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_imm_vccz_live_scc +# GCN: $vcc = S_AND_B64 $exec, -1, implicit-def $scc +# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_imm_vccz_live_scc +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $vcc = S_AND_B64 $exec, -1, implicit-def $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_vccz_live_scc +# GCN-NOT: S_MOV_ +# GCN: $vcc = S_AND_B64 $exec, -1, implicit-def $scc +# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_mov_vccz_live_scc +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_vccz_live_sreg +# GCN: $sgpr0_sgpr1 = S_MOV_B64 -1 +# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_mov_vccz_live_sreg +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_AND_B64 $exec, $sgpr0_sgpr1, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_vccz_live_sreg_commute +# GCN: $sgpr0_sgpr1 = S_MOV_B64 -1 +# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_mov_vccz_live_sreg_commute +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_AND_B64 $sgpr0_sgpr1, $exec, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_vccz_live_scc_commute +# GCN-NOT: S_MOV_ +# GCN: $vcc = S_AND_B64 $exec, -1, implicit-def $scc +# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_mov_vccz_live_scc_commute +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_AND_B64 killed $sgpr0_sgpr1, $exec, implicit-def $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_vccz_commute +# GCN-NOT: S_MOV_ +# GCN-NOT: S_AND_ +# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_mov_vccz_commute +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_AND_B64 killed $sgpr0_sgpr1, $exec, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_exec_vccz +# GCN: $exec = S_MOV_B64 -1 +# GCN-NEXT: S_ENDPGM +name: and_execz_mov_exec_vccz +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $exec = S_MOV_B64 -1 + $vcc = S_AND_B64 $exec, $exec, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_exec_vccnz +# GCN: $exec = S_MOV_B64 -1 +# GCN-NEXT: S_BRANCH %bb.1{{$}} +name: and_execz_mov_exec_vccnz +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $exec = S_MOV_B64 -1 + $vcc = S_AND_B64 $exec, $exec, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_vccz_reads_sreg_early +# GCN: $sgpr0_sgpr1 = S_MOV_B64 -1 +# GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr1 +# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_mov_vccz_reads_sreg_early +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $sgpr2 = S_MOV_B32 $sgpr1 + $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_vccz_reads_sreg_late +# GCN: $sgpr0_sgpr1 = S_MOV_B64 -1 +# GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr1 +# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_mov_vccz_reads_sreg_late +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_AND_B64 $exec, $sgpr0_sgpr1, implicit-def dead $scc + $sgpr2 = S_MOV_B32 $sgpr1 + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +# GCN-LABEL: name: and_execz_mov_vccz_reads_writes_sreg_early +# GCN: $sgpr0_sgpr1 = S_MOV_B64 -1 +# GCN-NEXT: $sgpr1 = S_MOV_B32 $sgpr0 +# GCN-NEXT: $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc +# GCN-NEXT: S_CBRANCH_VCCZ %bb.1, implicit killed $vcc +name: and_execz_mov_vccz_reads_writes_sreg_early +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $sgpr1 = S_MOV_B32 $sgpr0 + $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_vccz_reads_cond +# GCN: $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc +# GCN-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo +# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_mov_vccz_reads_cond +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc + $sgpr2 = S_MOV_B32 $vcc_lo + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_vccz_modifies_sreg +# GCN: $sgpr0_sgpr1 = S_MOV_B64 -1 +# GCN-NEXT: $sgpr0 = S_MOV_B32 0 +# GCN-NEXT: $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc +# GCN-NEXT: S_CBRANCH_VCCZ %bb.1, implicit killed $vcc +name: and_execz_mov_vccz_modifies_sreg +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $sgpr0 = S_MOV_B32 0 + $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_imm_vccz_liveout_scc +# GCN: $vcc = S_AND_B64 $exec, -1, implicit-def $scc +# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec +# GCN-NEXT S_ENDPGM implicit $scc +name: and_execz_imm_vccz_liveout_scc +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $vcc = S_AND_B64 $exec, -1, implicit-def $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM implicit $scc +... diff --git a/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/test/CodeGen/AMDGPU/insert_vector_dynelt.ll new file mode 100644 index 0000000000000000000000000000000000000000..80309b40e17335da2f5f0294b3ed1670c3d9b5d0 --- /dev/null +++ b/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -0,0 +1,312 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s + +; GCN-LABEL: {{^}}float4_inselt: +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]] +; GCN: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]] +define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, <4 x float> %vec, i32 %sel) { +entry: + %v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel + store <4 x float> %v, <4 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}float4_inselt_undef: +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN-NOT: v_cmp_ +; GCN-NOT: v_cndmask_ +; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]] +; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]] +; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]] +define amdgpu_kernel void @float4_inselt_undef(<4 x float> addrspace(1)* %out, i32 %sel) { +entry: + %v = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel + store <4 x float> %v, <4 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}int4_inselt: +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1, v{{[0-9]+}}, [[CC1]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC2]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC3]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1, v{{[0-9]+}}, [[CC4]] +; GCN: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]] +define amdgpu_kernel void @int4_inselt(<4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %sel) { +entry: + %v = insertelement <4 x i32> %vec, i32 1, i32 %sel + store <4 x i32> %v, <4 x i32> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}float2_inselt: +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC2]] +; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]] +define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, <2 x float> %vec, i32 %sel) { +entry: + %v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel + store <2 x float> %v, <2 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}float8_inselt: +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC5:[^,]+]], [[IDX:s[0-9]+]], 7 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC5]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC6:[^,]+]], [[IDX]], 6 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC6]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC7:[^,]+]], [[IDX]], 5 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC7]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC8:[^,]+]], [[IDX]], 4 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC8]] +; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST0]]:[[ELT_LAST0]]] +; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST1]]:[[ELT_LAST1]]] +define amdgpu_kernel void @float8_inselt(<8 x float> addrspace(1)* %out, <8 x float> %vec, i32 %sel) { +entry: + %v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel + store <8 x float> %v, <8 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}float16_inselt: +; GCN: v_movreld_b32 +define amdgpu_kernel void @float16_inselt(<16 x float> addrspace(1)* %out, <16 x float> %vec, i32 %sel) { +entry: + %v = insertelement <16 x float> %vec, float 1.000000e+00, i32 %sel + store <16 x float> %v, <16 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}half4_inselt: +; GCN-NOT: v_cndmask_b32 +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 +; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3c00 +define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) { +entry: + %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel + store <4 x half> %v, <4 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}half2_inselt: +; GCN-NOT: v_cndmask_b32 +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 +; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]] +; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @half2_inselt(<2 x half> addrspace(1)* %out, <2 x half> %vec, i32 %sel) { +entry: + %v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel + store <2 x half> %v, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}half8_inselt: +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0 +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 1 +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 2 +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 3 +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 4 +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 5 +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 6 +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 7 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +define amdgpu_kernel void @half8_inselt(<8 x half> addrspace(1)* %out, <8 x half> %vec, i32 %sel) { +entry: + %v = insertelement <8 x half> %vec, half 1.000000e+00, i32 %sel + store <8 x half> %v, <8 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}short2_inselt: +; GCN-NOT: v_cndmask_b32 +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 +; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]] +; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], 1, v{{[0-9]+}} +define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) { +entry: + %v = insertelement <2 x i16> %vec, i16 1, i32 %sel + store <2 x i16> %v, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}short4_inselt: +; GCN-NOT: v_cndmask_b32 +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 +; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 +define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) { +entry: + %v = insertelement <4 x i16> %vec, i16 1, i32 %sel + store <4 x i16> %v, <4 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}byte8_inselt: +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3 +; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 +define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) { +entry: + %v = insertelement <8 x i8> %vec, i8 1, i32 %sel + store <8 x i8> %v, <8 x i8> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}byte16_inselt: +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0 +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 15 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +define amdgpu_kernel void @byte16_inselt(<16 x i8> addrspace(1)* %out, <16 x i8> %vec, i32 %sel) { +entry: + %v = insertelement <16 x i8> %vec, i8 1, i32 %sel + store <16 x i8> %v, <16 x i8> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}double2_inselt: +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC1]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC2]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]] +define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) { +entry: + %v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel + store <2 x double> %v, <2 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}double8_inselt: +; GCN-NOT: v_cndmask +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +define amdgpu_kernel void @double8_inselt(<8 x double> addrspace(1)* %out, <8 x double> %vec, i32 %sel) { +entry: + %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel + store <8 x double> %v, <8 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}bit4_inselt: +; GCN: buffer_store_byte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +define amdgpu_kernel void @bit4_inselt(<4 x i1> addrspace(1)* %out, <4 x i1> %vec, i32 %sel) { +entry: + %v = insertelement <4 x i1> %vec, i1 1, i32 %sel + store <4 x i1> %v, <4 x i1> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}bit128_inselt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC1]] +; GCN-DAG: v_mov_b32_e32 [[LASTIDX:v[0-9]+]], 0x7f +; GCN-DAG: v_cmp_ne_u32_e32 [[CCL:[^,]+]], s{{[0-9]+}}, [[LASTIDX]] +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CCL]] +define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i1> %vec, i32 %sel) { +entry: + %v = insertelement <128 x i1> %vec, i1 1, i32 %sel + store <128 x i1> %v, <128 x i1> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll index 692696ff7302b8b0483f3e9c1067d5f13cf9d51a..93ee16ea85d85369e49a75a8fdf3f81794e5c890 100644 --- a/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -83,8 +83,11 @@ define <4 x float> @insertelement_to_sgpr() nounwind { } ; GCN-LABEL: {{^}}dynamic_insertelement_v2f32: -; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 -; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] +; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 +; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]], v{{[0-9]+}}, [[CC1]] ; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind { %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b @@ -93,10 +96,14 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* } ; GCN-LABEL: {{^}}dynamic_insertelement_v3f32: -; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 -; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] -; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: -; GCN-DAG: buffer_store_dword v +; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 +; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC3]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC1]] +; GCN-DAG: buffer_store_dwordx3 v define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind { %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 @@ -104,8 +111,15 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* } ; GCN-LABEL: {{^}}dynamic_insertelement_v4f32: -; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 -; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] +; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 +; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC4]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC3]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]], v{{[0-9]+}}, [[CC1]] ; GCN: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]: define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind { %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b @@ -114,7 +128,11 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* } ; GCN-LABEL: {{^}}dynamic_insertelement_v8f32: -; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 +; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 7 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CCL]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC1]] ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { @@ -136,8 +154,11 @@ define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1 } ; GCN-LABEL: {{^}}dynamic_insertelement_v2i32: -; GCN: v_movreld_b32 -; GCN: buffer_store_dwordx2 +; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC2]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5, v{{[0-9]+}}, [[CC1]] +; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind { %vecins = insertelement <2 x i32> %a, i32 5, i32 %b store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8 @@ -145,9 +166,13 @@ define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* % } ; GCN-LABEL: {{^}}dynamic_insertelement_v3i32: -; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5 -; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: -; GCN-DAG: buffer_store_dword v +; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC3]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC2]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]] +; GCN-DAG: buffer_store_dwordx3 v define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind { %vecins = insertelement <3 x i32> %a, i32 5, i32 %b store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16 @@ -156,8 +181,15 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* % ; GCN-LABEL: {{^}}dynamic_insertelement_v4i32: ; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}} -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] -; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]] +; GCN-DAG: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC4]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC3]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC2]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC1]] ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind { %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b @@ -166,7 +198,10 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* % } ; GCN-LABEL: {{^}}dynamic_insertelement_v8i32: -; GCN: v_movreld_b32 +; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 7 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CCL]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]] ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind { @@ -229,8 +264,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou ; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]] ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 ; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] -; VI: s_not_b32 [[NOT_MASK:s[0-9]+]], [[SHIFTED_MASK]] -; VI: s_and_b32 [[AND_NOT_MASK:s[0-9]+]], [[NOT_MASK]], [[LOAD]] +; VI: s_andn2_b32 [[AND_NOT_MASK:s[0-9]+]], [[LOAD]], [[SHIFTED_MASK]] ; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]] ; VI: s_lshr_b32 [[HI2:s[0-9]+]], [[AND_NOT_MASK]], 16 @@ -269,8 +303,7 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %ou ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 ; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff ; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]] -; VI: s_not_b64 [[NOT_MASK:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}} -; VI: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[NOT_MASK]], [[VEC]] +; VI: s_andn2_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[VEC]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}} ; VI: s_and_b32 s[[INS:[0-9]+]], s[[MASK_SHIFT_LO]], 5 ; VI: s_or_b64 s{{\[}}[[RESULT0:[0-9]+]]:[[RESULT1:[0-9]+]]{{\]}}, s{{\[}}[[INS]]:[[MASK_HI]]{{\]}}, [[AND]] ; VI: v_mov_b32_e32 v[[V_RESULT0:[0-9]+]], s[[RESULT0]] @@ -288,24 +321,13 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* % ; GCN: s_load_dwordx4 ; GCN: s_load_dword s -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte - -; GCN: buffer_store_byte +; GCN-NOT: buffer_store_byte + +; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 15 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CCL]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]] + ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind { %vecins = insertelement <16 x i8> %a, i8 5, i32 %b @@ -343,23 +365,18 @@ endif: ; GCN-DAG: s_load_dwordx4 s{{\[}}[[A_ELT0:[0-9]+]]:[[A_ELT3:[0-9]+]]{{\]}} ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x18|0x60}}{{$}} -; GCN-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}} - ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000 -; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]] -; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 0 - -; Increment to next element folded into base register, but FileCheck -; can't do math expressions - -; FIXME: Should be able to manipulate m0 directly instead of s_lshl_b32 + copy to m0 - -; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[ELT1]], [[CC2]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[ELT1]], [[CC1]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]] ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm @@ -371,8 +388,12 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1) ; GCN-LABEL: {{^}}dynamic_insertelement_v2i64: -; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 5 -; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 0 +; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC2]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC1]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]] ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm @@ -383,34 +404,41 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* % } ; GCN-LABEL: {{^}}dynamic_insertelement_v3i64: +; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2 +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC3]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC3]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1 +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC2]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC1]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]] define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind { %vecins = insertelement <3 x i64> %a, i64 5, i32 %b store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32 ret void } -; FIXME: Should be able to do without stack access. The used stack -; space is also 2x what should be required. - ; GCN-LABEL: {{^}}dynamic_insertelement_v4f64: -; Stack store - -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}} -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}} - -; Write element -; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}} - -; Stack reload -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}} -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}} +; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40200000 +; GCN-DAG: v_cmp_eq_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC4]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC4]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC3]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC3]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC2]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC1]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]] -; Store result ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm -; GCN: ScratchSize: 64 +; GCN: ScratchSize: 0 define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { %vecins = insertelement <4 x double> %a, double 8.0, i32 %b diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index d9b253328196b8cdf410b8a84265e50956e75302..b4fb59983cbbe934cb8083274cef57a991ab60ae 100644 --- a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -444,40 +444,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac ret void } -; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: -; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 - -; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] -; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] - -; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] -; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] - -; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] -; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]] - -; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]] -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 - %tid.ext = sext i32 %tid to i64 - %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext - %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext - %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext - %idx = load i32, i32 addrspace(1)* %idx.gep - %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep - %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx - store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep - ret void -} - ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr: ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} ; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234 -; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] -; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] +; GCN-DAG: {{flat|global}}_load_dword [[IDX:v[0-9]+]] +; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]] ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll new file mode 100644 index 0000000000000000000000000000000000000000..e35cf7a349ac572da1bd3985d7b81259205c0718 --- /dev/null +++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll @@ -0,0 +1,36 @@ +; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s + +; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: +; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} +; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 + +; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] +; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] + +; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] +; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] + +; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] +; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]] + +; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %idx = load i32, i32 addrspace(1)* %idx.gep + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep + ret void +} + + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll new file mode 100644 index 0000000000000000000000000000000000000000..07ee65526c9610c2badeda24bb719fb8dab72c23 --- /dev/null +++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll @@ -0,0 +1,36 @@ +; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s + +; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: + +; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] +; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] + +; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} +; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 + +; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] +; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] + +; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] +; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]] + +; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %idx = load i32, i32 addrspace(1)* %idx.gep + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep + ret void +} + + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll index 11067522f8573f11303d908cc7abd17811a0f540..6b6473124bb9f468b7f6beec31d3d22a609f71a6 100644 --- a/test/CodeGen/AMDGPU/kernel-args.ll +++ b/test/CodeGen/AMDGPU/kernel-args.ll @@ -1,19 +1,19 @@ ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,HSA-GFX9,FUNC %s ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=EG,EGCM,FUNC %s ; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=CM,EGCM,FUNC %s ; FUNC-LABEL: {{^}}i8_arg: -; HSA-VI: kernarg_segment_byte_size = 12 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 12 +; HSA-GFX9: kernarg_segment_alignment = 4 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff -; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 -; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff +; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 +; HSA-GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff ; EGCM: VTX_READ_8{{.*}} #3 @@ -25,13 +25,13 @@ define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) noun } ; FUNC-LABEL: {{^}}i8_zext_arg: -; HSA-VI: kernarg_segment_byte_size = 12 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 12 +; HSA-GFX9: kernarg_segment_alignment = 4 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c -; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 -; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff +; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 +; HSA-GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff ; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, @@ -49,15 +49,15 @@ define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zero } ; FUNC-LABEL: {{^}}i8_sext_arg: -; HSA-VI: kernarg_segment_byte_size = 12 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 12 +; HSA-GFX9: kernarg_segment_alignment = 4 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c -; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 -; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]] -; HSA-VI: flat_store_dword +; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 +; HSA-GFX9: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]] +; HSA-GFX9: global_store_dword ; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, @@ -75,17 +75,17 @@ define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 sign } ; FUNC-LABEL: {{^}}i16_arg: -; HSA-VI: kernarg_segment_byte_size = 12 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 12 +; HSA-GFX9: kernarg_segment_alignment = 4 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff -; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 -; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} -; HSA-VI: flat_store_dword +; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 +; HSA-GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} +; HSA-GFX9: global_store_dword ; EGCM: VTX_READ_16 ; EGCM: KC0[2].Y @@ -96,15 +96,15 @@ define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) no } ; FUNC-LABEL: {{^}}i16_zext_arg: -; HSA-VI: kernarg_segment_byte_size = 12 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 12 +; HSA-GFX9: kernarg_segment_alignment = 4 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c -; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 -; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} -; HSA-VI: flat_store_dword +; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 +; HSA-GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} +; HSA-GFX9: global_store_dword ; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, @@ -121,16 +121,16 @@ define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 ze } ; FUNC-LABEL: {{^}}i16_sext_arg: -; HSA-VI: kernarg_segment_byte_size = 12 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 12 +; HSA-GFX9: kernarg_segment_alignment = 4 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c -; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 -; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]] -; HSA-VI: flat_store_dword +; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 +; HSA-GFX9: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]] +; HSA-GFX9: global_store_dword ; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, @@ -147,13 +147,13 @@ define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 si } ; FUNC-LABEL: {{^}}i32_arg: -; HSA-VI: kernarg_segment_byte_size = 12 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 12 +; HSA-GFX9: kernarg_segment_alignment = 4 ; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c -; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8 +; HSA-GFX9: s_load_dword s{{[0-9]}}, s[4:5], 0x8 define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { entry: store i32 %in, i32 addrspace(1)* %out, align 4 @@ -161,12 +161,12 @@ entry: } ; FUNC-LABEL: {{^}}f32_arg: -; HSA-VI: kernarg_segment_byte_size = 12 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 12 +; HSA-GFX9: kernarg_segment_alignment = 4 ; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 +; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { entry: store float %in, float addrspace(1)* %out, align 4 @@ -174,8 +174,8 @@ entry: } ; FUNC-LABEL: {{^}}v2i8_arg: -; HSA-VI: kernarg_segment_byte_size = 12 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 12 +; HSA-GFX9: kernarg_segment_alignment = 4 ; EGCM: VTX_READ_8 ; EGCM: VTX_READ_8 @@ -189,15 +189,15 @@ entry: } ; FUNC-LABEL: {{^}}v2i16_arg: -; HSA-VI: kernarg_segment_byte_size = 12 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 12 +; HSA-GFX9: kernarg_segment_alignment = 4 ; EGCM: VTX_READ_16 ; EGCM: VTX_READ_16 ; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 +; HSA-GFX9: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { entry: store <2 x i16> %in, <2 x i16> addrspace(1)* %out @@ -205,14 +205,14 @@ entry: } ; FUNC-LABEL: {{^}}v2i32_arg: -; HSA-VI: kernarg_segment_byte_size = 16 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 16 +; HSA-GFX9: kernarg_segment_alignment = 4 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c -; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 +; HSA-GFX9: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { entry: store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4 @@ -220,14 +220,14 @@ entry: } ; FUNC-LABEL: {{^}}v2f32_arg: -; HSA-VI: kernarg_segment_byte_size = 16 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 16 +; HSA-GFX9: kernarg_segment_alignment = 4 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c -; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8 +; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8 define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { entry: store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4 @@ -235,8 +235,8 @@ entry: } ; FUNC-LABEL: {{^}}v3i8_arg: -; HSA-VI: kernarg_segment_byte_size = 12 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 12 +; HSA-GFX9: kernarg_segment_alignment = 4 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 @@ -253,8 +253,8 @@ entry: } ; FUNC-LABEL: {{^}}v3i16_arg: -; HSA-VI: kernarg_segment_byte_size = 16 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 16 +; HSA-GFX9: kernarg_segment_alignment = 4 ; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44 ; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 @@ -271,14 +271,14 @@ entry: } ; FUNC-LABEL: {{^}}v3i32_arg: -; HSA-VI: kernarg_segment_byte_size = 32 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 32 +; HSA-GFX9: kernarg_segment_alignment = 4 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 -; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 +; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { entry: store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4 @@ -286,14 +286,14 @@ entry: } ; FUNC-LABEL: {{^}}v3f32_arg: -; HSA-VI: kernarg_segment_byte_size = 32 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 32 +; HSA-GFX9: kernarg_segment_alignment = 4 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 -; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 +; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { entry: store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4 @@ -301,8 +301,8 @@ entry: } ; FUNC-LABEL: {{^}}v4i8_arg: -; HSA-VI: kernarg_segment_byte_size = 12 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 12 +; HSA-GFX9: kernarg_segment_alignment = 4 ; EGCM: VTX_READ_8 ; EGCM: VTX_READ_8 ; EGCM: VTX_READ_8 @@ -317,8 +317,8 @@ entry: } ; FUNC-LABEL: {{^}}v4i16_arg: -; HSA-VI: kernarg_segment_byte_size = 16 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 16 +; HSA-GFX9: kernarg_segment_alignment = 4 ; EGCM: VTX_READ_16 ; EGCM: VTX_READ_16 ; EGCM: VTX_READ_16 @@ -334,8 +334,8 @@ entry: ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24 ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c -; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 -; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +; HSA-GFX9-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 +; HSA-GFX9-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { entry: store <4 x i16> %in, <4 x i16> addrspace(1)* %out @@ -343,8 +343,8 @@ entry: } ; FUNC-LABEL: {{^}}v4i32_arg: -; HSA-VI: kernarg_segment_byte_size = 32 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 32 +; HSA-GFX9: kernarg_segment_alignment = 4 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W @@ -352,7 +352,7 @@ entry: ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 -; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 +; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { entry: store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 @@ -360,15 +360,15 @@ entry: } ; FUNC-LABEL: {{^}}v4f32_arg: -; HSA-VI: kernarg_segment_byte_size = 32 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 32 +; HSA-GFX9: kernarg_segment_alignment = 4 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 -; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 +; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { entry: store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4 @@ -377,8 +377,8 @@ entry: ; FIXME: Lots of unpack and re-pack junk on VI ; FUNC-LABEL: {{^}}v8i8_arg: -; HSA-VI: kernarg_segment_byte_size = 16 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 16 +; HSA-GFX9: kernarg_segment_alignment = 4 ; EGCM: VTX_READ_8 ; EGCM: VTX_READ_8 ; EGCM: VTX_READ_8 @@ -405,8 +405,8 @@ entry: } ; FUNC-LABEL: {{^}}v8i16_arg: -; HSA-VI: kernarg_segment_byte_size = 32 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 32 +; HSA-GFX9: kernarg_segment_alignment = 4 ; EGCM: VTX_READ_16 ; EGCM: VTX_READ_16 ; EGCM: VTX_READ_16 @@ -423,7 +423,7 @@ entry: ; MESA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34 -; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 +; HSA-GFX9: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { entry: store <8 x i16> %in, <8 x i16> addrspace(1)* %out @@ -431,8 +431,8 @@ entry: } ; FUNC-LABEL: {{^}}v8i32_arg: -; HSA-VI: kernarg_segment_byte_size = 64 -; HSA-VI: kernarg_segment_alignment = 5 +; HSA-GFX9: kernarg_segment_byte_size = 64 +; HSA-GFX9: kernarg_segment_alignment = 5 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W @@ -444,7 +444,7 @@ entry: ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 -; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 +; HSA-GFX9: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { entry: store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 @@ -452,8 +452,8 @@ entry: } ; FUNC-LABEL: {{^}}v8f32_arg: -; HSA-VI: kernarg_segment_byte_size = 64 -; HSA-VI: kernarg_segment_alignment = 5 +; HSA-GFX9: kernarg_segment_byte_size = 64 +; HSA-GFX9: kernarg_segment_alignment = 5 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W @@ -472,8 +472,8 @@ entry: ; FIXME: Pack/repack on VI ; FUNC-LABEL: {{^}}v16i8_arg: -; HSA-VI: kernarg_segment_byte_size = 32 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 32 +; HSA-GFX9: kernarg_segment_alignment = 4 ; EGCM: VTX_READ_8 ; EGCM: VTX_READ_8 ; EGCM: VTX_READ_8 @@ -508,8 +508,8 @@ entry: } ; FUNC-LABEL: {{^}}v16i16_arg: -; HSA-VI: kernarg_segment_byte_size = 64 -; HSA-VI: kernarg_segment_alignment = 5 +; HSA-GFX9: kernarg_segment_byte_size = 64 +; HSA-GFX9: kernarg_segment_alignment = 5 ; EGCM: VTX_READ_16 ; EGCM: VTX_READ_16 ; EGCM: VTX_READ_16 @@ -535,7 +535,7 @@ entry: ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 -; HSA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 +; HSA-GFX9: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { entry: store <16 x i16> %in, <16 x i16> addrspace(1)* %out @@ -543,8 +543,8 @@ entry: } ; FUNC-LABEL: {{^}}v16i32_arg: -; HSA-VI: kernarg_segment_byte_size = 128 -; HSA-VI: kernarg_segment_alignment = 6 +; HSA-GFX9: kernarg_segment_byte_size = 128 +; HSA-GFX9: kernarg_segment_alignment = 6 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W @@ -563,7 +563,7 @@ entry: ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 -; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 +; HSA-GFX9: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { entry: store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 @@ -571,8 +571,8 @@ entry: } ; FUNC-LABEL: {{^}}v16f32_arg: -; HSA-VI: kernarg_segment_byte_size = 128 -; HSA-VI: kernarg_segment_alignment = 6 +; HSA-GFX9: kernarg_segment_byte_size = 128 +; HSA-GFX9: kernarg_segment_alignment = 6 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W @@ -591,7 +591,7 @@ entry: ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 -; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 +; HSA-GFX9: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { entry: store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 @@ -600,7 +600,7 @@ entry: ; FUNC-LABEL: {{^}}kernel_arg_i64: ; MESA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x24 -; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 +; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 ; MESA-GCN: buffer_store_dwordx2 define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { @@ -613,7 +613,7 @@ define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwi ; MESA-VI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x24 ; MESA-GCN: buffer_store_dwordx2 -; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 +; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) { entry: store double %in, double addrspace(1)* %out @@ -630,10 +630,10 @@ entry: ; } ; FUNC-LABEL: {{^}}i65_arg: -; HSA-VI: kernarg_segment_byte_size = 24 -; HSA-VI: kernarg_segment_alignment = 4 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +; HSA-GFX9: kernarg_segment_byte_size = 24 +; HSA-GFX9: kernarg_segment_alignment = 4 +; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 +; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind { entry: store i65 %in, i65 addrspace(1)* %out, align 4 @@ -641,20 +641,20 @@ entry: } ; FUNC-LABEL: {{^}}i1_arg: -; HSA-VI: kernarg_segment_byte_size = 12 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 12 +; HSA-GFX9: kernarg_segment_alignment = 4 ; GCN: s_load_dword s ; GCN: s_and_b32 -; GCN: {{buffer|flat}}_store_byte +; GCN: {{buffer|flat|global}}_store_byte define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { store i1 %x, i1 addrspace(1)* %out, align 1 ret void } ; FUNC-LABEL: {{^}}i1_arg_zext_i32: -; HSA-VI: kernarg_segment_byte_size = 12 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 12 +; HSA-GFX9: kernarg_segment_alignment = 4 ; GCN: s_load_dword ; SGCN: buffer_store_dword @@ -665,11 +665,11 @@ define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwi } ; FUNC-LABEL: {{^}}i1_arg_zext_i64: -; HSA-VI: kernarg_segment_byte_size = 12 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 12 +; HSA-GFX9: kernarg_segment_alignment = 4 ; GCN: s_load_dword s -; GCN: {{buffer|flat}}_store_dwordx2 +; GCN: {{buffer|flat|global}}_store_dwordx2 define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { %ext = zext i1 %x to i64 store i64 %ext, i64 addrspace(1)* %out, align 8 @@ -677,11 +677,11 @@ define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwi } ; FUNC-LABEL: {{^}}i1_arg_sext_i32: -; HSA-VI: kernarg_segment_byte_size = 12 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 12 +; HSA-GFX9: kernarg_segment_alignment = 4 ; GCN: s_load_dword -; GCN: {{buffer|flat}}_store_dword +; GCN: {{buffer|flat|global}}_store_dword define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { %ext = sext i1 %x to i32 store i32 %ext, i32addrspace(1)* %out, align 4 @@ -689,12 +689,12 @@ define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwi } ; FUNC-LABEL: {{^}}i1_arg_sext_i64: -; HSA-VI: kernarg_segment_byte_size = 12 -; HSA-VI: kernarg_segment_alignment = 4 +; HSA-GFX9: kernarg_segment_byte_size = 12 +; HSA-GFX9: kernarg_segment_alignment = 4 ; GCN: s_load_dword ; GCN: s_bfe_i64 -; GCN: {{buffer|flat}}_store_dwordx2 +; GCN: {{buffer|flat|global}}_store_dwordx2 define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { %ext = sext i1 %x to i64 store i64 %ext, i64 addrspace(1)* %out, align 8 @@ -702,7 +702,7 @@ define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwi } ; FUNC-LABEL: {{^}}empty_struct_arg: -; HSA-VI: kernarg_segment_byte_size = 0 +; HSA-GFX9: kernarg_segment_byte_size = 0 define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { ret void } @@ -718,11 +718,11 @@ define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { ; FIXME: Total argument size is computed wrong ; FUNC-LABEL: {{^}}struct_argument_alignment: -; HSA-VI: kernarg_segment_byte_size = 40 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 +; HSA-GFX9: kernarg_segment_byte_size = 40 +; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 +; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { %val0 = extractvalue {i32, i64} %arg0, 0 %val1 = extractvalue {i32, i64} %arg0, 1 @@ -738,11 +738,11 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; No padding between i8 and next struct, but round up at end to 4 byte ; multiple. ; FUNC-LABEL: {{^}}packed_struct_argument_alignment: -; HSA-VI: kernarg_segment_byte_size = 28 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 +; HSA-GFX9: kernarg_segment_byte_size = 28 +; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 +; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17 +; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { %val0 = extractvalue <{i32, i64}> %arg0, 0 %val1 = extractvalue <{i32, i64}> %arg0, 1 @@ -756,12 +756,12 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, } ; GCN-LABEL: {{^}}struct_argument_alignment_after: -; HSA-VI: kernarg_segment_byte_size = 64 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 -; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30 +; HSA-GFX9: kernarg_segment_byte_size = 64 +; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 +; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 +; HSA-GFX9: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30 define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { %val0 = extractvalue {i32, i64} %arg0, 0 %val1 = extractvalue {i32, i64} %arg0, 1 @@ -776,10 +776,10 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, } ; GCN-LABEL: {{^}}array_3xi32: -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc +; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 +; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 +; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0xc define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { store volatile i16 %arg0, i16 addrspace(1)* undef store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef @@ -788,13 +788,19 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; FIXME: Why not all scalar loads? ; GCN-LABEL: {{^}}array_3xi16: -; HSA-VI: s_add_u32 s{{[0-9]+}}, s4, 2 -; HSA-VI: s_addc_u32 s{{[0-9]+}}, s5, 0 -; HSA-VI: flat_load_ushort -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 +; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:2 +; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4 +; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:6 define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { store volatile i8 %arg0, i8 addrspace(1)* undef store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef ret void } + +; GCN-LABEL: {{^}}small_array_round_down_offset: +; HSA-GFX9: global_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:1 +define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { + %val = extractvalue [1 x i8] %arg, 0 + store volatile i8 %val, i8 addrspace(1)* undef + ret void +} diff --git a/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll index a1bb6c28e7402bed39aa12da3631becacaf2cd3b..b7344cfb33c63c975e3b9fc49b3020d446217a37 100644 --- a/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll +++ b/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-ir-lower-kernel-arguments=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -amdgpu-ir-lower-kernel-arguments=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s ; Repeat of some problematic tests in kernel-args.ll, with the IR ; argument lowering pass disabled. Struct padding needs to be diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll index d8cf52341e39fbbb3b692a7ab96fbbd42dac1f49..0343052601f430d1637dabba3e58e722f38ac473 100644 --- a/test/CodeGen/AMDGPU/large-alloca-compute.ll +++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -1,8 +1,8 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s ; RUN: llc -march=amdgcn -mcpu=carrizo --show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s ; RUN: llc -march=amdgcn -mcpu=gfx900 --show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=ALL %s -; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s -; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3,-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s ; FIXME: align on alloca seems to be ignored for private_segment_alignment diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll deleted file mode 100644 index 6374c9e02edde48eea02d7b7cc421cad2c5b3242..0000000000000000000000000000000000000000 --- a/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll +++ /dev/null @@ -1,35 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}kill_gs_const: -; SI-NOT: v_cmpx_le_f32 -; SI: s_mov_b64 exec, 0 -define amdgpu_gs void @kill_gs_const() { -main_body: - %tmp = icmp ule i32 0, 3 - %tmp1 = select i1 %tmp, float 1.000000e+00, float -1.000000e+00 - call void @llvm.AMDGPU.kill(float %tmp1) - %tmp2 = icmp ule i32 3, 0 - %tmp3 = select i1 %tmp2, float 1.000000e+00, float -1.000000e+00 - call void @llvm.AMDGPU.kill(float %tmp3) - ret void -} - -; SI-LABEL: {{^}}kill_vcc_implicit_def: -; SI-NOT: v_cmp_gt_f32_e32 vcc, -; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}} -; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}} -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] -define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(4)* byval %arg, [17 x <16 x i8>] addrspace(4)* byval %arg1, [17 x <4 x i32>] addrspace(4)* byval %arg2, [34 x <8 x i32>] addrspace(4)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) { -entry: - %tmp0 = fcmp olt float %arg13, 0.000000e+00 - call void @llvm.AMDGPU.kill(float %arg14) - %tmp1 = select i1 %tmp0, float 1.000000e+00, float 0.000000e+00 - call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 - ret void -} - -declare void @llvm.AMDGPU.kill(float) #0 -declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 - -attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll deleted file mode 100644 index 1d370aba6da3496929725f665470c1aad7cb6ee3..0000000000000000000000000000000000000000 --- a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll +++ /dev/null @@ -1,54 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s - -; Example of a simple geometry shader loading vertex attributes from the -; ESGS ring buffer - -; FIXME: Out of bounds immediate offset crashes - -; CHECK-LABEL: {{^}}main: -; CHECK: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 glc slc -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc slc -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen glc slc -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen glc slc -; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc - -define amdgpu_vs void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <32 x i8>] addrspace(4)* byval %arg2, [2 x <4 x i32>] addrspace(4)* byval %arg3, [17 x <4 x i32>] addrspace(4)* inreg %arg4, [17 x <4 x i32>] addrspace(4)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) { -main_body: - %tmp = getelementptr [2 x <4 x i32>], [2 x <4 x i32>] addrspace(4)* %arg3, i64 0, i32 1 - %tmp10 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 - %tmp11 = shl i32 %arg6, 2 - %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0) - %tmp13 = bitcast i32 %tmp12 to float - %tmp14 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 %tmp11, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0) - %tmp15 = bitcast i32 %tmp14 to float - %tmp16 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 %tmp11, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0) - %tmp17 = bitcast i32 %tmp16 to float - %tmp18 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0) - %tmp19 = bitcast i32 %tmp18 to float - - %tmp20 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 123, i32 1, i32 1, i32 1, i32 1, i32 0) - %tmp21 = bitcast i32 %tmp20 to float - - %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0) - %tmp23 = bitcast i32 %tmp22 to float - - call void @llvm.amdgcn.exp.f32(i32 15, i32 12, float %tmp13, float %tmp15, float %tmp17, float %tmp19, i1 false, i1 false) - call void @llvm.amdgcn.exp.f32(i32 15, i32 12, float %tmp21, float %tmp23, float %tmp23, float %tmp23, i1 true, i1 false) - ret void -} - -; Function Attrs: nounwind readonly -declare i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 - -; Function Attrs: nounwind readonly -declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #0 - -declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 - -attributes #0 = { nounwind readonly } -attributes #1 = { nounwind inaccessiblememonly } - -!0 = !{!"const", !1, i32 1} -!1 = !{!"tbaa root"} diff --git a/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll deleted file mode 100644 index 01b76422c03f83772c771e73149c9cebea9d988b..0000000000000000000000000000000000000000 --- a/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll +++ /dev/null @@ -1,75 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}test1: -;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:32 glc slc -define amdgpu_vs void @test1(i32 %a1, i32 %vaddr) { - %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 - call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata, - i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1, - i32 1, i32 0) - ret void -} - -;CHECK-LABEL: {{^}}test1_idx: -;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:32 glc slc -define amdgpu_vs void @test1_idx(i32 %a1, i32 %vaddr) { - %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 - call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata, - i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 0, i32 1, i32 1, - i32 1, i32 0) - ret void -} - -;CHECK-LABEL: {{^}}test1_scalar_offset: -;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, {{s[0-9]+}} idxen offset:32 glc slc -define amdgpu_vs void @test1_scalar_offset(i32 %a1, i32 %vaddr, i32 inreg %soffset) { - %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 - call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata, - i32 4, i32 %vaddr, i32 %soffset, i32 32, i32 14, i32 4, i32 0, i32 1, i32 1, - i32 1, i32 0) - ret void -} - -;CHECK-LABEL: {{^}}test1_no_glc_slc: -;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:32 -define amdgpu_vs void @test1_no_glc_slc(i32 %a1, i32 %vaddr) { - %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 - call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata, - i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 0, - i32 0, i32 0) - ret void -} - -;CHECK-LABEL: {{^}}test2: -;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 offen offset:24 glc slc -define amdgpu_vs void @test2(i32 %a1, i32 %vaddr) { - %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 - call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata, - i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1, - i32 1, i32 0) - ret void -} - -;CHECK-LABEL: {{^}}test3: -;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:11, nfmt:4, 0 offen offset:16 glc slc -define amdgpu_vs void @test3(i32 %a1, i32 %vaddr) { - %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0 - call void @llvm.SI.tbuffer.store.v2i32(<4 x i32> undef, <2 x i32> %vdata, - i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1, - i32 1, i32 0) - ret void -} - -;CHECK-LABEL: {{^}}test4: -;CHECK: tbuffer_store_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:4, nfmt:4, 0 offen offset:8 glc slc -define amdgpu_vs void @test4(i32 %vdata, i32 %vaddr) { - call void @llvm.SI.tbuffer.store.i32(<4 x i32> undef, i32 %vdata, - i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1, - i32 1, i32 0) - ret void -} - -declare void @llvm.SI.tbuffer.store.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -declare void @llvm.SI.tbuffer.store.v2i32(<4 x i32>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -declare void @llvm.SI.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll index 49ca7d405724b6693c2235b59199c0b1010e1ff6..ba9f206f1512ca0110c0bce7a746c65ca526933e 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -193,6 +193,22 @@ main_body: ret void } +;CHECK-LABEL: {{^}}buffer_load_x3_offen_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +;CHECK: s_waitcnt +define amdgpu_ps void @buffer_load_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { +main_body: + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 12 + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + %r1 = extractelement <2 x float> %vr1, i32 0 + %r2 = extractelement <2 x float> %vr1, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true) + ret void +} + ;CHECK-LABEL: {{^}}buffer_load_x1_offset_merged: ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 @@ -227,6 +243,20 @@ main_body: ret void } +;CHECK-LABEL: {{^}}buffer_load_x3_offset_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;CHECK: s_waitcnt +define amdgpu_ps void @buffer_load_x3_offset_merged(<4 x i32> inreg %rsrc) { +main_body: + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) + %r1 = extractelement <2 x float> %vr1, i32 0 + %r2 = extractelement <2 x float> %vr1, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true) + ret void +} + declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll index 48a30c7f57511556be18ba7223f3fdef3f7b3c87..7e2996ea92f419266d4f388c6db8ab8a44eb1584 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll @@ -147,6 +147,41 @@ define amdgpu_ps void @buffer_store_x2_offen_merged(<4 x i32> inreg %rsrc, i32 % ret void } +;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 +define amdgpu_ps void @buffer_store_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3) { + %a1 = add i32 %a, 28 + %a2 = add i32 %a, 32 + %a3 = add i32 %a, 36 + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +define amdgpu_ps void @buffer_store_x3_offen_merged2(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, float %v2) { + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 12 + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged3: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +define amdgpu_ps void @buffer_store_x3_offen_merged3(<4 x i32> inreg %rsrc, i32 %a, float %v1, <2 x float> %v2) { + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 8 + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + ret void +} + ;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged: ;CHECK-NOT: s_waitcnt ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 @@ -164,12 +199,40 @@ define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, floa ;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) { +define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1, <2 x float> %v2) { call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) ret void } +;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +define amdgpu_ps void @buffer_store_x3_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged2: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +define amdgpu_ps void @buffer_store_x3_offset_merged2(<4 x i32> inreg %rsrc, float %v1, <2 x float> %v2) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged3: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:8 +define amdgpu_ps void @buffer_store_x3_offset_merged3(<4 x i32> inreg %rsrc, <2 x float> %v1, float %v2) { + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + ret void +} + declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll index b6f9f951d9bda2931737f88dbd3519e2bdaa9fd4..a2f2ced72e7adc97fecc0e9bee73612dc95ab63e 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: not llc -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s ; ERROR: in function test{{.*}}: unsupported hsa intrinsic without hsa target diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll index 5853d8d8e4e1d296e2b5e959b6ebbf2e6eb2d49c..ee039a392e2644658b3914b08a9b99f87a6d1475 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL %s +; RUN: llc -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,OS-MESA3D,MESA,ALL %s ; RUN: llc -mtriple=amdgcn-mesa-unknown -verify-machineinstrs < %s | FileCheck -check-prefixes=OS-UNKNOWN,MESA,ALL %s diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll index f8c60451ac73656215edb696f9200b75d027e9f4..6866d9537b3e55018df4c1edb49a491b9efdc00d 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: not llc -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s ; ERROR: in function test{{.*}}: unsupported hsa intrinsic without hsa target diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll new file mode 100644 index 0000000000000000000000000000000000000000..eae81fee4393690e44d99399d74c3cc26d041ed9 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll @@ -0,0 +1,114 @@ +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}s_buffer_load_imm: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x4 +define amdgpu_ps void @s_buffer_load_imm(<4 x i32> inreg %desc) { +main_body: + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0) + %bitcast = bitcast i32 %load to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_load_index: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +define amdgpu_ps void @s_buffer_load_index(<4 x i32> inreg %desc, i32 inreg %index) { +main_body: + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %index, i32 0) + %bitcast = bitcast i32 %load to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_loadx2_imm: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x40 +define amdgpu_ps void @s_buffer_loadx2_imm(<4 x i32> inreg %desc) { +main_body: + %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 64, i32 0) + %bitcast = bitcast <2 x i32> %load to <2 x float> + %x = extractelement <2 x float> %bitcast, i32 0 + %y = extractelement <2 x float> %bitcast, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_loadx2_index: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +define amdgpu_ps void @s_buffer_loadx2_index(<4 x i32> inreg %desc, i32 inreg %index) { +main_body: + %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %index, i32 0) + %bitcast = bitcast <2 x i32> %load to <2 x float> + %x = extractelement <2 x float> %bitcast, i32 0 + %y = extractelement <2 x float> %bitcast, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_loadx4_imm: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0xc8 +define amdgpu_ps void @s_buffer_loadx4_imm(<4 x i32> inreg %desc) { +main_body: + %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 200, i32 0) + %bitcast = bitcast <4 x i32> %load to <4 x float> + %x = extractelement <4 x float> %bitcast, i32 0 + %y = extractelement <4 x float> %bitcast, i32 1 + %z = extractelement <4 x float> %bitcast, i32 2 + %w = extractelement <4 x float> %bitcast, i32 3 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_loadx4_index: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +define amdgpu_ps void @s_buffer_loadx4_index(<4 x i32> inreg %desc, i32 inreg %index) { +main_body: + %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %index, i32 0) + %bitcast = bitcast <4 x i32> %load to <4 x float> + %x = extractelement <4 x float> %bitcast, i32 0 + %y = extractelement <4 x float> %bitcast, i32 1 + %z = extractelement <4 x float> %bitcast, i32 2 + %w = extractelement <4 x float> %bitcast, i32 3 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex2: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x4 +define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) { +main_body: + %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0) + %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0) + %x = bitcast i32 %load0 to float + %y = bitcast i32 %load1 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex4: +;CHECK-NOT: s_waitcnt; +;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x8 +define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) { +main_body: + %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0) + %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 12, i32 0) + %load2 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 16, i32 0) + %load3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 20, i32 0) + %x = bitcast i32 %load0 to float + %y = bitcast i32 %load1 to float + %z = bitcast i32 %load2 to float + %w = bitcast i32 %load3 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + ret void +} + +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) +declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) +declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32) +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll index 9fe0c0f8fc1fda1ad9efe9641fd245aced9433d8..18abf607aea5a2ddef0df877bd05145a0a1a69dd 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-OPT %s -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOOPT %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-OPT %s +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOOPT %s ; VI-LABEL: {{^}}dpp_test: ; VI: v_mov_b32_e32 v0, s{{[0-9]+}} diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll index 349e7f0f0e8da3c9fbb82cc39277bc7632f5ea00..377785e0ca2d14d4c9c709afde3aa39f1b27f5ef 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=CI-HSA %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=CI-HSA %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA %s ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=UNKNOWN-OS -check-prefix=SI-MESA %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=UNKNOWN-OS -check-prefix=VI-MESA %s -; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s -; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s declare i32 @llvm.amdgcn.workgroup.id.x() #0 declare i32 @llvm.amdgcn.workgroup.id.y() #0 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll index 8b80998cab6fe90f1bae91f2ba69864072555db0..13e204b03a039dd3b9def4ab44e4ce98ecf3f596 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=CI-HSA %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=CI-HSA %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA %s ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=SI-MESA %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=VI-MESA %s -; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s -; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workitem.id.y() #0 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll index 1946e6a3618673d69a4373e6264f80d8f57ffc30..57af85b67bf3be099740af8de09b67a6d19a9a15 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll @@ -33,20 +33,17 @@ main_body: ;CHECK-LABEL: {{^}}kill: ;CHECK: v_cmp_eq_u32_e32 [[CMP:[^,]+]], v0, v1 ;CHECK: s_wqm_b64 [[WQM:[^,]+]], [[CMP]] -;FIXME: This could just be: s_and_b64 exec, exec, [[WQM]] -;CHECK: v_cndmask_b32_e64 [[KILL:[^,]+]], -1.0, 1.0, [[WQM]] -;CHECK: v_cmpx_le_f32_e32 {{[^,]+}}, 0, [[KILL]] +;CHECK: s_and_b64 exec, exec, [[WQM]] ;CHECK: s_endpgm define amdgpu_ps void @kill(i32 %v0, i32 %v1) #1 { main_body: %c = icmp eq i32 %v0, %v1 %w = call i1 @llvm.amdgcn.wqm.vote(i1 %c) - %r = select i1 %w, float 1.0, float -1.0 - call void @llvm.AMDGPU.kill(float %r) + call void @llvm.amdgcn.kill(i1 %w) ret void } -declare void @llvm.AMDGPU.kill(float) #1 +declare void @llvm.amdgcn.kill(i1) #1 declare i1 @llvm.amdgcn.wqm.vote(i1) attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/load-lo16.ll b/test/CodeGen/AMDGPU/load-lo16.ll index eec9144cda285f49631e24f46088437d87a31aba..b7512f24c207743f6142a0b389954264fa282f63 100644 --- a/test/CodeGen/AMDGPU/load-lo16.ll +++ b/test/CodeGen/AMDGPU/load-lo16.ll @@ -305,8 +305,11 @@ entry: ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: s_setpc_b64 -; VI: flat_load_ubyte v{{[0-9]+}} -; VI: v_or_b32_e32 +; VI: flat_load_ubyte [[LO:v[0-9]+]] +; VI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, v2 +; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x5040c00 +; VI: v_perm_b32 [[RES:v[0-9]+]], [[HI]], [[LO]], [[MASK]] +; VI: flat_store_dword v[0:1], [[RES]] define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> diff --git a/test/CodeGen/AMDGPU/local-64.ll b/test/CodeGen/AMDGPU/local-64.ll index 87c18a7fc449fbf78100e07a99d9b709072f0a79..f0dca0780aaef82c1da1ca87aeb4603f098f564f 100644 --- a/test/CodeGen/AMDGPU/local-64.ll +++ b/test/CodeGen/AMDGPU/local-64.ll @@ -48,7 +48,7 @@ define amdgpu_kernel void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i ; The LDS offset will be 65536 bytes, which is larger than the size of LDS on ; SI, which is why it is being OR'd with the base pointer. -; SI-DAG: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 +; SI-DAG: s_bitset1_b32 [[ADDR:s[0-9]+]], 16 ; CI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 ; VI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 ; GFX9-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 diff --git a/test/CodeGen/AMDGPU/local-atomics64.ll b/test/CodeGen/AMDGPU/local-atomics64.ll index fc7f9f7615ad443a634782bb133e0eae202d7e75..af64f2b1d578ee16eec77410793042a3c6e58a74 100644 --- a/test/CodeGen/AMDGPU/local-atomics64.ll +++ b/test/CodeGen/AMDGPU/local-atomics64.ll @@ -364,7 +364,7 @@ define amdgpu_kernel void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) noun ; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 ; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 ; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN: ds_add_u64 {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll index 3528eaff8187d6336e467e2b8e4b6df32409a9ff..df1ef1cba71c8e5c21d33ae1aba4a4c7b5289f66 100644 --- a/test/CodeGen/AMDGPU/madak.ll +++ b/test/CodeGen/AMDGPU/madak.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8_9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8_9 %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare float @llvm.fabs.f32(float) nounwind readnone @@ -8,8 +8,10 @@ declare float @llvm.fabs.f32(float) nounwind readnone ; GCN-LABEL: {{^}}madak_f32: ; GFX6: buffer_load_dword [[VA:v[0-9]+]] ; GFX6: buffer_load_dword [[VB:v[0-9]+]] -; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] +; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; GCN: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -88,8 +90,10 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %o ; GCN-LABEL: {{^}}madak_inline_imm_f32: ; GFX6: buffer_load_dword [[VA:v[0-9]+]] ; GFX6: buffer_load_dword [[VB:v[0-9]+]] -; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] +; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll b/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll new file mode 100644 index 0000000000000000000000000000000000000000..f692c763c0b94e95bf02779c6aaaaa5676b58d86 --- /dev/null +++ b/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll @@ -0,0 +1,222 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx800 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s + +; FUNC-LABEL: {{^}}system_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_acquire() { +entry: + fence acquire + ret void +} + +; FUNC-LABEL: {{^}}system_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_release() { +entry: + fence release + ret void +} + +; FUNC-LABEL: {{^}}system_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_acq_rel() { +entry: + fence acq_rel + ret void +} + +; FUNC-LABEL: {{^}}system_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_seq_cst() { +entry: + fence seq_cst + ret void +} + +; FUNC-LABEL: {{^}}singlethread_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_acquire() { +entry: + fence syncscope("singlethread") acquire + ret void +} + +; FUNC-LABEL: {{^}}singlethread_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_release() { +entry: + fence syncscope("singlethread") release + ret void +} + +; FUNC-LABEL: {{^}}singlethread_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_acq_rel() { +entry: + fence syncscope("singlethread") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}singlethread_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_seq_cst() { +entry: + fence syncscope("singlethread") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}agent_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_acquire() { +entry: + fence syncscope("agent") acquire + ret void +} + +; FUNC-LABEL: {{^}}agent_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_release() { +entry: + fence syncscope("agent") release + ret void +} + +; FUNC-LABEL: {{^}}agent_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_acq_rel() { +entry: + fence syncscope("agent") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}agent_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_seq_cst() { +entry: + fence syncscope("agent") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}workgroup_acquire: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_acquire() { +entry: + fence syncscope("workgroup") acquire + ret void +} + +; FUNC-LABEL: {{^}}workgroup_release: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_release() { +entry: + fence syncscope("workgroup") release + ret void +} + +; FUNC-LABEL: {{^}}workgroup_acq_rel: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_acq_rel() { +entry: + fence syncscope("workgroup") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}workgroup_seq_cst: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_seq_cst() { +entry: + fence syncscope("workgroup") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}wavefront_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_acquire() { +entry: + fence syncscope("wavefront") acquire + ret void +} + +; FUNC-LABEL: {{^}}wavefront_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_release() { +entry: + fence syncscope("wavefront") release + ret void +} + +; FUNC-LABEL: {{^}}wavefront_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_acq_rel() { +entry: + fence syncscope("wavefront") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}wavefront_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_seq_cst() { +entry: + fence syncscope("wavefront") seq_cst + ret void +} diff --git a/test/CodeGen/AMDGPU/memory-legalizer-load.ll b/test/CodeGen/AMDGPU/memory-legalizer-load.ll index 76ab6259b4253ea4a952b0ee132d2c6841db2114..179cb3f625d9853c69ea48e6ead5e2f8cedd47a0 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-load.ll +++ b/test/CodeGen/AMDGPU/memory-legalizer-load.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -319,7 +319,7 @@ entry: ; GCN-LABEL: {{^}}nontemporal_global_1: ; GFX8: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} -; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off glc slc{{$}} +; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} define amdgpu_kernel void @nontemporal_global_1( i32 addrspace(1)* %in, i32* %out) { entry: diff --git a/test/CodeGen/AMDGPU/memory-legalizer-mesa3d.ll b/test/CodeGen/AMDGPU/memory-legalizer-mesa3d.ll new file mode 100644 index 0000000000000000000000000000000000000000..609dc0400c490034e20fe58c51a86dafa8631e05 --- /dev/null +++ b/test/CodeGen/AMDGPU/memory-legalizer-mesa3d.ll @@ -0,0 +1,222 @@ +; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s +; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s +; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx800 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s +; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s + +; FUNC-LABEL: {{^}}system_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_acquire() { +entry: + fence acquire + ret void +} + +; FUNC-LABEL: {{^}}system_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_release() { +entry: + fence release + ret void +} + +; FUNC-LABEL: {{^}}system_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_acq_rel() { +entry: + fence acq_rel + ret void +} + +; FUNC-LABEL: {{^}}system_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_seq_cst() { +entry: + fence seq_cst + ret void +} + +; FUNC-LABEL: {{^}}singlethread_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_acquire() { +entry: + fence syncscope("singlethread") acquire + ret void +} + +; FUNC-LABEL: {{^}}singlethread_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_release() { +entry: + fence syncscope("singlethread") release + ret void +} + +; FUNC-LABEL: {{^}}singlethread_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_acq_rel() { +entry: + fence syncscope("singlethread") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}singlethread_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_seq_cst() { +entry: + fence syncscope("singlethread") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}agent_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_acquire() { +entry: + fence syncscope("agent") acquire + ret void +} + +; FUNC-LABEL: {{^}}agent_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_release() { +entry: + fence syncscope("agent") release + ret void +} + +; FUNC-LABEL: {{^}}agent_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_acq_rel() { +entry: + fence syncscope("agent") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}agent_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_seq_cst() { +entry: + fence syncscope("agent") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}workgroup_acquire: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_acquire() { +entry: + fence syncscope("workgroup") acquire + ret void +} + +; FUNC-LABEL: {{^}}workgroup_release: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_release() { +entry: + fence syncscope("workgroup") release + ret void +} + +; FUNC-LABEL: {{^}}workgroup_acq_rel: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_acq_rel() { +entry: + fence syncscope("workgroup") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}workgroup_seq_cst: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_seq_cst() { +entry: + fence syncscope("workgroup") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}wavefront_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_acquire() { +entry: + fence syncscope("wavefront") acquire + ret void +} + +; FUNC-LABEL: {{^}}wavefront_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_release() { +entry: + fence syncscope("wavefront") release + ret void +} + +; FUNC-LABEL: {{^}}wavefront_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_acq_rel() { +entry: + fence syncscope("wavefront") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}wavefront_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_seq_cst() { +entry: + fence syncscope("wavefront") seq_cst + ret void +} diff --git a/test/CodeGen/AMDGPU/memory-legalizer-store.ll b/test/CodeGen/AMDGPU/memory-legalizer-store.ll index 7bec74d36c81e215b1e2a1c33c630a3cb01583eb..87c43949df6a7559e281980dec3167906e83544c 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-store.ll +++ b/test/CodeGen/AMDGPU/memory-legalizer-store.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -240,7 +240,7 @@ entry: ; GCN-LABEL: {{^}}nontemporal_global_1: ; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} -; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}} +; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} define amdgpu_kernel void @nontemporal_global_1( i32* %in, i32 addrspace(1)* %out) { entry: diff --git a/test/CodeGen/AMDGPU/memory_clause.ll b/test/CodeGen/AMDGPU/memory_clause.ll index f4a66f83fd94b6af0a977c4ce3a566ef09880d37..9ae068a4340d557ed45ff2c1dcd7e7092866d6f9 100644 --- a/test/CodeGen/AMDGPU/memory_clause.ll +++ b/test/CodeGen/AMDGPU/memory_clause.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}vector_clause: ; GCN: global_load_dwordx4 @@ -105,7 +105,7 @@ bb: } ; GCN-LABEL: {{^}}vector_clause_indirect: -; GCN: global_load_dwordx2 [[ADDR:v\[[0-9:]+\]]], v[{{[0-9:]+}}], off +; GCN: global_load_dwordx2 [[ADDR:v\[[0-9:]+\]]], v[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll index 2b2f67cd1b3d59cb1829fe515eda1cfbbde27aa0..7d0c0db22b7e3750630fe230265dcd7619bf1111 100644 --- a/test/CodeGen/AMDGPU/merge-stores.ll +++ b/test/CodeGen/AMDGPU/merge-stores.ll @@ -164,8 +164,8 @@ define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float ad } ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32: -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dword +; SI-DAG: buffer_store_dwordx3 +; SI-NOT: buffer_store_dwordx2 ; SI-NOT: buffer_store_dword ; GCN: s_endpgm define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { @@ -274,11 +274,9 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace } ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32: -; SI-DAG: buffer_load_dwordx2 -; SI-DAG: buffer_load_dword v +; SI-DAG: buffer_load_dwordx3 ; GCN: s_waitcnt -; SI-DAG: buffer_store_dword v -; SI-DAG: buffer_store_dwordx2 v +; SI-DAG: buffer_store_dwordx3 v ; GCN: s_endpgm define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 @@ -563,8 +561,7 @@ define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32: ; GCN: buffer_store_dwordx4 -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dword v +; GCN: buffer_store_dwordx3 define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { store i32 34, i32 addrspace(1)* %out, align 4 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 @@ -611,13 +608,11 @@ define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* ; GCN-LABEL: {{^}}copy_v3i32_align4: ; GCN-NOT: SCRATCH_RSRC_DWORD -; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-NOT: offen ; GCN: s_waitcnt vmcnt ; GCN-NOT: offen -; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; GCN-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: ScratchSize: 0{{$}} define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 { @@ -644,13 +639,11 @@ define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %ou ; GCN-LABEL: {{^}}copy_v3f32_align4: ; GCN-NOT: SCRATCH_RSRC_DWORD -; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-NOT: offen ; GCN: s_waitcnt vmcnt ; GCN-NOT: offen -; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; GCN-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: ScratchSize: 0{{$}} define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4 diff --git a/test/CodeGen/AMDGPU/mode-register.mir b/test/CodeGen/AMDGPU/mode-register.mir new file mode 100644 index 0000000000000000000000000000000000000000..bf06a01d4c3c5685507bff8ca14c457a0867ea55 --- /dev/null +++ b/test/CodeGen/AMDGPU/mode-register.mir @@ -0,0 +1,459 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-mode-register %s -o - | FileCheck %s + +--- +# check that the mode is changed to rtz from default rtn for interp f16 +# CHECK-LABEL: name: interp_f16_default +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NEXT: V_INTERP_P1LL_F16 +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK-NEXT: V_ADD_F16_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: interp_f16_default + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + $m0 = S_MOV_B32 killed $sgpr2 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that the mode is not changed for interp f16 when the mode is already RTZ +# CHECK-LABEL: name: interp_f16_explicit_rtz +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NEXT: V_MOV_B32_e32 +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK-NEXT: V_ADD_F16_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: interp_f16_explicit_rtz + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + $m0 = S_MOV_B32 killed $sgpr2 + S_SETREG_IMM32_B32 3, 2177 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that explicit RTN mode change is registered +# CHECK-LABEL: name: explicit_rtn +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NEXT: V_INTERP_P1LL_F16 +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK-NEXT: V_ADD_F16_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: explicit_rtn + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + $m0 = S_MOV_B32 killed $sgpr2 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + S_SETREG_IMM32_B32 0, 2177 + $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that the mode is unchanged from RTN for F64 instruction +# CHECK-LABEL: name: rtn_default +# CHECK-LABEL: bb.0: +# CHECK-NOT: S_SETREG_IMM32_B32 +# CHECK: V_FRACT_F64 + +name: rtn_default + +body: | + bb.0: + liveins: $vgpr1_vgpr2 + $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec + S_ENDPGM +... +--- +# check that the mode is changed from RTZ to RTN for F64 instruction +# CHECK-LABEL: name: rtn_from_rtz +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NEXT: S_SETREG_IMM32_B32 0, 2177 +# CHECK-NEXT: V_FRACT_F64 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: rtn_from_rtz + +body: | + bb.0: + liveins: $vgpr1_vgpr2 + S_SETREG_IMM32_B32 3, 2177 + $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec + S_ENDPGM +... +--- +# CHECK-LABEL: name: rtz_from_rtn +# CHECK-LABEL: bb.1: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: rtz_from_rtn + +body: | + bb.0: + successors: %bb.1 + liveins: $vgpr1_vgpr2 + $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_ENDPGM +... +--- +# check that the mode is changed from RTZ to RTN for F64 instruction +# and back again for remaining interp instruction +# CHECK-LABEL: name: interp_f16_plus_sqrt_f64 +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P2_F16 +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64 +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P2_F16 + +name: interp_f16_plus_sqrt_f64 + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + $m0 = S_MOV_B32 killed $sgpr2 + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that an explicit change to the single precision mode has no effect +# CHECK-LABEL: name: single_precision_mode_change +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P2_F16 +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64 +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P2_F16 + +name: single_precision_mode_change + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + $m0 = S_MOV_B32 killed $sgpr2 + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_SETREG_IMM32_B32 2, 2049 + $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that mode is propagated back to start of loop - first instruction is RTN but needs +# setreg as RTZ is set in loop +# CHECK-LABEL: name: loop +# CHECK-LABEL: bb.1: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64 +# CHECK-LABEL: bb.2: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P1LL_F16 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: loop + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.1, %bb.3 + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_BRANCH %bb.3 + + bb.3: + S_ENDPGM +... +--- +# two back-edges to same node with different modes +# CHECK-LABEL: name: double_loop +# CHECK-NOT: S_SETREG_IMM32_B32 +# CHECK-LABEL: bb.2: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64_e32 +# CHECK-LABEL: bb.4: +# CHECK: S_SETREG_IMM32_B32 3, 2177 + +name: double_loop + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + S_NOP 1 + S_BRANCH %bb.2 + + bb.2: + successors: %bb.1, %bb.3 + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + S_NOP 1 + S_BRANCH %bb.4 + + bb.4: + successors: %bb.5 + S_NOP 1 + S_BRANCH %bb.5 + + bb.5: + successors: %bb.1, %bb.6 + S_SETREG_IMM32_B32 3, 2177 + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_BRANCH %bb.6 + + bb.6: + S_ENDPGM +... +--- +# check that mode is propagated back to start of loop and through a block that +# neither sets or uses the mode. +# CHECK-LABEL: name: loop_indirect +# CHECK_NOT: S_SETREG_IMM32_B32 +# CHECK-LABEL: bb.3: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P1LL_F16 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: loop_indirect + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + S_NOP 1 + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + S_NOP 1 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.1, %bb.4 + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_BRANCH %bb.4 + + bb.4: + S_ENDPGM +... +--- +# check that multiple mode values are propagated to a block that uses the mode +# CHECK-LABEL: name: multiple_mode_direct +# CHECK-LABEL: bb.3: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: multiple_mode_direct + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2, %bb.3 + S_CBRANCH_VCCZ %bb.2, implicit $vcc + S_BRANCH %bb.3 + + bb.2: + successors: %bb.3 + S_SETREG_IMM32_B32 3, 2177 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_BRANCH %bb.4 + + bb.4: + S_ENDPGM +... +--- +# check that multiple mode values are propagated through a block that neither +# sets or uses the mode. +# CHECK-LABEL: name: multiple_mode_indirect +# CHECK-LABEL: bb.4: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: multiple_mode_indirect + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2, %bb.3 + S_CBRANCH_VCCZ %bb.2, implicit $vcc + S_BRANCH %bb.3 + + bb.2: + successors: %bb.3 + S_SETREG_IMM32_B32 3, 2177 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + S_NOP 1 + S_BRANCH %bb.4 + + bb.4: + successors: %bb.5 + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_BRANCH %bb.5 + + bb.5: + S_ENDPGM +... +--- +# CHECK-LABEL: name: pass_through_blocks +# CHECK-LABEL: bb.0: +# CHECK: V_FRACT_F64_e32 +# CHECK-NEXT: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: pass_through_blocks + +body: | + bb.0: + successors: %bb.1 + liveins: $vgpr1_vgpr2 + $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + S_BRANCH %bb.4 + + bb.4: + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_ENDPGM +... +--- +# check that multiple mode values are propagated +# CHECK-LABEL: name: if_then_else +# CHECK-LABEL: bb.3: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: if_then_else + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2, %bb.3 + S_CBRANCH_VCCZ %bb.3, implicit $vcc + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + S_SETREG_IMM32_B32 3, 2177 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_BRANCH %bb.4 + + bb.4: + S_ENDPGM +... diff --git a/test/CodeGen/AMDGPU/movreld-bug.ll b/test/CodeGen/AMDGPU/movreld-bug.ll index 0cc8a5271bd7383be5c7f15666f11b58365dc102..d07cb5f3fa63ca086e0304491d6c47ce7a38fa02 100644 --- a/test/CodeGen/AMDGPU/movreld-bug.ll +++ b/test/CodeGen/AMDGPU/movreld-bug.ll @@ -7,8 +7,8 @@ ; GCN: ; return define amdgpu_ps float @main(i32 inreg %arg) #0 { main_body: - %tmp24 = insertelement <2 x float> undef, float 0.000000e+00, i32 %arg - %tmp25 = extractelement <2 x float> %tmp24, i32 1 + %tmp24 = insertelement <16 x float> undef, float 0.000000e+00, i32 %arg + %tmp25 = extractelement <16 x float> %tmp24, i32 1 ret float %tmp25 } diff --git a/test/CodeGen/AMDGPU/mubuf.ll b/test/CodeGen/AMDGPU/mubuf.ll index d5c22d22cca5ca27e0cab452bf3c6145d3c38b02..9cc48b41d6d7eaa5df9913592ae9527a48184b5a 100644 --- a/test/CodeGen/AMDGPU/mubuf.ll +++ b/test/CodeGen/AMDGPU/mubuf.ll @@ -60,7 +60,7 @@ main_body: %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0 %tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0 %tmp2 = shl i32 %6, 2 - %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) + %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 1) %tmp4 = add i32 %6, 16 %tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32> call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1) @@ -79,7 +79,7 @@ main_body: %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0 %tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0 %tmp2 = shl i32 %6, 2 - %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) + %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 1) %tmp4 = add i32 %6, 16 %tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32> call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1) @@ -176,7 +176,7 @@ define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 { ret void } -declare i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) attributes #0 = { nounwind readonly } diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll index 679fd7c987038ba7060e19c0ec5811923ad245d9..7988a246a0f792b8d7754c4e6b9fb45cac4b6355 100644 --- a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -60,11 +60,11 @@ ; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret: -; GCN: s_mov_b64 [[EXIT1:s\[[0-9]+:[0-9]+\]]], 0 -; GCN: v_cmp_lt_i32_e32 vcc, 1, -; GCN: s_mov_b64 [[EXIT0:s\[[0-9]+:[0-9]+\]]], 0 -; GCN: s_and_saveexec_b64 -; GCN: s_xor_b64 +; GCN-DAG: s_mov_b64 [[EXIT1:s\[[0-9]+:[0-9]+\]]], 0 +; GCN-DAG: v_cmp_lt_i32_e32 vcc, 1, +; GCN-DAG: s_mov_b64 [[EXIT0:s\[[0-9]+:[0-9]+\]]], 0 +; GCN-DAG: s_and_saveexec_b64 +; GCN-DAG: s_xor_b64 ; GCN: ; %LeafBlock1 ; GCN-NEXT: s_mov_b64 [[EXIT0]], exec @@ -92,8 +92,8 @@ ; GCN-NEXT: s_xor_b64 ; GCN: ; %exit1 -; GCN: ds_write_b32 -; GCN: s_andn2_b64 [[EXIT0]], [[EXIT0]], exec +; GCN-DAG: ds_write_b32 +; GCN-DAG: s_andn2_b64 [[EXIT0]], [[EXIT0]], exec ; GCN: ; %Flow5 ; GCN-NEXT: s_or_b64 exec, exec, diff --git a/test/CodeGen/AMDGPU/nand.ll b/test/CodeGen/AMDGPU/nand.ll new file mode 100644 index 0000000000000000000000000000000000000000..be7d9f677ec022691f8d654d936160e297ffe4bb --- /dev/null +++ b/test/CodeGen/AMDGPU/nand.ll @@ -0,0 +1,83 @@ +; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX600 %s +; RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX700 %s +; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX801 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX900 %s + +; GCN-LABEL: {{^}}scalar_nand_i32_one_use +; GCN: s_nand_b32 +define amdgpu_kernel void @scalar_nand_i32_one_use( + i32 addrspace(1)* %r0, i32 %a, i32 %b) { +entry: + %and = and i32 %a, %b + %r0.val = xor i32 %and, -1 + store i32 %r0.val, i32 addrspace(1)* %r0 + ret void +} + +; GCN-LABEL: {{^}}scalar_nand_i32_mul_use +; GCN-NOT: s_nand_b32 +; GCN: s_and_b32 +; GCN: s_not_b32 +; GCN: s_add_i32 +define amdgpu_kernel void @scalar_nand_i32_mul_use( + i32 addrspace(1)* %r0, i32 addrspace(1)* %r1, i32 %a, i32 %b) { +entry: + %and = and i32 %a, %b + %r0.val = xor i32 %and, -1 + %r1.val = add i32 %and, %a + store i32 %r0.val, i32 addrspace(1)* %r0 + store i32 %r1.val, i32 addrspace(1)* %r1 + ret void +} + +; GCN-LABEL: {{^}}scalar_nand_i64_one_use +; GCN: s_nand_b64 +define amdgpu_kernel void @scalar_nand_i64_one_use( + i64 addrspace(1)* %r0, i64 %a, i64 %b) { +entry: + %and = and i64 %a, %b + %r0.val = xor i64 %and, -1 + store i64 %r0.val, i64 addrspace(1)* %r0 + ret void +} + +; GCN-LABEL: {{^}}scalar_nand_i64_mul_use +; GCN-NOT: s_nand_b64 +; GCN: s_and_b64 +; GCN: s_not_b64 +; GCN: s_add_u32 +; GCN: s_addc_u32 +define amdgpu_kernel void @scalar_nand_i64_mul_use( + i64 addrspace(1)* %r0, i64 addrspace(1)* %r1, i64 %a, i64 %b) { +entry: + %and = and i64 %a, %b + %r0.val = xor i64 %and, -1 + %r1.val = add i64 %and, %a + store i64 %r0.val, i64 addrspace(1)* %r0 + store i64 %r1.val, i64 addrspace(1)* %r1 + ret void +} + +; GCN-LABEL: {{^}}vector_nand_i32_one_use +; GCN-NOT: s_nand_b32 +; GCN: v_and_b32 +; GCN: v_not_b32 +define i32 @vector_nand_i32_one_use(i32 %a, i32 %b) { +entry: + %and = and i32 %a, %b + %r = xor i32 %and, -1 + ret i32 %r +} + +; GCN-LABEL: {{^}}vector_nand_i64_one_use +; GCN-NOT: s_nand_b64 +; GCN: v_and_b32 +; GCN: v_and_b32 +; GCN: v_not_b32 +; GCN: v_not_b32 +define i64 @vector_nand_i64_one_use(i64 %a, i64 %b) { +entry: + %and = and i64 %a, %b + %r = xor i64 %and, -1 + ret i64 %r +} diff --git a/test/CodeGen/AMDGPU/nop-data.ll b/test/CodeGen/AMDGPU/nop-data.ll index 790e31c781aea2458b9aa5fd93ee69e16766c395..4e836a398ee9db5e821c39e9b277a6a8a4976e3d 100644 --- a/test/CodeGen/AMDGPU/nop-data.ll +++ b/test/CodeGen/AMDGPU/nop-data.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -filetype=obj < %s | llvm-objdump -d - -mcpu=fiji | FileCheck %s +; RUN: llc -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=fiji -filetype=obj < %s | llvm-objdump -d - -mcpu=fiji | FileCheck %s ; CHECK: kernel0: ; CHECK-NEXT: s_endpgm diff --git a/test/CodeGen/AMDGPU/nor.ll b/test/CodeGen/AMDGPU/nor.ll new file mode 100644 index 0000000000000000000000000000000000000000..8fddd39cad3ce876d800141e1c77230660c550e5 --- /dev/null +++ b/test/CodeGen/AMDGPU/nor.ll @@ -0,0 +1,83 @@ +; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX600 %s +; RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX700 %s +; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX801 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX900 %s + +; GCN-LABEL: {{^}}scalar_nor_i32_one_use +; GCN: s_nor_b32 +define amdgpu_kernel void @scalar_nor_i32_one_use( + i32 addrspace(1)* %r0, i32 %a, i32 %b) { +entry: + %or = or i32 %a, %b + %r0.val = xor i32 %or, -1 + store i32 %r0.val, i32 addrspace(1)* %r0 + ret void +} + +; GCN-LABEL: {{^}}scalar_nor_i32_mul_use +; GCN-NOT: s_nor_b32 +; GCN: s_or_b32 +; GCN: s_not_b32 +; GCN: s_add_i32 +define amdgpu_kernel void @scalar_nor_i32_mul_use( + i32 addrspace(1)* %r0, i32 addrspace(1)* %r1, i32 %a, i32 %b) { +entry: + %or = or i32 %a, %b + %r0.val = xor i32 %or, -1 + %r1.val = add i32 %or, %a + store i32 %r0.val, i32 addrspace(1)* %r0 + store i32 %r1.val, i32 addrspace(1)* %r1 + ret void +} + +; GCN-LABEL: {{^}}scalar_nor_i64_one_use +; GCN: s_nor_b64 +define amdgpu_kernel void @scalar_nor_i64_one_use( + i64 addrspace(1)* %r0, i64 %a, i64 %b) { +entry: + %or = or i64 %a, %b + %r0.val = xor i64 %or, -1 + store i64 %r0.val, i64 addrspace(1)* %r0 + ret void +} + +; GCN-LABEL: {{^}}scalar_nor_i64_mul_use +; GCN-NOT: s_nor_b64 +; GCN: s_or_b64 +; GCN: s_not_b64 +; GCN: s_add_u32 +; GCN: s_addc_u32 +define amdgpu_kernel void @scalar_nor_i64_mul_use( + i64 addrspace(1)* %r0, i64 addrspace(1)* %r1, i64 %a, i64 %b) { +entry: + %or = or i64 %a, %b + %r0.val = xor i64 %or, -1 + %r1.val = add i64 %or, %a + store i64 %r0.val, i64 addrspace(1)* %r0 + store i64 %r1.val, i64 addrspace(1)* %r1 + ret void +} + +; GCN-LABEL: {{^}}vector_nor_i32_one_use +; GCN-NOT: s_nor_b32 +; GCN: v_or_b32 +; GCN: v_not_b32 +define i32 @vector_nor_i32_one_use(i32 %a, i32 %b) { +entry: + %or = or i32 %a, %b + %r = xor i32 %or, -1 + ret i32 %r +} + +; GCN-LABEL: {{^}}vector_nor_i64_one_use +; GCN-NOT: s_nor_b64 +; GCN: v_or_b32 +; GCN: v_or_b32 +; GCN: v_not_b32 +; GCN: v_not_b32 +define i64 @vector_nor_i64_one_use(i64 %a, i64 %b) { +entry: + %or = or i64 %a, %b + %r = xor i64 %or, -1 + ret i64 %r +} diff --git a/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir b/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir index 576c1ce9874cb312769975f878a3315ab05b204f..f8a7b229c61b4446ed5e4c8e6b608069c5d67c81 100644 --- a/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir +++ b/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir @@ -1,29 +1,21 @@ # RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-optimize-exec-masking -o - %s | FileCheck %s --- | - define amdgpu_kernel void @optimize_if_and_saveexec_xor(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_and_saveexec_xor(i32 %z, i32 %v) { main_body: - %id = call i32 @llvm.amdgcn.workitem.id.x() - %cc = icmp eq i32 %id, 0 - %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %cc) - %1 = extractvalue { i1, i64 } %0, 0 - %2 = extractvalue { i1, i64 } %0, 1 - br i1 %1, label %if, label %end + br i1 undef, label %if, label %end if: ; preds = %main_body %v.if = load volatile i32, i32 addrspace(1)* undef br label %end end: ; preds = %if, %main_body - %r = phi i32 [ 4, %main_body ], [ %v.if, %if ] - call void @llvm.amdgcn.end.cf(i64 %2) - store i32 %r, i32 addrspace(1)* undef ret void } - define amdgpu_kernel void @optimize_if_and_saveexec(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_and_saveexec(i32 %z, i32 %v) { main_body: - br i1 undef, label %if, label %end + br i1 undef, label %if, label %end if: br label %end @@ -32,9 +24,9 @@ ret void } - define amdgpu_kernel void @optimize_if_or_saveexec(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_or_saveexec(i32 %z, i32 %v) { main_body: - br i1 undef, label %if, label %end + br i1 undef, label %if, label %end if: br label %end @@ -43,31 +35,20 @@ ret void } - - define amdgpu_kernel void @optimize_if_and_saveexec_xor_valu_middle(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_and_saveexec_xor_valu_middle(i32 %z, i32 %v) { main_body: - %id = call i32 @llvm.amdgcn.workitem.id.x() - %cc = icmp eq i32 %id, 0 - %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %cc) - %1 = extractvalue { i1, i64 } %0, 0 - %2 = extractvalue { i1, i64 } %0, 1 - store i32 %id, i32 addrspace(1)* undef - br i1 %1, label %if, label %end + br i1 undef, label %if, label %end if: ; preds = %main_body - %v.if = load volatile i32, i32 addrspace(1)* undef br label %end end: ; preds = %if, %main_body - %r = phi i32 [ 4, %main_body ], [ %v.if, %if ] - call void @llvm.amdgcn.end.cf(i64 %2) - store i32 %r, i32 addrspace(1)* undef ret void } - define amdgpu_kernel void @optimize_if_and_saveexec_xor_wrong_reg(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_and_saveexec_xor_wrong_reg(i32 %z, i32 %v) { main_body: - br i1 undef, label %if, label %end + br i1 undef, label %if, label %end if: br label %end @@ -76,9 +57,9 @@ ret void } - define amdgpu_kernel void @optimize_if_and_saveexec_xor_modify_copy_to_exec(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_and_saveexec_xor_modify_copy_to_exec(i32 %z, i32 %v) { main_body: - br i1 undef, label %if, label %end + br i1 undef, label %if, label %end if: br label %end @@ -87,9 +68,9 @@ ret void } - define amdgpu_kernel void @optimize_if_and_saveexec_xor_live_out_setexec(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_and_saveexec_xor_live_out_setexec(i32 %z, i32 %v) { main_body: - br i1 undef, label %if, label %end + br i1 undef, label %if, label %end if: br label %end @@ -98,9 +79,9 @@ ret void } - define amdgpu_kernel void @optimize_if_unknown_saveexec(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_unknown_saveexec(i32 %z, i32 %v) { main_body: - br i1 undef, label %if, label %end + br i1 undef, label %if, label %end if: br label %end @@ -109,9 +90,9 @@ ret void } - define amdgpu_kernel void @optimize_if_andn2_saveexec(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_andn2_saveexec(i32 %z, i32 %v) { main_body: - br i1 undef, label %if, label %end + br i1 undef, label %if, label %end if: br label %end @@ -120,9 +101,9 @@ ret void } - define amdgpu_kernel void @optimize_if_andn2_saveexec_no_commute(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_andn2_saveexec_no_commute(i32 %z, i32 %v) { main_body: - br i1 undef, label %if, label %end + br i1 undef, label %if, label %end if: br label %end @@ -131,16 +112,16 @@ ret void } - ; Function Attrs: nounwind readnone - declare i32 @llvm.amdgcn.workitem.id.x() #1 - - declare { i1, i64 } @llvm.amdgcn.if(i1) - - declare void @llvm.amdgcn.end.cf(i64) + define amdgpu_kernel void @if_and_xor_read_exec_copy_subreg() { + main_body: + br i1 undef, label %if, label %end + if: ; preds = %main_body + br label %end - attributes #0 = { nounwind } - attributes #1 = { nounwind readnone } + end: ; preds = %if, %main_body + ret void + } ... --- @@ -150,28 +131,8 @@ # CHECK-NEXT: SI_MASK_BRANCH name: optimize_if_and_saveexec_xor -alignment: 0 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -tracksRegLiveness: true liveins: - { reg: '$vgpr0' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false body: | bb.0.main_body: liveins: $vgpr0 @@ -190,7 +151,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -198,7 +159,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM ... @@ -208,28 +169,8 @@ body: | # CHECK-NEXT: SI_MASK_BRANCH name: optimize_if_and_saveexec -alignment: 0 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -tracksRegLiveness: true liveins: - { reg: '$vgpr0' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false body: | bb.0.main_body: liveins: $vgpr0 @@ -247,7 +188,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -255,7 +196,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM ... @@ -265,28 +206,8 @@ body: | # CHECK-NEXT: SI_MASK_BRANCH name: optimize_if_or_saveexec -alignment: 0 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -tracksRegLiveness: true liveins: - { reg: '$vgpr0' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false body: | bb.0.main_body: liveins: $vgpr0 @@ -304,7 +225,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -312,40 +233,20 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM ... --- # CHECK-LABEL: name: optimize_if_and_saveexec_xor_valu_middle # CHECK: $sgpr2_sgpr3 = S_AND_B64 $sgpr0_sgpr1, killed $vcc, implicit-def $scc -# CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) +# CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec # CHECK-NEXT: $sgpr0_sgpr1 = S_XOR_B64 $sgpr2_sgpr3, killed $sgpr0_sgpr1, implicit-def $scc # CHECK-NEXT: $exec = COPY killed $sgpr2_sgpr3 # CHECK-NEXT: SI_MASK_BRANCH name: optimize_if_and_saveexec_xor_valu_middle -alignment: 0 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -tracksRegLiveness: true liveins: - { reg: '$vgpr0' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false body: | bb.0.main_body: liveins: $vgpr0 @@ -354,7 +255,7 @@ body: | $vcc = V_CMP_EQ_I32_e64 0, killed $vgpr0, implicit $exec $vgpr0 = V_MOV_B32_e32 4, implicit $exec $sgpr2_sgpr3 = S_AND_B64 $sgpr0_sgpr1, killed $vcc, implicit-def $scc - BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec $sgpr0_sgpr1 = S_XOR_B64 $sgpr2_sgpr3, killed $sgpr0_sgpr1, implicit-def $scc $exec = S_MOV_B64_term killed $sgpr2_sgpr3 SI_MASK_BRANCH %bb.2, implicit $exec @@ -365,7 +266,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -373,7 +274,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM ... @@ -384,28 +285,8 @@ body: | # CHECK-NEXT: $exec = COPY $sgpr0_sgpr1 # CHECK-NEXT: SI_MASK_BRANCH %bb.2, implicit $exec name: optimize_if_and_saveexec_xor_wrong_reg -alignment: 0 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -tracksRegLiveness: true liveins: - { reg: '$vgpr0' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false body: | bb.0.main_body: liveins: $vgpr0 @@ -423,7 +304,7 @@ body: | bb.1.if: liveins: $sgpr0_sgpr1 , $sgpr4_sgpr5_sgpr6_sgpr7 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr4_sgpr5_sgpr6_sgpr7 @@ -431,7 +312,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM ... @@ -444,28 +325,8 @@ body: | # CHECK-NEXT: SI_MASK_BRANCH %bb.2, implicit $exec name: optimize_if_and_saveexec_xor_modify_copy_to_exec -alignment: 0 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -tracksRegLiveness: true liveins: - { reg: '$vgpr0' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false body: | bb.0.main_body: liveins: $vgpr0 @@ -485,7 +346,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -495,7 +356,7 @@ body: | $sgpr1 = S_MOV_B32 1 $sgpr2 = S_MOV_B32 -1 $sgpr3 = S_MOV_B32 61440 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM ... @@ -506,28 +367,8 @@ body: | # CHECK-NEXT: $exec = COPY $sgpr2_sgpr3 # CHECK-NEXT: SI_MASK_BRANCH name: optimize_if_and_saveexec_xor_live_out_setexec -alignment: 0 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -tracksRegLiveness: true liveins: - { reg: '$vgpr0' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false body: | bb.0.main_body: liveins: $vgpr0 @@ -546,7 +387,7 @@ body: | S_SLEEP 0, implicit $sgpr2_sgpr3 $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -554,7 +395,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM ... @@ -566,28 +407,8 @@ body: | # CHECK-NEXT: SI_MASK_BRANCH %bb.2, implicit $exec name: optimize_if_unknown_saveexec -alignment: 0 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -tracksRegLiveness: true liveins: - { reg: '$vgpr0' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false body: | bb.0.main_body: liveins: $vgpr0 @@ -605,7 +426,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -613,7 +434,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM ... @@ -623,28 +444,8 @@ body: | # CHECK-NEXT: SI_MASK_BRANCH name: optimize_if_andn2_saveexec -alignment: 0 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -tracksRegLiveness: true liveins: - { reg: '$vgpr0' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false body: | bb.0.main_body: liveins: $vgpr0 @@ -662,7 +463,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -670,7 +471,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM ... @@ -680,28 +481,8 @@ body: | # CHECK-NEXT: $exec = COPY killed $sgpr2_sgpr3 # CHECK-NEXT: SI_MASK_BRANCH %bb.2, implicit $exec name: optimize_if_andn2_saveexec_no_commute -alignment: 0 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -tracksRegLiveness: true liveins: - { reg: '$vgpr0' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false body: | bb.0.main_body: liveins: $vgpr0 @@ -719,7 +500,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `i32 addrspace(1)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -727,7 +508,45 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM ... +--- +# A read from exec copy subreg prevents optimization +# CHECK-LABEL: name: if_and_xor_read_exec_copy_subreg{{$}} +# CHECK: $sgpr0_sgpr1 = COPY $exec +# CHECK-NEXT: $sgpr4 = S_MOV_B32 $sgpr1 +name: if_and_xor_read_exec_copy_subreg +liveins: + - { reg: '$vgpr0' } +body: | + bb.0.main_body: + liveins: $vgpr0 + + $sgpr0_sgpr1 = COPY $exec + $sgpr4 = S_MOV_B32 $sgpr1 + $vcc = V_CMP_EQ_I32_e64 0, killed $vgpr0, implicit $exec + $vgpr0 = V_MOV_B32_e32 4, implicit $exec + $sgpr2_sgpr3 = S_AND_B64 $sgpr0_sgpr1, killed $vcc, implicit-def $scc + $sgpr0_sgpr1 = S_XOR_B64 $sgpr2_sgpr3, killed $sgpr0_sgpr1, implicit-def $scc + $exec = S_MOV_B64_term killed $sgpr2_sgpr3 + SI_MASK_BRANCH %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1.if: + liveins: $sgpr0_sgpr1 + + $sgpr7 = S_MOV_B32 61440 + $sgpr6 = S_MOV_B32 -1 + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec + + bb.2.end: + liveins: $vgpr0, $sgpr0_sgpr1 + + $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc + $sgpr3 = S_MOV_B32 61440 + $sgpr2 = S_MOV_B32 -1 + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_ENDPGM +... diff --git a/test/CodeGen/AMDGPU/optimize-negated-cond-exec-masking.mir b/test/CodeGen/AMDGPU/optimize-negated-cond-exec-masking.mir new file mode 100644 index 0000000000000000000000000000000000000000..61e6f44348706a31a04741f410b706483d8fc728 --- /dev/null +++ b/test/CodeGen/AMDGPU/optimize-negated-cond-exec-masking.mir @@ -0,0 +1,465 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass=si-optimize-exec-masking-pre-ra -o - %s | FileCheck -check-prefix=GCN %s + +# GCN: name: negated_cond_vop2 +# GCN: %0:sreg_64_xexec = IMPLICIT_DEF +# GCN-NEXT: $vcc = S_ANDN2_B64 $exec, %0, implicit-def $scc +# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc +--- +name: negated_cond_vop2 +body: | + bb.0: + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec + V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + S_BRANCH %bb.1 + + bb.1: + S_BRANCH %bb.0 + + bb.2: + S_ENDPGM +... + +# GCN: name: negated_cond_vop3 +# GCN: %0:sreg_64_xexec = IMPLICIT_DEF +# GCN-NEXT: $vcc = S_ANDN2_B64 $exec, %0, implicit-def $scc +# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc +--- +name: negated_cond_vop3 +body: | + bb.0: + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec + %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1, 1, implicit $exec + $vcc = S_AND_B64 killed %2, $exec, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + S_BRANCH %bb.1 + + bb.1: + S_BRANCH %bb.0 + + bb.2: + S_ENDPGM +... + +# GCN: name: negated_cond_vop2_redef_vcc1 +# GCN: %0:sreg_64_xexec = IMPLICIT_DEF +# GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec +# GCN-NEXT: V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec +# GCN-NEXT: $vcc_lo = COPY $sgpr0 +# GCN-NEXT: $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc +# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc +--- +name: negated_cond_vop2_redef_vcc1 +body: | + bb.0: + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec + V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec + $vcc_lo = COPY $sgpr0 + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + S_BRANCH %bb.1 + + bb.1: + S_BRANCH %bb.0 + + bb.2: + S_ENDPGM +... + +# GCN: name: negated_cond_vop2_redef_vcc2 +# GCN: %0:sreg_64_xexec = IMPLICIT_DEF +# GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec +# GCN-NEXT: V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec +# GCN-NEXT: $vcc_hi = COPY $sgpr0 +# GCN-NEXT: $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc +# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc +--- +name: negated_cond_vop2_redef_vcc2 +body: | + bb.0: + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec + V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec + $vcc_hi = COPY $sgpr0 + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + S_BRANCH %bb.1 + + bb.1: + S_BRANCH %bb.0 + + bb.2: + S_ENDPGM +... + +# GCN: name: negated_cond_vop3_redef_cmp +# GCN: %0:sreg_64_xexec = IMPLICIT_DEF +# GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec +# GCN-NEXT: %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1, 1, implicit $exec +# GCN-NEXT: %2.sub1:sreg_64_xexec = COPY $sgpr0 +# GCN-NEXT: $vcc = S_AND_B64 %2, $exec, implicit-def dead $scc +# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc +--- +name: negated_cond_vop3_redef_cmp +body: | + bb.0: + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec + %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1, 1, implicit $exec + %2.sub1 = COPY $sgpr0 + $vcc = S_AND_B64 killed %2, $exec, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + S_BRANCH %bb.1 + + bb.1: + S_BRANCH %bb.0 + + bb.2: + S_ENDPGM +... + +# GCN: name: negated_cond_undef_vcc +# GCN: $vcc = S_AND_B64 $exec, undef $vcc, implicit-def dead $scc +# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc +--- +name: negated_cond_undef_vcc +body: | + bb.0: + $vcc = S_AND_B64 $exec, undef $vcc, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + S_BRANCH %bb.1 + + bb.1: + S_BRANCH %bb.0 + + bb.2: + S_ENDPGM +... + +# GCN: name: negated_cond_vop3_imp_vcc +# GCN: $vcc = IMPLICIT_DEF +# GCN-NEXT: $vcc = S_ANDN2_B64 $exec, $vcc, implicit-def $scc +# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc +--- +name: negated_cond_vop3_imp_vcc +body: | + bb.0: + $vcc = IMPLICIT_DEF + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, $vcc, implicit $exec + %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1, 1, implicit $exec + $vcc = S_AND_B64 killed %2, $exec, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + S_BRANCH %bb.1 + + bb.1: + S_BRANCH %bb.0 + + bb.2: + S_ENDPGM +... + +# GCN: name: negated_cond_vop2_imp_vcc +# GCN: $vcc = IMPLICIT_DEF +# GCN-NEXT: $vcc = S_ANDN2_B64 $exec, $vcc, implicit-def $scc +# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc +--- +name: negated_cond_vop2_imp_vcc +body: | + bb.0: + $vcc = IMPLICIT_DEF + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, $vcc, implicit $exec + V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec + $vcc = S_AND_B64 killed $vcc, $exec, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + S_BRANCH %bb.1 + + bb.1: + S_BRANCH %bb.0 + + bb.2: + S_ENDPGM +... + +# GCN: name: negated_cond_vop3_redef_sel +# GCN: %0:sreg_64_xexec = IMPLICIT_DEF +# GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec +# GCN-NEXT: %1:vgpr_32 = COPY $vgpr0 +# GCN-NEXT: %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1, 1, implicit $exec +# GCN-NEXT: $vcc = S_AND_B64 %2, $exec, implicit-def dead $scc +# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc +--- +name: negated_cond_vop3_redef_sel +body: | + bb.0: + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec + %1:vgpr_32 = COPY $vgpr0 + %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1, 1, implicit $exec + $vcc = S_AND_B64 killed %2, $exec, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + S_BRANCH %bb.1 + + bb.1: + S_BRANCH %bb.0 + + bb.2: + S_ENDPGM +... + +# GCN: name: negated_cond_vop2_used_sel +# GCN: %0:sreg_64_xexec = IMPLICIT_DEF +# GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec +# GCN-NEXT: $vcc = S_ANDN2_B64 $exec, %0, implicit-def $scc +# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc +--- +name: negated_cond_vop2_used_sel +body: | + bb.0: + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec + V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + S_BRANCH %bb.1 + + bb.1: + S_BRANCH %bb.0 + + bb.2: + $vgpr0 = COPY %1 + S_ENDPGM +... + +# GCN: name: negated_cond_vop2_used_vcc +# GCN: %0:sreg_64_xexec = IMPLICIT_DEF +# GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec +# GCN-NEXT: V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec +# GCN-NEXT: $sgpr0_sgpr1 = COPY $vcc +# GCN-NEXT: $vcc = S_ANDN2_B64 $exec, %0, implicit-def $scc +# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc +--- +name: negated_cond_vop2_used_vcc +body: | + bb.0: + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec + V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec + $sgpr0_sgpr1 = COPY $vcc + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + S_BRANCH %bb.1 + + bb.1: + S_BRANCH %bb.0 + + bb.2: + S_ENDPGM +... + +# GCN: name: negated_cond_vop3_sel_wrong_subreg1 +# GCN: %0:sreg_64_xexec = IMPLICIT_DEF +# GCN-NEXT: %1.sub1:vreg_64 = IMPLICIT_DEF +# GCN-NEXT: %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec +# GCN-NEXT: %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1.sub1, 1, implicit $exec +# GCN-NEXT: $vcc = S_AND_B64 %2, $exec, implicit-def dead $scc +# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc +--- +name: negated_cond_vop3_sel_wrong_subreg1 +body: | + bb.0: + %0:sreg_64_xexec = IMPLICIT_DEF + %1.sub1 = IMPLICIT_DEF + %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec + %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1.sub1, 1, implicit $exec + $vcc = S_AND_B64 killed %2, $exec, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + S_BRANCH %bb.1 + + bb.1: + S_BRANCH %bb.0 + + bb.2: + S_ENDPGM +... + +# GCN: name: negated_cond_vop3_sel_wrong_subreg2 +# GCN: %0:sreg_64_xexec = IMPLICIT_DEF +# GCN-NEXT: %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec +# GCN-NEXT: %1.sub1:vreg_64 = IMPLICIT_DEF +# GCN-NEXT: %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1.sub1, 1, implicit $exec +# GCN-NEXT: $vcc = S_AND_B64 %2, $exec, implicit-def dead $scc +# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc +--- +name: negated_cond_vop3_sel_wrong_subreg2 +body: | + bb.0: + %0:sreg_64_xexec = IMPLICIT_DEF + %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec + %1.sub1 = IMPLICIT_DEF + %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1.sub1, 1, implicit $exec + $vcc = S_AND_B64 killed %2, $exec, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + S_BRANCH %bb.1 + + bb.1: + S_BRANCH %bb.0 + + bb.2: + S_ENDPGM +... + +# GCN: name: negated_cond_vop3_sel_right_subreg1 +# GCN: %0:sreg_64_xexec = IMPLICIT_DEF +# GCN-NEXT: %1.sub1:vreg_64 = IMPLICIT_DEF +# GCN-NEXT: $vcc = S_ANDN2_B64 $exec, %0, implicit-def $scc +# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc +--- +name: negated_cond_vop3_sel_right_subreg1 +body: | + bb.0: + %0:sreg_64_xexec = IMPLICIT_DEF + %1.sub1 = IMPLICIT_DEF + %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec + %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1.sub0, 1, implicit $exec + $vcc = S_AND_B64 killed %2, $exec, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + S_BRANCH %bb.1 + + bb.1: + S_BRANCH %bb.0 + + bb.2: + S_ENDPGM +... + +# GCN: name: negated_cond_vop3_sel_right_subreg2 +# GCN: %0:sreg_64_xexec = IMPLICIT_DEF +# GCN-NEXT: %1.sub1:vreg_64 = IMPLICIT_DEF +# GCN-NEXT: $vcc = S_ANDN2_B64 $exec, %0, implicit-def $scc +# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc +--- +name: negated_cond_vop3_sel_right_subreg2 +body: | + bb.0: + %0:sreg_64_xexec = IMPLICIT_DEF + %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec + %1.sub1 = IMPLICIT_DEF + %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1.sub0, 1, implicit $exec + $vcc = S_AND_B64 killed %2, $exec, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + S_BRANCH %bb.1 + + bb.1: + S_BRANCH %bb.0 + + bb.2: + S_ENDPGM +... + +# GCN: name: negated_cond_vop3_sel_subreg_overlap +# GCN: %0:sreg_64_xexec = IMPLICIT_DEF +# GCN-NEXT: %1.sub2:vreg_128 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec +# GCN-NEXT: %1.sub2_sub3:vreg_128 = IMPLICIT_DEF +# GCN-NEXT: %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1.sub2, 1, implicit $exec +# GCN-NEXT: $vcc = S_AND_B64 %2, $exec, implicit-def dead $scc +# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc +--- +name: negated_cond_vop3_sel_subreg_overlap +body: | + bb.0: + %0:sreg_64_xexec = IMPLICIT_DEF + %1.sub2:vreg_128 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec + %1.sub2_sub3 = IMPLICIT_DEF + %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1.sub2, 1, implicit $exec + $vcc = S_AND_B64 killed %2, $exec, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + S_BRANCH %bb.1 + + bb.1: + S_BRANCH %bb.0 + + bb.2: + S_ENDPGM +... + +# GCN: name: negated_cond_vop2_dominated_blocks +# GCN: %0:sreg_64_xexec = IMPLICIT_DEF +# GCN: $vcc = S_ANDN2_B64 $exec, %0, implicit-def $scc +# GCN-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit $vcc +--- +name: negated_cond_vop2_dominated_blocks +body: | + bb.0: + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec + + bb.1: + V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc + S_BRANCH %bb.2 + + bb.2: + S_BRANCH %bb.1 + + bb.3: + S_ENDPGM +... + +# GCN: name: negated_cond_vop2_different_blocks_cmp_and +# GCN: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec +# GCN: $vcc = S_AND_B64 $exec, %2, implicit-def dead $scc +# GCN-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit $vcc +--- +name: negated_cond_vop2_different_blocks_cmp_and +body: | + bb.0: + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec + %2:sreg_64_xexec = V_CMP_NE_U32_e64 %1, 1, implicit $exec + + bb.1: + $vcc = S_AND_B64 $exec, killed %2, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc + S_BRANCH %bb.2 + + bb.2: + S_BRANCH %bb.1 + + bb.3: + S_ENDPGM +... + +# GCN: name: negated_cond_vop2_not_dominated_blocks +# GCN: V_CNDMASK_B32_e64 0, 1, +# GCN: $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc +# GCN-NEXT: S_CBRANCH_VCCNZ %bb.4, implicit $vcc +--- +name: negated_cond_vop2_not_dominated_blocks +body: | + bb.0: + $vcc = IMPLICIT_DEF + %1 = IMPLICIT_DEF + S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc + S_BRANCH %bb.1 + + bb.1: + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %0, implicit $exec + + bb.2: + V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.4, implicit killed $vcc + S_BRANCH %bb.3 + + bb.3: + S_BRANCH %bb.2 + + bb.4: + S_ENDPGM +... diff --git a/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/test/CodeGen/AMDGPU/optimize-negated-cond.ll new file mode 100644 index 0000000000000000000000000000000000000000..be5d8d4720509896ecfaca05bff29a079e30b390 --- /dev/null +++ b/test/CodeGen/AMDGPU/optimize-negated-cond.ll @@ -0,0 +1,75 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}negated_cond: +; GCN: BB0_1: +; GCN: v_cmp_eq_u32_e64 [[CC:[^,]+]], +; GCN: BB0_2: +; GCN-NOT: v_cndmask_b32 +; GCN-NOT: v_cmp +; GCN: s_andn2_b64 vcc, exec, [[CC]] +; GCN: s_cbranch_vccnz BB0_4 +define amdgpu_kernel void @negated_cond(i32 addrspace(1)* %arg1) { +bb: + br label %bb1 + +bb1: + %tmp1 = load i32, i32 addrspace(1)* %arg1 + %tmp2 = icmp eq i32 %tmp1, 0 + br label %bb2 + +bb2: + %tmp3 = phi i32 [ 0, %bb1 ], [ %tmp6, %bb4 ] + %tmp4 = shl i32 %tmp3, 5 + br i1 %tmp2, label %bb3, label %bb4 + +bb3: + %tmp5 = add i32 %tmp4, 1 + br label %bb4 + +bb4: + %tmp6 = phi i32 [ %tmp5, %bb3 ], [ %tmp4, %bb2 ] + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp6 + store i32 0, i32 addrspace(1)* %gep + %tmp7 = icmp eq i32 %tmp6, 32 + br i1 %tmp7, label %bb1, label %bb2 +} + +; GCN-LABEL: {{^}}negated_cond_dominated_blocks: +; GCN: v_cmp_eq_u32_e64 [[CC:[^,]+]], +; GCN: BB1_1: +; GCN-NOT: v_cndmask_b32 +; GCN-NOT: v_cmp +; GCN: s_andn2_b64 vcc, exec, [[CC]] +; GCN: s_cbranch_vccz BB1_3 +define amdgpu_kernel void @negated_cond_dominated_blocks(i32 addrspace(1)* %arg1) { +bb: + br label %bb2 + +bb2: + %tmp1 = load i32, i32 addrspace(1)* %arg1 + %tmp2 = icmp eq i32 %tmp1, 0 + br label %bb4 + +bb3: + ret void + +bb4: + %tmp3 = phi i32 [ 0, %bb2 ], [ %tmp7, %bb7 ] + %tmp4 = shl i32 %tmp3, 5 + br i1 %tmp2, label %bb5, label %bb6 + +bb5: + %tmp5 = add i32 %tmp4, 1 + br label %bb7 + +bb6: + %tmp6 = add i32 %tmp3, 1 + br label %bb7 + +bb7: + %tmp7 = phi i32 [ %tmp5, %bb5 ], [ %tmp6, %bb6 ] + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp7 + store i32 0, i32 addrspace(1)* %gep + %tmp8 = icmp eq i32 %tmp7, 32 + br i1 %tmp8, label %bb3, label %bb4 +} diff --git a/test/CodeGen/AMDGPU/or3.ll b/test/CodeGen/AMDGPU/or3.ll new file mode 100644 index 0000000000000000000000000000000000000000..400d9c4b0103a8b22a05654e80ad5389dcf3925b --- /dev/null +++ b/test/CodeGen/AMDGPU/or3.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s + +; =================================================================================== +; V_OR3_B32 +; =================================================================================== + +define amdgpu_ps float @or3(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: or3: +; VI: ; %bb.0: +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: or3: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: ; return to shader part epilog + %x = or i32 %a, %b + %result = or i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +; ThreeOp instruction variant not used due to Constant Bus Limitations +; TODO: with reassociation it is possible to replace a v_or_b32_e32 with an s_or_b32 +define amdgpu_ps float @or3_vgpr_a(i32 %a, i32 inreg %b, i32 inreg %c) { +; VI-LABEL: or3_vgpr_a: +; VI: ; %bb.0: +; VI-NEXT: v_or_b32_e32 v0, s2, v0 +; VI-NEXT: v_or_b32_e32 v0, s3, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: or3_vgpr_a: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX9-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX9-NEXT: ; return to shader part epilog + %x = or i32 %a, %b + %result = or i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @or3_vgpr_all2(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: or3_vgpr_all2: +; VI: ; %bb.0: +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: or3_vgpr_all2: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: ; return to shader part epilog + %x = or i32 %b, %c + %result = or i32 %a, %x + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @or3_vgpr_bc(i32 inreg %a, i32 %b, i32 %c) { +; VI-LABEL: or3_vgpr_bc: +; VI: ; %bb.0: +; VI-NEXT: v_or_b32_e32 v0, s2, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: or3_vgpr_bc: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_or3_b32 v0, s2, v0, v1 +; GFX9-NEXT: ; return to shader part epilog + %x = or i32 %a, %b + %result = or i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @or3_vgpr_const(i32 %a, i32 %b) { +; VI-LABEL: or3_vgpr_const: +; VI: ; %bb.0: +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v0, 64, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: or3_vgpr_const: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_or3_b32 v0, v1, v0, 64 +; GFX9-NEXT: ; return to shader part epilog + %x = or i32 64, %b + %result = or i32 %x, %a + %bc = bitcast i32 %result to float + ret float %bc +} diff --git a/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll b/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll index 6a41c3ad2e88fda950b8023afa33685b0ff25d0d..d7e38a602ffa80b9642993a02a97be410c775f53 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll @@ -1,5 +1,5 @@ -; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=+promote-alloca < %s | FileCheck -check-prefix=NOOPTS -check-prefix=ALL %s -; RUN: llc -O1 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=+promote-alloca < %s | FileCheck -check-prefix=OPTS -check-prefix=ALL %s +; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=-code-object-v3,+promote-alloca < %s | FileCheck -check-prefix=NOOPTS -check-prefix=ALL %s +; RUN: llc -O1 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=-code-object-v3,+promote-alloca < %s | FileCheck -check-prefix=OPTS -check-prefix=ALL %s ; ALL-LABEL: {{^}}promote_alloca_i32_array_array: ; NOOPTS: workgroup_group_segment_byte_size = 0{{$}} diff --git a/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll index e8dcb50a3c1fe400275464db7584aea4d3b639d5..83a608ad5f31cece6e597268a4ee418db4d14129 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=GCN %s ; This shows that the amount of LDS estimate is sensitive to the order ; of the LDS globals. diff --git a/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll b/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll index ebeed0dd4435b1cdf7db4d07a63945237dd602b9..28e4925f95061b4453e7b943acedbcae9a59b9bf 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll @@ -1,12 +1,14 @@ ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + ; CHECK-LABEL: @lds_promoted_alloca_select_invalid_pointer_operand( ; CHECK: %alloca = alloca i32 -; CHECK: select i1 undef, i32* undef, i32* %alloca +; CHECK: select i1 undef, i32 addrspace(5)* undef, i32 addrspace(5)* %alloca define amdgpu_kernel void @lds_promoted_alloca_select_invalid_pointer_operand() #0 { - %alloca = alloca i32, align 4 - %select = select i1 undef, i32* undef, i32* %alloca - store i32 0, i32* %select, align 4 + %alloca = alloca i32, align 4, addrspace(5) + %select = select i1 undef, i32 addrspace(5)* undef, i32 addrspace(5)* %alloca + store i32 0, i32 addrspace(5)* %select, align 4 ret void } @@ -17,11 +19,11 @@ define amdgpu_kernel void @lds_promoted_alloca_select_invalid_pointer_operand() ; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1 ; CHECK: store i32 0, i32 addrspace(3)* %select, align 4 define amdgpu_kernel void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 { - %alloca = alloca [16 x i32], align 4 - %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a - %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b - %select = select i1 undef, i32* %ptr0, i32* %ptr1 - store i32 0, i32* %select, align 4 + %alloca = alloca [16 x i32], align 4, addrspace(5) + %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a + %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %b + %select = select i1 undef, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1 + store i32 0, i32 addrspace(5)* %select, align 4 ret void } @@ -30,16 +32,16 @@ define amdgpu_kernel void @lds_promote_alloca_select_two_derived_pointers(i32 %a ; CHECK-LABEL: @lds_promote_alloca_select_two_allocas( ; CHECK: %alloca0 = alloca i32, i32 16, align 4 ; CHECK: %alloca1 = alloca i32, i32 16, align 4 -; CHECK: %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a -; CHECK: %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b -; CHECK: %select = select i1 undef, i32* %ptr0, i32* %ptr1 +; CHECK: %ptr0 = getelementptr inbounds i32, i32 addrspace(5)* %alloca0, i32 %a +; CHECK: %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %alloca1, i32 %b +; CHECK: %select = select i1 undef, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1 define amdgpu_kernel void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 { - %alloca0 = alloca i32, i32 16, align 4 - %alloca1 = alloca i32, i32 16, align 4 - %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a - %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b - %select = select i1 undef, i32* %ptr0, i32* %ptr1 - store i32 0, i32* %select, align 4 + %alloca0 = alloca i32, i32 16, align 4, addrspace(5) + %alloca1 = alloca i32, i32 16, align 4, addrspace(5) + %ptr0 = getelementptr inbounds i32, i32 addrspace(5)* %alloca0, i32 %a + %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %alloca1, i32 %b + %select = select i1 undef, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1 + store i32 0, i32 addrspace(5)* %select, align 4 ret void } @@ -51,11 +53,11 @@ define amdgpu_kernel void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) ; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1 ; CHECK: store i32 0, i32 addrspace(3)* %select, align 4 define amdgpu_kernel void @lds_promote_alloca_select_two_derived_constant_pointers() #0 { - %alloca = alloca [16 x i32], align 4 - %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 1 - %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 3 - %select = select i1 undef, i32* %ptr0, i32* %ptr1 - store i32 0, i32* %select, align 4 + %alloca = alloca [16 x i32], align 4, addrspace(5) + %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 + %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 3 + %select = select i1 undef, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1 + store i32 0, i32 addrspace(5)* %select, align 4 ret void } @@ -68,34 +70,34 @@ define amdgpu_kernel void @lds_promote_alloca_select_two_derived_constant_pointe ; CHECK: %select1 = select i1 undef, i32 addrspace(3)* %select0, i32 addrspace(3)* %ptr2 ; CHECK: store i32 0, i32 addrspace(3)* %select1, align 4 define amdgpu_kernel void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 { - %alloca = alloca [16 x i32], align 4 - %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a - %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b - %ptr2 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %c - %select0 = select i1 undef, i32* %ptr0, i32* %ptr1 - %select1 = select i1 undef, i32* %select0, i32* %ptr2 - store i32 0, i32* %select1, align 4 + %alloca = alloca [16 x i32], align 4, addrspace(5) + %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a + %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %b + %ptr2 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %c + %select0 = select i1 undef, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1 + %select1 = select i1 undef, i32 addrspace(5)* %select0, i32 addrspace(5)* %ptr2 + store i32 0, i32 addrspace(5)* %select1, align 4 ret void } define amdgpu_kernel void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 { entry: - %alloca = alloca [16 x i32], align 4 - %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a - %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b - store i32 0, i32* %ptr0 + %alloca = alloca [16 x i32], align 4, addrspace(5) + %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a + %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %b + store i32 0, i32 addrspace(5)* %ptr0 br i1 undef, label %bb1, label %bb2 bb1: - %ptr2 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %c - %select0 = select i1 undef, i32* undef, i32* %ptr2 - store i32 0, i32* %ptr1 + %ptr2 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %c + %select0 = select i1 undef, i32 addrspace(5)* undef, i32 addrspace(5)* %ptr2 + store i32 0, i32 addrspace(5)* %ptr1 br label %bb2 bb2: - %phi.ptr = phi i32* [ %ptr0, %entry ], [ %select0, %bb1 ] - %select1 = select i1 undef, i32* %phi.ptr, i32* %ptr1 - store i32 0, i32* %select1, align 4 + %phi.ptr = phi i32 addrspace(5)* [ %ptr0, %entry ], [ %select0, %bb1 ] + %select1 = select i1 undef, i32 addrspace(5)* %phi.ptr, i32 addrspace(5)* %ptr1 + store i32 0, i32 addrspace(5)* %select1, align 4 ret void } @@ -104,12 +106,12 @@ bb2: ; CHECK: select i1 %tmp2, double addrspace(3)* %{{[0-9]+}}, double addrspace(3)* null define amdgpu_kernel void @select_null_rhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 { bb: - %tmp = alloca double, align 8 - store double 0.000000e+00, double* %tmp, align 8 + %tmp = alloca double, align 8, addrspace(5) + store double 0.000000e+00, double addrspace(5)* %tmp, align 8 %tmp2 = icmp eq i32 %arg1, 0 - %tmp3 = select i1 %tmp2, double* %tmp, double* null - store double 1.000000e+00, double* %tmp3, align 8 - %tmp4 = load double, double* %tmp, align 8 + %tmp3 = select i1 %tmp2, double addrspace(5)* %tmp, double addrspace(5)* null + store double 1.000000e+00, double addrspace(5)* %tmp3, align 8 + %tmp4 = load double, double addrspace(5)* %tmp, align 8 store double %tmp4, double addrspace(1)* %arg ret void } @@ -119,12 +121,12 @@ bb: ; CHECK: select i1 %tmp2, double addrspace(3)* null, double addrspace(3)* %{{[0-9]+}} define amdgpu_kernel void @select_null_lhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 { bb: - %tmp = alloca double, align 8 - store double 0.000000e+00, double* %tmp, align 8 + %tmp = alloca double, align 8, addrspace(5) + store double 0.000000e+00, double addrspace(5)* %tmp, align 8 %tmp2 = icmp eq i32 %arg1, 0 - %tmp3 = select i1 %tmp2, double* null, double* %tmp - store double 1.000000e+00, double* %tmp3, align 8 - %tmp4 = load double, double* %tmp, align 8 + %tmp3 = select i1 %tmp2, double addrspace(5)* null, double addrspace(5)* %tmp + store double 1.000000e+00, double addrspace(5)* %tmp3, align 8 + %tmp4 = load double, double addrspace(5)* %tmp, align 8 store double %tmp4, double addrspace(1)* %arg ret void } diff --git a/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll index 80112160412c1b96f1e005a788b74f1db64e3335..15da72db4abb7ff9dfa13d7cebf472fb6f6f67d9 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll @@ -1,15 +1,15 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s ; GCN-LABEL: {{^}}float4_alloca_store4: ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4 -; GFX-NOT: buffer_ -; GCN: v_readfirstlane_b32 -; GFX8: v_movrels_b32 -; GFX9: s_set_gpr_idx_on -; GFX9: s_set_gpr_idx_off +; GCN-NOT: buffer_ +; GCN: v_cndmask_b32 +; GCN: v_cndmask_b32 +; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0, +; GCN: store_dword v[{{[0-9:]+}}], [[RES]] ; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2 ; OPT: store <4 x float> , <4 x float> addrspace(5)* %alloca, align 4 @@ -36,11 +36,15 @@ entry: ; GCN-LABEL: {{^}}float4_alloca_load4: ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_load4 -; GFX-NOT: buffer_ -; GCN: v_readfirstlane_b32 -; GFX8: v_movreld_b32 -; GFX9: s_set_gpr_idx_on -; GFX9: s_set_gpr_idx_off +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN-NOT: v_cmp_ +; GCN-NOT: v_cndmask_ +; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]] +; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]] +; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]] +; GCN: store_dwordx4 v[{{[0-9:]+}}], ; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2 ; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca @@ -68,7 +72,7 @@ entry: ; GCN-LABEL: {{^}}half4_alloca_store4: ; OPT-LABEL: define amdgpu_kernel void @half4_alloca_store4 -; GFX-NOT: buffer_ +; GCN-NOT: buffer_ ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00 ; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]] @@ -98,7 +102,7 @@ entry: ; GCN-LABEL: {{^}}half4_alloca_load4: ; OPT-LABEL: define amdgpu_kernel void @half4_alloca_load4 -; GFX-NOT: buffer_ +; GCN-NOT: buffer_ ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff @@ -128,7 +132,7 @@ entry: ; GCN-LABEL: {{^}}short4_alloca_store4: ; OPT-LABEL: define amdgpu_kernel void @short4_alloca_store4 -; GFX-NOT: buffer_ +; GCN-NOT: buffer_ ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x40003 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001 ; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]] @@ -158,7 +162,7 @@ entry: ; GCN-LABEL: {{^}}short4_alloca_load4: ; OPT-LABEL: define amdgpu_kernel void @short4_alloca_load4 -; GFX-NOT: buffer_ +; GCN-NOT: buffer_ ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff diff --git a/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll new file mode 100644 index 0000000000000000000000000000000000000000..a9fc318ce0e110fcc7acffcc9214ed9dc1e43b2c --- /dev/null +++ b/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -0,0 +1,485 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s + +declare i64 @_Z13get_global_idj(i32) + +define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)* %buffer) { +; GCN-LABEL: clmem_read_simplified: +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %a0 = shl i64 %call, 7 + %idx.ext11 = and i64 %a0, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* + + %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv + %load1 = load i64, i64 addrspace(1)* %addr1, align 8 + %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256 + %load2 = load i64, i64 addrspace(1)* %addr2, align 8 + %add.1 = add i64 %load2, %load1 + + %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512 + %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8 + %add.2 = add i64 %load3, %add.1 + %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768 + %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 + %add.3 = add i64 %load4, %add.2 + + %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024 + %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 + %add.4 = add i64 %load5, %add.3 + %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280 + %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 + %add.5 = add i64 %load6, %add.4 + + %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536 + %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 + %add.6 = add i64 %load7, %add.5 + %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792 + %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 + %add.7 = add i64 %load8, %add.6 + + store i64 %add.7, i64 addrspace(1)* %saddr, align 8 + ret void +} + +define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) { +; GCN-LABEL: clmem_read: +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %a0 = shl i64 %call, 17 + %idx.ext11 = and i64 %a0, 4261412864 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %a1 = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* + %add.ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %a1, i64 %conv + br label %for.cond.preheader + +while.cond.loopexit: ; preds = %for.body + %dec = add nsw i32 %dec31, -1 + %tobool = icmp eq i32 %dec31, 0 + br i1 %tobool, label %while.end, label %for.cond.preheader + +for.cond.preheader: ; preds = %entry, %while.cond.loopexit + %dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ] + %sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ] + br label %for.body + +for.body: ; preds = %for.body, %for.cond.preheader + %block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ] + %sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ] + %conv3 = zext i32 %block.029 to i64 + %add.ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3 + %load1 = load i64, i64 addrspace(1)* %add.ptr8, align 8 + %add = add i64 %load1, %sum.128 + + %add9 = or i32 %block.029, 256 + %conv3.1 = zext i32 %add9 to i64 + %add.ptr8.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.1 + %load2 = load i64, i64 addrspace(1)* %add.ptr8.1, align 8 + %add.1 = add i64 %load2, %add + + %add9.1 = or i32 %block.029, 512 + %conv3.2 = zext i32 %add9.1 to i64 + %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.2 + %l3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8 + %add.2 = add i64 %l3, %add.1 + + %add9.2 = or i32 %block.029, 768 + %conv3.3 = zext i32 %add9.2 to i64 + %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.3 + %l4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 + %add.3 = add i64 %l4, %add.2 + + %add9.3 = or i32 %block.029, 1024 + %conv3.4 = zext i32 %add9.3 to i64 + %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.4 + %l5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 + %add.4 = add i64 %l5, %add.3 + + %add9.4 = or i32 %block.029, 1280 + %conv3.5 = zext i32 %add9.4 to i64 + %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.5 + %l6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 + %add.5 = add i64 %l6, %add.4 + + %add9.5 = or i32 %block.029, 1536 + %conv3.6 = zext i32 %add9.5 to i64 + %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.6 + %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 + %add.6 = add i64 %load7, %add.5 + + %add9.6 = or i32 %block.029, 1792 + %conv3.7 = zext i32 %add9.6 to i64 + %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.7 + %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 + %add.7 = add i64 %load8, %add.6 + + %add9.7 = or i32 %block.029, 2048 + %conv3.8 = zext i32 %add9.7 to i64 + %add.ptr8.8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.8 + %load9 = load i64, i64 addrspace(1)* %add.ptr8.8, align 8 + %add.8 = add i64 %load9, %add.7 + + %add9.8 = or i32 %block.029, 2304 + %conv3.9 = zext i32 %add9.8 to i64 + %add.ptr8.9 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.9 + %load10 = load i64, i64 addrspace(1)* %add.ptr8.9, align 8 + %add.9 = add i64 %load10, %add.8 + + %add9.9 = or i32 %block.029, 2560 + %conv3.10 = zext i32 %add9.9 to i64 + %add.ptr8.10 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.10 + %load11 = load i64, i64 addrspace(1)* %add.ptr8.10, align 8 + %add.10 = add i64 %load11, %add.9 + + %add9.31 = add nuw nsw i32 %block.029, 8192 + %cmp.31 = icmp ult i32 %add9.31, 4194304 + br i1 %cmp.31, label %for.body, label %while.cond.loopexit + +while.end: ; preds = %while.cond.loopexit + store i64 %add.10, i64 addrspace(1)* %a1, align 8 + ret void +} + +; using 32bit address. +define amdgpu_kernel void @Address32(i8 addrspace(1)* %buffer) { +; GCN-LABEL: Address32: +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-3072 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %id = shl i64 %call, 7 + %idx.ext11 = and i64 %id, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %addr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)* + + %add.ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %addr, i64 %conv + %load1 = load i32, i32 addrspace(1)* %add.ptr6, align 4 + + %add.ptr8.1 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 256 + %load2 = load i32, i32 addrspace(1)* %add.ptr8.1, align 4 + %add.1 = add i32 %load2, %load1 + + %add.ptr8.2 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 512 + %load3 = load i32, i32 addrspace(1)* %add.ptr8.2, align 4 + %add.2 = add i32 %load3, %add.1 + + %add.ptr8.3 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 768 + %load4 = load i32, i32 addrspace(1)* %add.ptr8.3, align 4 + %add.3 = add i32 %load4, %add.2 + + %add.ptr8.4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1024 + %load5 = load i32, i32 addrspace(1)* %add.ptr8.4, align 4 + %add.4 = add i32 %load5, %add.3 + + %add.ptr8.5 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1280 + %load6 = load i32, i32 addrspace(1)* %add.ptr8.5, align 4 + %add.5 = add i32 %load6, %add.4 + + %add.ptr8.6 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1536 + %load7 = load i32, i32 addrspace(1)* %add.ptr8.6, align 4 + %add.6 = add i32 %load7, %add.5 + + %add.ptr8.7 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1792 + %load8 = load i32, i32 addrspace(1)* %add.ptr8.7, align 4 + %add.7 = add i32 %load8, %add.6 + + %add.ptr8.8 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2048 + %load9 = load i32, i32 addrspace(1)* %add.ptr8.8, align 4 + %add.8 = add i32 %load9, %add.7 + + %add.ptr8.9 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2304 + %load10 = load i32, i32 addrspace(1)* %add.ptr8.9, align 4 + %add.9 = add i32 %load10, %add.8 + + store i32 %add.9, i32 addrspace(1)* %addr, align 4 + ret void +} + +define amdgpu_kernel void @Offset64(i8 addrspace(1)* %buffer) { +; GCN-LABEL: Offset64: +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %a0 = shl i64 %call, 7 + %idx.ext11 = and i64 %a0, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* + + %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv + %load1 = load i64, i64 addrspace(1)* %addr1, align 8 + + %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870400 + %load2 = load i64, i64 addrspace(1)* %addr2, align 8 + + %add1 = add i64 %load2, %load1 + + %addr3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870656 + %load3 = load i64, i64 addrspace(1)* %addr3, align 8 + + %add2 = add i64 %load3, %add1 + + %addr4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870912 + %load4 = load i64, i64 addrspace(1)* %addr4, align 8 + %add4 = add i64 %load4, %add2 + + store i64 %add4, i64 addrspace(1)* %saddr, align 8 + ret void +} + +; TODO: Support load4 as anchor instruction. +define amdgpu_kernel void @p32Offset64(i8 addrspace(1)* %buffer) { +; GCN-LABEL: p32Offset64: +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %a0 = shl i64 %call, 7 + %idx.ext11 = and i64 %a0, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)* + + %addr1 = getelementptr inbounds i32, i32 addrspace(1)* %saddr, i64 %conv + %load1 = load i32, i32 addrspace(1)* %addr1, align 8 + + %addr2 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870400 + %load2 = load i32, i32 addrspace(1)* %addr2, align 8 + + %add1 = add i32 %load2, %load1 + + %addr3 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870656 + %load3 = load i32, i32 addrspace(1)* %addr3, align 8 + + %add2 = add i32 %load3, %add1 + + %addr4 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870912 + %load4 = load i32, i32 addrspace(1)* %addr4, align 8 + %add4 = add i32 %load4, %add2 + + store i32 %add4, i32 addrspace(1)* %saddr, align 8 + ret void +} + +define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1, +; GCN-LABEL: DiffBase: +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off + i8 addrspace(1)* %buffer2) { +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %a0 = shl i64 %call, 7 + %idx.ext11 = and i64 %a0, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer1, i64 %idx.ext11 + %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* + + %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %buffer2, i64 %idx.ext11 + %saddr2 = bitcast i8 addrspace(1)* %add.ptr2 to i64 addrspace(1)* + + %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 512 + %load1 = load i64, i64 addrspace(1)* %addr1, align 8 + %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 768 + %load2 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 + %add1 = add i64 %load2, %load1 + %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 1024 + %load3 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 + %add2 = add i64 %load3, %add1 + + %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1280 + %load4 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 + + %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1536 + %load5 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 + %add3 = add i64 %load5, %load4 + + %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1792 + %load6 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 + %add4 = add i64 %load6, %add3 + + %add5 = add i64 %add2, %add4 + + store i64 %add5, i64 addrspace(1)* %saddr, align 8 + ret void +} + +define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) { +; GCN-LABEL: ReverseOrder: +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %a0 = shl i64 %call, 7 + %idx.ext11 = and i64 %a0, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* + + %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv + %load1 = load i64, i64 addrspace(1)* %addr1, align 8 + + %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792 + %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 + %add7 = add i64 %load8, %load1 + + %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536 + %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 + %add6 = add i64 %load7, %add7 + + %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280 + %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 + %add5 = add i64 %load6, %add6 + + %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024 + %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 + %add4 = add i64 %load5, %add5 + + %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768 + %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 + %add3 = add i64 %load4, %add4 + + %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512 + %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8 + %add2 = add i64 %load3, %add3 + + %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256 + %load2 = load i64, i64 addrspace(1)* %addr2, align 8 + %add1 = add i64 %load2, %add2 + + store i64 %add1, i64 addrspace(1)* %saddr, align 8 + ret void +} + +define hidden amdgpu_kernel void @negativeoffset(i8 addrspace(1)* nocapture %buffer) { +; GCN-LABEL: negativeoffset: +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) #2 + %conv = and i64 %call, 255 + %0 = shl i64 %call, 7 + %idx.ext11 = and i64 %0, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %buffer_head = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* + + %buffer_wave = getelementptr inbounds i64, i64 addrspace(1)* %buffer_head, i64 %conv + + %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870656 + %load1 = load i64, i64 addrspace(1)* %addr1, align 8 + + %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870912 + %load2 = load i64, i64 addrspace(1)* %addr2, align 8 + + + %add = add i64 %load2, %load1 + + store i64 %add, i64 addrspace(1)* %buffer_head, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir b/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir new file mode 100644 index 0000000000000000000000000000000000000000..b5e4032a56f7917bf0d98974d06d92cb5ba3c61c --- /dev/null +++ b/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir @@ -0,0 +1,190 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s + +# GFX9-LABEL: name: diffoporder_add +# GFX9: %{{[0-9]+}}:vreg_64 = GLOBAL_LOAD_DWORDX2 %{{[0-9]+}}, -2048, 0, 0 +# GFX9: %{{[0-9]+}}:vreg_64 = GLOBAL_LOAD_DWORDX2 %{{[0-9]+}}, 0, 0, 0 + +name: diffoporder_add +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %3:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %4:sreg_32_xm0 = COPY $sgpr101 + %5:sreg_32_xm0 = S_MOV_B32 0 + $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %3 + $sgpr4 = COPY %4 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + %6:vreg_64 = COPY $vgpr0_vgpr1 + %7:vgpr_32 = V_AND_B32_e32 255, %6.sub0, implicit $exec + %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %9:vreg_64 = REG_SEQUENCE killed %7, %subreg.sub0, %8, %subreg.sub1 + %10:vgpr_32 = V_LSHLREV_B32_e64 7, %6.sub0, implicit $exec + %11:vgpr_32 = V_AND_B32_e32 -32768, killed %10, implicit $exec + %12:sgpr_32 = COPY %1.sub1 + %13:vgpr_32 = COPY %5 + %14:vgpr_32, %15:sreg_64_xexec = V_ADD_I32_e64 %1.sub0, %11, implicit $exec + %16:vgpr_32 = COPY %12 + %17:vgpr_32, dead %18:sreg_64_xexec = V_ADDC_U32_e64 %16, %13, killed %15, implicit $exec + %19:vreg_64 = REG_SEQUENCE %14, %subreg.sub0, %17, %subreg.sub1 + %20:vreg_64 = V_LSHLREV_B64 3, %9, implicit $exec + %21:vgpr_32, %22:sreg_64_xexec = V_ADD_I32_e64 %14, %20.sub0, implicit $exec + %23:vgpr_32, dead %24:sreg_64_xexec = V_ADDC_U32_e64 %17, %20.sub1, killed %22, implicit $exec + %25:sgpr_32 = S_MOV_B32 4096 + %26:vgpr_32, %27:sreg_64_xexec = V_ADD_I32_e64 %25, %21, implicit $exec + %28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, implicit $exec + %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1 + %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, implicit $exec + %32:sgpr_32 = S_MOV_B32 6144 + %33:vgpr_32, %34:sreg_64_xexec = V_ADD_I32_e64 %21, %32, implicit $exec + %35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, implicit $exec + %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1 + %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, implicit $exec +... +--- + +# GFX9-LABEL: name: LowestInMiddle +# GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 11200 +# GFX9: [[BASE_LO:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_5:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 %{{[0-9]+}}, [[S_MOV_B32_1]] +# GFX9: [[BASE_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_64_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_I32_e64_5]] +# GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BASE_LO]], %subreg.sub0, [[BASE_HI]], %subreg.sub1 +# GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -3200, 0, 0 +# +# GFX9: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 6400 +# GFX9: [[BASE1_LO:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_7:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 %{{[0-9]+}}, [[S_MOV_B32_2]] +# GFX9: [[BASE1_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_64_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_I32_e64_7]] +# GFX9: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BASE1_LO]], %subreg.sub0, [[BASE1_HI]], %subreg.sub1 +# GFX9: [[GLOBAL_LOAD_DWORDX2_1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE3]], 0, 0, 0, +# GFX9: [[GLOBAL_LOAD_DWORDX2_2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], 0, 0, 0, + +name: LowestInMiddle +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %3:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %4:sreg_32_xm0 = COPY $sgpr101 + %5:sreg_32_xm0 = S_MOV_B32 0 + $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %3 + $sgpr4 = COPY %4 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + %6:vreg_64 = COPY $vgpr0_vgpr1 + %7:vgpr_32 = V_AND_B32_e32 255, %6.sub0, implicit $exec + %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %9:vreg_64 = REG_SEQUENCE killed %7, %subreg.sub0, %8, %subreg.sub1 + %10:vgpr_32 = V_LSHLREV_B32_e64 7, %6.sub0, implicit $exec + %11:vgpr_32 = V_AND_B32_e32 -32768, killed %10, implicit $exec + %12:sgpr_32 = COPY %1.sub1 + %13:vgpr_32 = COPY %5 + %14:vgpr_32, %15:sreg_64_xexec = V_ADD_I32_e64 %1.sub0, %11, implicit $exec + %16:vgpr_32 = COPY %12 + %17:vgpr_32, dead %18:sreg_64_xexec = V_ADDC_U32_e64 %16, %13, killed %15, implicit $exec + %19:vreg_64 = REG_SEQUENCE %14, %subreg.sub0, %17, %subreg.sub1 + %20:vreg_64 = V_LSHLREV_B64 3, %9, implicit $exec + %21:vgpr_32, %22:sreg_64_xexec = V_ADD_I32_e64 %14, %20.sub0, implicit $exec + %23:vgpr_32, dead %24:sreg_64_xexec = V_ADDC_U32_e64 %17, %20.sub1, killed %22, implicit $exec + %25:sgpr_32 = S_MOV_B32 8000 + %26:vgpr_32, %27:sreg_64_xexec = V_ADD_I32_e64 %21, %25, implicit $exec + %28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, implicit $exec + %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1 + %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, implicit $exec + %32:sgpr_32 = S_MOV_B32 6400 + %33:vgpr_32, %34:sreg_64_xexec = V_ADD_I32_e64 %21, %32, implicit $exec + %35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, implicit $exec + %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1 + %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, implicit $exec + %39:sgpr_32 = S_MOV_B32 11200 + %40:vgpr_32, %41:sreg_64_xexec = V_ADD_I32_e64 %21, %39, implicit $exec + %42:vgpr_32, dead %43:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %41, implicit $exec + %44:vreg_64 = REG_SEQUENCE %40, %subreg.sub0, %42, %subreg.sub1 + %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, implicit $exec +... +--- + +# GFX9-LABEL: name: NegativeDistance +# GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 10240 +# GFX9: [[V_ADD_I32_e64_4:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_5:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 %{{[0-9]+}}, [[S_MOV_B32_1]] +# GFX9: [[BASE_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_64_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_I32_e64_5]] +# GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_4]], %subreg.sub0, [[BASE_HI]], %subreg.sub1 +# GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -4096, 0, 0 +# GFX9: [[GLOBAL_LOAD_DWORDX2_1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -2048, 0, 0 +# GFX9: [[GLOBAL_LOAD_DWORDX2_2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], 0, 0, 0 + +name: NegativeDistance +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %3:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %4:sreg_32_xm0 = COPY $sgpr101 + %5:sreg_32_xm0 = S_MOV_B32 0 + $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %3 + $sgpr4 = COPY %4 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + %6:vreg_64 = COPY $vgpr0_vgpr1 + %7:vgpr_32 = V_AND_B32_e32 255, %6.sub0, implicit $exec + %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %9:vreg_64 = REG_SEQUENCE killed %7, %subreg.sub0, %8, %subreg.sub1 + %10:vgpr_32 = V_LSHLREV_B32_e64 7, %6.sub0, implicit $exec + %11:vgpr_32 = V_AND_B32_e32 -32768, killed %10, implicit $exec + %12:sgpr_32 = COPY %1.sub1 + %13:vgpr_32 = COPY %5 + %14:vgpr_32, %15:sreg_64_xexec = V_ADD_I32_e64 %1.sub0, %11, implicit $exec + %16:vgpr_32 = COPY %12 + %17:vgpr_32, dead %18:sreg_64_xexec = V_ADDC_U32_e64 %16, %13, killed %15, implicit $exec + %19:vreg_64 = REG_SEQUENCE %14, %subreg.sub0, %17, %subreg.sub1 + %20:vreg_64 = V_LSHLREV_B64 3, %9, implicit $exec + %21:vgpr_32, %22:sreg_64_xexec = V_ADD_I32_e64 %14, %20.sub0, implicit $exec + %23:vgpr_32, dead %24:sreg_64_xexec = V_ADDC_U32_e64 %17, %20.sub1, killed %22, implicit $exec + %25:sgpr_32 = S_MOV_B32 6144 + %26:vgpr_32, %27:sreg_64_xexec = V_ADD_I32_e64 %21, %25, implicit $exec + %28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, implicit $exec + %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1 + %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, implicit $exec + %32:sgpr_32 = S_MOV_B32 8192 + %33:vgpr_32, %34:sreg_64_xexec = V_ADD_I32_e64 %21, %32, implicit $exec + %35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, implicit $exec + %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1 + %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, implicit $exec + %39:sgpr_32 = S_MOV_B32 10240 + %40:vgpr_32, %41:sreg_64_xexec = V_ADD_I32_e64 %21, %39, implicit $exec + %42:vgpr_32, dead %43:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %41, implicit $exec + %44:vreg_64 = REG_SEQUENCE %40, %subreg.sub0, %42, %subreg.sub1 + %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, implicit $exec +... +--- + +# Tests for a successful compilation. +name: assert_hit +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %3:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %4:sreg_32_xm0 = COPY $sgpr101 + %5:sreg_32_xm0 = S_MOV_B32 0 + $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %3 + $sgpr4 = COPY %4 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + %6:vreg_64 = COPY $vgpr0_vgpr1 + %7:vgpr_32 = V_AND_B32_e32 255, %6.sub0, implicit $exec + %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %9:vreg_64 = REG_SEQUENCE killed %7, %subreg.sub0, %8, %subreg.sub1 + %10:vgpr_32 = V_LSHLREV_B32_e64 7, %6.sub0, implicit $exec + %11:vgpr_32 = V_AND_B32_e32 -32768, killed %10, implicit $exec + %12:sgpr_32 = COPY %1.sub1 + %13:vgpr_32 = COPY %5 + %14:vgpr_32, %15:sreg_64_xexec = V_ADD_I32_e64 %1.sub0, %11, implicit $exec + %16:vgpr_32 = COPY %12 + %17:vgpr_32, dead %18:sreg_64_xexec = V_ADDC_U32_e64 %16, %13, killed %15, implicit $exec + %19:vreg_64 = REG_SEQUENCE %14, %subreg.sub0, %17, %subreg.sub1 + %20:vreg_64 = V_LSHLREV_B64 3, %9, implicit $exec + %21:vgpr_32, %22:sreg_64_xexec = V_ADD_I32_e64 %14, %20.sub0, implicit $exec + %23:vgpr_32, dead %24:sreg_64_xexec = V_ADDC_U32_e64 %17, %20.sub1, killed %22, implicit $exec + + %25:sgpr_32 = S_MOV_B32 6144 + %26:vgpr_32, %27:sreg_64_xexec = V_ADD_I32_e64 %21, %25, implicit $exec + %28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 4294967295, killed %27, implicit $exec + %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1 + %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, implicit $exec +... diff --git a/test/CodeGen/AMDGPU/ret.ll b/test/CodeGen/AMDGPU/ret.ll index 265d37e8d641601c67579e9fab4f43752dfd5b61..b7d202638dbce46e7b79d5c2eea0ee4258589db2 100644 --- a/test/CodeGen/AMDGPU/ret.ll +++ b/test/CodeGen/AMDGPU/ret.ll @@ -17,14 +17,13 @@ bb: } ; GCN-LABEL: {{^}}vgpr_literal: -; GCN: v_mov_b32_e32 v4, v0 -; GCN: exp mrt0 v4, v4, v4, v4 done vm +; GCN: exp mrt0 v0, v0, v0, v0 done vm ; GCN-DAG: v_mov_b32_e32 v0, 1.0 ; GCN-DAG: v_mov_b32_e32 v1, 2.0 ; GCN-DAG: v_mov_b32_e32 v2, 4.0 ; GCN-DAG: v_mov_b32_e32 v3, -1.0 -; GCN: s_waitcnt expcnt(0) +; GCN-DAG: s_waitcnt expcnt(0) ; GCN-NOT: s_endpgm define amdgpu_vs { float, float, float, float } @vgpr_literal([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { bb: @@ -226,15 +225,14 @@ bb: } ; GCN-LABEL: {{^}}structure_literal: -; GCN: v_mov_b32_e32 v3, v0 -; GCN: exp mrt0 v3, v3, v3, v3 done vm +; GCN: exp mrt0 v0, v0, v0, v0 done vm ; GCN-DAG: v_mov_b32_e32 v0, 1.0 ; GCN-DAG: s_mov_b32 s0, 2 ; GCN-DAG: s_mov_b32 s1, 3 ; GCN-DAG: v_mov_b32_e32 v1, 2.0 ; GCN-DAG: v_mov_b32_e32 v2, 4.0 -; GCN: s_waitcnt expcnt(0) +; GCN-DAG: s_waitcnt expcnt(0) define amdgpu_vs { { float, i32 }, { i32, <2 x float> } } @structure_literal([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { bb: call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0 diff --git a/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll b/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll new file mode 100644 index 0000000000000000000000000000000000000000..6e7d24cdc3c73c9bc888a29bf61495e8b13158f5 --- /dev/null +++ b/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll @@ -0,0 +1,26 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-OPT %s +; RUN: llc -march=amdgcn -mcpu=fiji -O0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-NOOPT %s + +; GCN-LABEL: {{^}}scalar_to_vector_i16: +; GCN-NOOPT: s_mov_b32 [[S:s[0-9]+]], 42 +; GCN-NOOPT: v_mov_b32_e32 [[V:v[0-9]+]], [[S]] +; GCN-OPT: v_mov_b32_e32 [[V:v[0-9]+]], 42 +; GCN: buffer_store_short [[V]], +define void @scalar_to_vector_i16() { + %tmp = load <2 x i16>, <2 x i16> addrspace(5)* undef + %tmp1 = insertelement <2 x i16> %tmp, i16 42, i64 0 + store <2 x i16> %tmp1, <2 x i16> addrspace(5)* undef + ret void +} + +; GCN-LABEL: {{^}}scalar_to_vector_f16: +; GCN-NOOPT: s_movk_i32 [[S:s[0-9]+]], 0x3c00 +; GCN-NOOPT: v_mov_b32_e32 [[V:v[0-9]+]], [[S]] +; GCN-OPT: v_mov_b32_e32 [[V:v[0-9]+]], 0x3c00 +; GCN: buffer_store_short [[V]], +define void @scalar_to_vector_f16() { + %tmp = load <2 x half>, <2 x half> addrspace(5)* undef + %tmp1 = insertelement <2 x half> %tmp, half 1.0, i64 0 + store <2 x half> %tmp1, <2 x half> addrspace(5)* undef + ret void +} diff --git a/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll b/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll index 5edc2c5c9b713eeadd0f6b1550c2b36bd6b819c3..d93cde6a885f76fc2f9b60f8f4a45898963d9b8f 100644 --- a/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll +++ b/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll @@ -24,7 +24,7 @@ main_body: %array_vector9 = insertelement <4 x float> , float %tmp1, i32 1 %array_vector10 = insertelement <4 x float> %array_vector9, float 0.000000e+00, i32 2 %array_vector11 = insertelement <4 x float> %array_vector10, float undef, i32 3 - %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> undef, i32 undef, i32 4864, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) + %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> undef, i32 undef, i32 4864, i32 0) call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 36, i32 4, i32 4, i1 1, i1 1) %bc = bitcast <4 x float> %array_vector3 to <4 x i32> %tmp4 = extractelement <4 x i32> %bc, i32 undef @@ -46,7 +46,7 @@ main_body: } declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 -declare i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2 +declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #2 declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #3 attributes #0 = { nounwind "target-cpu"="tonga" } diff --git a/test/CodeGen/AMDGPU/sdwa-op64-test.ll b/test/CodeGen/AMDGPU/sdwa-op64-test.ll new file mode 100644 index 0000000000000000000000000000000000000000..6ed12078bddb46a15e25f4a65d1a63484f554af3 --- /dev/null +++ b/test/CodeGen/AMDGPU/sdwa-op64-test.ll @@ -0,0 +1,74 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=FIJI,GCN %s + +; GCN-LABEL: {{^}}test_add_co_sdwa: +; GFX9: v_add_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +; FIJI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; FIJI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +define amdgpu_kernel void @test_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp + %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 + %tmp5 = and i32 %tmp4, 255 + %tmp6 = zext i32 %tmp5 to i64 + %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp + %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8 + %tmp9 = add nsw i64 %tmp8, %tmp6 + store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8 + ret void +} + + +; GCN-LABEL: {{^}}test_sub_co_sdwa: +; GFX9: v_sub_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9: v_subbrev_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +; FIJI: v_sub_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; FIJI: v_subbrev_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +define amdgpu_kernel void @test_sub_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp + %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 + %tmp5 = and i32 %tmp4, 255 + %tmp6 = zext i32 %tmp5 to i64 + %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp + %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8 + %tmp9 = sub nsw i64 %tmp8, %tmp6 + store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8 + ret void +} + +; GCN-LABEL: {{^}}test1_add_co_sdwa: +; GFX9: v_add_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +; GFX9: v_add_co_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +; FIJI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; FIJI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +; FIJI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; FIJI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} +define amdgpu_kernel void @test1_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1, i64 addrspace(1)* %arg2) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp + %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 + %tmp5 = and i32 %tmp4, 255 + %tmp6 = zext i32 %tmp5 to i64 + %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp + %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8 + %tmp9 = add nsw i64 %tmp8, %tmp6 + store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8 + %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp + %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4 + %tmp15 = and i32 %tmp14, 255 + %tmp16 = zext i32 %tmp15 to i64 + %tmp17 = getelementptr inbounds i64, i64 addrspace(1)* %arg2, i32 %tmp + %tmp18 = load i64, i64 addrspace(1)* %tmp17, align 8 + %tmp19 = add nsw i64 %tmp18, %tmp16 + store i64 %tmp19, i64 addrspace(1)* %tmp17, align 8 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/test/CodeGen/AMDGPU/sdwa-ops.mir b/test/CodeGen/AMDGPU/sdwa-ops.mir new file mode 100644 index 0000000000000000000000000000000000000000..9660c88bbb027dfcc7420ad002ec7e4af304ac39 --- /dev/null +++ b/test/CodeGen/AMDGPU/sdwa-ops.mir @@ -0,0 +1,390 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 %s + +# test for 3 consecutive _sdwa's +# GFX9-LABEL: name: test1_add_co_sdwa +# GFX9: V_ADD_I32_sdwa +# GFX9-NEXT: V_ADDC_U32_e32 +# GFX9: V_ADD_I32_sdwa +# GFX9-NEXT: V_ADDC_U32_e32 +# GFX9: V_ADD_I32_sdwa +# GFX9-NEXT: V_ADDC_U32_e32 +--- +name: test1_add_co_sdwa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %22:sreg_32_xm0 = S_MOV_B32 255 + %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %30:vreg_64 = COPY $sgpr0_sgpr1 + %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec + %64:vgpr_32, dead %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %65, implicit $exec + %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + + %161:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %163:vgpr_32, %165:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %161, implicit $exec + %164:vgpr_32, dead %166:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %165, implicit $exec + %162:vreg_64 = REG_SEQUENCE %163, %subreg.sub0, %164, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %162, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + + %171:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %173:vgpr_32, %175:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %171, implicit $exec + %174:vgpr_32, dead %176:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %175, implicit $exec + %172:vreg_64 = REG_SEQUENCE %173, %subreg.sub0, %174, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %172, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + +... + +# test for VCC interference on sdwa, should generate 1 xform only +# GFX9-LABEL: name: test2_add_co_sdwa +# GFX9: V_ADD_I32_sdwa +# GFX9: V_ADDC_U32_e32 +# GFX9-NOT: V_ADD_I32_sdwa +# GFX9-NOT: V_ADDC_U32_e32 +--- +name: test2_add_co_sdwa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %22:sreg_32_xm0 = S_MOV_B32 255 + %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %30:vreg_64 = COPY $sgpr0_sgpr1 + %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec + + %161:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %163:vgpr_32, %165:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %161, implicit $exec + %164:vgpr_32, dead %166:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %165, implicit $exec + %162:vreg_64 = REG_SEQUENCE %163, %subreg.sub0, %164, %subreg.sub1 + + %64:vgpr_32, dead %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %65, implicit $exec + %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + + %161:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %163:vgpr_32, %165:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %161, implicit $exec + %164:vgpr_32, dead %166:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %165, implicit $exec + %162:vreg_64 = REG_SEQUENCE %163, %subreg.sub0, %164, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %162, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + +... + +# test for CarryOut used, should reject +# GFX9-LABEL: name: test3_add_co_sdwa +# GFX9: V_ADD_I32_e64 +# GFX9: V_ADDC_U32_e64 +# GFX9-NOT: V_ADD_I32_sdwa +# GFX9-NOT: V_ADDC_U32_e32 +--- +name: test3_add_co_sdwa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %22:sreg_32_xm0 = S_MOV_B32 255 + %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %30:vreg_64 = COPY $sgpr0_sgpr1 + %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec + %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %65, implicit $exec + %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %66, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + +... + +# test for CarryIn used more than once, should reject +# GFX9-LABEL: name: test4_add_co_sdwa +# GFX9: V_ADD_I32_e64 +# GFX9: V_ADDC_U32_e64 +# GFX9-NOT: V_ADD_I32_sdwa +# GFX9-NOT: V_ADDC_U32_e32 +--- +name: test4_add_co_sdwa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %22:sreg_32_xm0 = S_MOV_B32 255 + %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %30:vreg_64 = COPY $sgpr0_sgpr1 + %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec + %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, implicit $exec + %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %65, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + + +... + +# test for simple example, should generate sdwa +# GFX9-LABEL: name: test5_add_co_sdwa +# GFX9: V_ADD_I32_sdwa +# GFX9: V_ADDC_U32_e32 +--- +name: test5_add_co_sdwa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %22:sreg_32_xm0 = S_MOV_B32 255 + %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %30:vreg_64 = COPY $sgpr0_sgpr1 + %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec + %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, implicit $exec + %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + + +... + +# test for V_ADD_I32_e64 only, should reject +# GFX9-LABEL: name: test6_add_co_sdwa +# GFX9: V_ADD_I32_e64 +# GFX9-NOT: V_ADD_I32_sdwa +# GFX9-NOT: V_ADDC_U32_e32 +--- +name: test6_add_co_sdwa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %22:sreg_32_xm0 = S_MOV_B32 255 + %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %30:vreg_64 = COPY $sgpr0_sgpr1 + %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec + %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %23, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + + +... + +# test for V_ADDC_U32_e64 only, should reject +# GFX9-LABEL: name: test7_add_co_sdwa +# GFX9: V_ADDC_U32_e64 +# GFX9-NOT: V_ADD_I32_sdwa +# GFX9-NOT: V_ADDC_U32_e32 +--- +name: test7_add_co_sdwa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %22:sreg_32_xm0 = S_MOV_B32 255 + %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %24:sreg_64_xexec = COPY $sgpr0_sgpr1 + + %30:vreg_64 = COPY $sgpr0_sgpr1 + %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %24, implicit $exec + %62:vreg_64 = REG_SEQUENCE %23, %subreg.sub0, %23, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + + +... + +# test for $vcc defined between two adds, should not generate +# GFX9-LABEL: name: test8_add_co_sdwa +# GFX9-NOT: V_ADD_I32_sdwa +# GFX9: V_ADDC_U32_e64 +--- +name: test8_add_co_sdwa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %22:sreg_32_xm0 = S_MOV_B32 255 + %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %30:vreg_64 = COPY $sgpr0_sgpr1 + %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec + $vcc = COPY %30 + %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, implicit $exec + %31:vreg_64 = COPY $vcc + %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %31, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + + +... + +# test for non dead $vcc, should not generate +# GFX9-LABEL: name: test9_add_co_sdwa +# GFX9-NOT: V_ADD_I32_sdwa +# GFX9: V_ADDC_U32_e64 +--- +name: test9_add_co_sdwa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %22:sreg_32_xm0 = S_MOV_B32 255 + %30:vreg_64 = COPY $sgpr0_sgpr1 + $vcc = COPY %30 + %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec + %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, implicit $exec + %31:vreg_64 = COPY $vcc + %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %31, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + +... + +# test for def $vcc_lo, should not generate +# GFX9-LABEL: name: test10_add_co_sdwa +# GFX9-NOT: V_ADD_I32_sdwa +# GFX9: V_ADDC_U32_e64 +--- +name: test10_add_co_sdwa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %22:sreg_32_xm0 = S_MOV_B32 255 + %30:vreg_64 = COPY $sgpr0_sgpr1 + $vcc_lo = COPY %30.sub0 + %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec + %31:vgpr_32 = COPY $vcc_lo + %32:vreg_64 = REG_SEQUENCE %31, %subreg.sub0, %23, %subreg.sub1 + %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, implicit $exec + %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %32, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + +... + +# test for read $vcc_hi, should not generate +# GFX9-LABEL: name: test11_add_co_sdwa +# GFX9-NOT: V_ADD_I32_sdwa +# GFX9: V_ADDC_U32_e64 +--- +name: test11_add_co_sdwa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %22:sreg_32_xm0 = S_MOV_B32 255 + %30:vreg_64 = COPY $sgpr0_sgpr1 + $vcc_hi = COPY %30.sub0 + %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec + %31:vgpr_32 = COPY $vcc_hi + %32:vreg_64 = REG_SEQUENCE %31, %subreg.sub0, %23, %subreg.sub1 + %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, implicit $exec + %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %32, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + +... + +# test for $vcc defined and used between adds, should not generate +# GFX9-LABEL: name: test12_add_co_sdwa +# GFX9-NOT: V_ADD_I32_sdwa +# GFX9: V_ADDC_U32_e64 +--- +name: test12_add_co_sdwa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$vgpr0', virtual-reg: '%0' } + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %22:sreg_32_xm0 = S_MOV_B32 255 + %30:vreg_64 = COPY $sgpr0_sgpr1 + %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec + %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, implicit $exec + $vcc = COPY %30 + %31:vreg_64 = COPY killed $vcc + %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, implicit $exec + %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %31, %62, %1, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + diff --git a/test/CodeGen/AMDGPU/sdwa-peephole.ll b/test/CodeGen/AMDGPU/sdwa-peephole.ll index 3c92e8e5cba3d65a4c294c2fc1d2bd96ee2944b4..6beac4904a704ef4251019c7f06faa95bc10c287 100644 --- a/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -501,7 +501,12 @@ entry: ; GCN-LABEL: {{^}}sdwa_crash_inlineasm_def: ; GCN: s_mov_b32 s{{[0-9]+}}, 0xffff ; GCN: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x10000, +; +; TODO: Why is the constant not peepholed into the v_or_b32_e32? +; +; NOSDWA: s_mov_b32 [[CONST:s[0-9]+]], 0x10000 +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, s0, +; SDWA: v_or_b32_e32 v{{[0-9]+}}, 0x10000, define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 { bb: br label %bb1 diff --git a/test/CodeGen/AMDGPU/select-undef.ll b/test/CodeGen/AMDGPU/select-undef.ll new file mode 100644 index 0000000000000000000000000000000000000000..6597d6784e0c2367ba62c024bf3c1f520831c681 --- /dev/null +++ b/test/CodeGen/AMDGPU/select-undef.ll @@ -0,0 +1,45 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}select_undef_lhs: +; GCN: s_waitcnt +; GCN-NOT: v_cmp +; GCN-NOT: v_cndmask +; GCN-NEXT: s_setpc_b64 +define float @select_undef_lhs(float %val, i1 %cond) { + %undef = call float @llvm.amdgcn.rcp.f32(float undef) + %sel = select i1 %cond, float %undef, float %val + ret float %sel +} + +; GCN-LABEL: {{^}}select_undef_rhs: +; GCN: s_waitcnt +; GCN-NOT: v_cmp +; GCN-NOT: v_cndmask +; GCN-NEXT: s_setpc_b64 +define float @select_undef_rhs(float %val, i1 %cond) { + %undef = call float @llvm.amdgcn.rcp.f32(float undef) + %sel = select i1 %cond, float %val, float %undef + ret float %sel +} + +; GCN-LABEL: {{^}}select_undef_n1: +; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 1.0 +; GCN: store_dword {{[^,]+}}, [[RES]] +define void @select_undef_n1(float addrspace(1)* %a, i32 %c) { + %cc = icmp eq i32 %c, 0 + %sel = select i1 %cc, float 1.000000e+00, float undef + store float %sel, float addrspace(1)* %a + ret void +} + +; GCN-LABEL: {{^}}select_undef_n2: +; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 1.0 +; GCN: store_dword {{[^,]+}}, [[RES]] +define void @select_undef_n2(float addrspace(1)* %a, i32 %c) { + %cc = icmp eq i32 %c, 0 + %sel = select i1 %cc, float undef, float 1.000000e+00 + store float %sel, float addrspace(1)* %a + ret void +} + +declare float @llvm.amdgcn.rcp.f32(float) diff --git a/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir b/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir index be8b207a74083cd06d2babf67d6c174a9e0289ee..9481e98502d6e095676ca5e807adcc210213d275 100644 --- a/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir +++ b/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir @@ -75,23 +75,22 @@ name: sgpr_spill_wrong_stack_id tracksRegLiveness: true frameInfo: - adjustsStack: false hasCalls: true body: | - bb.0.bb: - %8:sreg_32_xm0 = COPY $sgpr5 - %4:vreg_64 = IMPLICIT_DEF - %3:vgpr_32 = FLAT_LOAD_DWORD %4, 0, 0, 0, implicit $exec, implicit $flat_scr - %5:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc + bb.0: + %0:sreg_32_xm0 = COPY $sgpr5 + %1:vreg_64 = IMPLICIT_DEF + %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit $exec, implicit $flat_scr + %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr5 - dead $sgpr30_sgpr31 = SI_CALL %5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit undef $vgpr0 - $sgpr5 = COPY %8 - %12:sreg_32_xm0 = COPY $sgpr5 + dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit undef $vgpr0 + $sgpr5 = COPY %0 + %4:sreg_32_xm0 = COPY $sgpr5 ADJCALLSTACKDOWN 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr5 ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr5 - $vgpr0 = COPY %3 - dead $sgpr30_sgpr31 = SI_CALL %5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit killed $vgpr0 - $sgpr5 = COPY %12 + $vgpr0 = COPY %2 + dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit killed $vgpr0 + $sgpr5 = COPY %4 ADJCALLSTACKDOWN 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr5 ... diff --git a/test/CodeGen/AMDGPU/shl_add.ll b/test/CodeGen/AMDGPU/shl_add.ll new file mode 100644 index 0000000000000000000000000000000000000000..062198a8cc8a0d0ba83bd56490225034a228c55e --- /dev/null +++ b/test/CodeGen/AMDGPU/shl_add.ll @@ -0,0 +1,94 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s + +; =================================================================================== +; V_LSHL_ADD_U32 +; =================================================================================== + +define amdgpu_ps float @shl_add(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: shl_add: +; VI: ; %bb.0: +; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: shl_add: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshl_add_u32 v0, v0, v1, v2 +; GFX9-NEXT: ; return to shader part epilog + %x = shl i32 %a, %b + %result = add i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +; ThreeOp instruction variant not used due to Constant Bus Limitations +define amdgpu_ps float @shl_add_vgpr_a(i32 %a, i32 inreg %b, i32 inreg %c) { +; VI-LABEL: shl_add_vgpr_a: +; VI: ; %bb.0: +; VI-NEXT: v_lshlrev_b32_e32 v0, s2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s3, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: shl_add_vgpr_a: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshlrev_b32_e32 v0, s2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s3, v0 +; GFX9-NEXT: ; return to shader part epilog + %x = shl i32 %a, %b + %result = add i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @shl_add_vgpr_all(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: shl_add_vgpr_all: +; VI: ; %bb.0: +; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: shl_add_vgpr_all: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshl_add_u32 v0, v0, v1, v2 +; GFX9-NEXT: ; return to shader part epilog + %x = shl i32 %a, %b + %result = add i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @shl_add_vgpr_ab(i32 %a, i32 %b, i32 inreg %c) { +; VI-LABEL: shl_add_vgpr_ab: +; VI: ; %bb.0: +; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: shl_add_vgpr_ab: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshl_add_u32 v0, v0, v1, s2 +; GFX9-NEXT: ; return to shader part epilog + %x = shl i32 %a, %b + %result = add i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @shl_add_vgpr_const(i32 %a, i32 %b) { +; VI-LABEL: shl_add_vgpr_const: +; VI: ; %bb.0: +; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: shl_add_vgpr_const: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 3, v1 +; GFX9-NEXT: ; return to shader part epilog + %x = shl i32 %a, 3 + %result = add i32 %x, %b + %bc = bitcast i32 %result to float + ret float %bc +} diff --git a/test/CodeGen/AMDGPU/shl_or.ll b/test/CodeGen/AMDGPU/shl_or.ll new file mode 100644 index 0000000000000000000000000000000000000000..24d34efa28b1c26991364c11088c3de5a9341b03 --- /dev/null +++ b/test/CodeGen/AMDGPU/shl_or.ll @@ -0,0 +1,144 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s + +; =================================================================================== +; V_LSHL_OR_B32 +; =================================================================================== + +define amdgpu_ps float @shl_or(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: shl_or: +; VI: ; %bb.0: +; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: shl_or: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshl_or_b32 v0, v0, v1, v2 +; GFX9-NEXT: ; return to shader part epilog + %x = shl i32 %a, %b + %result = or i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @shl_or_vgpr_c(i32 inreg %a, i32 inreg %b, i32 %c) { +; VI-LABEL: shl_or_vgpr_c: +; VI: ; %bb.0: +; VI-NEXT: s_lshl_b32 s0, s2, s3 +; VI-NEXT: v_or_b32_e32 v0, s0, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: shl_or_vgpr_c: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_lshl_b32 s0, s2, s3 +; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX9-NEXT: ; return to shader part epilog + %x = shl i32 %a, %b + %result = or i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @shl_or_vgpr_all2(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: shl_or_vgpr_all2: +; VI: ; %bb.0: +; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: shl_or_vgpr_all2: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshl_or_b32 v0, v0, v1, v2 +; GFX9-NEXT: ; return to shader part epilog + %x = shl i32 %a, %b + %result = or i32 %c, %x + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @shl_or_vgpr_ac(i32 %a, i32 inreg %b, i32 %c) { +; VI-LABEL: shl_or_vgpr_ac: +; VI: ; %bb.0: +; VI-NEXT: v_lshlrev_b32_e32 v0, s2, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: shl_or_vgpr_ac: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshl_or_b32 v0, v0, s2, v1 +; GFX9-NEXT: ; return to shader part epilog + %x = shl i32 %a, %b + %result = or i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @shl_or_vgpr_const(i32 %a, i32 %b) { +; VI-LABEL: shl_or_vgpr_const: +; VI: ; %bb.0: +; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v0, 6, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: shl_or_vgpr_const: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshl_or_b32 v0, v0, v1, 6 +; GFX9-NEXT: ; return to shader part epilog + %x = shl i32 %a, %b + %result = or i32 %x, 6 + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @shl_or_vgpr_const2(i32 %a, i32 %b) { +; VI-LABEL: shl_or_vgpr_const2: +; VI: ; %bb.0: +; VI-NEXT: v_lshlrev_b32_e32 v0, 6, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: shl_or_vgpr_const2: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 6, v1 +; GFX9-NEXT: ; return to shader part epilog + %x = shl i32 %a, 6 + %result = or i32 %x, %b + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @shl_or_vgpr_const_scalar1(i32 inreg %a, i32 %b) { +; VI-LABEL: shl_or_vgpr_const_scalar1: +; VI: ; %bb.0: +; VI-NEXT: s_lshl_b32 s0, s2, 6 +; VI-NEXT: v_or_b32_e32 v0, s0, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: shl_or_vgpr_const_scalar1: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshl_or_b32 v0, s2, 6, v0 +; GFX9-NEXT: ; return to shader part epilog + %x = shl i32 %a, 6 + %result = or i32 %x, %b + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @shl_or_vgpr_const_scalar2(i32 %a, i32 inreg %b) { +; VI-LABEL: shl_or_vgpr_const_scalar2: +; VI: ; %bb.0: +; VI-NEXT: v_lshlrev_b32_e32 v0, 6, v0 +; VI-NEXT: v_or_b32_e32 v0, s2, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: shl_or_vgpr_const_scalar2: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 6, s2 +; GFX9-NEXT: ; return to shader part epilog + %x = shl i32 %a, 6 + %result = or i32 %x, %b + %bc = bitcast i32 %result to float + ret float %bc +} diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll index 7c0f5b9a4711d618eb48acd41f8b985e25bf6b98..e9a6ba9894261660eca007becf62a088cc3ce4bd 100644 --- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -1,5 +1,5 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) @@ -265,16 +265,15 @@ define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %o ; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}} ; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}} +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:20 +; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:12 +; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:28 +; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:44 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:12 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:28 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:44 +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:36 +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:52 -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off{{$}} -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:20 - -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:36 -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:52 define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 diff --git a/test/CodeGen/AMDGPU/sibling-call.ll b/test/CodeGen/AMDGPU/sibling-call.ll index efc01083a3673734fcf1c68ec330d61f5530f751..260953b184c3e23e103d973e2b97c357595c2d87 100644 --- a/test/CodeGen/AMDGPU/sibling-call.ll +++ b/test/CodeGen/AMDGPU/sibling-call.ll @@ -136,8 +136,7 @@ entry: ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9: v_add_u32_e32 v0, v0, [[LOAD_0]] -; GFX9: v_add_u32_e32 v0, v0, [[LOAD_1]] +; GFX9: v_add3_u32 v0, v0, v3, v2 ; GCN-NEXT: s_setpc_b64 define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 { diff --git a/test/CodeGen/AMDGPU/simplify-libcalls.ll b/test/CodeGen/AMDGPU/simplify-libcalls.ll index 24117e2c83689377063b698540ebfd2baba62b2e..859f848d228c4448b092c389b67f7462f8ab9cda 100644 --- a/test/CodeGen/AMDGPU/simplify-libcalls.ll +++ b/test/CodeGen/AMDGPU/simplify-libcalls.ll @@ -783,5 +783,10 @@ entry: ret void } +; GCN-PRELINK: declare float @_Z4fabsf(float) local_unnamed_addr #[[$NOUNWIND_READONLY:[0-9]+]] +; GCN-PRELINK: declare float @_Z4cbrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY]] +; GCN-PRELINK: declare float @_Z11native_sqrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY]] + ; CGN-PRELINK: attributes #[[$NOUNWIND]] = { nounwind } +; GCN-PRELINK: attributes #[[$NOUNWIND_READONLY]] = { nounwind readonly } attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/skip-if-dead.ll b/test/CodeGen/AMDGPU/skip-if-dead.ll index 42a28b9527391240b33604c5932e5d02132618b4..a02e6f8349cd3b7a623bd39f123e90fc2ffdd74b 100644 --- a/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -2,9 +2,10 @@ ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_pos: ; CHECK-NEXT: ; %bb.0: +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 { - call void @llvm.AMDGPU.kill(float 0.0) + call void @llvm.amdgcn.kill(i1 true) ret void } @@ -14,7 +15,7 @@ define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 { ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 { - call void @llvm.AMDGPU.kill(float -0.0) + call void @llvm.amdgcn.kill(i1 false) ret void } @@ -27,58 +28,62 @@ define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 { ; CHECK-NEXT: ; %bb.2: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 { - call void @llvm.AMDGPU.kill(float -0.0) - call void @llvm.AMDGPU.kill(float -1.0) + call void @llvm.amdgcn.kill(i1 false) + call void @llvm.amdgcn.kill(i1 false) ret void } ; CHECK-LABEL: {{^}}test_kill_depth_var: ; CHECK-NEXT: ; %bb.0: -; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var(float %x) #0 { - call void @llvm.AMDGPU.kill(float %x) + %cmp = fcmp olt float %x, 0.0 + call void @llvm.amdgcn.kill(i1 %cmp) ret void } ; FIXME: Ideally only one would be emitted ; CHECK-LABEL: {{^}}test_kill_depth_var_x2_same: ; CHECK-NEXT: ; %bb.0: -; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 ; CHECK-NEXT: ; %bb.1: -; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 ; CHECK-NEXT: ; %bb.2: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 { - call void @llvm.AMDGPU.kill(float %x) - call void @llvm.AMDGPU.kill(float %x) + %cmp = fcmp olt float %x, 0.0 + call void @llvm.amdgcn.kill(i1 %cmp) + call void @llvm.amdgcn.kill(i1 %cmp) ret void } ; CHECK-LABEL: {{^}}test_kill_depth_var_x2: ; CHECK-NEXT: ; %bb.0: -; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 ; CHECK-NEXT: ; %bb.1: -; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v1 +; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v1 ; CHECK-NEXT: ; %bb.2: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 { - call void @llvm.AMDGPU.kill(float %x) - call void @llvm.AMDGPU.kill(float %y) + %cmp.x = fcmp olt float %x, 0.0 + call void @llvm.amdgcn.kill(i1 %cmp.x) + %cmp.y = fcmp olt float %y, 0.0 + call void @llvm.amdgcn.kill(i1 %cmp.y) ret void } ; CHECK-LABEL: {{^}}test_kill_depth_var_x2_instructions: ; CHECK-NEXT: ; %bb.0: -; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 ; CHECK-NEXT: s_cbranch_execnz BB6_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: exp ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: BB6_2: ; CHECK: v_mov_b32_e64 v7, -1 -; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7 +; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7 ; CHECK-NEXT: s_cbranch_execnz BB6_4 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: exp @@ -86,9 +91,11 @@ define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 { ; CHECK-NEXT: BB6_4: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 { - call void @llvm.AMDGPU.kill(float %x) + %cmp.x = fcmp olt float %x, 0.0 + call void @llvm.amdgcn.kill(i1 %cmp.x) %y = call float asm sideeffect "v_mov_b32_e64 v7, -1", "={v7}"() - call void @llvm.AMDGPU.kill(float %y) + %cmp.y = fcmp olt float %y, 0.0 + call void @llvm.amdgcn.kill(i1 %cmp.y) ret void } @@ -111,7 +118,7 @@ define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 { ; CHECK: v_nop_e64 ; CHECK: v_nop_e64 -; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7 +; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7 ; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: ; %bb.2: ; CHECK-NEXT: exp null off, off, off, off done vm @@ -137,7 +144,8 @@ bb: v_nop_e64 v_nop_e64 v_nop_e64", "={v7}"() - call void @llvm.AMDGPU.kill(float %var) + %cmp.var = fcmp olt float %var, 0.0 + call void @llvm.amdgcn.kill(i1 %cmp.var) br label %exit exit: @@ -162,7 +170,7 @@ exit: ; CHECK: ;;#ASMEND ; CHECK: v_mov_b32_e64 v8, -1 ; CHECK: ;;#ASMEND -; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7 +; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7 ; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: ; %bb.2: @@ -196,7 +204,8 @@ bb: v_nop_e64 v_nop_e64", "={v7}"() %live.across = call float asm sideeffect "v_mov_b32_e64 v8, -1", "={v8}"() - call void @llvm.AMDGPU.kill(float %var) + %cmp.var = fcmp olt float %var, 0.0 + call void @llvm.amdgcn.kill(i1 %cmp.var) store volatile float %live.across, float addrspace(1)* undef %live.out = call float asm sideeffect "v_mov_b32_e64 v9, -2", "={v9}"() br label %exit @@ -221,7 +230,7 @@ exit: ; CHECK: v_mov_b32_e64 v7, -1 ; CHECK: v_nop_e64 -; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7 +; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7 ; CHECK-NEXT: ; %bb.3: ; CHECK: buffer_load_dword [[LOAD:v[0-9]+]] @@ -251,7 +260,8 @@ bb: v_nop_e64 v_nop_e64 v_nop_e64", "={v7}"() - call void @llvm.AMDGPU.kill(float %var) + %cmp.var = fcmp olt float %var, 0.0 + call void @llvm.amdgcn.kill(i1 %cmp.var) %vgpr = load volatile i32, i32 addrspace(1)* undef %loop.cond = icmp eq i32 %vgpr, 0 br i1 %loop.cond, label %bb, label %exit @@ -264,7 +274,7 @@ exit: ; bug 28550 ; CHECK-LABEL: {{^}}phi_use_def_before_kill: ; CHECK: v_cndmask_b32_e64 [[PHIREG:v[0-9]+]], 0, -1.0, -; CHECK: v_cmpx_le_f32_e32 vcc, 0, +; CHECK: v_cmpx_lt_f32_e32 vcc, 0, ; CHECK-NEXT: s_cbranch_execnz [[BB4:BB[0-9]+_[0-9]+]] ; CHECK: exp @@ -288,7 +298,8 @@ bb: %tmp = fadd float %x, 1.000000e+00 %tmp1 = fcmp olt float 0.000000e+00, %tmp %tmp2 = select i1 %tmp1, float -1.000000e+00, float 0.000000e+00 - call void @llvm.AMDGPU.kill(float %tmp2) + %cmp.tmp2 = fcmp olt float %tmp2, 0.0 + call void @llvm.amdgcn.kill(i1 %cmp.tmp2) br i1 undef, label %phibb, label %bb8 phibb: @@ -335,7 +346,7 @@ bb5: ; preds = %bb4, %bb3 unreachable bb6: ; preds = %bb - call void @llvm.AMDGPU.kill(float -1.000000e+00) + call void @llvm.amdgcn.kill(i1 false) unreachable bb7: ; preds = %bb4 @@ -348,7 +359,7 @@ bb7: ; preds = %bb4 ; CHECK: s_xor_b64 ; CHECK-NEXT: mask branch [[BB4:BB[0-9]+_[0-9]+]] -; CHECK: v_cmpx_le_f32_e32 vcc, 0, +; CHECK: v_cmpx_gt_f32_e32 vcc, 0, ; CHECK: [[BB4]]: ; CHECK: s_or_b64 exec, exec ; CHECK: image_sample_c @@ -369,7 +380,8 @@ bb: br i1 %tmp, label %bb3, label %bb4 bb3: ; preds = %bb - call void @llvm.AMDGPU.kill(float %arg) + %cmp.arg = fcmp olt float %arg, 0.0 + call void @llvm.amdgcn.kill(i1 %cmp.arg) br label %bb4 bb4: ; preds = %bb3, %bb @@ -387,7 +399,7 @@ bb9: ; preds = %bb4 } declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare void @llvm.AMDGPU.kill(float) #0 +declare void @llvm.amdgcn.kill(i1) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } diff --git a/test/CodeGen/AMDGPU/smed3.ll b/test/CodeGen/AMDGPU/smed3.ll index 22f11da4648b5cc2ede211559003c2c1a2a1047c..22c3b2d42f35af1822894ebb5336c0180c02f67c 100644 --- a/test/CodeGen/AMDGPU/smed3.ll +++ b/test/CodeGen/AMDGPU/smed3.ll @@ -364,6 +364,211 @@ bb: ret void } +; 16 combinations + +; 16: min(max(x, y), max(min(x, y), z)) +; 17: min(max(x, y), max(min(y, x), z)) +; 18: min(max(x, y), max(z, min(x, y))) +; 19: min(max(x, y), max(z, min(y, x))) +; 20: min(max(y, x), max(min(x, y), z)) +; 21: min(max(y, x), max(min(y, x), z)) +; 22: min(max(y, x), max(z, min(x, y))) +; 23: min(max(y, x), max(z, min(y, x))) +; +; + commute outermost min + +; GCN-LABEL: {{^}}s_test_smed3_i32_pat_16: +; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_smed3_i32_pat_16(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @smin(i32 %x, i32 %y) + %tmp1 = call i32 @smax(i32 %x, i32 %y) + %tmp2 = call i32 @smax(i32 %tmp0, i32 %z) + %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_smed3_i32_pat_17: +; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_smed3_i32_pat_17(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @smin(i32 %y, i32 %x) + %tmp1 = call i32 @smax(i32 %x, i32 %y) + %tmp2 = call i32 @smax(i32 %tmp0, i32 %z) + %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_smed3_i32_pat_18: +; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_smed3_i32_pat_18(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @smin(i32 %x, i32 %y) + %tmp1 = call i32 @smax(i32 %x, i32 %y) + %tmp2 = call i32 @smax(i32 %z, i32 %tmp0) + %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_smed3_i32_pat_19: +; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_smed3_i32_pat_19(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @smin(i32 %y, i32 %x) + %tmp1 = call i32 @smax(i32 %x, i32 %y) + %tmp2 = call i32 @smax(i32 %z, i32 %tmp0) + %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_smed3_i32_pat_20: +; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_smed3_i32_pat_20(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @smin(i32 %x, i32 %y) + %tmp1 = call i32 @smax(i32 %y, i32 %x) + %tmp2 = call i32 @smax(i32 %tmp0, i32 %z) + %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_smed3_i32_pat_21: +; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_smed3_i32_pat_21(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @smin(i32 %y, i32 %x) + %tmp1 = call i32 @smax(i32 %y, i32 %x) + %tmp2 = call i32 @smax(i32 %tmp0, i32 %z) + %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_smed3_i32_pat_22: +; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_smed3_i32_pat_22(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @smin(i32 %x, i32 %y) + %tmp1 = call i32 @smax(i32 %y, i32 %x) + %tmp2 = call i32 @smax(i32 %z, i32 %tmp0) + %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_smed3_i32_pat_23: +; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_smed3_i32_pat_23(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @smin(i32 %y, i32 %x) + %tmp1 = call i32 @smax(i32 %y, i32 %x) + %tmp2 = call i32 @smax(i32 %z, i32 %tmp0) + %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_smed3_i32_pat_24: +; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_smed3_i32_pat_24(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @smin(i32 %x, i32 %y) + %tmp1 = call i32 @smax(i32 %x, i32 %y) + %tmp2 = call i32 @smax(i32 %tmp0, i32 %z) + %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_smed3_i32_pat_25: +; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_smed3_i32_pat_25(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @smin(i32 %y, i32 %x) + %tmp1 = call i32 @smax(i32 %x, i32 %y) + %tmp2 = call i32 @smax(i32 %tmp0, i32 %z) + %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_smed3_i32_pat_26: +; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_smed3_i32_pat_26(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @smin(i32 %x, i32 %y) + %tmp1 = call i32 @smax(i32 %x, i32 %y) + %tmp2 = call i32 @smax(i32 %z, i32 %tmp0) + %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_smed3_i32_pat_27: +; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_smed3_i32_pat_27(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @smin(i32 %y, i32 %x) + %tmp1 = call i32 @smax(i32 %x, i32 %y) + %tmp2 = call i32 @smax(i32 %z, i32 %tmp0) + %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_smed3_i32_pat_28: +; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_smed3_i32_pat_28(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @smin(i32 %x, i32 %y) + %tmp1 = call i32 @smax(i32 %y, i32 %x) + %tmp2 = call i32 @smax(i32 %tmp0, i32 %z) + %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_smed3_i32_pat_29: +; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_smed3_i32_pat_29(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @smin(i32 %y, i32 %x) + %tmp1 = call i32 @smax(i32 %y, i32 %x) + %tmp2 = call i32 @smax(i32 %tmp0, i32 %z) + %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_smed3_i32_pat_30: +; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_smed3_i32_pat_30(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @smin(i32 %x, i32 %y) + %tmp1 = call i32 @smax(i32 %y, i32 %x) + %tmp2 = call i32 @smax(i32 %z, i32 %tmp0) + %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_smed3_i32_pat_31: +; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_smed3_i32_pat_31(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @smin(i32 %y, i32 %x) + %tmp1 = call i32 @smax(i32 %y, i32 %x) + %tmp2 = call i32 @smax(i32 %z, i32 %tmp0) + %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + ; FIXME: Should keep scalar or not promote ; GCN-LABEL: {{^}}s_test_smed3_i16_pat_0: ; GCN: s_sext_i32_i16 @@ -476,6 +681,28 @@ bb: ret void } +; GCN-LABEL: {{^}}v_test_smed3_i16_pat_1: +; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +define amdgpu_kernel void @v_test_smed3_i16_pat_1(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid + %gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3 + %gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8 + %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + %x = load i16, i16 addrspace(1)* %gep0 + %y = load i16, i16 addrspace(1)* %gep1 + %z = load i16, i16 addrspace(1)* %gep2 + + %tmp0 = call i16 @smin16(i16 %x, i16 %y) + %tmp1 = call i16 @smax16(i16 %x, i16 %y) + %tmp2 = call i16 @smax16(i16 %tmp0, i16 %z) + %tmp3 = call i16 @smin16(i16 %tmp1, i16 %tmp2) + store i16 %tmp3, i16 addrspace(1)* %out.gep + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } attributes #2 = { nounwind readnone alwaysinline } diff --git a/test/CodeGen/AMDGPU/smrd-fold-offset.mir b/test/CodeGen/AMDGPU/smrd-fold-offset.mir index 44954f0652313a4cf80f8790c248de28759c8972..10601ccaeb77dbcd1e4ad8bfeccce23a5f5cd39b 100644 --- a/test/CodeGen/AMDGPU/smrd-fold-offset.mir +++ b/test/CodeGen/AMDGPU/smrd-fold-offset.mir @@ -1,6 +1,8 @@ # RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies -o - %s | FileCheck -check-prefix=GCN %s -# GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095 +# GCN-LABEL: name: smrd_vgpr_offset_imm +# GCN: V_READFIRSTLANE_B32 +# GCN: S_BUFFER_LOAD_DWORD_SGPR --- name: smrd_vgpr_offset_imm body: | @@ -22,7 +24,9 @@ body: | SI_RETURN_TO_EPILOG $vgpr0 ... -# GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095 +# GCN-LABEL: name: smrd_vgpr_offset_imm_add_u32 +# GCN: V_READFIRSTLANE_B32 +# GCN: S_BUFFER_LOAD_DWORD_SGPR --- name: smrd_vgpr_offset_imm_add_u32 body: | diff --git a/test/CodeGen/AMDGPU/smrd-vccz-bug.ll b/test/CodeGen/AMDGPU/smrd-vccz-bug.ll index 3e0fd566cc7bb47cd37afc4ee86f28533c8b12c6..823785dc752dc17809e381543219fd47c42963b2 100644 --- a/test/CodeGen/AMDGPU/smrd-vccz-bug.ll +++ b/test/CodeGen/AMDGPU/smrd-vccz-bug.ll @@ -5,7 +5,7 @@ ; GCN-FUNC: {{^}}vccz_workaround: ; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x0 ; GCN: v_cmp_neq_f32_e64 {{[^,]*}}, s{{[0-9]+}}, 0{{$}} -; VCCZ-BUG: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VCCZ-BUG: s_waitcnt lgkmcnt(0) ; VCCZ-BUG: s_mov_b64 vcc, vcc ; NOVCCZ-BUG-NOT: s_mov_b64 vcc, vcc ; GCN: s_cbranch_vccnz [[EXIT:[0-9A-Za-z_]+]] diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll index c87145a1a5ba7a740dcc8be4a03ad6ad7893bcfe..22ee62ef427606039e07d2bc6a026cccd485b881 100644 --- a/test/CodeGen/AMDGPU/smrd.ll +++ b/test/CodeGen/AMDGPU/smrd.ll @@ -292,18 +292,19 @@ main_body: ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm: ; GCN-NEXT: %bb. -; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4095 ; +; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4092 ; define amdgpu_ps float @smrd_vgpr_offset_imm(<4 x i32> inreg %desc, i32 %offset) #0 { main_body: - %off = add i32 %offset, 4095 + %off = add i32 %offset, 4092 %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %off) ret float %r } ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm_too_large: ; GCN-NEXT: %bb. -; GCN-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0 -; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ; +; SICI-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0 +; SICI-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ; +; VIGFX9-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 4 offen offset:4092 ; define amdgpu_ps float @smrd_vgpr_offset_imm_too_large(<4 x i32> inreg %desc, i32 %offset) #0 { main_body: %off = add i32 %offset, 4096 @@ -332,7 +333,7 @@ main_body: ; GCN-LABEL: {{^}}smrd_imm_merge_m0: ; -; SICIVI: s_buffer_load_dwordx2 +; GCN: s_buffer_load_dwordx2 ; SICIVI: s_mov_b32 m0 ; SICIVI_DAG: v_interp_p1_f32 ; SICIVI_DAG: v_interp_p1_f32 @@ -340,13 +341,15 @@ main_body: ; SICIVI_DAG: v_interp_p2_f32 ; SICIVI_DAG: v_interp_p2_f32 ; SICIVI_DAG: v_interp_p2_f32 -; SICIVI: s_mov_b32 m0 -; SICIVI: v_movrels_b32_e32 +; +; extractelement does not result in movrels anymore for vectors gitting 8 dwords +; SICIVI-NOT: s_mov_b32 m0 +; SICIVI-NOT: v_movrels_b32_e32 +; v_cndmask_b32_e32 +; v_cndmask_b32_e32 ; ; Merging is still thwarted on GFX9 due to s_set_gpr_idx ; -; GFX9: s_buffer_load_dword -; GFX9: s_buffer_load_dword define amdgpu_ps float @smrd_imm_merge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 { main_body: %idx1.f = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 0) @@ -510,12 +513,15 @@ main_body: } ; GCN-LABEL: {{^}}smrd_load_nonconst4: -; SICIVI: v_add_{{i32|u32}}_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ; -; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0xff8, v0 ; -; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ; -; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ; -; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ; -; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ; +; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ; +; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ; +; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ; +; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ; +; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ; +; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 56 offen offset:4032 ; +; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 56 offen offset:4048 ; +; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 56 offen offset:4064 ; +; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 56 offen offset:4080 ; ; GCN: ; return to shader part epilog define amdgpu_ps <16 x float> @smrd_load_nonconst4(<4 x i32> inreg %rsrc, i32 %off) #0 { main_body: @@ -526,12 +532,16 @@ main_body: } ; GCN-LABEL: {{^}}smrd_load_nonconst5: -; SICIVI: v_add_{{i32|u32}}_e32 v{{[0-9]+}}, vcc, 0x1004, v0 -; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0x1004, v0 -; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ; -; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ; -; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ; -; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ; +; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x1004, v0 +; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ; +; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ; +; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ; +; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ; +; VIGFX9: s_movk_i32 s4, 0xfc0 +; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], s4 offen offset:68 ; +; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], s4 offen offset:84 ; +; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], s4 offen offset:100 ; +; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], s4 offen offset:116 ; ; GCN: ; return to shader part epilog define amdgpu_ps <16 x float> @smrd_load_nonconst5(<4 x i32> inreg %rsrc, i32 %off) #0 { main_body: @@ -559,9 +569,10 @@ main_body: ; GCN-LABEL: {{^}}smrd_uniform_loop: ; -; TODO: this should use an s_buffer_load +; TODO: we should keep the loop counter in an SGPR ; -; GCN: buffer_load_dword +; GCN: v_readfirstlane_b32 +; GCN: s_buffer_load_dword define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 { main_body: br label %loop @@ -614,6 +625,68 @@ exit: ret float %sum.next } +; This test checks that the load after some control flow with an offset based +; on a divergent shader input is correctly recognized as divergent. This was +; reduced from an actual regression. Yes, the %unused argument matters, as +; well as the fact that %arg4 is a vector. +; +; GCN-LABEL: {{^}}arg_divergence: +; GCN: buffer_load_dword v0, v0, +; GCN-NEXT: s_waitcnt +; GCN-NEXT: ; return to shader part epilog +define amdgpu_cs float @arg_divergence(i32 inreg %unused, <3 x i32> %arg4) #0 { +main_body: + br i1 undef, label %if1, label %endif1 + +if1: ; preds = %main_body + store i32 0, i32 addrspace(3)* undef, align 4 + br label %endif1 + +endif1: ; preds = %if1, %main_body + %tmp13 = extractelement <3 x i32> %arg4, i32 0 + %tmp97 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 %tmp13) + ret float %tmp97 +} + +; GCN-LABEL: {{^}}s_buffer_load_f32: +; GCN: s_buffer_load_dword s0, s[0:3], s4 +define amdgpu_ps void @s_buffer_load_f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { + %sgpr = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + call void asm sideeffect "; use $0", "s"(float %sgpr) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_v2f32: +; GCN: s_buffer_load_dwordx2 s[0:1], s[0:3], s4 +define amdgpu_ps void @s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { + %sgpr = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %offset, i32 0) + call void asm sideeffect "; use $0", "s"(<2 x float> %sgpr) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_v4f32: +; GCN: s_buffer_load_dwordx4 s[0:3], s[0:3], s4 +define amdgpu_ps void @s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { + %sgpr = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %offset, i32 0) + call void asm sideeffect "; use $0", "s"(<4 x float> %sgpr) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_v8f32: +; GCN: s_buffer_load_dwordx8 s[0:7], s[0:3], s4 +define amdgpu_ps void @s_buffer_load_v8f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { + %sgpr = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %offset, i32 0) + call void asm sideeffect "; use $0", "s"(<8 x float> %sgpr) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_v16f32: +; GCN: s_buffer_load_dwordx16 s[0:15], s[0:3], s4 +define amdgpu_ps void @s_buffer_load_v16f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { + %sgpr = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %offset, i32 0) + call void asm sideeffect "; use $0", "s"(<16 x float> %sgpr) + ret void +} declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 @@ -625,6 +698,12 @@ declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32) declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32) +declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) +declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32) +declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32) +declare <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32>, i32, i32) +declare <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32>, i32, i32) + attributes #0 = { nounwind } attributes #1 = { nounwind readnone } attributes #2 = { nounwind readnone speculatable } diff --git a/test/CodeGen/AMDGPU/spill-empty-live-interval.mir b/test/CodeGen/AMDGPU/spill-empty-live-interval.mir index 32b4f158a011c31fc792444ab0ebc25f0e60e0b4..384042f5a6351f80ed1987bdbd0e68accb3fb501 100644 --- a/test/CodeGen/AMDGPU/spill-empty-live-interval.mir +++ b/test/CodeGen/AMDGPU/spill-empty-live-interval.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa-opencl -verify-machineinstrs -stress-regalloc=1 -start-before=simple-register-coalescing -stop-after=greedy -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=1 -start-before=simple-register-coalescing -stop-after=greedy -o - %s | FileCheck %s # https://bugs.llvm.org/show_bug.cgi?id=33620 --- @@ -10,7 +10,7 @@ # CHECK: undef %7.sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef %7.sub1, implicit $exec # CHECK-NEXT: SI_SPILL_V64_SAVE %7, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) # CHECK-NEXT: undef %5.sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec -# CHECK-NEXT: dead %2:vgpr_32 = V_MUL_F32_e32 0, %5.sub1, implicit $exec +# CHECK-NEXT: dead %3:vgpr_32 = V_MUL_F32_e32 0, %5.sub1, implicit $exec # CHECK: S_NOP 0, implicit %6.sub1 # CHECK-NEXT: %8:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) @@ -19,20 +19,16 @@ name: expecting_non_empty_interval tracksRegLiveness: true -registers: - - { id: 0, class: vreg_64, preferred-register: '' } - - { id: 1, class: vgpr_32, preferred-register: '' } - - { id: 2, class: vgpr_32, preferred-register: '' } - - { id: 3, class: vreg_64, preferred-register: '' } body: | bb.0: successors: %bb.1 - undef %0.sub1 = V_MAC_F32_e32 0, undef %1, undef %0.sub1, implicit $exec - undef %3.sub1 = V_MOV_B32_e32 1786773504, implicit $exec - dead %2 = V_MUL_F32_e32 0, %3.sub1, implicit $exec + + undef %0.sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef %0.sub1, implicit $exec + undef %2.sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec + dead %3:vgpr_32 = V_MUL_F32_e32 0, %2.sub1, implicit $exec bb.1: - S_NOP 0, implicit %3.sub1 + S_NOP 0, implicit %2.sub1 S_NOP 0, implicit %0.sub1 S_NOP 0, implicit undef %0.sub0 @@ -44,29 +40,24 @@ body: | # CHECK-LABEL: name: rematerialize_empty_interval_has_reference # CHECK-NOT: MOV -# CHECK: undef %3.sub2:vreg_128 = V_MOV_B32_e32 1786773504, implicit $exec +# CHECK: undef %1.sub2:vreg_128 = V_MOV_B32_e32 1786773504, implicit $exec # CHECK: bb.1: -# CHECK-NEXT: S_NOP 0, implicit %3.sub2 -# CHECK-NEXT: S_NOP 0, implicit undef %6.sub0 -# CHECK-NEXT: undef %4.sub2:vreg_128 = V_MOV_B32_e32 0, implicit $exec -# CHECK-NEXT: S_NOP 0, implicit %4.sub2 +# CHECK-NEXT: S_NOP 0, implicit %1.sub2 +# CHECK-NEXT: S_NOP 0, implicit undef %4.sub0 +# CHECK-NEXT: undef %2.sub2:vreg_128 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: S_NOP 0, implicit %2.sub2 name: rematerialize_empty_interval_has_reference tracksRegLiveness: true -registers: - - { id: 0, class: vreg_128, preferred-register: '' } - - { id: 1, class: vgpr_32, preferred-register: '' } - - { id: 2, class: vgpr_32, preferred-register: '' } - - { id: 3, class: vreg_128, preferred-register: '' } body: | bb.0: successors: %bb.1 - undef %0.sub2 = V_MOV_B32_e32 0, implicit $exec - undef %3.sub2 = V_MOV_B32_e32 1786773504, implicit $exec + undef %0.sub2:vreg_128 = V_MOV_B32_e32 0, implicit $exec + undef %1.sub2:vreg_128 = V_MOV_B32_e32 1786773504, implicit $exec bb.1: - S_NOP 0, implicit %3.sub2 + S_NOP 0, implicit %1.sub2 S_NOP 0, implicit undef %0.sub0 S_NOP 0, implicit %0.sub2 diff --git a/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir b/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir index 68c8b31a95f5578f7b12ea0fe3588d1b5f942631..559c0d438ef00e1f27cd1fcd80b48c399a65ce9d 100644 --- a/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir +++ b/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir @@ -17,18 +17,12 @@ name: no_merge_sgpr_vgpr_spill_slot tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32 } - - { id: 1, class: sreg_32_xm0_xexec } - - { id: 2, class: vgpr_32 } - - { id: 3, class: sreg_32_xm0_xexec } - body: | bb.0: - %0 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, implicit $flat_scr, implicit $exec - %2 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, implicit $flat_scr, implicit $exec + %0:vgpr_32 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, implicit $flat_scr, implicit $exec + %2:vgpr_32 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, implicit $flat_scr, implicit $exec S_NOP 0, implicit %0 - %1 = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0 - %3 = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0 + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0 + %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0 S_NOP 0, implicit %1 ... diff --git a/test/CodeGen/AMDGPU/store-global.ll b/test/CodeGen/AMDGPU/store-global.ll index 5040ab3bc29a9719349b6b38720951c82ac5bae2..50f6b81d7ac71cadbc229b1fd9b9fea9b15cf5a8 100644 --- a/test/CodeGen/AMDGPU/store-global.ll +++ b/test/CodeGen/AMDGPU/store-global.ll @@ -273,8 +273,7 @@ entry: } ; FUNC-LABEL: {{^}}store_v3i32: -; SIVI-DAG: buffer_store_dwordx2 -; SIVI-DAG: buffer_store_dword v +; SIVI-DAG: buffer_store_dwordx3 ; GFX9-DAG: global_store_dwordx2 ; GFX9-DAG: global_store_dword v diff --git a/test/CodeGen/AMDGPU/store-v3i64.ll b/test/CodeGen/AMDGPU/store-v3i64.ll index da905b82ab4baf636243e2484a4875673a4aea82..534347f3fae96b0674e938d22151ce9044c5884d 100644 --- a/test/CodeGen/AMDGPU/store-v3i64.ll +++ b/test/CodeGen/AMDGPU/store-v3i64.ll @@ -89,8 +89,7 @@ define amdgpu_kernel void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* % } ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i32: -; GCN-DAG: buffer_store_dwordx2 -; GCN-DAG: buffer_store_dword v +; GCN-DAG: buffer_store_dwordx3 define amdgpu_kernel void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) { %trunc = trunc <3 x i64> %x to <3 x i32> store <3 x i32> %trunc, <3 x i32> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/store-weird-sizes.ll b/test/CodeGen/AMDGPU/store-weird-sizes.ll index 5bcf78f11ac5724075a365168b2e7f4edb148b04..13380e03e32f1ba5fc497b960262928b9e851101 100644 --- a/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -1,64 +1,235 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,HAWAII %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,FIJI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s -; GCN-LABEL: {{^}}local_store_i56: -; CIVI-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6 -; CIVI-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4 -; CIVI-DAG: ds_write_b32 v0, v{{[0-9]+$}} - -; GFX9-DAG: ds_write_b8_d16_hi v0, v{{[0-9]+}} offset:6 -; GFX9-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4 -; GFX9-DAG: ds_write_b32 v0, v{{[0-9]+$}} - - define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 { +; CIVI-LABEL: local_store_i56: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 +; CIVI-NEXT: ds_write_b32 v0, v1 +; CIVI-NEXT: ds_write_b8 v0, v3 offset:6 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: local_store_i56: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 +; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 +; GFX9-NEXT: ds_write_b32 v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] store i56 %arg, i56 addrspace(3)* %ptr, align 8 ret void } -; GCN-LABEL: {{^}}local_store_i55: -; CIVI-DAG: ds_write_b8 v{{[0-9]+}}, v{{[0-9]+}} offset:6 -; CIVI-DAG: ds_write_b16 v{{[0-9]+}}, v{{[0-9]+}} offset:4 -; CIVI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+$}} - -; GFX9-DAG: ds_write_b8_d16_hi v0, v{{[0-9]+}} offset:6 -; GFX9-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4 -; GFX9-DAG: ds_write_b32 v0, v{{[0-9]+$}} define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 { +; HAWAII-LABEL: local_store_i55: +; HAWAII: ; %bb.0: +; HAWAII-NEXT: s_add_u32 s0, s4, 14 +; HAWAII-NEXT: s_addc_u32 s1, s5, 0 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v1, s0 +; HAWAII-NEXT: v_mov_b32_e32 v3, s1 +; HAWAII-NEXT: v_mov_b32_e32 v2, s2 +; HAWAII-NEXT: ds_write_b16 v1, v2 offset:4 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 +; HAWAII-NEXT: ds_write_b32 v1, v3 +; HAWAII-NEXT: s_endpgm +; +; FIJI-LABEL: local_store_i55: +; FIJI: ; %bb.0: +; FIJI-NEXT: s_add_u32 s0, s4, 14 +; FIJI-NEXT: s_addc_u32 s1, s5, 0 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: flat_load_ubyte v0, v[0:1] +; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v1, s0 +; FIJI-NEXT: v_mov_b32_e32 v3, s1 +; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 +; FIJI-NEXT: ds_write_b32 v1, v3 +; FIJI-NEXT: s_endpgm +; +; GFX9-LABEL: local_store_i55: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: global_load_ubyte_d16_hi v0, v[0:1], off offset:14 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: ds_write_b16 v1, v2 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 0x7f0000, v0 +; GFX9-NEXT: ds_write_b8_d16_hi v1, v0 offset:6 +; GFX9-NEXT: ds_write_b32 v1, v3 +; GFX9-NEXT: s_endpgm store i55 %arg, i55 addrspace(3)* %ptr, align 8 ret void } -; GCN-LABEL: {{^}}local_store_i48: -; GCN-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4 -; GCN-DAG: ds_write_b32 v0, v{{[0-9]+$}} define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 { +; HAWAII-LABEL: local_store_i48: +; HAWAII: ; %bb.0: +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: v_mov_b32_e32 v2, s2 +; HAWAII-NEXT: ds_write_b16 v0, v2 offset:4 +; HAWAII-NEXT: ds_write_b32 v0, v1 +; HAWAII-NEXT: s_endpgm +; +; FIJI-LABEL: local_store_i48: +; FIJI: ; %bb.0: +; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: ds_write_b16 v0, v2 offset:4 +; FIJI-NEXT: ds_write_b32 v0, v1 +; FIJI-NEXT: s_endpgm +; +; GFX9-LABEL: local_store_i48: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 +; GFX9-NEXT: ds_write_b32 v0, v1 +; GFX9-NEXT: s_endpgm store i48 %arg, i48 addrspace(3)* %ptr, align 8 ret void } -; GCN-LABEL: {{^}}local_store_i65: -; GCN-DAG: ds_write_b8 v{{[0-9]+}}, v{{[0-9]+}} offset:8 -; GCN-DAG: ds_write_b64 define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 { +; HAWAII-LABEL: local_store_i65: +; HAWAII: ; %bb.0: +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v2, s2 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: s_and_b32 s0, s3, 1 +; HAWAII-NEXT: v_mov_b32_e32 v3, s0 +; HAWAII-NEXT: ds_write_b8 v2, v3 offset:8 +; HAWAII-NEXT: ds_write_b64 v2, v[0:1] +; HAWAII-NEXT: s_endpgm +; +; FIJI-LABEL: local_store_i65: +; FIJI: ; %bb.0: +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10 +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: s_and_b32 s0, s3, 1 +; FIJI-NEXT: v_mov_b32_e32 v3, s0 +; FIJI-NEXT: ds_write_b8 v2, v3 offset:8 +; FIJI-NEXT: ds_write_b64 v2, v[0:1] +; FIJI-NEXT: s_endpgm +; +; GFX9-LABEL: local_store_i65: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_and_b32 s0, s3, 1 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: ds_write_b8 v2, v3 offset:8 +; GFX9-NEXT: ds_write_b64 v2, v[0:1] +; GFX9-NEXT: s_endpgm store i65 %arg, i65 addrspace(3)* %ptr, align 8 ret void } -; GCN-LABEL: {{^}}local_store_i13: -; GCN: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x1fff, v1 -; GCN: ds_write_b16 v0, [[TRUNC]] define void @local_store_i13(i13 addrspace(3)* %ptr, i13 %arg) #0 { +; CIVI-LABEL: local_store_i13: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: v_and_b32_e32 v1, 0x1fff, v1 +; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: ds_write_b16 v0, v1 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: local_store_i13: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v1, 0x1fff, v1 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] store i13 %arg, i13 addrspace(3)* %ptr, align 8 ret void } -; GCN-LABEL: {{^}}local_store_i17: -; GCN: ds_write_b16 v0 -; CIVI: ds_write_b8 v0, v{{[0-9]+}} offset:2 -; GFX9: ds_write_b8_d16_hi v0, v{{[0-9]+}} offset:2 define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 { +; CIVI-LABEL: local_store_i17: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: ds_write_b16 v0, v1 +; CIVI-NEXT: ds_write_b8 v0, v2 offset:2 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: local_store_i17: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x1ffff, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] store i17 %arg, i17 addrspace(3)* %ptr, align 8 ret void } diff --git a/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir b/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir index 4458a049618e723ea44da615544865c8b4d84ddf..26228dfb1c34e200d1cb2f3dd8f3a3fd439ee5fb 100644 --- a/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir +++ b/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir @@ -102,6 +102,13 @@ body: | bb.7: successors: %bb.13(0x80000000) + + ; In reality we are checking that this code doesn't assert when splitting + ; and inserting a spill. Here we just check that the point where the error + ; occurs we see a correctly generated spill. + ; GCN-LABEL: bb.7: + ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec + undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 %15.sub2:vreg_128 = COPY %15.sub0 @@ -114,6 +121,10 @@ body: | bb.9: successors: %bb.12(0x80000000) + + ; GCN-LABEL: bb.9: + ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec + undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 %15.sub2:vreg_128 = COPY %15.sub0 @@ -121,6 +132,10 @@ body: | bb.10: successors: %bb.12(0x80000000) + + ; GCN-LABEL: bb.10: + ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec + undef %15.sub0:vreg_128 = V_MOV_B32_e32 2143289344, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 %15.sub2:vreg_128 = COPY %15.sub0 @@ -143,12 +158,6 @@ body: | bb.13: successors: %bb.15(0x40000000), %bb.14(0x40000000) - ; In reality we are checking that this code doesn't assert when splitting - ; and inserting a spill. Here we just check that the point where the error - ; occurs we see a correctly generated spill. - ; GCN-LABEL: bb.13: - ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec - %18:vgpr_32 = V_MAD_F32 0, %10.sub0, 0, target-flags(amdgpu-gotprel) 1073741824, 0, -1082130432, 0, 0, implicit $exec %19:vgpr_32 = V_MAD_F32 0, %12.sub0, 0, target-flags(amdgpu-gotprel) 0, 0, 0, 0, 0, implicit $exec %20:sreg_128 = S_BUFFER_LOAD_DWORDX4_IMM undef %21:sreg_128, 1040, 0 :: (dereferenceable invariant load 16) diff --git a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll index 0241490de7a5a51411a1adf87d0fa312c5835c1d..1c92f74ec4bdf6224e333d6db0701e96535d06a3 100644 --- a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll +++ b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll @@ -51,8 +51,7 @@ define amdgpu_kernel void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x ; t21: v2i32,ch = load t12, t10, undef:i64 ; t23: i64 = bitcast t21 ; t30: i16 = truncate t23 -; SI: buffer_load_dword v[[VAL:[0-9]+]] -; VI: buffer_load_dwordx2 v{{\[}}[[VAL:[0-9]+]] +; GCN: buffer_load_dword v[[VAL:[0-9]+]] ; GCN: buffer_store_short v[[VAL]], off define amdgpu_kernel void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in diff --git a/test/CodeGen/AMDGPU/umed3.ll b/test/CodeGen/AMDGPU/umed3.ll index 071fc07ea597db0eb2d325f683c11d50020cfd3b..2f6685921df84d40444bffefb2bc9624b2393f7f 100644 --- a/test/CodeGen/AMDGPU/umed3.ll +++ b/test/CodeGen/AMDGPU/umed3.ll @@ -363,6 +363,212 @@ bb: ret void } +; 16 combinations + +; 16: min(max(x, y), max(min(x, y), z)) +; 17: min(max(x, y), max(min(y, x), z)) +; 18: min(max(x, y), max(z, min(x, y))) +; 19: min(max(x, y), max(z, min(y, x))) +; 20: min(max(y, x), max(min(x, y), z)) +; 21: min(max(y, x), max(min(y, x), z)) +; 22: min(max(y, x), max(z, min(x, y))) +; 23: min(max(y, x), max(z, min(y, x))) +; +; + commute outermost min + + +; GCN-LABEL: {{^}}s_test_umed3_i32_pat_16: +; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_umed3_i32_pat_16(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @umin(i32 %x, i32 %y) + %tmp1 = call i32 @umax(i32 %x, i32 %y) + %tmp2 = call i32 @umax(i32 %tmp0, i32 %z) + %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_umed3_i32_pat_17: +; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_umed3_i32_pat_17(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @umin(i32 %y, i32 %x) + %tmp1 = call i32 @umax(i32 %x, i32 %y) + %tmp2 = call i32 @umax(i32 %tmp0, i32 %z) + %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_umed3_i32_pat_18: +; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_umed3_i32_pat_18(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @umin(i32 %x, i32 %y) + %tmp1 = call i32 @umax(i32 %x, i32 %y) + %tmp2 = call i32 @umax(i32 %z, i32 %tmp0) + %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_umed3_i32_pat_19: +; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_umed3_i32_pat_19(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @umin(i32 %y, i32 %x) + %tmp1 = call i32 @umax(i32 %x, i32 %y) + %tmp2 = call i32 @umax(i32 %z, i32 %tmp0) + %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_umed3_i32_pat_20: +; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_umed3_i32_pat_20(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @umin(i32 %x, i32 %y) + %tmp1 = call i32 @umax(i32 %y, i32 %x) + %tmp2 = call i32 @umax(i32 %tmp0, i32 %z) + %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_umed3_i32_pat_21: +; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_umed3_i32_pat_21(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @umin(i32 %y, i32 %x) + %tmp1 = call i32 @umax(i32 %y, i32 %x) + %tmp2 = call i32 @umax(i32 %tmp0, i32 %z) + %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_umed3_i32_pat_22: +; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_umed3_i32_pat_22(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @umin(i32 %x, i32 %y) + %tmp1 = call i32 @umax(i32 %y, i32 %x) + %tmp2 = call i32 @umax(i32 %z, i32 %tmp0) + %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_umed3_i32_pat_23: +; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_umed3_i32_pat_23(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @umin(i32 %y, i32 %x) + %tmp1 = call i32 @umax(i32 %y, i32 %x) + %tmp2 = call i32 @umax(i32 %z, i32 %tmp0) + %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_umed3_i32_pat_24: +; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_umed3_i32_pat_24(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @umin(i32 %x, i32 %y) + %tmp1 = call i32 @umax(i32 %x, i32 %y) + %tmp2 = call i32 @umax(i32 %tmp0, i32 %z) + %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_umed3_i32_pat_25: +; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_umed3_i32_pat_25(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @umin(i32 %y, i32 %x) + %tmp1 = call i32 @umax(i32 %x, i32 %y) + %tmp2 = call i32 @umax(i32 %tmp0, i32 %z) + %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_umed3_i32_pat_26: +; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_umed3_i32_pat_26(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @umin(i32 %x, i32 %y) + %tmp1 = call i32 @umax(i32 %x, i32 %y) + %tmp2 = call i32 @umax(i32 %z, i32 %tmp0) + %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_umed3_i32_pat_27: +; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_umed3_i32_pat_27(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @umin(i32 %y, i32 %x) + %tmp1 = call i32 @umax(i32 %x, i32 %y) + %tmp2 = call i32 @umax(i32 %z, i32 %tmp0) + %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_umed3_i32_pat_28: +; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_umed3_i32_pat_28(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @umin(i32 %x, i32 %y) + %tmp1 = call i32 @umax(i32 %y, i32 %x) + %tmp2 = call i32 @umax(i32 %tmp0, i32 %z) + %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_umed3_i32_pat_29: +; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_umed3_i32_pat_29(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @umin(i32 %y, i32 %x) + %tmp1 = call i32 @umax(i32 %y, i32 %x) + %tmp2 = call i32 @umax(i32 %tmp0, i32 %z) + %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_umed3_i32_pat_30: +; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_umed3_i32_pat_30(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @umin(i32 %x, i32 %y) + %tmp1 = call i32 @umax(i32 %y, i32 %x) + %tmp2 = call i32 @umax(i32 %z, i32 %tmp0) + %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}s_test_umed3_i32_pat_31: +; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_test_umed3_i32_pat_31(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +bb: + %tmp0 = call i32 @umin(i32 %y, i32 %x) + %tmp1 = call i32 @umax(i32 %y, i32 %x) + %tmp2 = call i32 @umax(i32 %z, i32 %tmp0) + %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1) + store i32 %tmp3, i32 addrspace(1)* %arg + ret void +} + ; GCN-LABEL: {{^}}s_test_umed3_i16_pat_0: ; GCN: s_and_b32 ; GCN: s_and_b32 @@ -510,6 +716,27 @@ bb: ret void } +; GCN-LABEL: {{^}}v_test_umed3_i16_pat_1: +; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @v_test_umed3_i16_pat_1(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid + %gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3 + %gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8 + %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + %x = load i16, i16 addrspace(1)* %gep0 + %y = load i16, i16 addrspace(1)* %gep1 + %z = load i16, i16 addrspace(1)* %gep2 + + %tmp0 = call i16 @umin16(i16 %x, i16 %y) + %tmp1 = call i16 @umax16(i16 %x, i16 %y) + %tmp2 = call i16 @umax16(i16 %tmp0, i16 %z) + %tmp3 = call i16 @umin16(i16 %tmp1, i16 %tmp2) + store i16 %tmp3, i16 addrspace(1)* %out.gep + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } attributes #2 = { nounwind readnone alwaysinline } diff --git a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir index 3dfc2a39cbcee90bdbaab3379248f43c5b600989..edcdf4cb62923503515ccb41845b2aff9bc9b14d 100644 --- a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir +++ b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir @@ -47,7 +47,7 @@ --- # CHECK-LABEL: name: vccz_corrupt_workaround # CHECK: $vcc = V_CMP_EQ_F32 -# CHECK-NEXT: S_WAITCNT 0 +# CHECK-NEXT: S_WAITCNT 127 # CHECK-NEXT: $vcc = S_MOV_B64 $vcc # CHECK-NEXT: S_CBRANCH_VCCZ %bb.2, implicit killed $vcc diff --git a/test/CodeGen/AMDGPU/vector-extract-insert.ll b/test/CodeGen/AMDGPU/vector-extract-insert.ll index e3027d09309988246bd074f3c61b5146dbc2461d..5bb4159ee9f845e90ca2ccef8dfe5bb3d62da9fe 100644 --- a/test/CodeGen/AMDGPU/vector-extract-insert.ll +++ b/test/CodeGen/AMDGPU/vector-extract-insert.ll @@ -27,8 +27,13 @@ define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %o ; GCN-LABEL: {{^}}extract_insert_different_dynelt_v4i32: ; GCN: buffer_load_dwordx4 -; GCN: v_movreld_b32 -; GCN: v_movrels_b32 +; GCN: v_cndmask_b32 +; GCN: v_cndmask_b32 +; GCN: v_cndmask_b32 +; GCN: v_cndmask_b32 +; GCN: v_cndmask_b32 +; GCN: v_cndmask_b32 +; GCN: v_cndmask_b32 ; GCN: buffer_store_dword v define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 { %id = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/test/CodeGen/AMDGPU/verifier-pseudo-terminators.mir b/test/CodeGen/AMDGPU/verifier-pseudo-terminators.mir new file mode 100644 index 0000000000000000000000000000000000000000..d9e8aa71f32aebf95ada14bf04e0f76f85e85861 --- /dev/null +++ b/test/CodeGen/AMDGPU/verifier-pseudo-terminators.mir @@ -0,0 +1,23 @@ +# RUN: not llc -march=amdgcn -run-pass=verify %s 2>&1 | FileCheck %s +# Make sure that mismatched successors are caught when a _term +# instruction is used + +# CHECK: *** Bad machine code: MBB exits via unconditional branch but the CFG successor doesn't match the actual successor! *** + +--- +name: verifier_pseudo_terminators +body: | + bb.0: + successors: %bb.1 + + %0:sreg_64 = S_XOR_B64_term undef %1:sreg_64, undef %2:sreg_64, implicit-def $scc + $exec = S_MOV_B64_term %0 + S_BRANCH %bb.2 + + bb.1: + S_SETPC_B64_return undef $sgpr30_sgpr31 + + bb.2: + S_SETPC_B64_return undef $sgpr30_sgpr31 + +... diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll index 32607c75e67db09d5c52dc0537e0276765f309ee..bcebc9f2d8b3c82c59bc3ff636d7af531169c901 100644 --- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -1,8 +1,8 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=GFX9MESA %s -; RUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s -; RUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s +; RUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s +; RUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s ; This ends up using all 256 registers and requires register ; scavenging which will fail to find an unsued register. diff --git a/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll b/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll index 8d66c346ed5b8314c64488ef5eb08f2817f77bfe..d6b3f035a3e3b97534adff38440efb033466c491 100644 --- a/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll +++ b/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll @@ -17,7 +17,7 @@ attributes #1 = { nounwind } !llvm.module.flags = !{!2, !3} !0 = distinct !DICompileUnit(language: DW_LANG_OpenCL, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug) -!1 = !DIFile(filename: "foo.cl", directory: "/dev/null") +!1 = !DIFile(filename: "foo.cl", directory: ".") !2 = !{i32 2, !"Dwarf Version", i32 4} !3 = !{i32 2, !"Debug Info Version", i32 3} !4 = !DILocation(line: 1, column: 42, scope: !5) diff --git a/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir b/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir new file mode 100644 index 0000000000000000000000000000000000000000..b0ca67febbf8ac384d012b1251cd14f098bb9e87 --- /dev/null +++ b/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir @@ -0,0 +1,47 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GCN %s + +# GCN-LABEL: name: irreducible_loop{{$}} +# GCN: S_LOAD_DWORDX4_IMM +# GCN: S_WAITCNT 127{{$}} +# GCN: S_BUFFER_LOAD_DWORD_IMM +# GCN: S_WAITCNT 127{{$}} +# GCN: S_CMP_GE_I32 +--- | + + define amdgpu_ps void @irreducible_loop() { + main: + ret void + } + +... +--- +name: irreducible_loop +body: | + bb.0: + successors: %bb.3, %bb.2 + + S_CBRANCH_VCCZ %bb.2, implicit $vcc + S_BRANCH %bb.3 + + bb.1: + successors: %bb.3, %bb.2 + + S_CBRANCH_VCCNZ %bb.3, implicit $vcc + + bb.2: + successors: %bb.3 + + renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0 + renamable $sgpr3 = S_BUFFER_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0 + + bb.3: + successors: %bb.1, %bb.4 + + S_CMP_GE_I32 renamable $sgpr2, renamable $sgpr3, implicit-def $scc + S_CBRANCH_SCC0 %bb.1, implicit killed $scc + + bb.4: + + S_ENDPGM + +... diff --git a/test/CodeGen/AMDGPU/waitcnt-preexisting.mir b/test/CodeGen/AMDGPU/waitcnt-preexisting.mir new file mode 100644 index 0000000000000000000000000000000000000000..211e8bce939353abc04c02cfce7e8e26db7ab37f --- /dev/null +++ b/test/CodeGen/AMDGPU/waitcnt-preexisting.mir @@ -0,0 +1,37 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GCN %s + +# GCN-LABEL: name: test{{$}} +# GCN: S_WAITCNT -16257 +# GGN: DS_READ2_B32 +# GGN: DS_READ2_B32 +# GCN: S_WAITCNT 383{{$}} +# GCN-NEXT: $vgpr1 = V_ADD_U32_e32 1, killed $vgpr1, implicit $exec +# GCN-NEXT: $vgpr1 = V_MAX_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec +# GCN-NEXT: S_WAITCNT 127{{$}} +# GCN-NEXT: $vgpr1 = V_MAX_U32_e32 killed $vgpr2, killed $vgpr1, implicit $exec +--- | + define amdgpu_cs void @test() { + ret void + } +... +--- +name: test +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $vgpr0 + + renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 480, 0 + renamable $vgpr13 = V_LSHLREV_B32_e32 2, killed $vgpr0, implicit $exec + S_WAITCNT -16257 + renamable $vgpr0_vgpr1 = DS_READ2_B32 renamable $vgpr13, 0, 1, 0, implicit $m0, implicit $exec + renamable $vgpr2_vgpr3 = DS_READ2_B32 renamable $vgpr13, 2, 3, 0, implicit $m0, implicit $exec + renamable $vgpr1 = V_ADD_U32_e32 1, killed $vgpr1, implicit $exec + renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec + renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr2, killed $vgpr1, implicit $exec + renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr3, killed $vgpr1, implicit $exec + $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec + $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec + $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec + IMAGE_STORE_V4_V2 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec + S_ENDPGM +... diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll index 81232df43d176ddd083073cfcc59d5315dccf077..3d1818368487d44978863c15a8575119eaa27fca 100644 --- a/test/CodeGen/AMDGPU/wqm.ll +++ b/test/CodeGen/AMDGPU/wqm.ll @@ -586,7 +586,8 @@ main_body: %data.0 = extractelement <2 x float> %data, i32 0 call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0) - call void @llvm.AMDGPU.kill(float %z) + %z.cmp = fcmp olt float %z, 0.0 + call void @llvm.amdgcn.kill(i1 %z.cmp) %idx.1 = extractelement <2 x i32> %idx, i32 1 %data.1 = extractelement <2 x float> %data, i32 1 @@ -619,7 +620,8 @@ main_body: call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) - call void @llvm.AMDGPU.kill(float %z) + %z.cmp = fcmp olt float %z, 0.0 + call void @llvm.amdgcn.kill(i1 %z.cmp) ret <4 x float> %dtex } @@ -826,7 +828,7 @@ declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i3 declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 -declare void @llvm.AMDGPU.kill(float) #1 +declare void @llvm.amdgcn.kill(i1) #1 declare float @llvm.amdgcn.wqm.f32(float) #3 declare i32 @llvm.amdgcn.wqm.i32(i32) #3 declare float @llvm.amdgcn.wwm.f32(float) #3 diff --git a/test/CodeGen/AMDGPU/xnor.ll b/test/CodeGen/AMDGPU/xnor.ll index 0371cc68f040956f0ad41f5fc5a1fa2fa3c4fda6..716bcb5922e9a52d49ad62843d573ddeb34cd5d8 100644 --- a/test/CodeGen/AMDGPU/xnor.ll +++ b/test/CodeGen/AMDGPU/xnor.ll @@ -61,8 +61,8 @@ entry: ; GCN-LABEL: {{^}}vector_xnor_i32_one_use ; GCN-NOT: s_xnor_b32 -; GCN: v_xor_b32 ; GCN: v_not_b32 +; GCN: v_xor_b32 ; GCN-DL: v_xnor_b32 define i32 @vector_xnor_i32_one_use(i32 %a, i32 %b) { entry: @@ -73,10 +73,10 @@ entry: ; GCN-LABEL: {{^}}vector_xnor_i64_one_use ; GCN-NOT: s_xnor_b64 -; GCN: v_xor_b32 -; GCN: v_xor_b32 ; GCN: v_not_b32 ; GCN: v_not_b32 +; GCN: v_xor_b32 +; GCN: v_xor_b32 ; GCN-DL: v_xnor_b32 ; GCN-DL: v_xnor_b32 define i64 @vector_xnor_i64_one_use(i64 %a, i64 %b) { @@ -85,3 +85,114 @@ entry: %r = xor i64 %xor, -1 ret i64 %r } + +; GCN-LABEL: {{^}}xnor_s_v_i32_one_use +; GCN-NOT: s_xnor_b32 +; GCN: s_not_b32 +; GCN: v_xor_b32 +define amdgpu_kernel void @xnor_s_v_i32_one_use(i32 addrspace(1)* %out, i32 %s) { + %v = call i32 @llvm.amdgcn.workitem.id.x() #1 + %xor = xor i32 %s, %v + %d = xor i32 %xor, -1 + store i32 %d, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}xnor_v_s_i32_one_use +; GCN-NOT: s_xnor_b32 +; GCN: s_not_b32 +; GCN: v_xor_b32 +define amdgpu_kernel void @xnor_v_s_i32_one_use(i32 addrspace(1)* %out, i32 %s) { + %v = call i32 @llvm.amdgcn.workitem.id.x() #1 + %xor = xor i32 %v, %s + %d = xor i32 %xor, -1 + store i32 %d, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}xnor_i64_s_v_one_use +; GCN-NOT: s_xnor_b64 +; GCN: s_not_b64 +; GCN: v_xor_b32 +; GCN: v_xor_b32 +; GCN-DL: v_xnor_b32 +; GCN-DL: v_xnor_b32 +define amdgpu_kernel void @xnor_i64_s_v_one_use( + i64 addrspace(1)* %r0, i64 %a) { +entry: + %b32 = call i32 @llvm.amdgcn.workitem.id.x() #1 + %b64 = zext i32 %b32 to i64 + %b = shl i64 %b64, 29 + %xor = xor i64 %a, %b + %r0.val = xor i64 %xor, -1 + store i64 %r0.val, i64 addrspace(1)* %r0 + ret void +} + +; GCN-LABEL: {{^}}xnor_i64_v_s_one_use +; GCN-NOT: s_xnor_b64 +; GCN: s_not_b64 +; GCN: v_xor_b32 +; GCN: v_xor_b32 +; GCN-DL: v_xnor_b32 +; GCN-DL: v_xnor_b32 +define amdgpu_kernel void @xnor_i64_v_s_one_use( + i64 addrspace(1)* %r0, i64 %a) { +entry: + %b32 = call i32 @llvm.amdgcn.workitem.id.x() #1 + %b64 = zext i32 %b32 to i64 + %b = shl i64 %b64, 29 + %xor = xor i64 %b, %a + %r0.val = xor i64 %xor, -1 + store i64 %r0.val, i64 addrspace(1)* %r0 + ret void +} + +; GCN-LABEL: {{^}}vector_xor_na_b_i32_one_use +; GCN-NOT: s_xnor_b32 +; GCN: v_not_b32 +; GCN: v_xor_b32 +; GCN-DL: v_xnor_b32 +define i32 @vector_xor_na_b_i32_one_use(i32 %a, i32 %b) { +entry: + %na = xor i32 %a, -1 + %r = xor i32 %na, %b + ret i32 %r +} + +; GCN-LABEL: {{^}}vector_xor_a_nb_i32_one_use +; GCN-NOT: s_xnor_b32 +; GCN: v_not_b32 +; GCN: v_xor_b32 +; GCN-DL: v_xnor_b32 +define i32 @vector_xor_a_nb_i32_one_use(i32 %a, i32 %b) { +entry: + %nb = xor i32 %b, -1 + %r = xor i32 %a, %nb + ret i32 %r +} + +; GCN-LABEL: {{^}}scalar_xor_a_nb_i64_one_use +; GCN: s_xnor_b64 +define amdgpu_kernel void @scalar_xor_a_nb_i64_one_use( + i64 addrspace(1)* %r0, i64 %a, i64 %b) { +entry: + %nb = xor i64 %b, -1 + %r0.val = xor i64 %a, %nb + store i64 %r0.val, i64 addrspace(1)* %r0 + ret void +} + +; GCN-LABEL: {{^}}scalar_xor_na_b_i64_one_use +; GCN: s_xnor_b64 +define amdgpu_kernel void @scalar_xor_na_b_i64_one_use( + i64 addrspace(1)* %r0, i64 %a, i64 %b) { +entry: + %na = xor i64 %a, -1 + %r0.val = xor i64 %na, %b + store i64 %r0.val, i64 addrspace(1)* %r0 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.amdgcn.workitem.id.x() #0 diff --git a/test/CodeGen/AMDGPU/xor_add.ll b/test/CodeGen/AMDGPU/xor_add.ll new file mode 100644 index 0000000000000000000000000000000000000000..1a13fc51d44a3fa6ecef31385e281642b5d15719 --- /dev/null +++ b/test/CodeGen/AMDGPU/xor_add.ll @@ -0,0 +1,94 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s + +; =================================================================================== +; V_XAD_U32 +; =================================================================================== + +define amdgpu_ps float @xor_add(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: xor_add: +; VI: ; %bb.0: +; VI-NEXT: v_xor_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: xor_add: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_xad_u32 v0, v0, v1, v2 +; GFX9-NEXT: ; return to shader part epilog + %x = xor i32 %a, %b + %result = add i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +; ThreeOp instruction variant not used due to Constant Bus Limitations +define amdgpu_ps float @xor_add_vgpr_a(i32 %a, i32 inreg %b, i32 inreg %c) { +; VI-LABEL: xor_add_vgpr_a: +; VI: ; %bb.0: +; VI-NEXT: v_xor_b32_e32 v0, s2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s3, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: xor_add_vgpr_a: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s3, v0 +; GFX9-NEXT: ; return to shader part epilog + %x = xor i32 %a, %b + %result = add i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @xor_add_vgpr_all(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: xor_add_vgpr_all: +; VI: ; %bb.0: +; VI-NEXT: v_xor_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: xor_add_vgpr_all: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_xad_u32 v0, v0, v1, v2 +; GFX9-NEXT: ; return to shader part epilog + %x = xor i32 %a, %b + %result = add i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @xor_add_vgpr_ab(i32 %a, i32 %b, i32 inreg %c) { +; VI-LABEL: xor_add_vgpr_ab: +; VI: ; %bb.0: +; VI-NEXT: v_xor_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: xor_add_vgpr_ab: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_xad_u32 v0, v0, v1, s2 +; GFX9-NEXT: ; return to shader part epilog + %x = xor i32 %a, %b + %result = add i32 %x, %c + %bc = bitcast i32 %result to float + ret float %bc +} + +define amdgpu_ps float @xor_add_vgpr_const(i32 %a, i32 %b) { +; VI-LABEL: xor_add_vgpr_const: +; VI: ; %bb.0: +; VI-NEXT: v_xor_b32_e32 v0, 3, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: xor_add_vgpr_const: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_xad_u32 v0, v0, 3, v1 +; GFX9-NEXT: ; return to shader part epilog + %x = xor i32 %a, 3 + %result = add i32 %x, %b + %bc = bitcast i32 %result to float + ret float %bc +} diff --git a/test/CodeGen/ARM/CGP/arm-cgp-calls.ll b/test/CodeGen/ARM/CGP/arm-cgp-calls.ll index 10cd6671ffc470844b8afdaee8665a874195775b..4c95ade534dc070a17f6e6b619f231e015d1360b 100644 --- a/test/CodeGen/ARM/CGP/arm-cgp-calls.ll +++ b/test/CodeGen/ARM/CGP/arm-cgp-calls.ll @@ -22,12 +22,10 @@ define i16 @test_call(i8 zeroext %arg) { ret i16 %conv } -; Test that the transformation bails when it finds that i16 is larger than i8. -; TODO: We should be able to remove the uxtb in these cases. ; CHECK-LABEL: promote_i8_sink_i16_1 ; CHECK: bl dummy_i8 ; CHECK: add{{.*}} r0, #1 -; CHECK: uxtb r0, r0 +; CHECK-NOT: uxt ; CHECK: cmp r0 define i16 @promote_i8_sink_i16_1(i8 zeroext %arg0, i16 zeroext %arg1, i16 zeroext %arg2) { %call = tail call zeroext i8 @dummy_i8(i8 %arg0) @@ -179,10 +177,46 @@ entry: ret i1 %tobool } +; CHECK-LABEL: i1_zeroext_call +; CHECK: uxt +define i1 @i1_zeroext_call(i16* %ts, i32 %a, i16* %b, i8* %c) { +entry: + %0 = load i16, i16* %ts, align 2 + %conv.i860 = trunc i32 %a to i16 + store i16 %conv.i860, i16* %b, align 2 + %call.i848 = call zeroext i1 @i1_zeroext(i8* %c, i32 64, i16 zeroext %conv.i860) + br i1 %call.i848, label %if.then223, label %if.else227 + +if.then223: + %cmp235 = icmp eq i16 %0, %conv.i860 + br label %exit + +if.else227: + %cmp236 = icmp ult i16 %0, %conv.i860 + br label %exit + +exit: + %retval = phi i1 [ %cmp235, %if.then223 ], [ %cmp236, %if.else227 ] + ret i1 %retval +} + +; CHECK-LABEL: promote_arg_pass_to_call +; CHECK-NOT: uxt +define i16 @promote_arg_pass_to_call(i16 zeroext %arg1, i16 zeroext %arg2) { + %conv = add nuw i16 %arg1, 15 + %mul = mul nuw nsw i16 %conv, 3 + %cmp = icmp ult i16 %mul, %arg2 + %trunc = trunc i16 %arg1 to i8 + %res = call zeroext i16 @dummy4(i1 %cmp, i8 %trunc, i16 %arg1) + ret i16 %res +} + + declare i32 @assert(...) declare i8 @dummy_i8(i8) declare i8 @dummy2(i8*, i8, i8) declare i16 @dummy3(i16) +declare i16 @dummy4(i1, i8, i16) declare dso_local i32 @e(...) local_unnamed_addr #1 declare dso_local zeroext i16 @f(...) local_unnamed_addr #1 @@ -193,3 +227,4 @@ declare fastcc signext i16 @safe_sub_func_int16_t_s_s(i16 signext %si2) declare dso_local fastcc i64 @safe_sub_func_int64_t_s_s(i64, i64) declare dso_local fastcc zeroext i8 @safe_lshift_func(i8 zeroext, i32) declare dso_local fastcc zeroext i8 @safe_mul_func_uint8_t_u_u(i8 returned zeroext) +declare i1 @i1_zeroext(i8*, i32, i16 zeroext) diff --git a/test/CodeGen/ARM/CGP/arm-cgp-casts.ll b/test/CodeGen/ARM/CGP/arm-cgp-casts.ll index 431846482c60c9622ef96250294841ed5296009f..273fea370a1f647dbf05e639b660bdb9228d1f24 100644 --- a/test/CodeGen/ARM/CGP/arm-cgp-casts.ll +++ b/test/CodeGen/ARM/CGP/arm-cgp-casts.ll @@ -104,9 +104,7 @@ entry: ; CHECK-COMMON-LABEL: or_icmp_ugt: ; CHECK-COMMON: ldrb -; CHECK-COMMON: sub.w -; CHECK-COMMON-NOT: uxt -; CHECK-COMMON: cmp.w +; CHECK-COMMON: subs.w ; CHECK-COMMON-NOT: uxt ; CHECK-COMMON: cmp define i1 @or_icmp_ugt(i32 %arg, i8* %ptr) { @@ -122,36 +120,6 @@ entry: ret i1 %or } -; CHECK-COMMON-LABEL: icmp_switch_trunc: -; CHECK-COMMON-NOT: uxt -define i16 @icmp_switch_trunc(i16 zeroext %arg) { -entry: - %conv = add nuw i16 %arg, 15 - %mul = mul nuw nsw i16 %conv, 3 - %trunc = trunc i16 %arg to i3 - switch i3 %trunc, label %default [ - i3 0, label %sw.bb - i3 1, label %sw.bb.i - ] - -sw.bb: - %cmp0 = icmp ult i16 %mul, 127 - %select = select i1 %cmp0, i16 %mul, i16 127 - br label %exit - -sw.bb.i: - %cmp1 = icmp ugt i16 %mul, 34 - %select.i = select i1 %cmp1, i16 %mul, i16 34 - br label %exit - -default: - br label %exit - -exit: - %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ] - ret i16 %res -} - ; We currently only handle truncs as sinks, so a uxt will still be needed for ; the icmp ugt instruction. ; CHECK-COMMON-LABEL: urem_trunc_icmps @@ -187,47 +155,6 @@ exit: ret void } -; CHECK-COMMON-LABEL: phi_feeding_switch -; CHECK-COMMON: ldrb -; CHECK-COMMON: uxtb -define void @phi_feeding_switch(i8* %memblock, i8* %store, i16 %arg) { -entry: - %pre = load i8, i8* %memblock, align 1 - %conv = trunc i16 %arg to i8 - br label %header - -header: - %phi.0 = phi i8 [ %pre, %entry ], [ %count, %latch ] - %phi.1 = phi i8 [ %conv, %entry ], [ %phi.3, %latch ] - %phi.2 = phi i8 [ 0, %entry], [ %count, %latch ] - switch i8 %phi.0, label %default [ - i8 43, label %for.inc.i - i8 45, label %for.inc.i.i - ] - -for.inc.i: - %xor = xor i8 %phi.1, 1 - br label %latch - -for.inc.i.i: - %and = and i8 %phi.1, 3 - br label %latch - -default: - %sub = sub i8 %phi.0, 1 - %cmp2 = icmp ugt i8 %sub, 4 - br i1 %cmp2, label %latch, label %exit - -latch: - %phi.3 = phi i8 [ %xor, %for.inc.i ], [ %and, %for.inc.i.i ], [ %phi.2, %default ] - %count = add nuw i8 %phi.2, 1 - store i8 %count, i8* %store, align 1 - br label %header - -exit: - ret void -} - ; Check that %exp requires uxth in all cases, and will also be required to ; promote %1 for the call - unless we can generate a uadd16. ; CHECK-COMMON-LABEL: zext_load_sink_call: @@ -254,40 +181,6 @@ exit: ret i32 %exitval } -%class.ae = type { i8 } -%class.x = type { i8 } -%class.v = type { %class.q } -%class.q = type { i16 } - -; CHECK-COMMON-LABEL: trunc_i16_i9_switch -; CHECK-COMMON-NOT: uxt -define i32 @trunc_i16_i9_switch(%class.ae* %this) { -entry: - %call = tail call %class.x* @_ZNK2ae2afEv(%class.ae* %this) - %call2 = tail call %class.v* @_ZN1x2acEv(%class.x* %call) - %0 = getelementptr inbounds %class.v, %class.v* %call2, i32 0, i32 0, i32 0 - %1 = load i16, i16* %0, align 2 - %2 = trunc i16 %1 to i9 - %trunc = and i9 %2, -64 - switch i9 %trunc, label %cleanup.fold.split [ - i9 0, label %cleanup - i9 -256, label %if.then7 - ] - -if.then7: - %3 = and i16 %1, 7 - %tobool = icmp eq i16 %3, 0 - %cond = select i1 %tobool, i32 2, i32 1 - br label %cleanup - -cleanup.fold.split: - br label %cleanup - -cleanup: - %retval.0 = phi i32 [ %cond, %if.then7 ], [ 0, %entry ], [ 2, %cleanup.fold.split ] - ret i32 %retval.0 -} - ; CHECK-COMMON-LABEL: bitcast_i16 ; CHECK-COMMON-NOT: uxt define i16 @bitcast_i16(i16 zeroext %arg0, i16 zeroext %arg1) { @@ -332,16 +225,14 @@ entry: ret i8 %res } -declare %class.x* @_ZNK2ae2afEv(%class.ae*) local_unnamed_addr -declare %class.v* @_ZN1x2acEv(%class.x*) local_unnamed_addr declare i32 @dummy(i32, i32) @d_uch = hidden local_unnamed_addr global [16 x i8] zeroinitializer, align 1 @sh1 = hidden local_unnamed_addr global i16 0, align 2 @d_sh = hidden local_unnamed_addr global [16 x i16] zeroinitializer, align 2 -; CHECK-LABEL: two_stage_zext_trunc_mix -; CHECK-NOT: uxt +; CHECK-COMMON-LABEL: two_stage_zext_trunc_mix +; CHECK-COMMON-NOT: uxt define i8* @two_stage_zext_trunc_mix(i32* %this, i32 %__pos1, i32 %__n1, i32** %__str, i32 %__pos2, i32 %__n2) { entry: %__size_.i.i.i.i = bitcast i32** %__str to i8* @@ -363,3 +254,337 @@ entry: %res = select i1 %tobool.i.i.i.i.i, i8* %7, i8* %8 ret i8* %res } + +; CHECK-COMMON-LABEL: search_through_zext_1 +; CHECK-COMMON-NOT: uxt +define i8 @search_through_zext_1(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c) { +entry: + %add = add nuw i8 %a, %b + %conv = zext i8 %add to i16 + %cmp = icmp ult i16 %conv, %c + br i1 %cmp, label %if.then, label %if.end + +if.then: + %sub = sub nuw i8 %b, %a + %conv2 = zext i8 %sub to i16 + %cmp2 = icmp ugt i16 %conv2, %c + %res = select i1 %cmp2, i8 %a, i8 %b + br label %if.end + +if.end: + %retval = phi i8 [ 0, %entry ], [ %res, %if.then ] + ret i8 %retval +} + +; TODO: We should be able to remove the uxtb here. The transform fails because +; the icmp ugt uses an i32, which is too large... but this doesn't matter +; because it won't be writing a large value to a register as a result. +; CHECK-COMMON-LABEL: search_through_zext_2 +; CHECK-COMMON: uxtb +; CHECK-COMMON: uxtb +define i8 @search_through_zext_2(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c, i32 %d) { +entry: + %add = add nuw i8 %a, %b + %conv = zext i8 %add to i16 + %cmp = icmp ult i16 %conv, %c + br i1 %cmp, label %if.then, label %if.end + +if.then: + %sub = sub nuw i8 %b, %a + %conv2 = zext i8 %sub to i32 + %cmp2 = icmp ugt i32 %conv2, %d + %res = select i1 %cmp2, i8 %a, i8 %b + br label %if.end + +if.end: + %retval = phi i8 [ 0, %entry ], [ %res, %if.then ] + ret i8 %retval +} + +; TODO: We should be able to remove the uxtb here as all the calculations are +; performed on i8s. The promotion of i8 to i16 and then the later truncation +; results in the uxtb. +; CHECK-COMMON-LABEL: search_through_zext_3 +; CHECK-COMMON: uxtb +; CHECK-COMMON: uxtb +define i8 @search_through_zext_3(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c, i32 %d) { +entry: + %add = add nuw i8 %a, %b + %conv = zext i8 %add to i16 + %cmp = icmp ult i16 %conv, %c + br i1 %cmp, label %if.then, label %if.end + +if.then: + %trunc = trunc i16 %conv to i8 + %sub = sub nuw i8 %b, %trunc + %conv2 = zext i8 %sub to i32 + %cmp2 = icmp ugt i32 %conv2, %d + %res = select i1 %cmp2, i8 %a, i8 %b + br label %if.end + +if.end: + %retval = phi i8 [ 0, %entry ], [ %res, %if.then ] + ret i8 %retval +} + +; TODO: We should be able to remove the uxt that gets introduced for %conv2 +; CHECK-COMMON-LABEL: search_through_zext_cmp +; CHECK-COMMON: uxt +define i8 @search_through_zext_cmp(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c) { +entry: + %cmp = icmp ne i8 %a, %b + %conv = zext i1 %cmp to i16 + %cmp1 = icmp ult i16 %conv, %c + br i1 %cmp1, label %if.then, label %if.end + +if.then: + %sub = sub nuw i8 %b, %a + %conv2 = zext i8 %sub to i16 + %cmp3 = icmp ugt i16 %conv2, %c + %res = select i1 %cmp3, i8 %a, i8 %b + br label %if.end + +if.end: + %retval = phi i8 [ 0, %entry ], [ %res, %if.then ] + ret i8 %retval +} + +; CHECK-COMMON-LABEL: search_through_zext_load +; CHECK-COMMON-NOT: uxt +define i8 @search_through_zext_load(i8* %a, i8 zeroext %b, i16 zeroext %c) { +entry: + %load = load i8, i8* %a + %conv = zext i8 %load to i16 + %cmp1 = icmp ult i16 %conv, %c + br i1 %cmp1, label %if.then, label %if.end + +if.then: + %sub = sub nuw i8 %b, %load + %conv2 = zext i8 %sub to i16 + %cmp3 = icmp ugt i16 %conv2, %c + %res = select i1 %cmp3, i8 %load, i8 %b + br label %if.end + +if.end: + %retval = phi i8 [ 0, %entry ], [ %res, %if.then ] + ret i8 %retval +} + +; CHECK-COMMON-LABEL: trunc_sink_less_than +; CHECK-COMMON-NOT: uxth +; CHECK-COMMON: cmp +; CHECK-COMMON: uxtb +define i16 @trunc_sink_less_than_cmp(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d) { +entry: + %sub = sub nuw i16 %b, %a + %cmp = icmp ult i16 %sub, %c + br i1 %cmp, label %if.then, label %if.end + +if.then: + %trunc = trunc i16 %sub to i8 + %add = add nuw i8 %d, 1 + %cmp2 = icmp ugt i8 %trunc, %add + %res = select i1 %cmp2, i16 %a, i16 %b + br label %if.end + +if.end: + %retval = phi i16 [ 0, %entry ], [ %res, %if.then ] + ret i16 %retval +} + +; TODO: We should be able to remove the uxth introduced to handle %sub +; CHECK-COMMON-LABEL: trunc_sink_less_than_arith +; CHECK-COMMON: uxth +; CHECK-COMMON: cmp +; CHECK-COMMON: uxtb +define i16 @trunc_sink_less_than_arith(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8 zeroext %e) { +entry: + %sub = sub nuw i16 %b, %a + %cmp = icmp ult i16 %sub, %c + br i1 %cmp, label %if.then, label %if.end + +if.then: + %trunc = trunc i16 %sub to i8 + %add = add nuw i8 %d, %trunc + %cmp2 = icmp ugt i8 %e, %add + %res = select i1 %cmp2, i16 %a, i16 %b + br label %if.end + +if.end: + %retval = phi i16 [ 0, %entry ], [ %res, %if.then ] + ret i16 %retval +} + +; CHECK-COMMON-LABEL: trunc_sink_less_than_store +; CHECK-COMMON-NOT: uxt +; CHECK-COMMON: cmp +; CHECK-COMMON-NOT: uxt +define i16 @trunc_sink_less_than_store(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8* %e) { +entry: + %sub = sub nuw i16 %b, %a + %cmp = icmp ult i16 %sub, %c + br i1 %cmp, label %if.then, label %if.end + +if.then: + %trunc = trunc i16 %sub to i8 + %add = add nuw i8 %d, %trunc + store i8 %add, i8* %e + br label %if.end + +if.end: + %retval = phi i16 [ 0, %entry ], [ %sub, %if.then ] + ret i16 %retval +} + +; CHECK-COMMON-LABEL: trunc_sink_less_than_ret +; CHECK-COMMON-NOT: uxt +define i8 @trunc_sink_less_than_ret(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8 zeroext %e) { +entry: + %sub = sub nuw i16 %b, %a + %cmp = icmp ult i16 %sub, %c + br i1 %cmp, label %if.then, label %if.end + +if.then: + %trunc = trunc i16 %sub to i8 + %add = add nuw i8 %d, %trunc + br label %if.end + +if.end: + %retval = phi i8 [ 0, %entry ], [ %add, %if.then ] + ret i8 %retval +} + +; CHECK-COMMON-LABEL: trunc_sink_less_than_zext_ret +; CHECK-COMMON-NOT: uxth +; CHECK-COMMON: sub +; CHECK-COMMON: uxtb +define zeroext i8 @trunc_sink_less_than_zext_ret(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8 zeroext %e) { +entry: + %sub = sub nuw i16 %b, %a + %cmp = icmp ult i16 %sub, %c + br i1 %cmp, label %if.then, label %if.end + +if.then: + %trunc = trunc i16 %sub to i8 + %add = add nuw i8 %d, %trunc + br label %if.end + +if.end: + %retval = phi i8 [ 0, %entry ], [ %add, %if.then ] + ret i8 %retval +} + +; CHECK-COMMON-LABEL: bitcast_i1 +; CHECK-COMMON-NOT: uxt +define i32 @bitcast_i1(i16 zeroext %a, i32 %b, i32 %c) { +entry: + %0 = bitcast i1 1 to i1 + %1 = trunc i16 %a to i1 + %cmp = icmp eq i1 %1, %0 + br i1 %cmp, label %if.then, label %exit + +if.then: + %conv = zext i1 %0 to i16 + %conv1 = zext i1 %1 to i16 + %cmp1 = icmp uge i16 %conv, %conv1 + %select = select i1 %cmp1, i32 %b, i32 %c + br label %exit + +exit: + %retval = phi i32 [ %select, %if.then ], [ 0, %entry ] + ret i32 %retval +} + +; CHECK-COMMON-LABEL: search_back_through_trunc +; CHECK-COMMON-NOT: uxt +; CHECK-COMMON: cmp +; CHECK-COMMON: strb +; CHECK-COMMON: strb +define void @search_back_through_trunc(i8* %a, i8* %b, i8* %c, i8* %d, i16* %e) { +entry: + %0 = load i8, i8* %a, align 1 + %conv106 = zext i8 %0 to i16 + %shl = shl nuw i16 %conv106, 8 + %1 = load i8, i8* %b, align 1 + %conv108 = zext i8 %1 to i16 + %or109 = or i16 %shl, %conv108 + %2 = load i8, i8* %c, align 1 + %conv119 = zext i8 %2 to i16 + %shl120 = shl nuw i16 %conv119, 8 + %3 = load i8, i8* %d, align 1 + %conv122 = zext i8 %3 to i16 + %or123 = or i16 %shl120, %conv122 + %cmp133 = icmp eq i16 %or109, %or123 + br i1 %cmp133, label %if.end183, label %if.else136 + +if.else136: + %4 = load i16, i16* %e, align 2 + %extract.t854 = trunc i16 %4 to i8 + %extract856 = lshr i16 %4, 8 + %extract.t857 = trunc i16 %extract856 to i8 + br label %if.end183 + +if.end183: + %w.0.off0 = phi i8 [ %extract.t854, %if.else136 ], [ %1, %entry ] + %w.0.off8 = phi i8 [ %extract.t857, %if.else136 ], [ %2, %entry ] + store i8 %w.0.off8, i8* %c, align 1 + store i8 %w.0.off0, i8* %d, align 1 + ret void +} + +@c = common dso_local local_unnamed_addr global i16 0, align 2 +@b = common dso_local local_unnamed_addr global i16 0, align 2 +@f = common dso_local local_unnamed_addr global i32 0, align 4 +@e = common dso_local local_unnamed_addr global i8 0, align 1 +@a = common dso_local local_unnamed_addr global i8 0, align 1 +@d = common dso_local local_unnamed_addr global i32 0, align 4 + +; CHECK-LABEL: and_trunc +; CHECK: ldrh +; CHECK: sxth +; CHECK: uxtb +define void @and_trunc_two_zext() { +entry: + %0 = load i16, i16* @c, align 2 + %1 = load i16, i16* @b, align 2 + %conv = sext i16 %1 to i32 + store i32 %conv, i32* @f, align 4 + %2 = trunc i16 %1 to i8 + %conv1 = and i8 %2, 1 + store i8 %conv1, i8* @e, align 1 + %3 = load i8, i8* @a, align 1 + %narrow = mul nuw i8 %3, %conv1 + %mul = zext i8 %narrow to i32 + store i32 %mul, i32* @d, align 4 + %4 = zext i8 %narrow to i16 + %conv5 = or i16 %0, %4 + %tobool = icmp eq i16 %conv5, 0 + br i1 %tobool, label %if.end, label %for.cond + +for.cond: + br label %for.cond + +if.end: + ret void +} + +; CHECK-LABEL: zext_urem_trunc +; CHECK-NOT: uxt +define void @zext_urem_trunc() { +entry: + %0 = load i16, i16* @c, align 2 + %cmp = icmp eq i16 %0, 0 + %1 = load i8, i8* @e, align 1 + br i1 %cmp, label %cond.end, label %cond.false + +cond.false: + %rem.lhs.trunc = zext i8 %1 to i16 + %rem7 = urem i16 %rem.lhs.trunc, %0 + %rem.zext = trunc i16 %rem7 to i8 + br label %cond.end + +cond.end: + %cond = phi i8 [ %rem.zext, %cond.false ], [ %1, %entry ] + store i8 %cond, i8* @a, align 1 + ret void +} diff --git a/test/CodeGen/ARM/CGP/arm-cgp-phis-ret.ll b/test/CodeGen/ARM/CGP/arm-cgp-phis-ret.ll index 65588b84f7ccc6cab0e80c2759d33f919658f380..e7adc5ae2491b2517c5109cd670081e13f63bb59 100644 --- a/test/CodeGen/ARM/CGP/arm-cgp-phis-ret.ll +++ b/test/CodeGen/ARM/CGP/arm-cgp-phis-ret.ll @@ -1,7 +1,7 @@ -; RUN: llc -mtriple=thumbv7m -arm-disable-cgp=false %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP -; RUN: llc -mtriple=thumbv8m.main -arm-disable-cgp=false %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP -; RUN: llc -mtriple=thumbv8m.main -arm-disable-cgp=false -arm-enable-scalar-dsp=true -mcpu=cortex-m33 %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP -; RUN: llc -mtriple=thumbv7em %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP-IMM +; RUN: llc -mtriple=thumbv7m -arm-disable-cgp=false %s -o - | FileCheck %s --check-prefix=CHECK-COMMON +; RUN: llc -mtriple=thumbv8m.main -arm-disable-cgp=false %s -o - | FileCheck %s --check-prefix=CHECK-COMMON +; RUN: llc -mtriple=thumbv8m.main -arm-disable-cgp=false -arm-enable-scalar-dsp=true -mcpu=cortex-m33 %s -o - | FileCheck %s --check-prefix=CHECK-COMMON +; RUN: llc -mtriple=thumbv7em %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK-COMMON ; Test that ARMCodeGenPrepare can handle: ; - loops @@ -172,3 +172,15 @@ if.end: exit: ret i16 %unrelated } + +; CHECK-COMMON-LABEL: promote_arg_return +; CHECK-COMMON-NOT: uxt +; CHECK-COMMON: strb +define i16 @promote_arg_return(i16 zeroext %arg1, i16 zeroext %arg2, i8* %res) { + %add = add nuw i16 %arg1, 15 + %mul = mul nuw nsw i16 %add, 3 + %cmp = icmp ult i16 %mul, %arg2 + %conv = zext i1 %cmp to i8 + store i8 %conv, i8* %res + ret i16 %arg1 +} diff --git a/test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll b/test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll index 85a10ba80809b491eb7bf9ae8f9b6f25a8446257..98794f500d4c9bb76d6c8ed4482a311657be7f97 100644 --- a/test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll +++ b/test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll @@ -11,18 +11,17 @@ ; CHECK-NODSP: sxtb ; CHECK-NODSP: cmp -; CHECK-DSP: add -; CHECK-DSP: uxtb +; CHECK-DSP: uadd8 +; CHECK-DSP: sub ; CHECK-DSP: cmp ; CHECK-DSP: sxtb -; CHECK-DSP: sub ; CHECK-DSP: sxtb ; CHECK-DSP: cmp ; CHECK-DSP-IMM: uadd8 [[ADD:r[0-9]+]], ; CHECK-DSP-IMM: cmp [[ADD]], +; CHECK-DSP-IMM: subs [[SUB:r[0-9]+]], ; CHECK-DSP-IMM: sxtb [[SEXT0:r[0-9]+]], [[ADD]] -; CHECK-DSP-IMM: usub8 [[SUB:r[0-9]+]], ; CHECK-DSP-IMM: sxtb [[SEXT1:r[0-9]+]], [[SUB]] ; CHECK-DSP-IMM: cmp [[SEXT1]], [[SEXT0]] define i8 @eq_sgt(i8* %x, i8 *%y, i8 zeroext %z) { diff --git a/test/CodeGen/ARM/CGP/arm-cgp-switch.ll b/test/CodeGen/ARM/CGP/arm-cgp-switch.ll new file mode 100644 index 0000000000000000000000000000000000000000..29c35fbc96e001da000d00d392328ff718adacb4 --- /dev/null +++ b/test/CodeGen/ARM/CGP/arm-cgp-switch.ll @@ -0,0 +1,168 @@ +; RUN: llc -mtriple=thumbv7em %s -arm-disable-cgp=false -o - | FileCheck %s +; RUN: llc -mtriple=thumbv7-linux-android %s -arm-disable-cgp=false -o - | FileCheck %s + +; CHECK-LABEL: truncate_source_phi_switch +; CHECK: ldrb +; CHECK: uxtb +define void @truncate_source_phi_switch(i8* %memblock, i8* %store, i16 %arg) { +entry: + %pre = load i8, i8* %memblock, align 1 + %conv = trunc i16 %arg to i8 + br label %header + +header: + %phi.0 = phi i8 [ %pre, %entry ], [ %count, %latch ] + %phi.1 = phi i8 [ %conv, %entry ], [ %phi.3, %latch ] + %phi.2 = phi i8 [ 0, %entry], [ %count, %latch ] + switch i8 %phi.0, label %default [ + i8 43, label %for.inc.i + i8 45, label %for.inc.i.i + ] + +for.inc.i: + %xor = xor i8 %phi.1, 1 + br label %latch + +for.inc.i.i: + %and = and i8 %phi.1, 3 + br label %latch + +default: + %sub = sub i8 %phi.0, 1 + %cmp2 = icmp ugt i8 %sub, 4 + br i1 %cmp2, label %latch, label %exit + +latch: + %phi.3 = phi i8 [ %xor, %for.inc.i ], [ %and, %for.inc.i.i ], [ %phi.2, %default ] + %count = add nuw i8 %phi.2, 1 + store i8 %count, i8* %store, align 1 + br label %header + +exit: + ret void +} + +; CHECK-LABEL: icmp_switch_source: +; CHECK-NOT: uxt +define i16 @icmp_switch_source(i16 zeroext %arg) { +entry: + %conv = add nuw i16 %arg, 15 + %mul = mul nuw nsw i16 %conv, 3 + switch i16 %arg, label %default [ + i16 0, label %sw.bb + i16 1, label %sw.bb.i + ] + +sw.bb: + %cmp0 = icmp ult i16 %mul, 127 + %select = select i1 %cmp0, i16 %mul, i16 127 + br label %exit + +sw.bb.i: + %cmp1 = icmp ugt i16 %mul, 34 + %select.i = select i1 %cmp1, i16 %mul, i16 34 + br label %exit + +default: + br label %exit + +exit: + %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ] + ret i16 %res +} + +; CHECK-LABEL: icmp_switch_narrow_source: +; CHECK-NOT: uxt +define i16 @icmp_switch_narrow_source(i8 zeroext %arg) { +entry: + %conv = zext i8 %arg to i16 + %add = add nuw i16 %conv, 15 + %mul = mul nuw nsw i16 %add, 3 + switch i8 %arg, label %default [ + i8 0, label %sw.bb + i8 1, label %sw.bb.i + ] + +sw.bb: + %cmp0 = icmp ult i16 %mul, 127 + %select = select i1 %cmp0, i16 %mul, i16 127 + br label %exit + +sw.bb.i: + %cmp1 = icmp ugt i16 %mul, 34 + %select.i = select i1 %cmp1, i16 %mul, i16 34 + br label %exit + +default: + br label %exit + +exit: + %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ] + ret i16 %res +} + +; CHECK-LABEL: icmp_switch_trunc: +; CHECK-NOT: uxt +define i16 @icmp_switch_trunc(i16 zeroext %arg) { +entry: + %conv = add nuw i16 %arg, 15 + %mul = mul nuw nsw i16 %conv, 3 + %trunc = trunc i16 %arg to i3 + switch i3 %trunc, label %default [ + i3 0, label %sw.bb + i3 1, label %sw.bb.i + ] + +sw.bb: + %cmp0 = icmp ult i16 %mul, 127 + %select = select i1 %cmp0, i16 %mul, i16 127 + br label %exit + +sw.bb.i: + %cmp1 = icmp ugt i16 %mul, 34 + %select.i = select i1 %cmp1, i16 %mul, i16 34 + br label %exit + +default: + br label %exit + +exit: + %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ] + ret i16 %res +} + +%class.ae = type { i8 } +%class.x = type { i8 } +%class.v = type { %class.q } +%class.q = type { i16 } +declare %class.x* @_ZNK2ae2afEv(%class.ae*) local_unnamed_addr +declare %class.v* @_ZN1x2acEv(%class.x*) local_unnamed_addr + +; CHECK-LABEL: trunc_i16_i9_switch +; CHECK-NOT: uxt +define i32 @trunc_i16_i9_switch(%class.ae* %this) { +entry: + %call = tail call %class.x* @_ZNK2ae2afEv(%class.ae* %this) + %call2 = tail call %class.v* @_ZN1x2acEv(%class.x* %call) + %0 = getelementptr inbounds %class.v, %class.v* %call2, i32 0, i32 0, i32 0 + %1 = load i16, i16* %0, align 2 + %2 = trunc i16 %1 to i9 + %trunc = and i9 %2, -64 + switch i9 %trunc, label %cleanup.fold.split [ + i9 0, label %cleanup + i9 -256, label %if.then7 + ] + +if.then7: + %3 = and i16 %1, 7 + %tobool = icmp eq i16 %3, 0 + %cond = select i1 %tobool, i32 2, i32 1 + br label %cleanup + +cleanup.fold.split: + br label %cleanup + +cleanup: + %retval.0 = phi i32 [ %cond, %if.then7 ], [ 0, %entry ], [ 2, %cleanup.fold.split ] + ret i32 %retval.0 +} diff --git a/test/CodeGen/ARM/GlobalISel/arm-call-lowering.ll b/test/CodeGen/ARM/GlobalISel/arm-call-lowering.ll index d667f1756eb083d2b9f5306201fc7944d9452b95..84602a3c4738c05e619853eff4a322f791a76d3c 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-call-lowering.ll +++ b/test/CodeGen/ARM/GlobalISel/arm-call-lowering.ll @@ -1,13 +1,16 @@ -; RUN: llc -mtriple arm-unknown -mattr=-v4t -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,NOV4T -; RUN: llc -mtriple arm-unknown -mattr=+v4t -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,V4T -; RUN: llc -mtriple arm-unknown -mattr=+v5t -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,V5T +; RUN: llc -mtriple arm-unknown -mattr=-v4t -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,NOV4T,ARM +; RUN: llc -mtriple arm-unknown -mattr=+v4t -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,V4T,ARM +; RUN: llc -mtriple arm-unknown -mattr=+v5t -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,V5T,ARM +; RUN: llc -mtriple thumb-unknown -mattr=+v6t2 -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,THUMB define arm_aapcscc void @test_indirect_call(void() *%fptr) { ; CHECK-LABEL: name: test_indirect_call +; THUMB: %[[FPTR:[0-9]+]]:gpr(p0) = COPY $r0 ; V5T: %[[FPTR:[0-9]+]]:gpr(p0) = COPY $r0 ; V4T: %[[FPTR:[0-9]+]]:tgpr(p0) = COPY $r0 ; NOV4T: %[[FPTR:[0-9]+]]:tgpr(p0) = COPY $r0 ; CHECK: ADJCALLSTACKDOWN 0, 0, 14, $noreg, implicit-def $sp, implicit $sp +; THUMB: tBLXr 14, $noreg, %[[FPTR]](p0), csr_aapcs, implicit-def $lr, implicit $sp ; V5T: BLX %[[FPTR]](p0), csr_aapcs, implicit-def $lr, implicit $sp ; V4T: BX_CALL %[[FPTR]](p0), csr_aapcs, implicit-def $lr, implicit $sp ; NOV4T: BMOVPCRX_CALL %[[FPTR]](p0), csr_aapcs, implicit-def $lr, implicit $sp @@ -22,7 +25,8 @@ declare arm_aapcscc void @call_target() define arm_aapcscc void @test_direct_call() { ; CHECK-LABEL: name: test_direct_call ; CHECK: ADJCALLSTACKDOWN 0, 0, 14, $noreg, implicit-def $sp, implicit $sp -; CHECK: BL @call_target, csr_aapcs, implicit-def $lr, implicit $sp +; THUMB: tBL 14, $noreg, @call_target, csr_aapcs, implicit-def $lr, implicit $sp +; ARM: BL @call_target, csr_aapcs, implicit-def $lr, implicit $sp ; CHECK: ADJCALLSTACKUP 0, 0, 14, $noreg, implicit-def $sp, implicit $sp entry: notail call arm_aapcscc void @call_target() diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir index b361023067de68049ab33e8f825e340a14645ac4..d3e9796b72f5282270359acbc532cc2584fee714 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir @@ -42,6 +42,9 @@ define void @test_vfnmss() #4 { ret void } + define void @test_bfc() #2 { ret void } + define void @test_no_bfc_bad_mask() #2 { ret void } + attributes #0 = { "target-features"="+v6" } attributes #1 = { "target-features"="-v6" } attributes #2 = { "target-features"="+v6t2" } @@ -1142,3 +1145,57 @@ body: | BX_RET 14, $noreg, implicit $s0 ; CHECK: BX_RET 14, $noreg, implicit $s0 ... +--- +name: test_bfc +# CHECK-LABEL: name: test_bfc +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + %1(s32) = G_CONSTANT i32 -65529 ; 0xFFFF0007 + %2(s32) = G_AND %0, %1 + ; CHECK: [[RS:%[0-9]+]]:gpr = COPY $r0 + ; CHECK: [[RD:%[0-9]+]]:gpr = BFC [[RS]], -65529, 14, $noreg + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[RD]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_no_bfc_bad_mask +# CHECK-LABEL: name: test_no_bfc_bad_mask +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + %1(s32) = G_CONSTANT i32 6 ; 0x00000006 + %2(s32) = G_AND %0, %1 + ; CHECK: [[RS:%[0-9]+]]:gpr = COPY $r0 + ; CHECK-NOT: BFC + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[RD]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... diff --git a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll index 95c454cdc1545beffb6827e258fdbddfecbcd664..175673740cddc73252be7c88e8b09029d5de9d7b 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll +++ b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll @@ -440,7 +440,7 @@ define i32 @test_shufflevector_s32_v2s32(i32 %arg) { ; CHECK: [[ARG:%[0-9]+]]:_(s32) = COPY $r0 ; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; CHECK-DAG: [[C0:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 -; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C0]](s32) +; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C0]](s32), [[C0]](s32) ; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](s32), [[UNDEF]], [[MASK]](<2 x s32>) ; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>) %vec = insertelement <1 x i32> undef, i32 %arg, i32 0 @@ -456,7 +456,7 @@ define i32 @test_shufflevector_v2s32_v3s32(i32 %arg1, i32 %arg2) { ; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF ; CHECK-DAG: [[C0:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-DAG: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 -; CHECK-DAG: [[MASK:%[0-9]+]]:_(<3 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C0]](s32), [[C1]](s32) +; CHECK-DAG: [[MASK:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C0]](s32), [[C1]](s32) ; CHECK-DAG: [[V1:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[UNDEF]], [[ARG1]](s32), [[C0]](s32) ; CHECK-DAG: [[V2:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[V1]], [[ARG2]](s32), [[C1]](s32) ; CHECK: [[VEC:%[0-9]+]]:_(<3 x s32>) = G_SHUFFLE_VECTOR [[V2]](<2 x s32>), [[UNDEF]], [[MASK]](<3 x s32>) @@ -476,7 +476,7 @@ define i32 @test_shufflevector_v2s32_v4s32(i32 %arg1, i32 %arg2) { ; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF ; CHECK-DAG: [[C0:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-DAG: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 -; CHECK-DAG: [[MASK:%[0-9]+]]:_(<4 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C0]](s32), [[C0]](s32), [[C0]](s32) +; CHECK-DAG: [[MASK:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C0]](s32), [[C0]](s32), [[C0]](s32), [[C0]](s32) ; CHECK-DAG: [[V1:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[UNDEF]], [[ARG1]](s32), [[C0]](s32) ; CHECK-DAG: [[V2:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[V1]], [[ARG2]](s32), [[C1]](s32) ; CHECK: [[VEC:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[V2]](<2 x s32>), [[UNDEF]], [[MASK]](<4 x s32>) @@ -499,7 +499,7 @@ define i32 @test_shufflevector_v4s32_v2s32(i32 %arg1, i32 %arg2, i32 %arg3, i32 ; CHECK-DAG: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-DAG: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CHECK-DAG: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 -; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C3]](s32) +; CHECK-DAG: [[MASK:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C3]](s32) ; CHECK-DAG: [[V1:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[UNDEF]], [[ARG1]](s32), [[C0]](s32) ; CHECK-DAG: [[V2:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[V1]], [[ARG2]](s32), [[C1]](s32) ; CHECK-DAG: [[V3:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[V2]], [[ARG3]](s32), [[C2]](s32) @@ -521,7 +521,7 @@ define i32 @test_constantstruct_v2s32() { ; CHECK-LABEL: name: test_constantstruct_v2s32 ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 -; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C2]](s32) +; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C2]](s32) ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>), [[C3]] %vec = extractvalue %struct.v2s32 {<2 x i32>}, 0 @@ -535,7 +535,7 @@ define i32 @test_constantstruct_v2s32_s32_s32() { ; CHECK-LABEL: name: test_constantstruct_v2s32_s32_s32 ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 -; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C2]](s32) +; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C2]](s32) ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-binops.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-binops.mir new file mode 100644 index 0000000000000000000000000000000000000000..51783caee3c66948147adc9e079568995e240b19 --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-binops.mir @@ -0,0 +1,561 @@ +# RUN: llc -mtriple arm-- -run-pass=legalizer %s -o - | FileCheck %s +# RUN: llc -mtriple thumb-- -mattr=+v6t2 -run-pass=legalizer %s -o - | FileCheck %s +--- | + define void @test_add_s8() { ret void } + define void @test_add_s16() { ret void } + define void @test_add_s32() { ret void } + + define void @test_sub_s8() { ret void } + define void @test_sub_s16() { ret void } + define void @test_sub_s32() { ret void } + + define void @test_mul_s8() { ret void } + define void @test_mul_s16() { ret void } + define void @test_mul_s32() { ret void } + + define void @test_and_s8() { ret void } + define void @test_and_s16() { ret void } + define void @test_and_s32() { ret void } + + define void @test_or_s8() { ret void } + define void @test_or_s16() { ret void } + define void @test_or_s32() { ret void } + + define void @test_xor_s8() { ret void } + define void @test_xor_s16() { ret void } + define void @test_xor_s32() { ret void } +... +--- +name: test_add_s8 +# CHECK-LABEL: name: test_add_s8 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } +body: | + bb.0: + liveins: $r0, $r1 + + %0(p0) = COPY $r0 + %1(s8) = G_LOAD %0 :: (load 1) + %2(p0) = COPY $r0 + %3(s8) = G_LOAD %2 :: (load 1) + %4(s8) = G_ADD %1, %3 + ; G_ADD with s8 should widen + ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_ADD {{%[0-9]+, %[0-9]+}} + ; CHECK: {{%[0-9]+}}:_(s32) = G_ADD {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_ADD {{%[0-9]+, %[0-9]+}} + %5(s32) = G_SEXT %4(s8) + $r0 = COPY %5(s32) + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_add_s16 +# CHECK-LABEL: name: test_add_s16 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } +body: | + bb.0: + liveins: $r0, $r1 + + %0(p0) = COPY $r0 + %1(s16) = G_LOAD %0 :: (load 2) + %2(p0) = COPY $r0 + %3(s16) = G_LOAD %2 :: (load 2) + %4(s16) = G_ADD %1, %3 + ; G_ADD with s16 should widen + ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_ADD {{%[0-9]+, %[0-9]+}} + ; CHECK: {{%[0-9]+}}:_(s32) = G_ADD {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_ADD {{%[0-9]+, %[0-9]+}} + %5(s32) = G_SEXT %4(s16) + $r0 = COPY %5(s32) + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_add_s32 +# CHECK-LABEL: name: test_add_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: $r0, $r1 + + %0(s32) = COPY $r0 + %1(s32) = COPY $r1 + %2(s32) = G_ADD %0, %1 + ; G_ADD with s32 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}:_(s32) = G_ADD {{%[0-9]+, %[0-9]+}} + $r0 = COPY %2(s32) + BX_RET 14, $noreg, implicit $r0 + +... +--- +name: test_sub_s8 +# CHECK-LABEL: name: test_sub_s8 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } +body: | + bb.0: + liveins: $r0, $r1 + + %0(p0) = COPY $r0 + %1(s8) = G_LOAD %0 :: (load 1) + %2(p0) = COPY $r0 + %3(s8) = G_LOAD %2 :: (load 1) + %4(s8) = G_SUB %1, %3 + ; G_SUB with s8 should widen + ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_SUB {{%[0-9]+, %[0-9]+}} + ; CHECK: {{%[0-9]+}}:_(s32) = G_SUB {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_SUB {{%[0-9]+, %[0-9]+}} + %5(s32) = G_SEXT %4(s8) + $r0 = COPY %5(s32) + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_sub_s16 +# CHECK-LABEL: name: test_sub_s16 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } +body: | + bb.0: + liveins: $r0, $r1 + + %0(p0) = COPY $r0 + %1(s16) = G_LOAD %0 :: (load 2) + %2(p0) = COPY $r0 + %3(s16) = G_LOAD %2 :: (load 2) + %4(s16) = G_SUB %1, %3 + ; G_SUB with s16 should widen + ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_SUB {{%[0-9]+, %[0-9]+}} + ; CHECK: {{%[0-9]+}}:_(s32) = G_SUB {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_SUB {{%[0-9]+, %[0-9]+}} + %5(s32) = G_SEXT %4(s16) + $r0 = COPY %5(s32) + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_sub_s32 +# CHECK-LABEL: name: test_sub_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: $r0, $r1 + + %0(s32) = COPY $r0 + %1(s32) = COPY $r1 + %2(s32) = G_SUB %0, %1 + ; G_SUB with s32 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}:_(s32) = G_SUB {{%[0-9]+, %[0-9]+}} + $r0 = COPY %2(s32) + BX_RET 14, $noreg, implicit $r0 + +... +--- +name: test_mul_s8 +# CHECK-LABEL: name: test_mul_s8 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } +body: | + bb.0: + liveins: $r0, $r1 + + %0(p0) = COPY $r0 + %1(s8) = G_LOAD %0 :: (load 1) + %2(p0) = COPY $r0 + %3(s8) = G_LOAD %2 :: (load 1) + %4(s8) = G_MUL %1, %3 + ; G_MUL with s8 should widen + ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_MUL {{%[0-9]+, %[0-9]+}} + ; CHECK: {{%[0-9]+}}:_(s32) = G_MUL {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_MUL {{%[0-9]+, %[0-9]+}} + %5(s32) = G_SEXT %4(s8) + $r0 = COPY %5(s32) + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_mul_s16 +# CHECK-LABEL: name: test_mul_s16 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } +body: | + bb.0: + liveins: $r0, $r1 + + %0(p0) = COPY $r0 + %1(s16) = G_LOAD %0 :: (load 2) + %2(p0) = COPY $r0 + %3(s16) = G_LOAD %2 :: (load 2) + %4(s16) = G_MUL %1, %3 + ; G_MUL with s16 should widen + ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_MUL {{%[0-9]+, %[0-9]+}} + ; CHECK: {{%[0-9]+}}:_(s32) = G_MUL {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_MUL {{%[0-9]+, %[0-9]+}} + %5(s32) = G_SEXT %4(s16) + $r0 = COPY %5(s32) + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_mul_s32 +# CHECK-LABEL: name: test_mul_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: $r0, $r1 + + %0(s32) = COPY $r0 + %1(s32) = COPY $r1 + %2(s32) = G_MUL %0, %1 + ; G_MUL with s32 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}:_(s32) = G_MUL {{%[0-9]+, %[0-9]+}} + $r0 = COPY %2(s32) + BX_RET 14, $noreg, implicit $r0 + +... +--- +name: test_and_s8 +# CHECK-LABEL: name: test_and_s8 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } +body: | + bb.0: + liveins: $r0, $r1 + + %0(p0) = COPY $r0 + %1(s8) = G_LOAD %0 :: (load 1) + %2(p0) = COPY $r0 + %3(s8) = G_LOAD %2 :: (load 1) + %4(s8) = G_AND %1, %3 + ; G_AND with s8 should widen + ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_AND {{%[0-9]+, %[0-9]+}} + ; CHECK: {{%[0-9]+}}:_(s32) = G_AND {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_AND {{%[0-9]+, %[0-9]+}} + %5(s32) = G_SEXT %4(s8) + $r0 = COPY %5(s32) + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_and_s16 +# CHECK-LABEL: name: test_and_s16 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } +body: | + bb.0: + liveins: $r0, $r1 + + %0(p0) = COPY $r0 + %1(s16) = G_LOAD %0 :: (load 2) + %2(p0) = COPY $r0 + %3(s16) = G_LOAD %2 :: (load 2) + %4(s16) = G_AND %1, %3 + ; G_AND with s16 should widen + ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_AND {{%[0-9]+, %[0-9]+}} + ; CHECK: {{%[0-9]+}}:_(s32) = G_AND {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_AND {{%[0-9]+, %[0-9]+}} + %5(s32) = G_SEXT %4(s16) + $r0 = COPY %5(s32) + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_and_s32 +# CHECK-LABEL: name: test_and_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: $r0, $r1 + + %0(s32) = COPY $r0 + %1(s32) = COPY $r1 + %2(s32) = G_AND %0, %1 + ; G_AND with s32 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}:_(s32) = G_AND {{%[0-9]+, %[0-9]+}} + $r0 = COPY %2(s32) + BX_RET 14, $noreg, implicit $r0 + +... +--- +name: test_or_s8 +# CHECK-LABEL: name: test_or_s8 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } +body: | + bb.0: + liveins: $r0, $r1 + + %0(p0) = COPY $r0 + %1(s8) = G_LOAD %0 :: (load 1) + %2(p0) = COPY $r0 + %3(s8) = G_LOAD %2 :: (load 1) + %4(s8) = G_OR %1, %3 + ; G_OR with s8 should widen + ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_OR {{%[0-9]+, %[0-9]+}} + ; CHECK: {{%[0-9]+}}:_(s32) = G_OR {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_OR {{%[0-9]+, %[0-9]+}} + %5(s32) = G_SEXT %4(s8) + $r0 = COPY %5(s32) + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_or_s16 +# CHECK-LABEL: name: test_or_s16 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } +body: | + bb.0: + liveins: $r0, $r1 + + %0(p0) = COPY $r0 + %1(s16) = G_LOAD %0 :: (load 2) + %2(p0) = COPY $r0 + %3(s16) = G_LOAD %2 :: (load 2) + %4(s16) = G_OR %1, %3 + ; G_OR with s16 should widen + ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_OR {{%[0-9]+, %[0-9]+}} + ; CHECK: {{%[0-9]+}}:_(s32) = G_OR {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_OR {{%[0-9]+, %[0-9]+}} + %5(s32) = G_SEXT %4(s16) + $r0 = COPY %5(s32) + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_or_s32 +# CHECK-LABEL: name: test_or_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: $r0, $r1 + + %0(s32) = COPY $r0 + %1(s32) = COPY $r1 + %2(s32) = G_OR %0, %1 + ; G_OR with s32 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}:_(s32) = G_OR {{%[0-9]+, %[0-9]+}} + $r0 = COPY %2(s32) + BX_RET 14, $noreg, implicit $r0 + +... +--- +name: test_xor_s8 +# CHECK-LABEL: name: test_xor_s8 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } +body: | + bb.0: + liveins: $r0, $r1 + + %0(p0) = COPY $r0 + %1(s8) = G_LOAD %0 :: (load 1) + %2(p0) = COPY $r0 + %3(s8) = G_LOAD %2 :: (load 1) + %4(s8) = G_XOR %1, %3 + ; G_XOR with s8 should widen + ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_XOR {{%[0-9]+, %[0-9]+}} + ; CHECK: {{%[0-9]+}}:_(s32) = G_XOR {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_XOR {{%[0-9]+, %[0-9]+}} + %5(s32) = G_SEXT %4(s8) + $r0 = COPY %5(s32) + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_xor_s16 +# CHECK-LABEL: name: test_xor_s16 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } +body: | + bb.0: + liveins: $r0, $r1 + + %0(p0) = COPY $r0 + %1(s16) = G_LOAD %0 :: (load 2) + %2(p0) = COPY $r0 + %3(s16) = G_LOAD %2 :: (load 2) + %4(s16) = G_XOR %1, %3 + ; G_XOR with s16 should widen + ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_XOR {{%[0-9]+, %[0-9]+}} + ; CHECK: {{%[0-9]+}}:_(s32) = G_XOR {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_XOR {{%[0-9]+, %[0-9]+}} + %5(s32) = G_SEXT %4(s16) + $r0 = COPY %5(s32) + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_xor_s32 +# CHECK-LABEL: name: test_xor_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: $r0, $r1 + + %0(s32) = COPY $r0 + %1(s32) = COPY $r1 + %2(s32) = G_XOR %0, %1 + ; G_XOR with s32 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}:_(s32) = G_XOR {{%[0-9]+, %[0-9]+}} + $r0 = COPY %2(s32) + BX_RET 14, $noreg, implicit $r0 + +... diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir new file mode 100644 index 0000000000000000000000000000000000000000..c9323eedced43e8ab5d91f7c3761296933cb8a6a --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir @@ -0,0 +1,177 @@ +# RUN: llc -mtriple arm-linux-gnueabi -mattr=+v5t -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,CLZ +# RUN: llc -mtriple arm-linux-gnueabi -mattr=-v5t -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,LIBCALLS +--- | + define void @test_ctlz_s32() { ret void } + define void @test_ctlz_zero_undef_s32() { ret void } + + ; same as above but with extensions + define void @test_ctlz_s16() { ret void } + define void @test_ctlz_zero_undef_s8() { ret void } +... +--- +name: test_ctlz_s32 +# CHECK-LABEL: name: test_ctlz_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + liveins: $r0 + + ; CHECK: [[X:%[0-9]+]]:_(s32) = COPY $r0 + %0(s32) = COPY $r0 + + ; CLZ: [[R:%[0-9]+]]:_(s32) = G_CTLZ [[X]] + ; LIBCALLS-NOT: G_CTLZ + ; LIBCALLS: ADJCALLSTACKDOWN + ; LIBCALLS: $r0 = COPY [[X]] + ; LIBCALLS: BL &__clzsi2, {{.*}}, implicit $r0, implicit-def $r0 + ; LIBCALLS: [[COUNT:%[0-9]+]]:_(s32) = COPY $r0 + ; LIBCALLS: ADJCALLSTACKUP + ; LIBCALLS-NOT: G_CTLZ + ; LIBCALLS: [[ZERO:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; LIBCALLS: [[BITS:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; LIBCALLS: [[CMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[X]](s32), [[ZERO]] + ; LIBCALLS: [[R:%[0-9]+]]:_(s32) = G_SELECT [[CMP]](s1), [[BITS]], [[COUNT]] + ; LIBCALLS-NOT: G_CTLZ + %1(s32) = G_CTLZ %0 + + ; CHECK: $r0 = COPY [[R]] + $r0 = COPY %1(s32) + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_ctlz_zero_undef_s32 +# CHECK-LABEL: name: test_ctlz_zero_undef_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + liveins: $r0 + + ; CHECK: [[X:%[0-9]+]]:_(s32) = COPY $r0 + %0(s32) = COPY $r0 + + ; CLZ: [[R:%[0-9]+]]:_(s32) = G_CTLZ [[X]] + ; LIBCALLS-NOT: G_CTLZ + ; LIBCALLS: ADJCALLSTACKDOWN + ; LIBCALLS: $r0 = COPY [[X]] + ; LIBCALLS: BL &__clzsi2, {{.*}}, implicit $r0, implicit-def $r0 + ; LIBCALLS: [[R:%[0-9]+]]:_(s32) = COPY $r0 + ; LIBCALLS: ADJCALLSTACKUP + ; LIBCALLS-NOT: G_CTLZ + %1(s32) = G_CTLZ_ZERO_UNDEF %0 + + ; CHECK: $r0 = COPY [[R]] + $r0 = COPY %1(s32) + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_ctlz_s16 +# CHECK-LABEL: name: test_ctlz_s16 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: $r0 + + ; CHECK: [[X:%[0-9]+]]:_(s32) = COPY $r0 + ; CHECK: [[BITMASK:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[XAGAIN:%[0-9]+]]:_(s32) = COPY [[X]] + ; CHECK: [[X32:%[0-9]+]]:_(s32) = G_AND [[XAGAIN]], [[BITMASK]] + %0(s32) = COPY $r0 + %1(s16) = G_TRUNC %0(s32) + + ; Check that the operation is performed for 32 bits + ; CLZ: [[COUNT:%[0-9]+]]:_(s32) = G_CTLZ [[X32]] + ; LIBCALLS-NOT: G_CTLZ + ; LIBCALLS: ADJCALLSTACKDOWN + ; LIBCALLS: $r0 = COPY [[X32]] + ; LIBCALLS: BL &__clzsi2, {{.*}}, implicit $r0, implicit-def $r0 + ; LIBCALLS: [[UNDEFCOUNT:%[0-9]+]]:_(s32) = COPY $r0 + ; LIBCALLS: ADJCALLSTACKUP + ; LIBCALLS-NOT: G_CTLZ + ; LIBCALLS: [[ZERO:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; LIBCALLS: [[BITS:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; LIBCALLS: [[CMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), {{%[0-9]+}}(s32), [[ZERO]] + ; LIBCALLS: [[COUNT:%[0-9]+]]:_(s32) = G_SELECT [[CMP]](s1), [[BITS]], [[UNDEFCOUNT]] + ; LIBCALLS-NOT: G_CTLZ + ; CHECK: [[BITDIFF:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[R32:%[0-9]+]]:_(s32) = G_SUB [[COUNT]], [[BITDIFF]] + %2(s16) = G_CTLZ %1 + + ; CHECK: [[BITDIFF:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[RAGAIN:%[0-9]+]]:_(s32) = COPY [[R32]] + ; CHECK: [[SHIFTEDR:%[0-9]+]]:_(s32) = G_SHL [[RAGAIN]], [[BITDIFF]] + ; CHECK: [[R:%[0-9]+]]:_(s32) = G_ASHR [[SHIFTEDR]], [[BITDIFF]] + ; CHECK: $r0 = COPY [[R]] + %3(s32) = G_SEXT %2(s16) + $r0 = COPY %3(s32) + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_ctlz_zero_undef_s8 +# CHECK-LABEL: name: test_ctlz_zero_undef_s8 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: $r0 + + ; CHECK: [[X:%[0-9]+]]:_(s32) = COPY $r0 + ; CHECK: [[BITMASK:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[XAGAIN:%[0-9]+]]:_(s32) = COPY [[X]] + ; CHECK: [[X32:%[0-9]+]]:_(s32) = G_AND [[XAGAIN]], [[BITMASK]] + %0(s32) = COPY $r0 + %1(s8) = G_TRUNC %0(s32) + + ; Check that the operation is performed for 32 bits + ; CLZ: [[COUNT:%[0-9]+]]:_(s32) = G_CTLZ + ; CLZ-NOT: G_CTLZ_ZERO_UNDEF + ; LIBCALLS-NOT: G_CTLZ + ; LIBCALLS: ADJCALLSTACKDOWN + ; LIBCALLS: $r0 = COPY [[X32]] + ; LIBCALLS: BL &__clzsi2, {{.*}}, implicit $r0, implicit-def $r0 + ; LIBCALLS: [[COUNT:%[0-9]+]]:_(s32) = COPY $r0 + ; LIBCALLS: ADJCALLSTACKUP + ; LIBCALLS-NOT: G_CTLZ + ; CHECK: [[BITDIFF:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[R32:%[0-9]+]]:_(s32) = G_SUB [[COUNT]], [[BITDIFF]] + %2(s8) = G_CTLZ_ZERO_UNDEF %1 + + ; CHECK: [[BITDIFF:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[RAGAIN:%[0-9]+]]:_(s32) = COPY [[R32]] + ; CHECK: [[SHIFTEDR:%[0-9]+]]:_(s32) = G_SHL [[RAGAIN]], [[BITDIFF]] + ; CHECK: [[R:%[0-9]+]]:_(s32) = G_ASHR [[SHIFTEDR]], [[BITDIFF]] + ; CHECK: $r0 = COPY [[R]] + %3(s32) = G_SEXT %2(s8) + $r0 = COPY %3(s32) + BX_RET 14, $noreg, implicit $r0 +... diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-casts.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-casts.mir new file mode 100644 index 0000000000000000000000000000000000000000..c45bc3d5385d7573ecdfc171e8a34b566234fdf7 --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-casts.mir @@ -0,0 +1,50 @@ +# RUN: llc -mtriple arm-- -run-pass=legalizer %s -o - | FileCheck %s +# RUN: llc -mtriple thumb-- -mattr=+v6t2 -run-pass=legalizer %s -o - | FileCheck %s +--- | + define void @test_inttoptr_s32() { ret void } + define void @test_ptrtoint_s32() { ret void } +... +--- +name: test_inttoptr_s32 +# CHECK-LABEL: name: test_inttoptr_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + %1(p0) = G_INTTOPTR %0(s32) + ; G_INTTOPTR with s32 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}:_(p0) = G_INTTOPTR {{%[0-9]+}} + $r0 = COPY %1(p0) + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_ptrtoint_s32 +# CHECK-LABEL: name: test_ptrtoint_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + liveins: $r0 + + %0(p0) = COPY $r0 + %1(s32) = G_PTRTOINT %0(p0) + ; G_PTRTOINT with s32 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}:_(s32) = G_PTRTOINT {{%[0-9]+}} + $r0 = COPY %1(s32) + BX_RET 14, $noreg, implicit $r0 +... diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-consts.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-consts.mir new file mode 100644 index 0000000000000000000000000000000000000000..d3248fe63a024864bb4f33ccc9cc650b7b651218 --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-consts.mir @@ -0,0 +1,57 @@ +# RUN: llc -mtriple arm-- -run-pass=legalizer %s -o - | FileCheck %s +# RUN: llc -mtriple thumb-- -mattr=+v6t2 -run-pass=legalizer %s -o - | FileCheck %s +--- | + define void @test_constants() { ret void } +... +--- +name: test_constants +# CHECK-LABEL: name: test_constants +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } +body: | + bb.0: + liveins: $r0 + + %4(p0) = COPY $r0 + + %0(s32) = G_CONSTANT 42 + ; CHECK: {{%[0-9]+}}:_(s32) = G_CONSTANT 42 + + %1(s16) = G_CONSTANT i16 21 + G_STORE %1(s16), %4(p0) :: (store 2) + ; CHECK-NOT: G_CONSTANT i16 + ; CHECK: [[EXT:%[0-9]+]]:_(s32) = G_CONSTANT i32 21 + ; CHECK: {{%[0-9]+}}:_(s16) = G_TRUNC [[EXT]](s32) + ; CHECK-NOT: G_CONSTANT i16 + + %2(s8) = G_CONSTANT i8 10 + G_STORE %2(s8), %4(p0) :: (store 1) + ; CHECK-NOT: G_CONSTANT i8 + ; CHECK: [[EXT:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; CHECK: {{%[0-9]+}}:_(s8) = G_TRUNC [[EXT]](s32) + ; CHECK-NOT: G_CONSTANT i8 + + %3(s1) = G_CONSTANT i1 1 + G_STORE %3(s1), %4(p0) :: (store 1) + ; CHECK-NOT: G_CONSTANT i1 + ; CHECK: [[EXT:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK: {{%[0-9]+}}:_(s1) = G_TRUNC [[EXT]](s32) + ; CHECK-NOT: G_CONSTANT i1 + + %5(p0) = G_CONSTANT 0 + G_STORE %5(p0), %4(p0) :: (store 4) + ; CHECK: {{%[0-9]+}}:_(p0) = G_CONSTANT 0 + + $r0 = COPY %0(s32) + BX_RET 14, $noreg, implicit $r0 +... diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-exts.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-exts.mir new file mode 100644 index 0000000000000000000000000000000000000000..672b42671a154d0bbcc4b718ca1d7d2a3055656e --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-exts.mir @@ -0,0 +1,79 @@ +# RUN: llc -mtriple arm-- -run-pass=legalizer %s -o - | FileCheck %s +# RUN: llc -mtriple thumb-- -mattr=+v6t2 -run-pass=legalizer %s -o - | FileCheck %s +--- | + define void @test_zext_s16() { ret void } + define void @test_sext_s8() { ret void } + define void @test_anyext_s1() { ret void } +... +--- +name: test_zext_s16 +# CHECK-LABEL: name: test_zext_s16 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: $r0 + + %0(p0) = COPY $r0 + %1(s16) = G_LOAD %0 :: (load 2) + %2(s32) = G_ZEXT %1 + ; G_ZEXT with s16 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}:_(s32) = G_ZEXT {{%[0-9]+}} + $r0 = COPY %2(s32) + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_sext_s8 +# CHECK-LABEL: name: test_sext_s8 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: $r0 + + %0(p0) = COPY $r0 + %1(s8) = G_LOAD %0(p0) :: (load 1) + %2(s32) = G_SEXT %1 + ; G_SEXT with s8 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}:_(s32) = G_SEXT {{%[0-9]+}} + $r0 = COPY %2(s32) + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_anyext_s1 +# CHECK-LABEL: name: test_anyext_s1 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: $r0 + + %0(p0) = COPY $r0 + %1(s1) = G_LOAD %0(p0) :: (load 1) + %2(s32) = G_ANYEXT %1 + ; G_ANYEXT with s1 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}:_(s32) = G_ANYEXT {{%[0-9]+}} + $r0 = COPY %2(s32) + BX_RET 14, $noreg, implicit $r0 +... diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-load-store.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-load-store.mir new file mode 100644 index 0000000000000000000000000000000000000000..a955af9a97ffbe27fc3f54c88c7578d3d0e70f33 --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-load-store.mir @@ -0,0 +1,49 @@ +# RUN: llc -mtriple arm-- -run-pass=legalizer %s -o - | FileCheck %s +# RUN: llc -mtriple thumb-- -mattr=+v6t2 -run-pass=legalizer %s -o - | FileCheck %s +--- | + define void @test_legal_loads_stores() { ret void } +... +--- +name: test_legal_loads_stores +# CHECK-LABEL: name: test_legal_loads_stores +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } +body: | + bb.0: + liveins: $r0 + + ; These are all legal, so we should find them unchanged in the output + ; CHECK-DAG: G_STORE {{%[0-9]+}}(s32), %0(p0) + ; CHECK-DAG: G_STORE {{%[0-9]+}}(s16), %0(p0) + ; CHECK-DAG: G_STORE {{%[0-9]+}}(s8), %0(p0) + ; CHECK-DAG: G_STORE {{%[0-9]+}}(s1), %0(p0) + ; CHECK-DAG: G_STORE {{%[0-9]+}}(p0), %0(p0) + ; CHECK-DAG: {{%[0-9]+}}:_(s32) = G_LOAD %0(p0) + ; CHECK-DAG: {{%[0-9]+}}:_(s16) = G_LOAD %0(p0) + ; CHECK-DAG: {{%[0-9]+}}:_(s8) = G_LOAD %0(p0) + ; CHECK-DAG: {{%[0-9]+}}:_(s1) = G_LOAD %0(p0) + ; CHECK-DAG: {{%[0-9]+}}:_(p0) = G_LOAD %0(p0) + %0(p0) = COPY $r0 + %2(s32) = G_LOAD %0(p0) :: (load 4) + G_STORE %2(s32), %0(p0) :: (store 4) + %3(s16) = G_LOAD %0(p0) :: (load 2) + G_STORE %3(s16), %0(p0) :: (store 2) + %4(s8) = G_LOAD %0(p0) :: (load 1) + G_STORE %4(s8), %0(p0) :: (store 1) + %5(s1) = G_LOAD %0(p0) :: (load 1) + G_STORE %5(s1), %0(p0) :: (store 1) + %6(p0) = G_LOAD %0(p0) :: (load 4) + G_STORE %6(p0), %0(p0) :: (store 4) + BX_RET 14, $noreg +... diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir index 4d61593b47a16b07d830e38b8b3a3c010107d7e2..bee8b7bbbabd2b801c2ae01c8357f4fec3ab56f0 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir @@ -1,45 +1,15 @@ # RUN: llc -mtriple arm-- -run-pass=legalizer %s -o - | FileCheck %s --- | - define void @test_sext_s8() { ret void } - define void @test_zext_s16() { ret void } - - define void @test_inttoptr_s32() { ret void } - define void @test_ptrtoint_s32() { ret void } - - define void @test_add_s8() { ret void } - define void @test_add_s16() { ret void } - define void @test_add_s32() { ret void } - - define void @test_sub_s8() { ret void } - define void @test_sub_s16() { ret void } - define void @test_sub_s32() { ret void } - - define void @test_mul_s8() { ret void } - define void @test_mul_s16() { ret void } - define void @test_mul_s32() { ret void } - - define void @test_and_s8() { ret void } - define void @test_and_s16() { ret void } - define void @test_and_s32() { ret void } - - define void @test_or_s8() { ret void } - define void @test_or_s16() { ret void } - define void @test_or_s32() { ret void } - - define void @test_xor_s8() { ret void } - define void @test_xor_s16() { ret void } - define void @test_xor_s32() { ret void } - define void @test_lshr_s32() { ret void } define void @test_ashr_s32() { ret void } define void @test_shl_s32() { ret void } define void @test_load_from_stack() { ret void } - define void @test_legal_loads_stores() #0 { ret void } + define void @test_load_store_64() #0 { ret void } define void @test_gep() { ret void } - define void @test_constants() { ret void } + define void @test_constants_s64() { ret void } define void @test_icmp_s8() { ret void } define void @test_icmp_s16() { ret void } @@ -61,632 +31,6 @@ attributes #0 = { "target-features"="+vfp2" } ... --- -name: test_sext_s8 -# CHECK-LABEL: name: test_sext_s8 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } -body: | - bb.0: - liveins: $r0 - - %0(p0) = COPY $r0 - %1(s8) = G_LOAD %0(p0) :: (load 1) - %2(s32) = G_SEXT %1 - ; G_SEXT with s8 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}:_(s32) = G_SEXT {{%[0-9]+}} - $r0 = COPY %2(s32) - BX_RET 14, $noreg, implicit $r0 -... ---- -name: test_zext_s16 -# CHECK-LABEL: name: test_zext_s16 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } -body: | - bb.0: - liveins: $r0 - - %0(p0) = COPY $r0 - %1(s16) = G_LOAD %0 :: (load 2) - %2(s32) = G_ZEXT %1 - ; G_ZEXT with s16 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}:_(s32) = G_ZEXT {{%[0-9]+}} - $r0 = COPY %2(s32) - BX_RET 14, $noreg, implicit $r0 -... ---- -name: test_inttoptr_s32 -# CHECK-LABEL: name: test_inttoptr_s32 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } -body: | - bb.0: - liveins: $r0 - - %0(s32) = COPY $r0 - %1(p0) = G_INTTOPTR %0(s32) - ; G_INTTOPTR with s32 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}:_(p0) = G_INTTOPTR {{%[0-9]+}} - $r0 = COPY %1(p0) - BX_RET 14, $noreg, implicit $r0 -... ---- -name: test_ptrtoint_s32 -# CHECK-LABEL: name: test_ptrtoint_s32 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } -body: | - bb.0: - liveins: $r0 - - %0(p0) = COPY $r0 - %1(s32) = G_PTRTOINT %0(p0) - ; G_PTRTOINT with s32 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}:_(s32) = G_PTRTOINT {{%[0-9]+}} - $r0 = COPY %1(s32) - BX_RET 14, $noreg, implicit $r0 -... ---- -name: test_add_s8 -# CHECK-LABEL: name: test_add_s8 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } - - { id: 3, class: _ } - - { id: 4, class: _ } - - { id: 5, class: _ } -body: | - bb.0: - liveins: $r0, $r1 - - %0(p0) = COPY $r0 - %1(s8) = G_LOAD %0 :: (load 1) - %2(p0) = COPY $r0 - %3(s8) = G_LOAD %2 :: (load 1) - %4(s8) = G_ADD %1, %3 - ; G_ADD with s8 should widen - ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_ADD {{%[0-9]+, %[0-9]+}} - ; CHECK: {{%[0-9]+}}:_(s32) = G_ADD {{%[0-9]+, %[0-9]+}} - ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_ADD {{%[0-9]+, %[0-9]+}} - %5(s32) = G_SEXT %4(s8) - $r0 = COPY %5(s32) - BX_RET 14, $noreg, implicit $r0 -... ---- -name: test_add_s16 -# CHECK-LABEL: name: test_add_s16 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } - - { id: 3, class: _ } - - { id: 4, class: _ } - - { id: 5, class: _ } -body: | - bb.0: - liveins: $r0, $r1 - - %0(p0) = COPY $r0 - %1(s16) = G_LOAD %0 :: (load 2) - %2(p0) = COPY $r0 - %3(s16) = G_LOAD %2 :: (load 2) - %4(s16) = G_ADD %1, %3 - ; G_ADD with s16 should widen - ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_ADD {{%[0-9]+, %[0-9]+}} - ; CHECK: {{%[0-9]+}}:_(s32) = G_ADD {{%[0-9]+, %[0-9]+}} - ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_ADD {{%[0-9]+, %[0-9]+}} - %5(s32) = G_SEXT %4(s16) - $r0 = COPY %5(s32) - BX_RET 14, $noreg, implicit $r0 -... ---- -name: test_add_s32 -# CHECK-LABEL: name: test_add_s32 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } -body: | - bb.0: - liveins: $r0, $r1 - - %0(s32) = COPY $r0 - %1(s32) = COPY $r1 - %2(s32) = G_ADD %0, %1 - ; G_ADD with s32 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}:_(s32) = G_ADD {{%[0-9]+, %[0-9]+}} - $r0 = COPY %2(s32) - BX_RET 14, $noreg, implicit $r0 - -... ---- -name: test_sub_s8 -# CHECK-LABEL: name: test_sub_s8 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } - - { id: 3, class: _ } - - { id: 4, class: _ } - - { id: 5, class: _ } -body: | - bb.0: - liveins: $r0, $r1 - - %0(p0) = COPY $r0 - %1(s8) = G_LOAD %0 :: (load 1) - %2(p0) = COPY $r0 - %3(s8) = G_LOAD %2 :: (load 1) - %4(s8) = G_SUB %1, %3 - ; G_SUB with s8 should widen - ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_SUB {{%[0-9]+, %[0-9]+}} - ; CHECK: {{%[0-9]+}}:_(s32) = G_SUB {{%[0-9]+, %[0-9]+}} - ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_SUB {{%[0-9]+, %[0-9]+}} - %5(s32) = G_SEXT %4(s8) - $r0 = COPY %5(s32) - BX_RET 14, $noreg, implicit $r0 -... ---- -name: test_sub_s16 -# CHECK-LABEL: name: test_sub_s16 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } - - { id: 3, class: _ } - - { id: 4, class: _ } - - { id: 5, class: _ } -body: | - bb.0: - liveins: $r0, $r1 - - %0(p0) = COPY $r0 - %1(s16) = G_LOAD %0 :: (load 2) - %2(p0) = COPY $r0 - %3(s16) = G_LOAD %2 :: (load 2) - %4(s16) = G_SUB %1, %3 - ; G_SUB with s16 should widen - ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_SUB {{%[0-9]+, %[0-9]+}} - ; CHECK: {{%[0-9]+}}:_(s32) = G_SUB {{%[0-9]+, %[0-9]+}} - ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_SUB {{%[0-9]+, %[0-9]+}} - %5(s32) = G_SEXT %4(s16) - $r0 = COPY %5(s32) - BX_RET 14, $noreg, implicit $r0 -... ---- -name: test_sub_s32 -# CHECK-LABEL: name: test_sub_s32 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } -body: | - bb.0: - liveins: $r0, $r1 - - %0(s32) = COPY $r0 - %1(s32) = COPY $r1 - %2(s32) = G_SUB %0, %1 - ; G_SUB with s32 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}:_(s32) = G_SUB {{%[0-9]+, %[0-9]+}} - $r0 = COPY %2(s32) - BX_RET 14, $noreg, implicit $r0 - -... ---- -name: test_mul_s8 -# CHECK-LABEL: name: test_mul_s8 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } - - { id: 3, class: _ } - - { id: 4, class: _ } - - { id: 5, class: _ } -body: | - bb.0: - liveins: $r0, $r1 - - %0(p0) = COPY $r0 - %1(s8) = G_LOAD %0 :: (load 1) - %2(p0) = COPY $r0 - %3(s8) = G_LOAD %2 :: (load 1) - %4(s8) = G_MUL %1, %3 - ; G_MUL with s8 should widen - ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_MUL {{%[0-9]+, %[0-9]+}} - ; CHECK: {{%[0-9]+}}:_(s32) = G_MUL {{%[0-9]+, %[0-9]+}} - ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_MUL {{%[0-9]+, %[0-9]+}} - %5(s32) = G_SEXT %4(s8) - $r0 = COPY %5(s32) - BX_RET 14, $noreg, implicit $r0 -... ---- -name: test_mul_s16 -# CHECK-LABEL: name: test_mul_s16 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } - - { id: 3, class: _ } - - { id: 4, class: _ } - - { id: 5, class: _ } -body: | - bb.0: - liveins: $r0, $r1 - - %0(p0) = COPY $r0 - %1(s16) = G_LOAD %0 :: (load 2) - %2(p0) = COPY $r0 - %3(s16) = G_LOAD %2 :: (load 2) - %4(s16) = G_MUL %1, %3 - ; G_MUL with s16 should widen - ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_MUL {{%[0-9]+, %[0-9]+}} - ; CHECK: {{%[0-9]+}}:_(s32) = G_MUL {{%[0-9]+, %[0-9]+}} - ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_MUL {{%[0-9]+, %[0-9]+}} - %5(s32) = G_SEXT %4(s16) - $r0 = COPY %5(s32) - BX_RET 14, $noreg, implicit $r0 -... ---- -name: test_mul_s32 -# CHECK-LABEL: name: test_mul_s32 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } -body: | - bb.0: - liveins: $r0, $r1 - - %0(s32) = COPY $r0 - %1(s32) = COPY $r1 - %2(s32) = G_MUL %0, %1 - ; G_MUL with s32 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}:_(s32) = G_MUL {{%[0-9]+, %[0-9]+}} - $r0 = COPY %2(s32) - BX_RET 14, $noreg, implicit $r0 - -... ---- -name: test_and_s8 -# CHECK-LABEL: name: test_and_s8 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } - - { id: 3, class: _ } - - { id: 4, class: _ } - - { id: 5, class: _ } -body: | - bb.0: - liveins: $r0, $r1 - - %0(p0) = COPY $r0 - %1(s8) = G_LOAD %0 :: (load 1) - %2(p0) = COPY $r0 - %3(s8) = G_LOAD %2 :: (load 1) - %4(s8) = G_AND %1, %3 - ; G_AND with s8 should widen - ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_AND {{%[0-9]+, %[0-9]+}} - ; CHECK: {{%[0-9]+}}:_(s32) = G_AND {{%[0-9]+, %[0-9]+}} - ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_AND {{%[0-9]+, %[0-9]+}} - %5(s32) = G_SEXT %4(s8) - $r0 = COPY %5(s32) - BX_RET 14, $noreg, implicit $r0 -... ---- -name: test_and_s16 -# CHECK-LABEL: name: test_and_s16 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } - - { id: 3, class: _ } - - { id: 4, class: _ } - - { id: 5, class: _ } -body: | - bb.0: - liveins: $r0, $r1 - - %0(p0) = COPY $r0 - %1(s16) = G_LOAD %0 :: (load 2) - %2(p0) = COPY $r0 - %3(s16) = G_LOAD %2 :: (load 2) - %4(s16) = G_AND %1, %3 - ; G_AND with s16 should widen - ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_AND {{%[0-9]+, %[0-9]+}} - ; CHECK: {{%[0-9]+}}:_(s32) = G_AND {{%[0-9]+, %[0-9]+}} - ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_AND {{%[0-9]+, %[0-9]+}} - %5(s32) = G_SEXT %4(s16) - $r0 = COPY %5(s32) - BX_RET 14, $noreg, implicit $r0 -... ---- -name: test_and_s32 -# CHECK-LABEL: name: test_and_s32 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } -body: | - bb.0: - liveins: $r0, $r1 - - %0(s32) = COPY $r0 - %1(s32) = COPY $r1 - %2(s32) = G_AND %0, %1 - ; G_AND with s32 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}:_(s32) = G_AND {{%[0-9]+, %[0-9]+}} - $r0 = COPY %2(s32) - BX_RET 14, $noreg, implicit $r0 - -... ---- -name: test_or_s8 -# CHECK-LABEL: name: test_or_s8 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } - - { id: 3, class: _ } - - { id: 4, class: _ } - - { id: 5, class: _ } -body: | - bb.0: - liveins: $r0, $r1 - - %0(p0) = COPY $r0 - %1(s8) = G_LOAD %0 :: (load 1) - %2(p0) = COPY $r0 - %3(s8) = G_LOAD %2 :: (load 1) - %4(s8) = G_OR %1, %3 - ; G_OR with s8 should widen - ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_OR {{%[0-9]+, %[0-9]+}} - ; CHECK: {{%[0-9]+}}:_(s32) = G_OR {{%[0-9]+, %[0-9]+}} - ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_OR {{%[0-9]+, %[0-9]+}} - %5(s32) = G_SEXT %4(s8) - $r0 = COPY %5(s32) - BX_RET 14, $noreg, implicit $r0 -... ---- -name: test_or_s16 -# CHECK-LABEL: name: test_or_s16 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } - - { id: 3, class: _ } - - { id: 4, class: _ } - - { id: 5, class: _ } -body: | - bb.0: - liveins: $r0, $r1 - - %0(p0) = COPY $r0 - %1(s16) = G_LOAD %0 :: (load 2) - %2(p0) = COPY $r0 - %3(s16) = G_LOAD %2 :: (load 2) - %4(s16) = G_OR %1, %3 - ; G_OR with s16 should widen - ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_OR {{%[0-9]+, %[0-9]+}} - ; CHECK: {{%[0-9]+}}:_(s32) = G_OR {{%[0-9]+, %[0-9]+}} - ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_OR {{%[0-9]+, %[0-9]+}} - %5(s32) = G_SEXT %4(s16) - $r0 = COPY %5(s32) - BX_RET 14, $noreg, implicit $r0 -... ---- -name: test_or_s32 -# CHECK-LABEL: name: test_or_s32 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } -body: | - bb.0: - liveins: $r0, $r1 - - %0(s32) = COPY $r0 - %1(s32) = COPY $r1 - %2(s32) = G_OR %0, %1 - ; G_OR with s32 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}:_(s32) = G_OR {{%[0-9]+, %[0-9]+}} - $r0 = COPY %2(s32) - BX_RET 14, $noreg, implicit $r0 - -... ---- -name: test_xor_s8 -# CHECK-LABEL: name: test_xor_s8 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } - - { id: 3, class: _ } - - { id: 4, class: _ } - - { id: 5, class: _ } -body: | - bb.0: - liveins: $r0, $r1 - - %0(p0) = COPY $r0 - %1(s8) = G_LOAD %0 :: (load 1) - %2(p0) = COPY $r0 - %3(s8) = G_LOAD %2 :: (load 1) - %4(s8) = G_XOR %1, %3 - ; G_XOR with s8 should widen - ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_XOR {{%[0-9]+, %[0-9]+}} - ; CHECK: {{%[0-9]+}}:_(s32) = G_XOR {{%[0-9]+, %[0-9]+}} - ; CHECK-NOT: {{%[0-9]+}}:_(s8) = G_XOR {{%[0-9]+, %[0-9]+}} - %5(s32) = G_SEXT %4(s8) - $r0 = COPY %5(s32) - BX_RET 14, $noreg, implicit $r0 -... ---- -name: test_xor_s16 -# CHECK-LABEL: name: test_xor_s16 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } - - { id: 3, class: _ } - - { id: 4, class: _ } - - { id: 5, class: _ } -body: | - bb.0: - liveins: $r0, $r1 - - %0(p0) = COPY $r0 - %1(s16) = G_LOAD %0 :: (load 2) - %2(p0) = COPY $r0 - %3(s16) = G_LOAD %2 :: (load 2) - %4(s16) = G_XOR %1, %3 - ; G_XOR with s16 should widen - ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_XOR {{%[0-9]+, %[0-9]+}} - ; CHECK: {{%[0-9]+}}:_(s32) = G_XOR {{%[0-9]+, %[0-9]+}} - ; CHECK-NOT: {{%[0-9]+}}:_(s16) = G_XOR {{%[0-9]+, %[0-9]+}} - %5(s32) = G_SEXT %4(s16) - $r0 = COPY %5(s32) - BX_RET 14, $noreg, implicit $r0 -... ---- -name: test_xor_s32 -# CHECK-LABEL: name: test_xor_s32 -legalized: false -# CHECK: legalized: true -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } -body: | - bb.0: - liveins: $r0, $r1 - - %0(s32) = COPY $r0 - %1(s32) = COPY $r1 - %2(s32) = G_XOR %0, %1 - ; G_XOR with s32 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}:_(s32) = G_XOR {{%[0-9]+, %[0-9]+}} - $r0 = COPY %2(s32) - BX_RET 14, $noreg, implicit $r0 - -... ---- name: test_lshr_s32 # CHECK-LABEL: name: test_lshr_s32 legalized: false @@ -792,8 +136,8 @@ body: | BX_RET 14, $noreg, implicit $r0 ... --- -name: test_legal_loads_stores -# CHECK-LABEL: name: test_legal_loads_stores +name: test_load_store_64 +# CHECK-LABEL: name: test_load_store_64 legalized: false # CHECK: legalized: true regBankSelected: false @@ -811,32 +155,12 @@ body: | bb.0: liveins: $r0 - ; These are all legal, so we should find them unchanged in the output + ; These are legal, so we should find them unchanged in the output ; CHECK-DAG: G_STORE {{%[0-9]+}}(s64), %0(p0) - ; CHECK-DAG: G_STORE {{%[0-9]+}}(s32), %0(p0) - ; CHECK-DAG: G_STORE {{%[0-9]+}}(s16), %0(p0) - ; CHECK-DAG: G_STORE {{%[0-9]+}}(s8), %0(p0) - ; CHECK-DAG: G_STORE {{%[0-9]+}}(s1), %0(p0) - ; CHECK-DAG: G_STORE {{%[0-9]+}}(p0), %0(p0) ; CHECK-DAG: {{%[0-9]+}}:_(s64) = G_LOAD %0(p0) - ; CHECK-DAG: {{%[0-9]+}}:_(s32) = G_LOAD %0(p0) - ; CHECK-DAG: {{%[0-9]+}}:_(s16) = G_LOAD %0(p0) - ; CHECK-DAG: {{%[0-9]+}}:_(s8) = G_LOAD %0(p0) - ; CHECK-DAG: {{%[0-9]+}}:_(s1) = G_LOAD %0(p0) - ; CHECK-DAG: {{%[0-9]+}}:_(p0) = G_LOAD %0(p0) %0(p0) = COPY $r0 %1(s64) = G_LOAD %0(p0) :: (load 8) G_STORE %1(s64), %0(p0) :: (store 8) - %2(s32) = G_LOAD %0(p0) :: (load 4) - G_STORE %2(s32), %0(p0) :: (store 4) - %3(s16) = G_LOAD %0(p0) :: (load 2) - G_STORE %3(s16), %0(p0) :: (store 2) - %4(s8) = G_LOAD %0(p0) :: (load 1) - G_STORE %4(s8), %0(p0) :: (store 1) - %5(s1) = G_LOAD %0(p0) :: (load 1) - G_STORE %5(s1), %0(p0) :: (store 1) - %6(p0) = G_LOAD %0(p0) :: (load 4) - G_STORE %6(p0), %0(p0) :: (store 4) BX_RET 14, $noreg ... --- @@ -865,8 +189,8 @@ body: | BX_RET 14, $noreg, implicit $r0 ... --- -name: test_constants -# CHECK-LABEL: name: test_constants +name: test_constants_s64 +# CHECK-LABEL: name: test_constants_s64 legalized: false # CHECK: legalized: true regBankSelected: false @@ -877,55 +201,21 @@ registers: - { id: 1, class: _ } - { id: 2, class: _ } - { id: 3, class: _ } - - { id: 4, class: _ } - - { id: 5, class: _ } - - { id: 6, class: _ } - - { id: 7, class: _ } - - { id: 8, class: _ } body: | bb.0: liveins: $r0 - %4(p0) = COPY $r0 - - %0(s32) = G_CONSTANT 42 - ; CHECK: {{%[0-9]+}}:_(s32) = G_CONSTANT 42 - - %1(s16) = G_CONSTANT i16 21 - G_STORE %1(s16), %4(p0) :: (store 2) - ; CHECK-NOT: G_CONSTANT i16 - ; CHECK: [[EXT:%[0-9]+]]:_(s32) = G_CONSTANT i32 21 - ; CHECK: {{%[0-9]+}}:_(s16) = G_TRUNC [[EXT]](s32) - ; CHECK-NOT: G_CONSTANT i16 - - %2(s8) = G_CONSTANT i8 10 - G_STORE %2(s8), %4(p0) :: (store 1) - ; CHECK-NOT: G_CONSTANT i8 - ; CHECK: [[EXT:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK: {{%[0-9]+}}:_(s8) = G_TRUNC [[EXT]](s32) - ; CHECK-NOT: G_CONSTANT i8 - - %3(s1) = G_CONSTANT i1 1 - G_STORE %3(s1), %4(p0) :: (store 1) - ; CHECK-NOT: G_CONSTANT i1 - ; CHECK: [[EXT:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; CHECK: {{%[0-9]+}}:_(s1) = G_TRUNC [[EXT]](s32) - ; CHECK-NOT: G_CONSTANT i1 - - %5(p0) = G_CONSTANT 0 - G_STORE %5(p0), %4(p0) :: (store 4) - ; CHECK: {{%[0-9]+}}:_(p0) = G_CONSTANT 0 + %0(p0) = COPY $r0 - %6(s64) = G_CONSTANT i64 17179869200 ; = 4 * 2 ^ 32 + 16 - %7(s32), %8(s32) = G_UNMERGE_VALUES %6(s64) - G_STORE %7(s32), %4(p0) :: (store 4) - G_STORE %8(s32), %4(p0) :: (store 4) + %1(s64) = G_CONSTANT i64 17179869200 ; = 4 * 2 ^ 32 + 16 + %2(s32), %3(s32) = G_UNMERGE_VALUES %1(s64) + G_STORE %2(s32), %0(p0) :: (store 4) + G_STORE %3(s32), %0(p0) :: (store 4) ; CHECK-DAG: {{%[0-9]+}}:_(s32) = G_CONSTANT i32 4 ; CHECK-DAG: {{%[0-9]+}}:_(s32) = G_CONSTANT i32 16 ; CHECK-NOT: G_CONSTANT i64 - $r0 = COPY %0(s32) - BX_RET 14, $noreg, implicit $r0 + BX_RET 14, $noreg ... --- name: test_icmp_s8 diff --git a/test/CodeGen/ARM/GlobalISel/arm-param-lowering.ll b/test/CodeGen/ARM/GlobalISel/arm-param-lowering.ll index 14ddeda39f01d7f966fb48948f5c29249a77c871..3c3da0edd53e7884dace11c30146034d1850b8b7 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-param-lowering.ll +++ b/test/CodeGen/ARM/GlobalISel/arm-param-lowering.ll @@ -1,6 +1,7 @@ -; RUN: llc -mtriple arm-unknown -mattr=+vfp2,+v4t -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=LITTLE -; RUN: llc -mtriple armeb-unknown -mattr=+vfp2,+v4t -global-isel -global-isel-abort=0 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=BIG +; RUN: llc -mtriple arm-unknown -mattr=+vfp2,+v4t -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=ARM -check-prefix=LITTLE +; RUN: llc -mtriple armeb-unknown -mattr=+vfp2,+v4t -global-isel -global-isel-abort=0 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=ARM -check-prefix=BIG ; XFAIL: armeb +; RUN: llc -mtriple thumb-unknown -mattr=+vfp2,+v6t2 -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=LITTLE -check-prefix=THUMB declare arm_aapcscc i32* @simple_reg_params_target(i32, i32*) @@ -11,11 +12,13 @@ define arm_aapcscc i32* @test_call_simple_reg_params(i32 *%a, i32 %b) { ; CHECK: ADJCALLSTACKDOWN 0, 0, 14, $noreg, implicit-def $sp, implicit $sp ; CHECK-DAG: $r0 = COPY [[BVREG]] ; CHECK-DAG: $r1 = COPY [[AVREG]] -; CHECK: BL @simple_reg_params_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit-def $r0 +; ARM: BL @simple_reg_params_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit-def $r0 +; THUMB: tBL 14, $noreg, @simple_reg_params_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit-def $r0 ; CHECK: [[RVREG:%[0-9]+]]:_(p0) = COPY $r0 ; CHECK: ADJCALLSTACKUP 0, 0, 14, $noreg, implicit-def $sp, implicit $sp ; CHECK: $r0 = COPY [[RVREG]] -; CHECK: BX_RET 14, $noreg, implicit $r0 +; ARM: BX_RET 14, $noreg, implicit $r0 +; THUMB: tBX_RET 14, $noreg, implicit $r0 entry: %r = notail call arm_aapcscc i32 *@simple_reg_params_target(i32 %b, i32 *%a) ret i32 *%r @@ -40,11 +43,13 @@ define arm_aapcscc i32* @test_call_simple_stack_params(i32 *%a, i32 %b) { ; CHECK: [[OFF2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CHECK: [[FI2:%[0-9]+]]:_(p0) = G_GEP [[SP2]], [[OFF2]](s32) ; CHECK: G_STORE [[AVREG]](p0), [[FI2]](p0){{.*}}store 4 -; CHECK: BL @simple_stack_params_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0 +; ARM: BL @simple_stack_params_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0 +; THUMB: tBL 14, $noreg, @simple_stack_params_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0 ; CHECK: [[RVREG:%[0-9]+]]:_(p0) = COPY $r0 ; CHECK: ADJCALLSTACKUP 8, 0, 14, $noreg, implicit-def $sp, implicit $sp ; CHECK: $r0 = COPY [[RVREG]] -; CHECK: BX_RET 14, $noreg, implicit $r0 +; ARM: BX_RET 14, $noreg, implicit $r0 +; THUMB: tBX_RET 14, $noreg, implicit $r0 entry: %r = notail call arm_aapcscc i32 *@simple_stack_params_target(i32 %b, i32 *%a, i32 %b, i32 *%a, i32 %b, i32 *%a) ret i32 *%r @@ -94,13 +99,15 @@ define arm_aapcscc signext i16 @test_call_ext_params(i8 %a, i16 %b, i1 %c) { ; CHECK: [[FI5:%[0-9]+]]:_(p0) = G_GEP [[SP5]], [[OFF5]](s32) ; CHECK: [[ZEXTC:%[0-9]+]]:_(s32) = G_ZEXT [[CVREG]] ; CHECK: G_STORE [[ZEXTC]](s32), [[FI5]](p0){{.*}}store 4 -; CHECK: BL @ext_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0 +; ARM: BL @ext_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0 +; THUMB: tBL 14, $noreg, @ext_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0 ; CHECK: [[R0VREG:%[0-9]+]]:_(s32) = COPY $r0 ; CHECK: [[RVREG:%[0-9]+]]:_(s16) = G_TRUNC [[R0VREG]] ; CHECK: ADJCALLSTACKUP 20, 0, 14, $noreg, implicit-def $sp, implicit $sp ; CHECK: [[RExtVREG:%[0-9]+]]:_(s32) = G_SEXT [[RVREG]] ; CHECK: $r0 = COPY [[RExtVREG]] -; CHECK: BX_RET 14, $noreg, implicit $r0 +; ARM: BX_RET 14, $noreg, implicit $r0 +; THUMB: tBX_RET 14, $noreg, implicit $r0 entry: %r = notail call arm_aapcscc signext i16 @ext_target(i8 signext %a, i8 zeroext %a, i16 signext %b, i16 zeroext %b, i8 signext %a, i8 zeroext %a, i16 signext %b, i16 zeroext %b, i1 zeroext %c) ret i16 %r @@ -115,11 +122,13 @@ define arm_aapcs_vfpcc double @test_call_vfpcc_fp_params(double %a, float %b) { ; CHECK: ADJCALLSTACKDOWN 0, 0, 14, $noreg, implicit-def $sp, implicit $sp ; CHECK-DAG: $s0 = COPY [[BVREG]] ; CHECK-DAG: $d1 = COPY [[AVREG]] -; CHECK: BL @vfpcc_fp_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $d1, implicit-def $d0 +; ARM: BL @vfpcc_fp_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $d1, implicit-def $d0 +; THUMB: tBL 14, $noreg, @vfpcc_fp_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $d1, implicit-def $d0 ; CHECK: [[RVREG:%[0-9]+]]:_(s64) = COPY $d0 ; CHECK: ADJCALLSTACKUP 0, 0, 14, $noreg, implicit-def $sp, implicit $sp ; CHECK: $d0 = COPY [[RVREG]] -; CHECK: BX_RET 14, $noreg, implicit $d0 +; ARM: BX_RET 14, $noreg, implicit $d0 +; THUMB: tBX_RET 14, $noreg, implicit $d0 entry: %r = notail call arm_aapcs_vfpcc double @vfpcc_fp_target(float %b, double %a) ret double %r @@ -149,7 +158,8 @@ define arm_aapcscc double @test_call_aapcs_fp_params(double %a, float %b) { ; CHECK: [[OFF2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; CHECK: [[FI2:%[0-9]+]]:_(p0) = G_GEP [[SP2]], [[OFF2]](s32) ; CHECK: G_STORE [[AVREG]](s64), [[FI2]](p0){{.*}}store 8 -; CHECK: BL @aapcscc_fp_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1 +; ARM: BL @aapcscc_fp_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1 +; THUMB: tBL 14, $noreg, @aapcscc_fp_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1 ; CHECK-DAG: [[R1:%[0-9]+]]:_(s32) = COPY $r0 ; CHECK-DAG: [[R2:%[0-9]+]]:_(s32) = COPY $r1 ; LITTLE: [[RVREG:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[R1]](s32), [[R2]](s32) @@ -160,7 +170,8 @@ define arm_aapcscc double @test_call_aapcs_fp_params(double %a, float %b) { ; LITTLE-DAG: $r1 = COPY [[R2]] ; BIG-DAG: $r0 = COPY [[R2]] ; BIG-DAG: $r1 = COPY [[R1]] -; CHECK: BX_RET 14, $noreg, implicit $r0, implicit $r1 +; ARM: BX_RET 14, $noreg, implicit $r0, implicit $r1 +; THUMB: tBX_RET 14, $noreg, implicit $r0, implicit $r1 entry: %r = notail call arm_aapcscc double @aapcscc_fp_target(float %b, double %a, float %b, double %a) ret double %r @@ -173,11 +184,13 @@ define arm_aapcs_vfpcc float @test_call_different_call_conv(float %x) { ; CHECK: [[X:%[0-9]+]]:_(s32) = COPY $s0 ; CHECK: ADJCALLSTACKDOWN 0, 0, 14, $noreg, implicit-def $sp, implicit $sp ; CHECK: $r0 = COPY [[X]] -; CHECK: BL @different_call_conv_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit-def $r0 +; ARM: BL @different_call_conv_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit-def $r0 +; THUMB: tBL 14, $noreg, @different_call_conv_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit-def $r0 ; CHECK: [[R:%[0-9]+]]:_(s32) = COPY $r0 ; CHECK: ADJCALLSTACKUP 0, 0, 14, $noreg, implicit-def $sp, implicit $sp ; CHECK: $s0 = COPY [[R]] -; CHECK: BX_RET 14, $noreg, implicit $s0 +; ARM: BX_RET 14, $noreg, implicit $s0 +; THUMB: tBX_RET 14, $noreg, implicit $s0 entry: %r = notail call arm_aapcscc float @different_call_conv_target(float %x) ret float %r @@ -200,7 +213,8 @@ define arm_aapcscc [3 x i32] @test_tiny_int_arrays([2 x i32] %arr) { ; CHECK: [[R0:%[0-9]+]]:_(s32), [[R1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INS2]](s64) ; CHECK: $r0 = COPY [[R0]] ; CHECK: $r1 = COPY [[R1]] -; CHECK: BL @tiny_int_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit-def $r0, implicit-def $r1 +; ARM: BL @tiny_int_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit-def $r0, implicit-def $r1 +; THUMB: tBL 14, $noreg, @tiny_int_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit-def $r0, implicit-def $r1 ; CHECK: [[R0:%[0-9]+]]:_(s32) = COPY $r0 ; CHECK: [[R1:%[0-9]+]]:_(s32) = COPY $r1 ; CHECK: [[R2:%[0-9]+]]:_(s32) = COPY $r2 @@ -215,7 +229,8 @@ define arm_aapcscc [3 x i32] @test_tiny_int_arrays([2 x i32] %arr) { ; CHECK: $r0 = COPY [[EXT3]] ; CHECK: $r1 = COPY [[EXT4]] ; CHECK: $r2 = COPY [[EXT5]] -; CHECK: BX_RET 14, $noreg, implicit $r0, implicit $r1, implicit $r2 +; ARM: BX_RET 14, $noreg, implicit $r0, implicit $r1, implicit $r2 +; THUMB: tBX_RET 14, $noreg, implicit $r0, implicit $r1, implicit $r2 entry: %r = notail call arm_aapcscc [3 x i32] @tiny_int_arrays_target([2 x i32] %arr) ret [3 x i32] %r @@ -249,9 +264,11 @@ define arm_aapcscc void @test_multiple_int_arrays([2 x i32] %arr0, [2 x i32] %ar ; CHECK: $r1 = COPY [[R1]] ; CHECK: $r2 = COPY [[R2]] ; CHECK: $r3 = COPY [[R3]] -; CHECK: BL @multiple_int_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3 +; ARM: BL @multiple_int_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3 +; THUMB: tBL 14, $noreg, @multiple_int_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3 ; CHECK: ADJCALLSTACKUP 0, 0, 14, $noreg, implicit-def $sp, implicit $sp -; CHECK: BX_RET 14, $noreg +; ARM: BX_RET 14, $noreg +; THUMB: tBX_RET 14, $noreg entry: notail call arm_aapcscc void @multiple_int_arrays_target([2 x i32] %arr0, [2 x i32] %arr1) ret void @@ -293,9 +310,11 @@ define arm_aapcscc void @test_large_int_arrays([20 x i32] %arr) { ; CHECK: [[OFF_LAST_ELEMENT:%[0-9]+]]:_(s32) = G_CONSTANT i32 60 ; CHECK: [[LAST_STACK_ARG_ADDR:%[0-9]+]]:_(p0) = G_GEP [[SP]], [[OFF_LAST_ELEMENT]](s32) ; CHECK: G_STORE [[LAST_STACK_ELEMENT]](s32), [[LAST_STACK_ARG_ADDR]]{{.*}}store 4 -; CHECK: BL @large_int_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3 +; ARM: BL @large_int_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3 +; THUMB: tBL 14, $noreg, @large_int_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3 ; CHECK: ADJCALLSTACKUP 64, 0, 14, $noreg, implicit-def $sp, implicit $sp -; CHECK: BX_RET 14, $noreg +; ARM: BX_RET 14, $noreg +; THUMB: tBX_RET 14, $noreg entry: notail call arm_aapcscc void @large_int_arrays_target([20 x i32] %arr) ret void @@ -342,7 +361,8 @@ define arm_aapcscc [2 x float] @test_fp_arrays_aapcs([3 x double] %arr) { ; CHECK: [[ARR2_OFFSET:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[ARR2_ADDR:%[0-9]+]]:_(p0) = G_GEP [[SP]], [[ARR2_OFFSET]](s32) ; CHECK: G_STORE [[ARR2]](s64), [[ARR2_ADDR]](p0){{.*}}store 8 -; CHECK: BL @fp_arrays_aapcs_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1 +; ARM: BL @fp_arrays_aapcs_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1 +; THUMB: tBL 14, $noreg, @fp_arrays_aapcs_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1 ; CHECK: [[R0:%[0-9]+]]:_(s32) = COPY $r0 ; CHECK: [[R1:%[0-9]+]]:_(s32) = COPY $r1 ; CHECK: [[R_MERGED:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[R0]](s32), [[R1]](s32) @@ -351,7 +371,8 @@ define arm_aapcscc [2 x float] @test_fp_arrays_aapcs([3 x double] %arr) { ; CHECK: [[EXT5:%[0-9]+]]:_(s32) = G_EXTRACT [[R_MERGED]](s64), 32 ; CHECK: $r0 = COPY [[EXT4]] ; CHECK: $r1 = COPY [[EXT5]] -; CHECK: BX_RET 14, $noreg, implicit $r0, implicit $r1 +; ARM: BX_RET 14, $noreg, implicit $r0, implicit $r1 +; THUMB: tBX_RET 14, $noreg, implicit $r0, implicit $r1 entry: %r = notail call arm_aapcscc [2 x float] @fp_arrays_aapcs_target([3 x double] %arr) ret [2 x float] %r @@ -433,7 +454,8 @@ define arm_aapcs_vfpcc [4 x float] @test_fp_arrays_aapcs_vfp([3 x double] %x, [3 ; CHECK: [[Z3_OFFSET:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; CHECK: [[Z3_ADDR:%[0-9]+]]:_(p0) = G_GEP [[SP]], [[Z3_OFFSET]](s32) ; CHECK: G_STORE [[Z3]](s64), [[Z3_ADDR]](p0){{.*}}store 8 -; CHECK: BL @fp_arrays_aapcs_vfp_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $d1, implicit $d2, implicit $s6, implicit $s7, implicit $s8, implicit-def $s0, implicit-def $s1, implicit-def $s2, implicit-def $s3 +; ARM: BL @fp_arrays_aapcs_vfp_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $d1, implicit $d2, implicit $s6, implicit $s7, implicit $s8, implicit-def $s0, implicit-def $s1, implicit-def $s2, implicit-def $s3 +; THUMB: tBL 14, $noreg, @fp_arrays_aapcs_vfp_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $d1, implicit $d2, implicit $s6, implicit $s7, implicit $s8, implicit-def $s0, implicit-def $s1, implicit-def $s2, implicit-def $s3 ; CHECK: [[R0:%[0-9]+]]:_(s32) = COPY $s0 ; CHECK: [[R1:%[0-9]+]]:_(s32) = COPY $s1 ; CHECK: [[R2:%[0-9]+]]:_(s32) = COPY $s2 @@ -448,7 +470,8 @@ define arm_aapcs_vfpcc [4 x float] @test_fp_arrays_aapcs_vfp([3 x double] %x, [3 ; CHECK: $s1 = COPY [[EXT12]] ; CHECK: $s2 = COPY [[EXT13]] ; CHECK: $s3 = COPY [[EXT14]] -; CHECK: BX_RET 14, $noreg, implicit $s0, implicit $s1, implicit $s2, implicit $s3 +; ARM: BX_RET 14, $noreg, implicit $s0, implicit $s1, implicit $s2, implicit $s3 +; THUMB: tBX_RET 14, $noreg, implicit $s0, implicit $s1, implicit $s2, implicit $s3 entry: %r = notail call arm_aapcs_vfpcc [4 x float] @fp_arrays_aapcs_vfp_target([3 x double] %x, [3 x float] %y, [4 x double] %z) ret [4 x float] %r @@ -490,7 +513,8 @@ define arm_aapcscc [2 x i32*] @test_tough_arrays([6 x [4 x i32]] %arr) { ; CHECK: [[OFF_LAST_ELEMENT:%[0-9]+]]:_(s32) = G_CONSTANT i32 76 ; CHECK: [[LAST_STACK_ARG_ADDR:%[0-9]+]]:_(p0) = G_GEP [[SP]], [[OFF_LAST_ELEMENT]](s32) ; CHECK: G_STORE [[LAST_STACK_ELEMENT]](s32), [[LAST_STACK_ARG_ADDR]]{{.*}}store 4 -; CHECK: BL @tough_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1 +; ARM: BL @tough_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1 +; THUMB: tBL 14, $noreg, @tough_arrays_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1 ; CHECK: [[R0:%[0-9]+]]:_(s32) = COPY $r0 ; CHECK: [[R1:%[0-9]+]]:_(s32) = COPY $r1 ; CHECK: [[RES_ARR:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[R0]](s32), [[R1]](s32) @@ -499,7 +523,8 @@ define arm_aapcscc [2 x i32*] @test_tough_arrays([6 x [4 x i32]] %arr) { ; CHECK: [[EXT2:%[0-9]+]]:_(p0) = G_EXTRACT [[RES_ARR]](s64), 32 ; CHECK: $r0 = COPY [[EXT1]] ; CHECK: $r1 = COPY [[EXT2]] -; CHECK: BX_RET 14, $noreg, implicit $r0, implicit $r1 +; ARM: BX_RET 14, $noreg, implicit $r0, implicit $r1 +; THUMB: tBX_RET 14, $noreg, implicit $r0, implicit $r1 entry: %r = notail call arm_aapcscc [2 x i32*] @tough_arrays_target([6 x [4 x i32]] %arr) ret [2 x i32*] %r @@ -522,7 +547,8 @@ define arm_aapcscc {i32, i32} @test_structs({i32, i32} %x) { ; CHECK: [[X0:%[0-9]+]]:_(s32), [[X1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INS2]](s64) ; CHECK-DAG: $r0 = COPY [[X0]](s32) ; CHECK-DAG: $r1 = COPY [[X1]](s32) -; CHECK: BL @structs_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit-def $r0, implicit-def $r1 +; ARM: BL @structs_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit-def $r0, implicit-def $r1 +; THUMB: tBL 14, $noreg, @structs_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit-def $r0, implicit-def $r1 ; CHECK: [[R0:%[0-9]+]]:_(s32) = COPY $r0 ; CHECK: [[R1:%[0-9]+]]:_(s32) = COPY $r1 ; CHECK: [[R:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[R0]](s32), [[R1]](s32) @@ -531,7 +557,8 @@ define arm_aapcscc {i32, i32} @test_structs({i32, i32} %x) { ; CHECK: [[EXT4:%[0-9]+]]:_(s32) = G_EXTRACT [[R]](s64), 32 ; CHECK: $r0 = COPY [[EXT3]](s32) ; CHECK: $r1 = COPY [[EXT4]](s32) -; CHECK: BX_RET 14, $noreg, implicit $r0, implicit $r1 +; ARM: BX_RET 14, $noreg, implicit $r0, implicit $r1 +; THUMB: tBX_RET 14, $noreg, implicit $r0, implicit $r1 %r = notail call arm_aapcscc {i32, i32} @structs_target({i32, i32} %x) ret {i32, i32} %r } diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir index 4634e5a0d9dfa5095af4b30337a93d7664a6947c..281218192a18826c0b1322fbe3394089693c24ac 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir @@ -27,6 +27,8 @@ define void @test_inttoptr_s32() { ret void } define void @test_ptrtoint_s32() { ret void } + define void @test_ctlz_s32() #3 { ret void } + @a_global = global float 1.0 define void @test_globals() { ret void } @@ -83,6 +85,7 @@ attributes #0 = { "target-features"="+vfp2"} attributes #1 = { "target-features"="+hwdiv-arm" } attributes #2 = { "target-features"="+vfp4"} + attributes #3 = { "target-features"="+v5t"} ... --- name: test_add_s32 @@ -561,6 +564,25 @@ body: | BX_RET 14, $noreg, implicit $r0 ... --- +name: test_ctlz_s32 +# CHECK-LABEL: name: test_ctlz_s32 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb, preferred-register: '' } +# CHECK: - { id: 1, class: gprb, preferred-register: '' } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + %0(s32) = COPY $r0 + %1(s32) = G_CTLZ %0(s32) + $r0 = COPY %1(s32) + BX_RET 14, $noreg, implicit $r0 +... +--- name: test_globals # CHECK-LABEL: name: test_globals legalized: true diff --git a/test/CodeGen/ARM/GlobalISel/arm-select-globals-ropi-rwpi.mir b/test/CodeGen/ARM/GlobalISel/arm-select-globals-ropi-rwpi.mir index fa15f09497e51c3c50c6b9ffe80a8222176ccbd4..dc017cc2654e5d6a7735d6689a75c7fac6a7b545 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-select-globals-ropi-rwpi.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-select-globals-ropi-rwpi.mir @@ -1,9 +1,9 @@ -# RUN: llc -O0 -mtriple arm-linux -relocation-model=ropi -mattr=-no-movt,+v8m -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RW-DEFAULT-MOVT,RW-DEFAULT,ROPI-MOVT,ROPI -# RUN: llc -O0 -mtriple arm-linux -relocation-model=ropi -mattr=+no-movt -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RW-DEFAULT-NOMOVT,RW-DEFAULT,ROPI-NOMOVT,ROPI -# RUN: llc -O0 -mtriple arm-linux -relocation-model=rwpi -mattr=-no-movt,+v8m -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RWPI-MOVT,RWPI,RO-DEFAULT-MOVT,RO-DEFAULT -# RUN: llc -O0 -mtriple arm-linux -relocation-model=rwpi -mattr=+no-movt -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RWPI-NOMOVT,RWPI,RO-DEFAULT-NOMOVT,RO-DEFAULT -# RUN: llc -O0 -mtriple arm-linux -relocation-model=ropi-rwpi -mattr=-no-movt,+v8m -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RWPI-MOVT,RWPI,ROPI-MOVT,ROPI -# RUN: llc -O0 -mtriple arm-linux -relocation-model=ropi-rwpi -mattr=+no-movt -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RWPI-NOMOVT,RWPI,ROPI-NOMOVT,ROPI +# RUN: llc -O0 -mtriple arm-linux -relocation-model=ropi -mattr=-no-movt,+v8m -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RW-DEFAULT-MOVT,ROPI-MOVT +# RUN: llc -O0 -mtriple arm-linux -relocation-model=ropi -mattr=+no-movt -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RW-DEFAULT-NOMOVT,ROPI-NOMOVT +# RUN: llc -O0 -mtriple arm-linux -relocation-model=rwpi -mattr=-no-movt,+v8m -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RWPI-MOVT,RWPI,RO-DEFAULT-MOVT +# RUN: llc -O0 -mtriple arm-linux -relocation-model=rwpi -mattr=+no-movt -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RWPI-NOMOVT,RWPI,RO-DEFAULT-NOMOVT +# RUN: llc -O0 -mtriple arm-linux -relocation-model=ropi-rwpi -mattr=-no-movt,+v8m -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RWPI-MOVT,RWPI,ROPI-MOVT +# RUN: llc -O0 -mtriple arm-linux -relocation-model=ropi-rwpi -mattr=+no-movt -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,RWPI-NOMOVT,RWPI,ROPI-NOMOVT --- | @internal_global = internal global i32 42 define void @test_internal_global() { ret void } diff --git a/test/CodeGen/ARM/GlobalISel/thumb-select-arithmetic-ops.mir b/test/CodeGen/ARM/GlobalISel/thumb-select-arithmetic-ops.mir new file mode 100644 index 0000000000000000000000000000000000000000..cf42a815cd4b8b4668da0f1324b21865a09d1e4b --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/thumb-select-arithmetic-ops.mir @@ -0,0 +1,251 @@ +# RUN: llc -O0 -mtriple thumb-- -mattr=+v6t2 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s +--- | + define void @test_add_regs() { ret void } + define void @test_add_fold_imm() { ret void } + define void @test_add_fold_imm12() { ret void } + define void @test_add_no_fold_imm() { ret void } + + define void @test_sub_imm_lhs() { ret void } + define void @test_sub_imm_rhs() { ret void } + + define void @test_mul() { ret void } + define void @test_mla() { ret void } +... +--- +name: test_add_regs +# CHECK-LABEL: name: test_add_regs +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0, $r1 + + %0(s32) = COPY $r0 + ; CHECK: [[VREGX:%[0-9]+]]:gprnopc = COPY $r0 + + %1(s32) = COPY $r1 + ; CHECK: [[VREGY:%[0-9]+]]:rgpr = COPY $r1 + + %2(s32) = G_ADD %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]]:gprnopc = t2ADDrr [[VREGX]], [[VREGY]], 14, $noreg, $noreg + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREGRES]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_add_fold_imm +# CHECK-LABEL: name: test_add_fold_imm +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + ; CHECK: [[VREGX:%[0-9]+]]:gprnopc = COPY $r0 + + %1(s32) = G_CONSTANT i32 786444 ; 0x000c000c + %2(s32) = G_ADD %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]]:gprnopc = t2ADDri [[VREGX]], 786444, 14, $noreg, $noreg + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREGRES]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_add_fold_imm12 +# CHECK-LABEL: name: test_add_fold_imm12 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + ; CHECK: [[VREGX:%[0-9]+]]:gpr = COPY $r0 + + %1(s32) = G_CONSTANT i32 4093 + %2(s32) = G_ADD %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]]:gprnopc = t2ADDri12 [[VREGX]], 4093, 14, $noreg + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREGRES]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_add_no_fold_imm +# CHECK-LABEL: name: test_add_no_fold_imm +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + ; CHECK: [[VREGX:%[0-9]+]]:gprnopc = COPY $r0 + + %1(s32) = G_CONSTANT i32 185470479 ; 0x0b0e0e0f + ; CHECK: [[VREGY:%[0-9]+]]:rgpr = t2MOVi32imm 185470479 + + %2(s32) = G_ADD %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]]:gprnopc = t2ADDrr [[VREGX]], [[VREGY]], 14, $noreg, $noreg + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREGRES]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_sub_imm_lhs +# CHECK-LABEL: name: test_sub_imm_lhs +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + ; CHECK: [[VREGX:%[0-9]+]]:rgpr = COPY $r0 + + %1(s32) = G_CONSTANT i32 786444 ; 0x000c000c + %2(s32) = G_SUB %1, %0 + ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2RSBri [[VREGX]], 786444, 14, $noreg, $noreg + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREGRES]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_sub_imm_rhs +# CHECK-LABEL: name: test_sub_imm_rhs +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + ; CHECK: [[VREGX:%[0-9]+]]:gprnopc = COPY $r0 + + %1(s32) = G_CONSTANT i32 786444 ; 0x000c000c + %2(s32) = G_SUB %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]]:gprnopc = t2SUBri [[VREGX]], 786444, 14, $noreg, $noreg + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREGRES]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_mul +# CHECK-LABEL: name: test_mul +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0, $r1 + + %0(s32) = COPY $r0 + ; CHECK: [[VREGX:%[0-9]+]]:rgpr = COPY $r0 + + %1(s32) = COPY $r1 + ; CHECK: [[VREGY:%[0-9]+]]:rgpr = COPY $r1 + + %2(s32) = G_MUL %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2MUL [[VREGX]], [[VREGY]], 14, $noreg + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREGRES]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_mla +# CHECK-LABEL: name: test_mla +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } + - { id: 4, class: gprb } +body: | + bb.0: + liveins: $r0, $r1, $r2 + + %0(s32) = COPY $r0 + ; CHECK: [[VREGX:%[0-9]+]]:rgpr = COPY $r0 + + %1(s32) = COPY $r1 + ; CHECK: [[VREGY:%[0-9]+]]:rgpr = COPY $r1 + + %2(s32) = COPY $r2 + ; CHECK: [[VREGZ:%[0-9]+]]:rgpr = COPY $r2 + + %3(s32) = G_MUL %0, %1 + %4(s32) = G_ADD %3, %2 + ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2MLA [[VREGX]], [[VREGY]], [[VREGZ]], 14, $noreg + + $r0 = COPY %4(s32) + ; CHECK: $r0 = COPY [[VREGRES]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... diff --git a/test/CodeGen/ARM/GlobalISel/thumb-select-casts.mir b/test/CodeGen/ARM/GlobalISel/thumb-select-casts.mir new file mode 100644 index 0000000000000000000000000000000000000000..df5417e7a917ef76d126383acd1bc8bd2e039e1a --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/thumb-select-casts.mir @@ -0,0 +1,51 @@ +# RUN: llc -O0 -mtriple thumb-- -mattr=+v6t2 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s +--- | + define void @test_inttoptr_s32() { ret void } + define void @test_ptrtoint_s32() { ret void } +... +--- +name: test_inttoptr_s32 +# CHECK-LABEL: name: test_inttoptr_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + %1(p0) = G_INTTOPTR %0(s32) + ; CHECK: [[INT:%[0-9]+]]:gpr = COPY $r0 + + $r0 = COPY %1(p0) + ; CHECK: $r0 = COPY [[INT]] + + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_ptrtoint_s32 +# CHECK-LABEL: name: test_ptrtoint_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(p0) = COPY $r0 + %1(s32) = G_PTRTOINT %0(p0) + ; CHECK: [[PTR:%[0-9]+]]:gpr = COPY $r0 + + $r0 = COPY %1(s32) + ; CHECK: $r0 = COPY [[PTR]] + + BX_RET 14, $noreg, implicit $r0 +... diff --git a/test/CodeGen/ARM/GlobalISel/thumb-select-exts.mir b/test/CodeGen/ARM/GlobalISel/thumb-select-exts.mir new file mode 100644 index 0000000000000000000000000000000000000000..365a614d6ec397b84286deefc26339aa5fff2bd0 --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/thumb-select-exts.mir @@ -0,0 +1,288 @@ +# RUN: llc -O0 -mtriple thumb-- -mattr=+v6t2 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s +--- | + define void @test_trunc_and_zext_s1() { ret void } + define void @test_trunc_and_sext_s1() { ret void } + define void @test_trunc_and_anyext_s1() { ret void } + + define void @test_trunc_and_zext_s8() { ret void } + define void @test_trunc_and_sext_s8() { ret void } + define void @test_trunc_and_anyext_s8() { ret void } + + define void @test_trunc_and_zext_s16() { ret void } + define void @test_trunc_and_sext_s16() { ret void } + define void @test_trunc_and_anyext_s16() { ret void } +... +--- +name: test_trunc_and_zext_s1 +# CHECK-LABEL: name: test_trunc_and_zext_s1 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +tracksRegLiveness: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + ; CHECK: [[VREG:%[0-9]+]]:gpr = COPY $r0 + + %1(s1) = G_TRUNC %0(s32) + + %2(s32) = G_ZEXT %1(s1) + ; CHECK: [[RVREG:%[0-9]+]]:rgpr = COPY [[VREG]] + ; CHECK: [[VREGEXT:%[0-9]+]]:rgpr = t2ANDri [[RVREG]], 1, 14, $noreg, $noreg + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREGEXT]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_trunc_and_sext_s1 +# CHECK-LABEL: name: test_trunc_and_sext_s1 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +tracksRegLiveness: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + ; CHECK: [[VREG:%[0-9]+]]:gpr = COPY $r0 + + %1(s1) = G_TRUNC %0(s32) + + %2(s32) = G_SEXT %1(s1) + ; CHECK: [[RVREG:%[0-9]+]]:rgpr = COPY [[VREG]] + ; CHECK: [[VREGAND:%[0-9]+]]:rgpr = t2ANDri [[RVREG]], 1, 14, $noreg, $noreg + ; CHECK: [[VREGEXT:%[0-9]+]]:rgpr = t2RSBri [[VREGAND]], 0, 14, $noreg, $noreg + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREGEXT]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_trunc_and_anyext_s1 +# CHECK-LABEL: name: test_trunc_and_anyext_s1 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +tracksRegLiveness: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + ; CHECK: [[VREG:%[0-9]+]]:gpr = COPY $r0 + + %1(s1) = G_TRUNC %0(s32) + + %2(s32) = G_ANYEXT %1(s1) + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREG]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_trunc_and_zext_s8 +# CHECK-LABEL: name: test_trunc_and_zext_s8 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +tracksRegLiveness: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + ; CHECK: [[VREG:%[0-9]+]]:gpr = COPY $r0 + + %1(s8) = G_TRUNC %0(s32) + ; CHECK: [[VREGTRUNC:%[0-9]+]]:rgpr = COPY [[VREG]] + + %2(s32) = G_ZEXT %1(s8) + ; CHECK: [[VREGEXT:%[0-9]+]]:rgpr = t2UXTB [[VREGTRUNC]], 0, 14, $noreg + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREGEXT]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_trunc_and_sext_s8 +# CHECK-LABEL: name: test_trunc_and_sext_s8 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +tracksRegLiveness: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + ; CHECK: [[VREG:%[0-9]+]]:gpr = COPY $r0 + + %1(s8) = G_TRUNC %0(s32) + ; CHECK: [[VREGTRUNC:%[0-9]+]]:rgpr = COPY [[VREG]] + + %2(s32) = G_SEXT %1(s8) + ; CHECK: [[VREGEXT:%[0-9]+]]:rgpr = t2SXTB [[VREGTRUNC]], 0, 14, $noreg + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREGEXT]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_trunc_and_anyext_s8 +# CHECK-LABEL: name: test_trunc_and_anyext_s8 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +tracksRegLiveness: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + ; CHECK: [[VREG:%[0-9]+]]:gpr = COPY $r0 + + %1(s8) = G_TRUNC %0(s32) + + %2(s32) = G_ANYEXT %1(s8) + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREG]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_trunc_and_zext_s16 +# CHECK-LABEL: name: test_trunc_and_zext_s16 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +tracksRegLiveness: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + ; CHECK: [[VREG:%[0-9]+]]:gpr = COPY $r0 + + %1(s16) = G_TRUNC %0(s32) + ; CHECK: [[VREGTRUNC:%[0-9]+]]:rgpr = COPY [[VREG]] + + %2(s32) = G_ZEXT %1(s16) + ; CHECK: [[VREGEXT:%[0-9]+]]:rgpr = t2UXTH [[VREGTRUNC]], 0, 14, $noreg + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREGEXT]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_trunc_and_sext_s16 +# CHECK-LABEL: name: test_trunc_and_sext_s16 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +tracksRegLiveness: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + ; CHECK: [[VREG:%[0-9]+]]:gpr = COPY $r0 + + %1(s16) = G_TRUNC %0(s32) + ; CHECK: [[VREGTRUNC:%[0-9]+]]:rgpr = COPY [[VREG]] + + %2(s32) = G_SEXT %1(s16) + ; CHECK: [[VREGEXT:%[0-9]+]]:rgpr = t2SXTH [[VREGTRUNC]], 0, 14, $noreg + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREGEXT]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_trunc_and_anyext_s16 +# CHECK-LABEL: name: test_trunc_and_anyext_s16 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +tracksRegLiveness: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + ; CHECK: [[VREG:%[0-9]+]]:gpr = COPY $r0 + + %1(s16) = G_TRUNC %0(s32) + + %2(s32) = G_ANYEXT %1(s16) + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREG]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... diff --git a/test/CodeGen/ARM/GlobalISel/thumb-select-imm.mir b/test/CodeGen/ARM/GlobalISel/thumb-select-imm.mir new file mode 100644 index 0000000000000000000000000000000000000000..4979491aca03e89d641bc0ec03972cb0bd99e446 --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/thumb-select-imm.mir @@ -0,0 +1,66 @@ +# RUN: llc -O0 -mtriple thumb-- -mattr=+v6t2 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s +--- | + define void @test_movi() { ret void } + define void @test_movi16() { ret void } + define void @test_movi32() { ret void } +... +--- +name: test_movi +# CHECK-LABEL: name: test_movi +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } +body: | + bb.0: + %0(s32) = G_CONSTANT i32 786444 ; 0x000c000c + ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2MOVi 786444, 14, $noreg, $noreg + + $r0 = COPY %0(s32) + ; CHECK: $r0 = COPY [[VREGRES]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_movi16 +# CHECK-LABEL: name: test_movi16 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } +body: | + bb.0: + %0(s32) = G_CONSTANT i32 65533 + ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2MOVi16 65533, 14, $noreg + + $r0 = COPY %0(s32) + ; CHECK: $r0 = COPY [[VREGRES]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_movi32 +# CHECK-LABEL: name: test_movi32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } +body: | + bb.0: + %0(s32) = G_CONSTANT i32 185470479 ; 0x0b0e0e0f + ; CHECK: [[VREGY:%[0-9]+]]:rgpr = t2MOVi32imm 185470479 + + $r0 = COPY %0(s32) + ; CHECK: $r0 = COPY [[VREGRES]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... diff --git a/test/CodeGen/ARM/GlobalISel/thumb-select-load-store.mir b/test/CodeGen/ARM/GlobalISel/thumb-select-load-store.mir new file mode 100644 index 0000000000000000000000000000000000000000..170c26e996cb512731422e4a85431360416a807b --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/thumb-select-load-store.mir @@ -0,0 +1,84 @@ +# RUN: llc -O0 -mtriple thumb-- -mattr=+v6t2 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s +--- | + define void @test_s8() { ret void } + define void @test_s16() { ret void } + define void @test_s32() { ret void } +... +--- +name: test_s8 +# CHECK-LABEL: name: test_s8 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(p0) = COPY $r0 + ; CHECK: %[[P:[0-9]+]]:gpr = COPY $r0 + + %1(s8) = G_LOAD %0(p0) :: (load 1) + ; CHECK: %[[V:[0-9]+]]:rgpr = t2LDRBi12 %[[P]], 0, 14, $noreg :: (load 1) + + G_STORE %1(s8), %0(p0) :: (store 1) + ; CHECK: t2STRBi12 %[[V]], %[[P]], 0, 14, $noreg :: (store 1) + + BX_RET 14, $noreg + ; CHECK: BX_RET 14, $noreg +... +--- +name: test_s16 +# CHECK-LABEL: name: test_s16 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(p0) = COPY $r0 + ; CHECK: %[[P:[0-9]+]]:gpr = COPY $r0 + + %1(s16) = G_LOAD %0(p0) :: (load 2) + ; CHECK: %[[V:[0-9]+]]:rgpr = t2LDRHi12 %[[P]], 0, 14, $noreg :: (load 2) + + G_STORE %1(s16), %0(p0) :: (store 2) + ; CHECK: t2STRHi12 %[[V]], %[[P]], 0, 14, $noreg :: (store 2) + + BX_RET 14, $noreg + ; CHECK: BX_RET 14, $noreg +... +--- +name: test_s32 +# CHECK-LABEL: name: test_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(p0) = COPY $r0 + ; CHECK: %[[P:[0-9]+]]:gpr = COPY $r0 + + %1(s32) = G_LOAD %0(p0) :: (load 4) + ; CHECK: %[[V:[0-9]+]]:gpr = t2LDRi12 %[[P]], 0, 14, $noreg :: (load 4) + + G_STORE %1(s32), %0(p0) :: (store 4) + ; CHECK: t2STRi12 %[[V]], %[[P]], 0, 14, $noreg :: (store 4) + + BX_RET 14, $noreg + ; CHECK: BX_RET 14, $noreg +... diff --git a/test/CodeGen/ARM/GlobalISel/thumb-select-logical-ops.mir b/test/CodeGen/ARM/GlobalISel/thumb-select-logical-ops.mir new file mode 100644 index 0000000000000000000000000000000000000000..d63c59942b784ab7c681f577270a00d0df2202be --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/thumb-select-logical-ops.mir @@ -0,0 +1,219 @@ +# RUN: llc -O0 -mtriple thumb-- -mattr=+v6t2 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s +--- | + define void @test_and_regs() { ret void } + define void @test_and_imm() { ret void } + + define void @test_bfc() { ret void } + define void @test_no_bfc_bad_mask() { ret void } + + define void @test_mvn() { ret void } + define void @test_bic() { ret void } + define void @test_orn() { ret void } +... +--- +name: test_and_regs +# CHECK-LABEL: name: test_and_regs +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0, $r1 + + %0(s32) = COPY $r0 + ; CHECK: [[VREGX:%[0-9]+]]:rgpr = COPY $r0 + + %1(s32) = COPY $r1 + ; CHECK: [[VREGY:%[0-9]+]]:rgpr = COPY $r1 + + %2(s32) = G_AND %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2ANDrr [[VREGX]], [[VREGY]], 14, $noreg, $noreg + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREGRES]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_and_imm +# CHECK-LABEL: name: test_and_imm +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + ; CHECK: [[VREGX:%[0-9]+]]:rgpr = COPY $r0 + + %1(s32) = G_CONSTANT i32 786444 ; 0x000c000c + %2(s32) = G_AND %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2ANDri [[VREGX]], 786444, 14, $noreg, $noreg + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREGRES]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_bfc +# CHECK-LABEL: name: test_bfc +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + ; CHECK: [[VREGX:%[0-9]+]]:rgpr = COPY $r0 + + %1(s32) = G_CONSTANT i32 -65529 ; 0xFFFF0007 + %2(s32) = G_AND %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2BFC [[VREGX]], -65529, 14, $noreg + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREGRES]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_no_bfc_bad_mask +# CHECK-LABEL: name: test_no_bfc_bad_mask +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + + %1(s32) = G_CONSTANT i32 786444 ; 0x000c000c + %2(s32) = G_AND %0, %1 + ; CHECK-NOT: t2BFC + + $r0 = COPY %2(s32) + + BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_mvn +# CHECK-LABEL: name: test_mvn +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: $r0 + + %0(s32) = COPY $r0 + ; CHECK: [[VREGX:%[0-9]+]]:rgpr = COPY $r0 + + %1(s32) = G_CONSTANT i32 -1 + %2(s32) = G_XOR %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2MVNr [[VREGX]], 14, $noreg, $noreg + + $r0 = COPY %2(s32) + ; CHECK: $r0 = COPY [[VREGRES]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_bic +# CHECK-LABEL: name: test_bic +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } + - { id: 4, class: gprb } +body: | + bb.0: + liveins: $r0, $r1 + + %0(s32) = COPY $r0 + %1(s32) = COPY $r1 + ; CHECK-DAG: [[VREGX:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK-DAG: [[VREGY:%[0-9]+]]:rgpr = COPY $r1 + + %2(s32) = G_CONSTANT i32 -1 + %3(s32) = G_XOR %1, %2 + + %4(s32) = G_AND %0, %3 + ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2BICrr [[VREGX]], [[VREGY]], 14, $noreg, $noreg + + $r0 = COPY %4(s32) + ; CHECK: $r0 = COPY [[VREGRES]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... +--- +name: test_orn +# CHECK-LABEL: name: test_orn +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } + - { id: 4, class: gprb } +body: | + bb.0: + liveins: $r0, $r1 + + %0(s32) = COPY $r0 + %1(s32) = COPY $r1 + ; CHECK-DAG: [[VREGX:%[0-9]+]]:rgpr = COPY $r0 + ; CHECK-DAG: [[VREGY:%[0-9]+]]:rgpr = COPY $r1 + + %2(s32) = G_CONSTANT i32 -1 + %3(s32) = G_XOR %1, %2 + + %4(s32) = G_OR %0, %3 + ; CHECK: [[VREGRES:%[0-9]+]]:rgpr = t2ORNrr [[VREGX]], [[VREGY]], 14, $noreg, $noreg + + $r0 = COPY %4(s32) + ; CHECK: $r0 = COPY [[VREGRES]] + + BX_RET 14, $noreg, implicit $r0 + ; CHECK: BX_RET 14, $noreg, implicit $r0 +... diff --git a/test/CodeGen/ARM/alloca-align.ll b/test/CodeGen/ARM/alloca-align.ll index 6186d137ef7fdd64421a03db1b8cffa603b70fc0..3326d361c07f07111e522e48d10dc655a1402df5 100644 --- a/test/CodeGen/ARM/alloca-align.ll +++ b/test/CodeGen/ARM/alloca-align.ll @@ -12,8 +12,7 @@ declare void @bar(i32*, [20000 x i8]* byval) ; And a base pointer getting used. ; CHECK: mov r6, sp ; Which is passed to the call -; CHECK: add [[REG:r[0-9]+|lr]], r6, #19456 -; CHECK: add r0, [[REG]], #536 +; CHECK: mov r0, r6 ; CHECK: bl bar define void @foo([20000 x i8]* %addr) { %tmp = alloca [4 x i32], align 32 diff --git a/test/CodeGen/ARM/arm-storebytesmerge.ll b/test/CodeGen/ARM/arm-storebytesmerge.ll index edc25302f7c5701a8798e67594539d9be80de224..00c5914b34b8bd6aca2652a34c1eafd0d90597a3 100644 --- a/test/CodeGen/ARM/arm-storebytesmerge.ll +++ b/test/CodeGen/ARM/arm-storebytesmerge.ll @@ -8,101 +8,95 @@ target triple = "thumbv7em-arm-none-eabi" define arm_aapcs_vfpcc void @test(i8* %v50) #0 { ; CHECK-LABEL: test: ; CHECK: @ %bb.0: -; CHECK-NEXT: movw r1, #35722 -; CHECK-NEXT: movt r1, #36236 -; CHECK-NEXT: str.w r1, [r0, #394] -; CHECK-NEXT: movw r1, #36750 -; CHECK-NEXT: movt r1, #37264 -; CHECK-NEXT: str.w r1, [r0, #398] -; CHECK-NEXT: movw r1, #37778 -; CHECK-NEXT: movt r1, #38292 -; CHECK-NEXT: str.w r1, [r0, #402] -; CHECK-NEXT: movw r1, #38806 -; CHECK-NEXT: movt r1, #39320 -; CHECK-NEXT: str.w r1, [r0, #406] -; CHECK-NEXT: movw r1, #39834 -; CHECK-NEXT: strh.w r1, [r0, #410] -; CHECK-NEXT: movw r1, #40348 -; CHECK-NEXT: movt r1, #40862 -; CHECK-NEXT: str.w r1, [r0, #412] -; CHECK-NEXT: movw r1, #41376 -; CHECK-NEXT: movt r1, #41890 -; CHECK-NEXT: str.w r1, [r0, #416] -; CHECK-NEXT: movw r1, #42404 -; CHECK-NEXT: movt r1, #42918 -; CHECK-NEXT: str.w r1, [r0, #420] -; CHECK-NEXT: movw r1, #43432 -; CHECK-NEXT: movt r1, #43946 -; CHECK-NEXT: str.w r1, [r0, #424] -; CHECK-NEXT: movw r1, #44460 -; CHECK-NEXT: movt r1, #44974 -; CHECK-NEXT: str.w r1, [r0, #428] -; CHECK-NEXT: movw r1, #45488 -; CHECK-NEXT: strh.w r1, [r0, #432] +; CHECK-NEXT: movw r1, #65534 +; CHECK-NEXT: strh.w r1, [r0, #510] +; CHECK-NEXT: movw r1, #64506 +; CHECK-NEXT: movt r1, #65020 +; CHECK-NEXT: str.w r1, [r0, #506] +; CHECK-NEXT: movw r1, #63478 +; CHECK-NEXT: movt r1, #63992 +; CHECK-NEXT: str.w r1, [r0, #502] +; CHECK-NEXT: movw r1, #62450 +; CHECK-NEXT: movt r1, #62964 +; CHECK-NEXT: str.w r1, [r0, #498] +; CHECK-NEXT: movw r1, #61422 +; CHECK-NEXT: movt r1, #61936 +; CHECK-NEXT: str.w r1, [r0, #494] +; CHECK-NEXT: movw r1, #60394 +; CHECK-NEXT: movt r1, #60908 +; CHECK-NEXT: str.w r1, [r0, #490] +; CHECK-NEXT: movw r1, #59366 +; CHECK-NEXT: movt r1, #59880 +; CHECK-NEXT: str.w r1, [r0, #486] +; CHECK-NEXT: movw r1, #58338 +; CHECK-NEXT: movt r1, #58852 +; CHECK-NEXT: str.w r1, [r0, #482] +; CHECK-NEXT: movw r1, #57310 +; CHECK-NEXT: movt r1, #57824 +; CHECK-NEXT: str.w r1, [r0, #478] +; CHECK-NEXT: movw r1, #56282 +; CHECK-NEXT: movt r1, #56796 +; CHECK-NEXT: str.w r1, [r0, #474] +; CHECK-NEXT: movw r1, #55254 +; CHECK-NEXT: movt r1, #55768 +; CHECK-NEXT: str.w r1, [r0, #470] +; CHECK-NEXT: movw r1, #54226 +; CHECK-NEXT: movt r1, #54740 +; CHECK-NEXT: str.w r1, [r0, #466] +; CHECK-NEXT: movw r1, #53198 +; CHECK-NEXT: movt r1, #53712 +; CHECK-NEXT: str.w r1, [r0, #462] +; CHECK-NEXT: movw r1, #52170 +; CHECK-NEXT: movt r1, #52684 +; CHECK-NEXT: str.w r1, [r0, #458] +; CHECK-NEXT: movw r1, #51142 +; CHECK-NEXT: movt r1, #51656 +; CHECK-NEXT: str.w r1, [r0, #454] +; CHECK-NEXT: movw r1, #50114 +; CHECK-NEXT: movt r1, #50628 +; CHECK-NEXT: str.w r1, [r0, #450] +; CHECK-NEXT: movw r1, #49086 +; CHECK-NEXT: movt r1, #49600 +; CHECK-NEXT: str.w r1, [r0, #446] +; CHECK-NEXT: movw r1, #48058 +; CHECK-NEXT: movt r1, #48572 +; CHECK-NEXT: str.w r1, [r0, #442] +; CHECK-NEXT: movw r1, #47030 +; CHECK-NEXT: movt r1, #47544 +; CHECK-NEXT: str.w r1, [r0, #438] ; CHECK-NEXT: movw r1, #46002 ; CHECK-NEXT: movt r1, #46516 ; CHECK-NEXT: str.w r1, [r0, #434] -; CHECK-NEXT: movw r1, #47030 -; CHECK-NEXT: strh.w r1, [r0, #438] -; CHECK-NEXT: movw r1, #47544 -; CHECK-NEXT: movt r1, #48058 -; CHECK-NEXT: str.w r1, [r0, #440] -; CHECK-NEXT: movw r1, #48572 -; CHECK-NEXT: movt r1, #49086 -; CHECK-NEXT: str.w r1, [r0, #444] -; CHECK-NEXT: movw r1, #49600 -; CHECK-NEXT: strh.w r1, [r0, #448] -; CHECK-NEXT: movs r1, #194 -; CHECK-NEXT: strb.w r1, [r0, #450] -; CHECK-NEXT: movw r1, #50371 -; CHECK-NEXT: movt r1, #50885 -; CHECK-NEXT: str.w r1, [r0, #451] -; CHECK-NEXT: movw r1, #51399 -; CHECK-NEXT: movt r1, #51913 -; CHECK-NEXT: str.w r1, [r0, #455] -; CHECK-NEXT: movw r1, #52427 -; CHECK-NEXT: movt r1, #52941 -; CHECK-NEXT: str.w r1, [r0, #459] -; CHECK-NEXT: movw r1, #53455 -; CHECK-NEXT: movt r1, #53969 -; CHECK-NEXT: str.w r1, [r0, #463] -; CHECK-NEXT: movw r1, #54483 -; CHECK-NEXT: strh.w r1, [r0, #467] -; CHECK-NEXT: movw r1, #54997 -; CHECK-NEXT: movt r1, #55511 -; CHECK-NEXT: str.w r1, [r0, #469] -; CHECK-NEXT: movw r1, #56025 -; CHECK-NEXT: movt r1, #56539 -; CHECK-NEXT: str.w r1, [r0, #473] -; CHECK-NEXT: movw r1, #57053 -; CHECK-NEXT: movt r1, #57567 -; CHECK-NEXT: str.w r1, [r0, #477] -; CHECK-NEXT: movw r1, #58081 -; CHECK-NEXT: movt r1, #58595 -; CHECK-NEXT: str.w r1, [r0, #481] -; CHECK-NEXT: movw r1, #59109 -; CHECK-NEXT: movt r1, #59623 -; CHECK-NEXT: str.w r1, [r0, #485] -; CHECK-NEXT: movw r1, #60137 -; CHECK-NEXT: strh.w r1, [r0, #489] -; CHECK-NEXT: movw r1, #60651 -; CHECK-NEXT: movt r1, #61165 -; CHECK-NEXT: str.w r1, [r0, #491] -; CHECK-NEXT: movw r1, #61679 -; CHECK-NEXT: strh.w r1, [r0, #495] -; CHECK-NEXT: movw r1, #62193 -; CHECK-NEXT: movt r1, #62707 -; CHECK-NEXT: str.w r1, [r0, #497] -; CHECK-NEXT: movw r1, #63221 -; CHECK-NEXT: movt r1, #63735 -; CHECK-NEXT: str.w r1, [r0, #501] -; CHECK-NEXT: movw r1, #64249 -; CHECK-NEXT: strh.w r1, [r0, #505] -; CHECK-NEXT: movs r1, #251 -; CHECK-NEXT: strb.w r1, [r0, #507] -; CHECK-NEXT: movw r1, #65020 -; CHECK-NEXT: movt r1, #65534 -; CHECK-NEXT: str.w r1, [r0, #508] +; CHECK-NEXT: movw r1, #44974 +; CHECK-NEXT: movt r1, #45488 +; CHECK-NEXT: str.w r1, [r0, #430] +; CHECK-NEXT: movw r1, #43946 +; CHECK-NEXT: movt r1, #44460 +; CHECK-NEXT: str.w r1, [r0, #426] +; CHECK-NEXT: movw r1, #42918 +; CHECK-NEXT: movt r1, #43432 +; CHECK-NEXT: str.w r1, [r0, #422] +; CHECK-NEXT: movw r1, #41890 +; CHECK-NEXT: movt r1, #42404 +; CHECK-NEXT: str.w r1, [r0, #418] +; CHECK-NEXT: movw r1, #40862 +; CHECK-NEXT: movt r1, #41376 +; CHECK-NEXT: str.w r1, [r0, #414] +; CHECK-NEXT: movw r1, #39834 +; CHECK-NEXT: movt r1, #40348 +; CHECK-NEXT: str.w r1, [r0, #410] +; CHECK-NEXT: movw r1, #38806 +; CHECK-NEXT: movt r1, #39320 +; CHECK-NEXT: str.w r1, [r0, #406] +; CHECK-NEXT: movw r1, #37778 +; CHECK-NEXT: movt r1, #38292 +; CHECK-NEXT: str.w r1, [r0, #402] +; CHECK-NEXT: movw r1, #36750 +; CHECK-NEXT: movt r1, #37264 +; CHECK-NEXT: str.w r1, [r0, #398] +; CHECK-NEXT: movw r1, #35722 +; CHECK-NEXT: movt r1, #36236 +; CHECK-NEXT: str.w r1, [r0, #394] ; CHECK-NEXT: bx lr %v190 = getelementptr inbounds i8, i8* %v50, i32 394 store i8 -118, i8* %v190, align 1 diff --git a/test/CodeGen/ARM/atomic-ops-m33.ll b/test/CodeGen/ARM/atomic-ops-m33.ll new file mode 100644 index 0000000000000000000000000000000000000000..474ad8960cf5b1860690ce851ecab5462d711b64 --- /dev/null +++ b/test/CodeGen/ARM/atomic-ops-m33.ll @@ -0,0 +1,140 @@ +; RUN: llc -mtriple=thumbv7-none-eabi -mcpu=cortex-m33 -verify-machineinstrs -o - %s | FileCheck %s + +define i8 @test_atomic_load_add_i8(i8 %offset) nounwind { +; CHECK-LABEL: test_atomic_load_add_i8: + %old = atomicrmw add i8* @var8, i8 %offset seq_cst +; CHECK-NOT: dmb +; CHECK-NOT: mcr +; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8 +; CHECK: movt r[[ADDR]], :upper16:var8 + +; CHECK: .LBB{{[0-9]+}}_1: +; CHECK: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]] + ; r0 below is a reasonable guess but could change: it certainly comes into the + ; function there. +; CHECK-NEXT: add{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0 +; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]] +; CHECK-NEXT: cmp [[STATUS]], #0 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 +; CHECK-NOT: dmb +; CHECK-NOT: mcr + +; CHECK: mov r0, r[[OLD]] + ret i8 %old +} + +define i16 @test_atomic_load_add_i16(i16 %offset) nounwind { +; CHECK-LABEL: test_atomic_load_add_i16: + %old = atomicrmw add i16* @var16, i16 %offset acquire +; CHECK-NOT: dmb +; CHECK-NOT: mcr +; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16 +; CHECK: movt r[[ADDR]], :upper16:var16 + +; CHECK: .LBB{{[0-9]+}}_1: +; CHECK: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]] + ; r0 below is a reasonable guess but could change: it certainly comes into the + ; function there. +; CHECK-NEXT: add{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0 +; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]] +; CHECK-NEXT: cmp [[STATUS]], #0 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 +; CHECK-NOT: dmb +; CHECK-NOT: mcr + +; CHECK: mov r0, r[[OLD]] + ret i16 %old +} + +define i32 @test_atomic_load_add_i32(i32 %offset) nounwind { +; CHECK-LABEL: test_atomic_load_add_i32: + %old = atomicrmw add i32* @var32, i32 %offset release +; CHECK-NOT: dmb +; CHECK-NOT: mcr +; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32 +; CHECK: movt r[[ADDR]], :upper16:var32 + +; CHECK: .LBB{{[0-9]+}}_1: +; CHECK: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]] + ; r0 below is a reasonable guess but could change: it certainly comes into the + ; function there. +; CHECK-NEXT: add{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0 +; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]] +; CHECK-NEXT: cmp [[STATUS]], #0 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 +; CHECK-NOT: dmb +; CHECK-NOT: mcr + +; CHECK: mov r0, r[[OLD]] + ret i32 %old +} + +define void @test_atomic_load_add_i64(i64 %offset) nounwind { +; CHECK-LABEL: test_atomic_load_add_i64: +; CHECK: bl __sync_fetch_and_add_8 + %old = atomicrmw add i64* @var64, i64 %offset monotonic + store i64 %old, i64* @var64 + ret void +} + +define i8 @test_load_acquire_i8(i8* %ptr) { +; CHECK-LABEL: test_load_acquire_i8: +; CHECK: ldab r0, [r0] + %val = load atomic i8, i8* %ptr seq_cst, align 1 + ret i8 %val +} + +define i16 @test_load_acquire_i16(i16* %ptr) { +; CHECK-LABEL: test_load_acquire_i16: +; CHECK: ldah r0, [r0] + %val = load atomic i16, i16* %ptr acquire, align 2 + ret i16 %val +} + +define i32 @test_load_acquire_i32(i32* %ptr) { +; CHECK-LABEL: test_load_acquire_i32: +; CHECK: lda r0, [r0] + %val = load atomic i32, i32* %ptr acquire, align 4 + ret i32 %val +} + +define i64 @test_load_acquire_i64(i64* %ptr) { +; CHECK-LABEL: test_load_acquire_i64: +; CHECK: bl __atomic_load + %val = load atomic i64, i64* %ptr acquire, align 4 + ret i64 %val +} + +define void @test_store_release_i8(i8 %val, i8* %ptr) { +; CHECK-LABEL: test_store_release_i8: +; CHECK: stlb r0, [r1] + store atomic i8 %val, i8* %ptr seq_cst, align 1 + ret void +} + +define void @test_store_release_i16(i16 %val, i16* %ptr) { +; CHECK-LABEL: test_store_release_i16: +; CHECK: stlh r0, [r1] + store atomic i16 %val, i16* %ptr release, align 2 + ret void +} + +define void @test_store_release_i32(i32 %val, i32* %ptr) { +; CHECK-LABEL: test_store_release_i32: +; CHECK: stl r0, [r1] + store atomic i32 %val, i32* %ptr seq_cst, align 4 + ret void +} + +define void @test_store_release_i64(i64 %val, i64* %ptr) { +; CHECK-LABEL: test_store_release_i64: +; CHECK: bl __atomic_store + store atomic i64 %val, i64* %ptr seq_cst, align 4 + ret void +} + + +@var8 = global i8 0 +@var16 = global i16 0 +@var32 = global i32 0 +@var64 = global i64 0 diff --git a/test/CodeGen/ARM/cmp.ll b/test/CodeGen/ARM/cmp.ll index 5c1630912579b08ed09353c5d22936a9dc1cf8e0..2e6b20cce7323d6410fb56dd98c5b5824fa0f12d 100644 --- a/test/CodeGen/ARM/cmp.ll +++ b/test/CodeGen/ARM/cmp.ll @@ -39,15 +39,11 @@ define i1 @f6(i32 %a, i32 %b) { define i1 @f7(i32 %a, i32 %b) { ; CHECK-LABEL: f7: -; CHECK: sub r2, r0, r1, lsr #6 -; CHECK: cmp r0, r1, lsr #6 -; CHECK: movwne r2, #1 -; CHECK: mov r0, r2 -; CHECK-T2: sub.w r2, r0, r1, lsr #6 -; CHECK-T2: cmp.w r0, r1, lsr #6 +; CHECK: subs r0, r0, r1, lsr #6 +; CHECK: movwne r0, #1 +; CHECK-T2: subs.w r0, r0, r1, lsr #6 ; CHECK-T2: it ne -; CHECK-T2: movne r2, #1 -; CHECK-T2: mov r0, r2 +; CHECK-T2: movne r0, #1 %tmp = lshr i32 %b, 6 %tmp1 = icmp ne i32 %a, %tmp ret i1 %tmp1 @@ -68,15 +64,11 @@ define i1 @f8(i32 %a, i32 %b) { define i1 @f9(i32 %a) { ; CHECK-LABEL: f9: -; CHECK: sub r1, r0, r0, ror #8 -; CHECK: cmp r0, r0, ror #8 -; CHECK: movwne r1, #1 -; CHECK: mov r0, r1 -; CHECK-T2: sub.w r1, r0, r0, ror #8 -; CHECK-T2: cmp.w r0, r0, ror #8 +; CHECK: subs r0, r0, r0, ror #8 +; CHECK: movwne r0, #1 +; CHECK-T2: subs.w r0, r0, r0, ror #8 ; CHECK-T2: it ne -; CHECK-T2: movne r1, #1 -; CHECK-T2: mov r0, r1 +; CHECK-T2: movne r0, #1 %l8 = shl i32 %a, 24 %r8 = lshr i32 %a, 8 %tmp = or i32 %l8, %r8 diff --git a/test/CodeGen/ARM/codemodel.ll b/test/CodeGen/ARM/codemodel.ll new file mode 100644 index 0000000000000000000000000000000000000000..ec9982faba12bfd536f6ba8992c48bfa19ecccd1 --- /dev/null +++ b/test/CodeGen/ARM/codemodel.ll @@ -0,0 +1,9 @@ +; RUN: not llc -verify-machineinstrs -o - -mtriple=arm-none-eabi -code-model=tiny < %s 2>&1 | FileCheck %s --check-prefix=TINY +; RUN: not llc -verify-machineinstrs -o - -mtriple=arm-none-eabi -code-model=kernel < %s 2>&1 | FileCheck %s --check-prefix=KERNEL + +; TINY: Target does not support the tiny CodeModel +; KERNEL: Target does not support the kernel CodeModel + +define void @foo() { + ret void +} diff --git a/test/CodeGen/ARM/debug-info-qreg.ll b/test/CodeGen/ARM/debug-info-qreg.ll index fa4d79c604af2727ab3e52e44c688340c653b8f7..67628dd2a3a2dc106121befa490abaabb37eed92 100644 --- a/test/CodeGen/ARM/debug-info-qreg.ll +++ b/test/CodeGen/ARM/debug-info-qreg.ll @@ -17,12 +17,12 @@ target triple = "thumbv7-apple-macosx10.6.7" declare <4 x float> @test0001(float) nounwind readnone ssp -define i32 @main(i32 %argc, i8** nocapture %argv, <4 x float> %x) nounwind ssp !dbg !10 { +define i32 @main(i32 %argc, i8** nocapture %argv, <4 x float> %x, <4 x float> %y) nounwind ssp !dbg !10 { entry: br label %for.body9 for.body9: ; preds = %for.body9, %entry - %add19 = fadd <4 x float> %x, , !dbg !39 + %add19 = fadd <4 x float> %x, %y, !dbg !39 br i1 undef, label %for.end54, label %for.body9, !dbg !44 for.end54: ; preds = %for.body9 diff --git a/test/CodeGen/ARM/float-helpers.s b/test/CodeGen/ARM/float-helpers.s index 0a9f2490d20d01e016af2f860e4155f5d2004e89..42ab56084d45cdeb966add4f00931e693c5d88dc 100644 --- a/test/CodeGen/ARM/float-helpers.s +++ b/test/CodeGen/ARM/float-helpers.s @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -asm-verbose=false -mattr=-vfp2 -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-ALL -check-prefix=CHECK-SOFT -; RUN: llc -asm-verbose=false -mattr=-vfp2 -mtriple=arm-eabi -meabi=gnu < %s | FileCheck %s -check-prefix=CHECK-ALL -check-prefix=CHECK-SOFT -; RUN: llc -asm-verbose=false -mattr=+vfp3 -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-ALL -check-prefix=CHECK-SOFTFP -; RUN: llc -asm-verbose=false -mattr=+vfp3 -meabi=gnu -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-ALL -check-prefix=CHECK-SOFTFP -; RUN: llc -asm-verbose=false -mattr=+vfp3 -float-abi=hard -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-ALL -check-prefix=CHECK-HARDFP-SP -check-prefix=CHECK-HARDFP-DP -; RUN: llc -asm-verbose=false -mattr=+vfp3 -float-abi=hard -meabi=gnu -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-ALL -check-prefix=CHECK-HARDFP-SP -check-prefix=CHECK-HARDFP-DP -; RUN: llc -asm-verbose=false -mattr=+vfp3,+fp-only-sp -float-abi=hard -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-ALL -check-prefix=CHECK-HARDFP-SP -check-prefix=CHECK-HARDFP-SPONLY -; RUN: llc -asm-verbose=false -mattr=+vfp3,+fp-only-sp -float-abi=hard -mtriple=arm-eabi -meabi=gnu < %s | FileCheck %s -check-prefix=CHECK-ALL -check-prefix=CHECK-HARDFP-SP -check-prefix=CHECK-HARDFP-SPONLY +; RUN: llc -asm-verbose=false -mattr=-vfp2 -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-SOFT +; RUN: llc -asm-verbose=false -mattr=-vfp2 -mtriple=arm-eabi -meabi=gnu < %s | FileCheck %s -check-prefix=CHECK-SOFT +; RUN: llc -asm-verbose=false -mattr=+vfp3 -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-SOFTFP +; RUN: llc -asm-verbose=false -mattr=+vfp3 -meabi=gnu -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-SOFTFP +; RUN: llc -asm-verbose=false -mattr=+vfp3 -float-abi=hard -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-HARDFP-SP -check-prefix=CHECK-HARDFP-DP +; RUN: llc -asm-verbose=false -mattr=+vfp3 -float-abi=hard -meabi=gnu -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-HARDFP-SP -check-prefix=CHECK-HARDFP-DP +; RUN: llc -asm-verbose=false -mattr=+vfp3,+fp-only-sp -float-abi=hard -mtriple=arm-eabi < %s | FileCheck %s -check-prefix=CHECK-HARDFP-SP -check-prefix=CHECK-HARDFP-SPONLY +; RUN: llc -asm-verbose=false -mattr=+vfp3,+fp-only-sp -float-abi=hard -mtriple=arm-eabi -meabi=gnu < %s | FileCheck %s -check-prefix=CHECK-HARDFP-SP -check-prefix=CHECK-HARDFP-SPONLY ; The Runtime ABI for the ARM Architecture IHI0043 section 4.1.2 The ; floating-point helper functions to always use the base AAPCS (soft-float) diff --git a/test/CodeGen/ARM/fp16-vld.ll b/test/CodeGen/ARM/fp16-vld.ll new file mode 100644 index 0000000000000000000000000000000000000000..5052b99e6c9dac2a1474a69c947e52506ee181a2 --- /dev/null +++ b/test/CodeGen/ARM/fp16-vld.ll @@ -0,0 +1,48 @@ +; RUN: llc -asm-verbose=false < %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv8.2a-arm-unknown-eabihf" + +define dso_local void @vec8(half* nocapture readonly %V, i32 %N) local_unnamed_addr #0 { +; CHECK: .LBB0_1: +; CHECK-NEXT: vld1.16 {d16, d17}, [r0]! +; CHECK-NEXT: subs r1, r1, #8 +; CHECK-NEXT: bne .LBB0_1 +entry: + br label %vector.body + +vector.body: + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds half, half* %V, i32 %index + %1 = bitcast half* %0 to <8 x half>* + %wide.load = load volatile <8 x half>, <8 x half>* %1, align 2 + %index.next = add i32 %index, 8 + %cmp = icmp eq i32 %index.next, %N + br i1 %cmp, label %byeblock, label %vector.body + +byeblock: + ret void +} + +define dso_local void @vec4(half* nocapture readonly %V, i32 %N) local_unnamed_addr #0 { +; CHECK: .LBB1_1: +; CHECK-NEXT: vld1.16 {d16}, [r0]! +; CHECK-NEXT: subs r1, r1, #4 +; CHECK-NEXT: bne .LBB1_1 +entry: + br label %vector.body + +vector.body: + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds half, half* %V, i32 %index + %1 = bitcast half* %0 to <4 x half>* + %wide.load = load volatile <4 x half>, <4 x half>* %1, align 2 + %index.next = add i32 %index, 4 + %cmp = icmp eq i32 %index.next, %N + br i1 %cmp, label %byeblock, label %vector.body + +byeblock: + ret void +} + +attributes #0 = { norecurse nounwind readonly "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "target-cpu"="generic" "target-features"="+armv8.2-a,+fullfp16,+strict-align,-thumb-mode" "unsafe-fp-math"="true" "use-soft-float"="false" } diff --git a/test/CodeGen/ARM/ldrcppic.ll b/test/CodeGen/ARM/ldrcppic.ll new file mode 100644 index 0000000000000000000000000000000000000000..c7727290d842b74c90ab47b842a8d8e6aaed47b1 --- /dev/null +++ b/test/CodeGen/ARM/ldrcppic.ll @@ -0,0 +1,56 @@ +; The failure is caused by ARM LDRcp/PICADD pairs. In PIC mode, the constant pool +; need a label to do address computation. This label is emitted when backend emits +; PICADD. When the target becomes dead, PICADD will be deleted. without this patch +; LDRcp is dead but not being deleted. This will cause a dead contant pool entry +; using a non existing label. This will cause an error in MC object emitting pass. + +; RUN: llc -relocation-model=pic -mcpu=cortex-a53 %s -filetype=obj -o - | llvm-nm - | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv8-unknown-linux-android" + +@_ZN15UsecaseSelector25AllowedImplDefinedFormatsE = external dso_local unnamed_addr constant <{ i32, i32, i32, i32, [12 x i32] }>, align 4 + +; Function Attrs: noinline nounwind optnone sspstrong uwtable +define dso_local fastcc void @_ZN15UsecaseSelector26IsAllowedImplDefinedFormatE15ChiBufferFormatj() unnamed_addr #1 align 2 { + br label %1 + +;